mirror of https://github.com/Jittor/Jittor
commit
5a30cd334f
|
@ -500,300 +500,302 @@ def change_function():
|
|||
self.padding, self.dilation, self.groups)
|
||||
return ret
|
||||
|
||||
class PoolACL(Function):
|
||||
# class PoolACL(Function):
|
||||
|
||||
def __init__(self,
|
||||
kernel_size,
|
||||
stride=None,
|
||||
padding=0,
|
||||
dilation=None,
|
||||
return_indices=None,
|
||||
ceil_mode=False,
|
||||
count_include_pad=True,
|
||||
op='maximum'):
|
||||
self.kernel_size = kernel_size if isinstance(
|
||||
kernel_size, tuple) else (kernel_size, kernel_size)
|
||||
stride = stride if stride else kernel_size
|
||||
self.stride = stride if isinstance(stride, tuple) else (stride,
|
||||
stride)
|
||||
self.padding = padding if isinstance(padding, tuple) else (padding,
|
||||
padding)
|
||||
dilation = dilation if dilation else 1
|
||||
assert dilation == 1
|
||||
self.dilation = dilation if isinstance(
|
||||
dilation, tuple) else (dilation, dilation)
|
||||
for item in self.kernel_size:
|
||||
if item <= 0:
|
||||
raise RuntimeError(
|
||||
f"kernel_size must be greater than zero, but got {item}"
|
||||
)
|
||||
for item in self.stride:
|
||||
if item <= 0:
|
||||
raise RuntimeError(
|
||||
f"stride must be greater than zero, but got {item}")
|
||||
for item in self.padding:
|
||||
if item < 0:
|
||||
raise RuntimeError(
|
||||
f"padding must be non-negative, but got {item}")
|
||||
self.op = op
|
||||
self.return_indices = return_indices
|
||||
self.ceil_mode = ceil_mode
|
||||
self.count_include_pad = count_include_pad
|
||||
# def __init__(self,
|
||||
# kernel_size,
|
||||
# stride=None,
|
||||
# padding=0,
|
||||
# dilation=None,
|
||||
# return_indices=None,
|
||||
# ceil_mode=False,
|
||||
# count_include_pad=True,
|
||||
# op='maximum'):
|
||||
# self.kernel_size = kernel_size if isinstance(
|
||||
# kernel_size, tuple) else (kernel_size, kernel_size)
|
||||
# stride = stride if stride else kernel_size
|
||||
# self.stride = stride if isinstance(stride, tuple) else (stride,
|
||||
# stride)
|
||||
# self.padding = padding if isinstance(padding, tuple) else (padding,
|
||||
# padding)
|
||||
# dilation = dilation if dilation else 1
|
||||
# assert dilation == 1
|
||||
# self.dilation = dilation if isinstance(
|
||||
# dilation, tuple) else (dilation, dilation)
|
||||
# for item in self.kernel_size:
|
||||
# if item <= 0:
|
||||
# raise RuntimeError(
|
||||
# f"kernel_size must be greater than zero, but got {item}"
|
||||
# )
|
||||
# for item in self.stride:
|
||||
# if item <= 0:
|
||||
# raise RuntimeError(
|
||||
# f"stride must be greater than zero, but got {item}")
|
||||
# for item in self.padding:
|
||||
# if item < 0:
|
||||
# raise RuntimeError(
|
||||
# f"padding must be non-negative, but got {item}")
|
||||
# self.op = op
|
||||
# self.return_indices = return_indices
|
||||
# self.ceil_mode = ceil_mode
|
||||
# self.count_include_pad = count_include_pad
|
||||
|
||||
def execute(self, input):
|
||||
self.input = input
|
||||
attr_code = f"""
|
||||
op.jt_name = "{"avgpool" if self.op == 'mean' else "maxpool"}";
|
||||
PoolAttr *attr = new PoolAttr();
|
||||
attr->kernel_size = {{ {self.kernel_size[0]}, {self.kernel_size[1]} }};
|
||||
attr->poolStrides = {{ {self.stride[0]}, {self.stride[1]} }};
|
||||
attr->poolPads = {{ {self.padding[0]}, {self.padding[1]} }};
|
||||
attr->poolDilations = {{ {self.dilation[0]}, {self.dilation[1]} }};
|
||||
attr->poolCeil = {"true" if self.ceil_mode else "false"};
|
||||
attr->countIncludePad = {"true" if self.count_include_pad else "false"};
|
||||
op.op_attr.reset(attr);
|
||||
"""
|
||||
input_height, input_width = input.shape[-2:]
|
||||
kernel_height, kernel_width = self.kernel_size[-2:]
|
||||
# def execute(self, input):
|
||||
# self.input = input
|
||||
# attr_code = f"""
|
||||
# op.jt_name = "{"avgpool" if self.op == 'mean' else "maxpool"}";
|
||||
# PoolAttr *attr = new PoolAttr();
|
||||
# attr->kernel_size = {{ {self.kernel_size[0]}, {self.kernel_size[1]} }};
|
||||
# attr->poolStrides = {{ {self.stride[0]}, {self.stride[1]} }};
|
||||
# attr->poolPads = {{ {self.padding[0]}, {self.padding[1]} }};
|
||||
# attr->poolDilations = {{ {self.dilation[0]}, {self.dilation[1]} }};
|
||||
# attr->poolCeil = {"true" if self.ceil_mode else "false"};
|
||||
# attr->countIncludePad = {"true" if self.count_include_pad else "false"};
|
||||
# op.op_attr.reset(attr);
|
||||
# """
|
||||
# input_height, input_width = input.shape[-2:]
|
||||
# kernel_height, kernel_width = self.kernel_size[-2:]
|
||||
|
||||
output_height = (input_height + 2 * self.padding[0] -
|
||||
(kernel_height - 1) - 1) // self.stride[0] + 1
|
||||
output_width = (input_width + 2 * self.padding[1] -
|
||||
(kernel_width - 1) - 1) // self.stride[1] + 1
|
||||
# output_height = (input_height + 2 * self.padding[0] -
|
||||
# (kernel_height - 1) - 1) // self.stride[0] + 1
|
||||
# output_width = (input_width + 2 * self.padding[1] -
|
||||
# (kernel_width - 1) - 1) // self.stride[1] + 1
|
||||
|
||||
output_shape = (input.shape[0], input.shape[1], output_height,
|
||||
output_width)
|
||||
# output_shape = (input.shape[0], input.shape[1], output_height,
|
||||
# output_width)
|
||||
|
||||
inputs = [input]
|
||||
# inputs = [input]
|
||||
|
||||
if self.op == 'maximum':
|
||||
result = acl_cmd(
|
||||
"Maxpool",
|
||||
inputs,
|
||||
output_dtypes=[input.dtype, 'int32'],
|
||||
output_shapes=[output_shape, output_shape],
|
||||
attr_code=attr_code,
|
||||
)
|
||||
elif self.op == 'mean':
|
||||
result = acl_cmd(
|
||||
"Avgpool",
|
||||
inputs,
|
||||
output_dtypes=[input.dtype],
|
||||
output_shapes=[output_shape],
|
||||
attr_code=attr_code,
|
||||
)
|
||||
else:
|
||||
raise ValueError('no this type pool')
|
||||
# if self.op == 'maximum':
|
||||
# result = acl_cmd(
|
||||
# "Maxpool",
|
||||
# inputs,
|
||||
# output_dtypes=[input.dtype, 'int32'],
|
||||
# output_shapes=[output_shape, output_shape],
|
||||
# attr_code=attr_code,
|
||||
# )
|
||||
# elif self.op == 'mean':
|
||||
# result = acl_cmd(
|
||||
# "Avgpool",
|
||||
# inputs,
|
||||
# output_dtypes=[input.dtype],
|
||||
# output_shapes=[output_shape],
|
||||
# attr_code=attr_code,
|
||||
# )
|
||||
# else:
|
||||
# raise ValueError('no this type pool')
|
||||
|
||||
if self.op == 'maximum':
|
||||
self.index = result[1]
|
||||
# if self.op == 'maximum':
|
||||
# self.index = result[1]
|
||||
|
||||
if self.return_indices:
|
||||
return result[0], result[1]
|
||||
else:
|
||||
return result[0]
|
||||
# if self.return_indices:
|
||||
# return result[0], result[1]
|
||||
# else:
|
||||
# return result[0]
|
||||
|
||||
def grad(self, grad_output):
|
||||
input = self.input
|
||||
attr_code = f"""
|
||||
op.jt_name = "{"avgpoolbackward" if self.op == 'mean' else "maxpoolbackward"}";
|
||||
PoolAttr *attr = new PoolAttr();
|
||||
attr->kernel_size = {{ {self.kernel_size[0]}, {self.kernel_size[1]} }};
|
||||
attr->poolStrides = {{ {self.stride[0]}, {self.stride[1]} }};
|
||||
attr->poolPads = {{ {self.padding[0]}, {self.padding[1]} }};
|
||||
attr->poolDilations = {{ {self.dilation[0]}, {self.dilation[1]} }};
|
||||
attr->poolCeil = {"true" if self.ceil_mode else "false"};
|
||||
attr->countIncludePad = {"true" if self.count_include_pad else "false"};
|
||||
op.op_attr.reset(attr);
|
||||
"""
|
||||
output_shapes = [input.shape]
|
||||
output_dtypes = [input.dtype]
|
||||
if self.op == 'maximum':
|
||||
result = acl_cmd("MaxpoolBackward",
|
||||
inputs=[grad_output, input, self.index],
|
||||
output_dtypes=output_dtypes,
|
||||
output_shapes=output_shapes,
|
||||
attr_code=attr_code)[0]
|
||||
elif self.op == 'mean':
|
||||
result = acl_cmd("AvgpoolBackward",
|
||||
inputs=[grad_output, input],
|
||||
output_dtypes=output_dtypes,
|
||||
output_shapes=output_shapes,
|
||||
attr_code=attr_code)[0]
|
||||
else:
|
||||
raise ValueError('no this type pool')
|
||||
return result
|
||||
# def grad(self, grad_output):
|
||||
# input = self.input
|
||||
# attr_code = f"""
|
||||
# op.jt_name = "{"avgpoolbackward" if self.op == 'mean' else "maxpoolbackward"}";
|
||||
# PoolAttr *attr = new PoolAttr();
|
||||
# attr->kernel_size = {{ {self.kernel_size[0]}, {self.kernel_size[1]} }};
|
||||
# attr->poolStrides = {{ {self.stride[0]}, {self.stride[1]} }};
|
||||
# attr->poolPads = {{ {self.padding[0]}, {self.padding[1]} }};
|
||||
# attr->poolDilations = {{ {self.dilation[0]}, {self.dilation[1]} }};
|
||||
# attr->poolCeil = {"true" if self.ceil_mode else "false"};
|
||||
# attr->countIncludePad = {"true" if self.count_include_pad else "false"};
|
||||
# op.op_attr.reset(attr);
|
||||
# """
|
||||
# output_shapes = [input.shape]
|
||||
# output_dtypes = [input.dtype]
|
||||
# if self.op == 'maximum':
|
||||
# result = acl_cmd("MaxpoolBackward",
|
||||
# inputs=[grad_output, input, self.index],
|
||||
# output_dtypes=output_dtypes,
|
||||
# output_shapes=output_shapes,
|
||||
# attr_code=attr_code)[0]
|
||||
# elif self.op == 'mean':
|
||||
# result = acl_cmd("AvgpoolBackward",
|
||||
# inputs=[grad_output, input],
|
||||
# output_dtypes=output_dtypes,
|
||||
# output_shapes=output_shapes,
|
||||
# attr_code=attr_code)[0]
|
||||
# else:
|
||||
# raise ValueError('no this type pool')
|
||||
# return result
|
||||
|
||||
class FlipACL(Function):
|
||||
# class FlipACL(Function):
|
||||
|
||||
def __init__(self):
|
||||
super(FlipACL, self).__init__()
|
||||
# def __init__(self):
|
||||
# super(FlipACL, self).__init__()
|
||||
|
||||
def execute(self, input, dim):
|
||||
if type(dim) is tuple:
|
||||
dim = list(dim)
|
||||
if type(dim) is not list:
|
||||
dim = [dim]
|
||||
attr_code = f"""
|
||||
op.jt_name = "flip";
|
||||
ReduceAttr *attr = new ReduceAttr();
|
||||
attr->axes = {{{', '.join(map(str, (list(dim))))}}};
|
||||
attr->prod_dim = {len(dim)};
|
||||
op.op_attr.reset(attr);
|
||||
"""
|
||||
self.attr_code = attr_code
|
||||
result = acl_cmd("Flip", [input],
|
||||
output_dtypes=[input.dtype],
|
||||
output_shapes=[input.shape],
|
||||
attr_code=self.attr_code)[0]
|
||||
return result
|
||||
# def execute(self, input, dim):
|
||||
# if type(dim) is tuple:
|
||||
# dim = list(dim)
|
||||
# if type(dim) is not list:
|
||||
# dim = [dim]
|
||||
# attr_code = f"""
|
||||
# op.jt_name = "flip";
|
||||
# ReduceAttr *attr = new ReduceAttr();
|
||||
# attr->axes = {{{', '.join(map(str, (list(dim))))}}};
|
||||
# attr->prod_dim = {len(dim)};
|
||||
# op.op_attr.reset(attr);
|
||||
# """
|
||||
# self.attr_code = attr_code
|
||||
# result = acl_cmd("Flip", [input],
|
||||
# output_dtypes=[input.dtype],
|
||||
# output_shapes=[input.shape],
|
||||
# attr_code=self.attr_code)[0]
|
||||
# return result
|
||||
|
||||
def grad(self, grad_output):
|
||||
grad_input = acl_cmd("Flip", [grad_output],
|
||||
output_dtypes=[grad_output.dtype],
|
||||
output_shapes=[grad_output.shape],
|
||||
attr_code=self.attr_code)[0]
|
||||
return grad_input
|
||||
# def grad(self, grad_output):
|
||||
# grad_input = acl_cmd("Flip", [grad_output],
|
||||
# output_dtypes=[grad_output.dtype],
|
||||
# output_shapes=[grad_output.shape],
|
||||
# attr_code=self.attr_code)[0]
|
||||
# return grad_input
|
||||
|
||||
from .aclops.flip_op import FlipACL
|
||||
def flip_acl(x, dim):
|
||||
return FlipACL()(x, dim)
|
||||
|
||||
class ConcatACL(Function):
|
||||
# class ConcatACL(Function):
|
||||
|
||||
def __init__(self):
|
||||
super(ConcatACL, self).__init__()
|
||||
# def __init__(self):
|
||||
# super(ConcatACL, self).__init__()
|
||||
|
||||
def __call__(self, *args):
|
||||
assert isinstance(args[0], list)
|
||||
assert isinstance(args[1], int)
|
||||
if jt.flags.no_grad:
|
||||
return self.execute(*args)
|
||||
backup = args
|
||||
args = list(args)
|
||||
taped_inputs = []
|
||||
taped_outputs = []
|
||||
input_mask = [-1] * (len(args[0]) + 1)
|
||||
newargs = [list(), args[1]]
|
||||
for i, v in enumerate(args[0]):
|
||||
if isinstance(v, jt.Var):
|
||||
if v.is_stop_grad():
|
||||
# -2 in input_mask represents it is stop_grad
|
||||
input_mask[i] = -2
|
||||
newargs[0].append(v)
|
||||
continue
|
||||
v = v.tape()
|
||||
newargs[0].append(v)
|
||||
input_mask[i] = len(taped_inputs)
|
||||
taped_inputs.append(v)
|
||||
# def __call__(self, *args):
|
||||
# assert isinstance(args[0], list)
|
||||
# assert isinstance(args[1], int)
|
||||
# if jt.flags.no_grad:
|
||||
# return self.execute(*args)
|
||||
# backup = args
|
||||
# args = list(args)
|
||||
# taped_inputs = []
|
||||
# taped_outputs = []
|
||||
# input_mask = [-1] * (len(args[0]) + 1)
|
||||
# newargs = [list(), args[1]]
|
||||
# for i, v in enumerate(args[0]):
|
||||
# if isinstance(v, jt.Var):
|
||||
# if v.is_stop_grad():
|
||||
# # -2 in input_mask represents it is stop_grad
|
||||
# input_mask[i] = -2
|
||||
# newargs[0].append(v)
|
||||
# continue
|
||||
# v = v.tape()
|
||||
# newargs[0].append(v)
|
||||
# input_mask[i] = len(taped_inputs)
|
||||
# taped_inputs.append(v)
|
||||
|
||||
ori_res = self.execute(*newargs)
|
||||
if not isinstance(ori_res, Sequence):
|
||||
res = [ori_res]
|
||||
else:
|
||||
res = list(ori_res)
|
||||
output_mask = [-1] * len(res)
|
||||
for i, v in enumerate(res):
|
||||
if isinstance(v, jt.Var):
|
||||
v = v.tape()
|
||||
output_mask[i] = len(taped_outputs)
|
||||
res[i] = v
|
||||
taped_outputs.append(v)
|
||||
self.input_mask = input_mask
|
||||
self.output_mask = output_mask
|
||||
# tape output and input together so
|
||||
# backward treat them as one operator
|
||||
jt.tape_together(taped_inputs, taped_outputs, self._grad)
|
||||
if isinstance(ori_res, Sequence):
|
||||
return res
|
||||
else:
|
||||
return res[0]
|
||||
# ori_res = self.execute(*newargs)
|
||||
# if not isinstance(ori_res, Sequence):
|
||||
# res = [ori_res]
|
||||
# else:
|
||||
# res = list(ori_res)
|
||||
# output_mask = [-1] * len(res)
|
||||
# for i, v in enumerate(res):
|
||||
# if isinstance(v, jt.Var):
|
||||
# v = v.tape()
|
||||
# output_mask[i] = len(taped_outputs)
|
||||
# res[i] = v
|
||||
# taped_outputs.append(v)
|
||||
# self.input_mask = input_mask
|
||||
# self.output_mask = output_mask
|
||||
# # tape output and input together so
|
||||
# # backward treat them as one operator
|
||||
# jt.tape_together(taped_inputs, taped_outputs, self._grad)
|
||||
# if isinstance(ori_res, Sequence):
|
||||
# return res
|
||||
# else:
|
||||
# return res[0]
|
||||
|
||||
def execute(self, input_tensors, dim=0):
|
||||
for _ in input_tensors:
|
||||
if not (-_.ndim <= dim < _.ndim):
|
||||
print(_.shape, dim)
|
||||
raise ValueError("dim out of range")
|
||||
# def execute(self, input_tensors, dim=0):
|
||||
# for _ in input_tensors:
|
||||
# if not (-_.ndim <= dim < _.ndim):
|
||||
# print(_.shape, dim)
|
||||
# raise ValueError("dim out of range")
|
||||
|
||||
if dim < 0:
|
||||
dim += input_tensors[0].ndim
|
||||
# if dim < 0:
|
||||
# dim += input_tensors[0].ndim
|
||||
|
||||
self.input = input_tensors
|
||||
self.dim = dim
|
||||
for i in range(len(input_tensors)):
|
||||
if input_tensors[i].dtype != input_tensors[0].dtype:
|
||||
raise ValueError(
|
||||
"All input tensors must have the same dtype")
|
||||
if input_tensors[i].shape[:dim] != input_tensors[
|
||||
0].shape[:dim] or input_tensors[i].shape[
|
||||
dim + 1:] != input_tensors[0].shape[dim + 1:]:
|
||||
raise ValueError(
|
||||
"All input tensors must have the same shape")
|
||||
attr_code = f"""
|
||||
op.jt_name = "concat";
|
||||
ConcatAttr *attr = new ConcatAttr();
|
||||
attr->tensorNum = {len(input_tensors)};
|
||||
attr->dim = {dim};
|
||||
op.op_attr.reset(attr);
|
||||
"""
|
||||
result = acl_cmd(
|
||||
"Concat",
|
||||
input_tensors,
|
||||
output_dtypes=[input_tensors[0].dtype],
|
||||
output_shapes=[
|
||||
jt.empty(self.calculate_output_shape(input_tensors,
|
||||
dim)).shape
|
||||
],
|
||||
attr_code=attr_code)[0]
|
||||
return result
|
||||
# self.input = input_tensors
|
||||
# self.dim = dim
|
||||
# for i in range(len(input_tensors)):
|
||||
# if input_tensors[i].dtype != input_tensors[0].dtype:
|
||||
# raise ValueError(
|
||||
# "All input tensors must have the same dtype")
|
||||
# if input_tensors[i].shape[:dim] != input_tensors[
|
||||
# 0].shape[:dim] or input_tensors[i].shape[
|
||||
# dim + 1:] != input_tensors[0].shape[dim + 1:]:
|
||||
# raise ValueError(
|
||||
# "All input tensors must have the same shape")
|
||||
# attr_code = f"""
|
||||
# op.jt_name = "concat";
|
||||
# ConcatAttr *attr = new ConcatAttr();
|
||||
# attr->tensorNum = {len(input_tensors)};
|
||||
# attr->dim = {dim};
|
||||
# op.op_attr.reset(attr);
|
||||
# """
|
||||
# result = acl_cmd(
|
||||
# "Concat",
|
||||
# input_tensors,
|
||||
# output_dtypes=[input_tensors[0].dtype],
|
||||
# output_shapes=[
|
||||
# jt.empty(self.calculate_output_shape(input_tensors,
|
||||
# dim)).shape
|
||||
# ],
|
||||
# attr_code=attr_code)[0]
|
||||
# return result
|
||||
|
||||
def _grad(self, *args):
|
||||
new_args = ((args[i] if i >= 0 else None)
|
||||
for i in self.output_mask)
|
||||
ret = self.grad(*new_args)
|
||||
new_ret = []
|
||||
for i, r in enumerate(ret):
|
||||
j = self.input_mask[i]
|
||||
if j < 0:
|
||||
# -2 in input_mask represents it is stop_grad
|
||||
assert r is None or j==-2, f"{type(self)}'s {i}-th returned grad should be None, "\
|
||||
"because the input value is not jittor variable."
|
||||
else:
|
||||
new_ret.append(r)
|
||||
return new_ret
|
||||
# def _grad(self, *args):
|
||||
# new_args = ((args[i] if i >= 0 else None)
|
||||
# for i in self.output_mask)
|
||||
# ret = self.grad(*new_args)
|
||||
# new_ret = []
|
||||
# for i, r in enumerate(ret):
|
||||
# j = self.input_mask[i]
|
||||
# if j < 0:
|
||||
# # -2 in input_mask represents it is stop_grad
|
||||
# assert r is None or j==-2, f"{type(self)}'s {i}-th returned grad should be None, "\
|
||||
# "because the input value is not jittor variable."
|
||||
# else:
|
||||
# new_ret.append(r)
|
||||
# return new_ret
|
||||
|
||||
def grad(self, grad_output):
|
||||
grad_inputs = self.split_grad(grad_output, self.input, self.dim)
|
||||
return grad_inputs
|
||||
# def grad(self, grad_output):
|
||||
# grad_inputs = self.split_grad(grad_output, self.input, self.dim)
|
||||
# return grad_inputs
|
||||
|
||||
def calculate_output_shape(self, input_tensors, axis):
|
||||
shape = list(input_tensors[0].shape)
|
||||
for tensor in input_tensors[1:]:
|
||||
shape[axis] += tensor.shape[axis]
|
||||
return tuple(shape)
|
||||
# def calculate_output_shape(self, input_tensors, axis):
|
||||
# shape = list(input_tensors[0].shape)
|
||||
# for tensor in input_tensors[1:]:
|
||||
# shape[axis] += tensor.shape[axis]
|
||||
# return tuple(shape)
|
||||
|
||||
def split_grad(self, grad_output, input_tensors, axis):
|
||||
offset = []
|
||||
shapeVec = []
|
||||
dtypeVec = []
|
||||
for tensor in input_tensors:
|
||||
offset.append(tensor.shape[axis])
|
||||
dtypeVec.append(tensor.dtype)
|
||||
shapeVec.append(tensor.shape)
|
||||
# def split_grad(self, grad_output, input_tensors, axis):
|
||||
# offset = []
|
||||
# shapeVec = []
|
||||
# dtypeVec = []
|
||||
# for tensor in input_tensors:
|
||||
# offset.append(tensor.shape[axis])
|
||||
# dtypeVec.append(tensor.dtype)
|
||||
# shapeVec.append(tensor.shape)
|
||||
|
||||
attr_code = f"""
|
||||
op.jt_name = "splitwithsize";
|
||||
auto *attr = new SplitWithSizeAttr();
|
||||
attr->splitSize = {{ {", ".join(map(str, offset))} }};
|
||||
attr->dim = {axis};
|
||||
op.op_attr.reset(attr);
|
||||
"""
|
||||
# attr_code = f"""
|
||||
# op.jt_name = "splitwithsize";
|
||||
# auto *attr = new SplitWithSizeAttr();
|
||||
# attr->splitSize = {{ {", ".join(map(str, offset))} }};
|
||||
# attr->dim = {axis};
|
||||
# op.op_attr.reset(attr);
|
||||
# """
|
||||
|
||||
result = acl_cmd("SplitWithSize", [grad_output],
|
||||
output_dtypes=dtypeVec,
|
||||
output_shapes=shapeVec,
|
||||
attr_code=attr_code)
|
||||
return result
|
||||
# result = acl_cmd("SplitWithSize", [grad_output],
|
||||
# output_dtypes=dtypeVec,
|
||||
# output_shapes=shapeVec,
|
||||
# attr_code=attr_code)
|
||||
# return result
|
||||
|
||||
from .aclops.concat_op import ConcatACL
|
||||
def concat(x, dim=0):
|
||||
return ConcatACL()(x, dim)
|
||||
|
||||
|
@ -2692,6 +2694,8 @@ def change_function():
|
|||
jt.nn.conv2d = warp(jt.nn.conv2d, conv_acl)
|
||||
jt.nn.Conv2d = warp(jt.nn.Conv2d, Conv2D)
|
||||
jt.nn.Conv = warp(jt.nn.Conv, Conv2D)
|
||||
|
||||
from .aclops.pool_op import PoolACL
|
||||
jt.nn.Pool = warp(jt.nn.Pool, PoolACL)
|
||||
|
||||
jt.flip = warp(jt.flip, flip_acl)
|
||||
|
|
|
@ -309,63 +309,63 @@ namespace jittor
|
|||
ret = it->second.getWorkspaceSizeFuncConvBackward(inputTensors[0], inputTensors[1], inputTensors[2], biasSizes, strides, pads, dilations, false, outPads, attr->group, outMask, 0, outputTensors[0], outputTensors[1], outputTensors[2], &workspaceSize, &executor);
|
||||
break;
|
||||
}
|
||||
case 22:
|
||||
{
|
||||
auto attr = dynamic_cast<PoolAttr *>(op_attr.get());
|
||||
kernel_size = aclCreateIntArray(attr->kernel_size.data(), 2);
|
||||
strides = aclCreateIntArray(attr->poolStrides.data(), 2);
|
||||
pads = aclCreateIntArray(attr->poolPads.data(), 2);
|
||||
dilations = aclCreateIntArray(attr->poolDilations.data(), 2);
|
||||
ret = it->second.getWorkspaceSizeFuncMaxPool(inputTensors[0], kernel_size, strides, pads, dilations, attr->poolCeil, outputTensors[0], outputTensors[1], &workspaceSize, &executor);
|
||||
break;
|
||||
}
|
||||
case 23:
|
||||
{
|
||||
auto attr = dynamic_cast<PoolAttr *>(op_attr.get());
|
||||
kernel_size = aclCreateIntArray(attr->kernel_size.data(), 2);
|
||||
strides = aclCreateIntArray(attr->poolStrides.data(), 2);
|
||||
pads = aclCreateIntArray(attr->poolPads.data(), 2);
|
||||
dilations = aclCreateIntArray(attr->poolDilations.data(), 2);
|
||||
ret = it->second.getWorkspaceSizeFuncMaxPoolBackward(inputTensors[0], inputTensors[1], inputTensors[2], kernel_size, strides, pads, dilations, attr->poolCeil, outputTensors[0], &workspaceSize, &executor);
|
||||
break;
|
||||
}
|
||||
case 24:
|
||||
{
|
||||
auto attr = dynamic_cast<PoolAttr *>(op_attr.get());
|
||||
kernel_size = aclCreateIntArray(attr->kernel_size.data(), 2);
|
||||
strides = aclCreateIntArray(attr->poolStrides.data(), 2);
|
||||
pads = aclCreateIntArray(attr->poolPads.data(), 2);
|
||||
ret = it->second.getWorkspaceSizeFuncAvgPool(inputTensors[0], kernel_size, strides, pads, attr->poolCeil, attr->countIncludePad, attr->divisorOverride, attr->divisorOverride, outputTensors[0], &workspaceSize, &executor);
|
||||
break;
|
||||
}
|
||||
case 25:
|
||||
{
|
||||
auto attr = dynamic_cast<PoolAttr *>(op_attr.get());
|
||||
kernel_size = aclCreateIntArray(attr->kernel_size.data(), 2);
|
||||
strides = aclCreateIntArray(attr->poolStrides.data(), 2);
|
||||
pads = aclCreateIntArray(attr->poolPads.data(), 2);
|
||||
ret = it->second.getWorkspaceSizeFuncAvgPoolBackward(inputTensors[0], inputTensors[1], kernel_size, strides, pads, attr->countIncludePad, attr->divisorOverride, attr->divisorOverride, attr->poolCeil, outputTensors[0], &workspaceSize, &executor);
|
||||
break;
|
||||
}
|
||||
case 26:
|
||||
{
|
||||
auto attr = dynamic_cast<ReduceAttr *>(op_attr.get());
|
||||
dim = aclCreateIntArray(attr->axes.data(), attr->axes.size());
|
||||
ret = it->second.getWorkspaceSizeFuncExpand(inputTensors[0], dim, outputTensors[0], &workspaceSize, &executor);
|
||||
break;
|
||||
}
|
||||
case 27:
|
||||
{
|
||||
std::vector<aclTensor *> concatTensorList = {};
|
||||
for (int i = 0; i < input_num; i++)
|
||||
{
|
||||
concatTensorList.push_back(inputTensors[i]);
|
||||
}
|
||||
auto concatTensorListInput = aclCreateTensorList(&concatTensorList[0], input_num);
|
||||
auto attr = dynamic_cast<ConcatAttr *>(op_attr.get());
|
||||
ret = it->second.getWorkspaceSizeFuncConcat(concatTensorListInput, attr->dim, outputTensors[0], &workspaceSize, &executor);
|
||||
break;
|
||||
}
|
||||
// case 22:
|
||||
// {
|
||||
// auto attr = dynamic_cast<PoolAttr *>(op_attr.get());
|
||||
// kernel_size = aclCreateIntArray(attr->kernel_size.data(), 2);
|
||||
// strides = aclCreateIntArray(attr->poolStrides.data(), 2);
|
||||
// pads = aclCreateIntArray(attr->poolPads.data(), 2);
|
||||
// dilations = aclCreateIntArray(attr->poolDilations.data(), 2);
|
||||
// ret = it->second.getWorkspaceSizeFuncMaxPool(inputTensors[0], kernel_size, strides, pads, dilations, attr->poolCeil, outputTensors[0], outputTensors[1], &workspaceSize, &executor);
|
||||
// break;
|
||||
// }
|
||||
// case 23:
|
||||
// {
|
||||
// auto attr = dynamic_cast<PoolAttr *>(op_attr.get());
|
||||
// kernel_size = aclCreateIntArray(attr->kernel_size.data(), 2);
|
||||
// strides = aclCreateIntArray(attr->poolStrides.data(), 2);
|
||||
// pads = aclCreateIntArray(attr->poolPads.data(), 2);
|
||||
// dilations = aclCreateIntArray(attr->poolDilations.data(), 2);
|
||||
// ret = it->second.getWorkspaceSizeFuncMaxPoolBackward(inputTensors[0], inputTensors[1], inputTensors[2], kernel_size, strides, pads, dilations, attr->poolCeil, outputTensors[0], &workspaceSize, &executor);
|
||||
// break;
|
||||
// }
|
||||
// case 24:
|
||||
// {
|
||||
// auto attr = dynamic_cast<PoolAttr *>(op_attr.get());
|
||||
// kernel_size = aclCreateIntArray(attr->kernel_size.data(), 2);
|
||||
// strides = aclCreateIntArray(attr->poolStrides.data(), 2);
|
||||
// pads = aclCreateIntArray(attr->poolPads.data(), 2);
|
||||
// ret = it->second.getWorkspaceSizeFuncAvgPool(inputTensors[0], kernel_size, strides, pads, attr->poolCeil, attr->countIncludePad, attr->divisorOverride, attr->divisorOverride, outputTensors[0], &workspaceSize, &executor);
|
||||
// break;
|
||||
// }
|
||||
// case 25:
|
||||
// {
|
||||
// auto attr = dynamic_cast<PoolAttr *>(op_attr.get());
|
||||
// kernel_size = aclCreateIntArray(attr->kernel_size.data(), 2);
|
||||
// strides = aclCreateIntArray(attr->poolStrides.data(), 2);
|
||||
// pads = aclCreateIntArray(attr->poolPads.data(), 2);
|
||||
// ret = it->second.getWorkspaceSizeFuncAvgPoolBackward(inputTensors[0], inputTensors[1], kernel_size, strides, pads, attr->countIncludePad, attr->divisorOverride, attr->divisorOverride, attr->poolCeil, outputTensors[0], &workspaceSize, &executor);
|
||||
// break;
|
||||
// }
|
||||
// case 26:
|
||||
// {
|
||||
// auto attr = dynamic_cast<ReduceAttr *>(op_attr.get());
|
||||
// dim = aclCreateIntArray(attr->axes.data(), attr->axes.size());
|
||||
// ret = it->second.getWorkspaceSizeFuncExpand(inputTensors[0], dim, outputTensors[0], &workspaceSize, &executor);
|
||||
// break;
|
||||
// }
|
||||
// case 27:
|
||||
// {
|
||||
// std::vector<aclTensor *> concatTensorList = {};
|
||||
// for (int i = 0; i < input_num; i++)
|
||||
// {
|
||||
// concatTensorList.push_back(inputTensors[i]);
|
||||
// }
|
||||
// auto concatTensorListInput = aclCreateTensorList(&concatTensorList[0], input_num);
|
||||
// auto attr = dynamic_cast<ConcatAttr *>(op_attr.get());
|
||||
// ret = it->second.getWorkspaceSizeFuncConcat(concatTensorListInput, attr->dim, outputTensors[0], &workspaceSize, &executor);
|
||||
// break;
|
||||
// }
|
||||
case 28:
|
||||
{
|
||||
auto attr = dynamic_cast<GatherAttr *>(op_attr.get());
|
||||
|
@ -510,14 +510,14 @@ namespace jittor
|
|||
// ret = it->second.getWorkspaceSizeFuncBinary(inputTensors[0], inputTensors[1], outputTensors[0], &workspaceSize, &executor);
|
||||
// break;
|
||||
// }
|
||||
case 50:
|
||||
{
|
||||
auto attr = dynamic_cast<SplitWithSizeAttr *>(op_attr.get());
|
||||
auto splitSize = aclCreateIntArray(attr->splitSize.data(), attr->splitSize.size());
|
||||
auto tensorList = aclCreateTensorList(&outputTensors[0], output_num);
|
||||
ret = it->second.getWorkspaceSizeFuncSplitWithSize(inputTensors[0], splitSize, attr->dim, tensorList, &workspaceSize, &executor);
|
||||
break;
|
||||
}
|
||||
// case 50:
|
||||
// {
|
||||
// auto attr = dynamic_cast<SplitWithSizeAttr *>(op_attr.get());
|
||||
// auto splitSize = aclCreateIntArray(attr->splitSize.data(), attr->splitSize.size());
|
||||
// auto tensorList = aclCreateTensorList(&outputTensors[0], output_num);
|
||||
// ret = it->second.getWorkspaceSizeFuncSplitWithSize(inputTensors[0], splitSize, attr->dim, tensorList, &workspaceSize, &executor);
|
||||
// break;
|
||||
// }
|
||||
case 51:
|
||||
{
|
||||
auto attr = dynamic_cast<FlashAttentionAttr *>(op_attr.get());
|
||||
|
|
|
@ -10,3 +10,6 @@
|
|||
#include <acl/aclops/matmul_op_acl.h>
|
||||
#include <acl/aclops/random_op_acl.h>
|
||||
#include <acl/aclops/bmm_op_acl.h>
|
||||
#include <acl/aclops/pool_op_acl.h>
|
||||
#include <acl/aclops/flip_op_acl.h>
|
||||
#include <acl/aclops/concat_op_acl.h>
|
||||
|
|
|
@ -0,0 +1,188 @@
|
|||
import os
|
||||
from jittor_utils import env_or_try_find
|
||||
import jittor_utils
|
||||
import ctypes
|
||||
import glob
|
||||
import jittor.compiler as compiler
|
||||
import jittor as jt
|
||||
import math
|
||||
import numpy as np
|
||||
|
||||
from typing import Union
|
||||
from collections.abc import Sequence, Iterable
|
||||
|
||||
def concat_cmd(name: str,
|
||||
inputs: list,
|
||||
output_dtypes: list = None,
|
||||
output_shapes: list = None,
|
||||
attr_code: str = "",
|
||||
attr_header: str = "",
|
||||
outputs: list = None):
|
||||
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
|
||||
|
||||
cuda_header = '''
|
||||
#include "acl/aclops/aclops.h"
|
||||
'''
|
||||
outputs_ = []
|
||||
if outputs is not None:
|
||||
outputs_ = outputs
|
||||
else:
|
||||
assert output_dtypes is not None
|
||||
assert output_shapes is not None
|
||||
assert len(output_dtypes) == len(output_shapes)
|
||||
for i in range(len(output_shapes)):
|
||||
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
|
||||
input_code = ''
|
||||
for i in range(len(inputs)):
|
||||
input_code += f"op.add(in{i}, true);\n"
|
||||
|
||||
output_code = ''
|
||||
for i in range(len(outputs_)):
|
||||
output_code += f"op.add(out{i}, false);\n"
|
||||
return jt.code(outputs=outputs_,
|
||||
inputs=inputs,
|
||||
cuda_header=attr_header + cuda_header,
|
||||
cuda_src=f"""
|
||||
|
||||
// aclop
|
||||
{name}OpRunner op;
|
||||
{input_code}
|
||||
{output_code}
|
||||
{attr_code}
|
||||
op.run();""")
|
||||
|
||||
class ConcatACL(jt.Function):
|
||||
|
||||
def __init__(self):
|
||||
super(ConcatACL, self).__init__()
|
||||
|
||||
def __call__(self, *args):
|
||||
assert isinstance(args[0], list)
|
||||
assert isinstance(args[1], int)
|
||||
if jt.flags.no_grad:
|
||||
return self.execute(*args)
|
||||
backup = args
|
||||
args = list(args)
|
||||
taped_inputs = []
|
||||
taped_outputs = []
|
||||
input_mask = [-1] * (len(args[0]) + 1)
|
||||
newargs = [list(), args[1]]
|
||||
for i, v in enumerate(args[0]):
|
||||
if isinstance(v, jt.Var):
|
||||
if v.is_stop_grad():
|
||||
# -2 in input_mask represents it is stop_grad
|
||||
input_mask[i] = -2
|
||||
newargs[0].append(v)
|
||||
continue
|
||||
v = v.tape()
|
||||
newargs[0].append(v)
|
||||
input_mask[i] = len(taped_inputs)
|
||||
taped_inputs.append(v)
|
||||
|
||||
ori_res = self.execute(*newargs)
|
||||
if not isinstance(ori_res, Sequence):
|
||||
res = [ori_res]
|
||||
else:
|
||||
res = list(ori_res)
|
||||
output_mask = [-1] * len(res)
|
||||
for i, v in enumerate(res):
|
||||
if isinstance(v, jt.Var):
|
||||
v = v.tape()
|
||||
output_mask[i] = len(taped_outputs)
|
||||
res[i] = v
|
||||
taped_outputs.append(v)
|
||||
self.input_mask = input_mask
|
||||
self.output_mask = output_mask
|
||||
# tape output and input together so
|
||||
# backward treat them as one operator
|
||||
jt.tape_together(taped_inputs, taped_outputs, self._grad)
|
||||
if isinstance(ori_res, Sequence):
|
||||
return res
|
||||
else:
|
||||
return res[0]
|
||||
|
||||
def execute(self, input_tensors, dim=0):
|
||||
for _ in input_tensors:
|
||||
if not (-_.ndim <= dim < _.ndim):
|
||||
print(_.shape, dim)
|
||||
raise ValueError("dim out of range")
|
||||
|
||||
if dim < 0:
|
||||
dim += input_tensors[0].ndim
|
||||
|
||||
self.input = input_tensors
|
||||
self.dim = dim
|
||||
for i in range(len(input_tensors)):
|
||||
if input_tensors[i].dtype != input_tensors[0].dtype:
|
||||
raise ValueError(
|
||||
"All input tensors must have the same dtype")
|
||||
if input_tensors[i].shape[:dim] != input_tensors[
|
||||
0].shape[:dim] or input_tensors[i].shape[
|
||||
dim + 1:] != input_tensors[0].shape[dim + 1:]:
|
||||
raise ValueError(
|
||||
"All input tensors must have the same shape")
|
||||
attr_code = f"""
|
||||
op.jt_name = "concat";
|
||||
ConcatAttr *attr = new ConcatAttr();
|
||||
attr->tensorNum = {len(input_tensors)};
|
||||
attr->dim = {dim};
|
||||
op.op_attr.reset(attr);
|
||||
"""
|
||||
result = concat_cmd(
|
||||
"Concat",
|
||||
input_tensors,
|
||||
output_dtypes=[input_tensors[0].dtype],
|
||||
output_shapes=[
|
||||
jt.empty(self.calculate_output_shape(input_tensors,
|
||||
dim)).shape
|
||||
],
|
||||
attr_code=attr_code)[0]
|
||||
return result
|
||||
|
||||
def _grad(self, *args):
|
||||
new_args = ((args[i] if i >= 0 else None)
|
||||
for i in self.output_mask)
|
||||
ret = self.grad(*new_args)
|
||||
new_ret = []
|
||||
for i, r in enumerate(ret):
|
||||
j = self.input_mask[i]
|
||||
if j < 0:
|
||||
# -2 in input_mask represents it is stop_grad
|
||||
assert r is None or j==-2, f"{type(self)}'s {i}-th returned grad should be None, "\
|
||||
"because the input value is not jittor variable."
|
||||
else:
|
||||
new_ret.append(r)
|
||||
return new_ret
|
||||
|
||||
def grad(self, grad_output):
|
||||
grad_inputs = self.split_grad(grad_output, self.input, self.dim)
|
||||
return grad_inputs
|
||||
|
||||
def calculate_output_shape(self, input_tensors, axis):
|
||||
shape = list(input_tensors[0].shape)
|
||||
for tensor in input_tensors[1:]:
|
||||
shape[axis] += tensor.shape[axis]
|
||||
return tuple(shape)
|
||||
|
||||
def split_grad(self, grad_output, input_tensors, axis):
|
||||
offset = []
|
||||
shapeVec = []
|
||||
dtypeVec = []
|
||||
for tensor in input_tensors:
|
||||
offset.append(tensor.shape[axis])
|
||||
dtypeVec.append(tensor.dtype)
|
||||
shapeVec.append(tensor.shape)
|
||||
|
||||
attr_code = f"""
|
||||
op.jt_name = "splitwithsize";
|
||||
auto *attr = new SplitWithSizeAttr();
|
||||
attr->splitSize = {{ {", ".join(map(str, offset))} }};
|
||||
attr->dim = {axis};
|
||||
op.op_attr.reset(attr);
|
||||
"""
|
||||
|
||||
result = concat_cmd("SplitWithSize", [grad_output],
|
||||
output_dtypes=dtypeVec,
|
||||
output_shapes=shapeVec,
|
||||
attr_code=attr_code)
|
||||
return result
|
|
@ -0,0 +1,89 @@
|
|||
#pragma once
|
||||
#include <acl/acl.h>
|
||||
#include <acl/acl_op_compiler.h>
|
||||
#include <Python.h>
|
||||
#include <pystate.h>
|
||||
#include <algorithm>
|
||||
#include <queue>
|
||||
#include <set>
|
||||
#include "common.h"
|
||||
#include "op.h"
|
||||
#include "acl_jittor.h"
|
||||
#include "ops/random_op.h"
|
||||
#include "ops/reduce_op.h"
|
||||
#include "ops/binary_op.h"
|
||||
#include "ops/broadcast_to_op.h"
|
||||
#include "ops/transpose_op.h"
|
||||
#include "ops/array_op.h"
|
||||
#include "ops/code_op.h"
|
||||
#include "fused_op.h"
|
||||
#include "ops/unary_op.h"
|
||||
#include "ops/ternary_op.h"
|
||||
#include "executor.h"
|
||||
#include "misc/cuda_flags.h"
|
||||
#include "mem/allocator.h"
|
||||
#include "op_compiler.h"
|
||||
#include "ops/op_register.h"
|
||||
#include "opt/tuner_manager.h"
|
||||
#include "utils/str_utils.h"
|
||||
#include "aclnn/aclnn.h"
|
||||
#include "concat_op_acl.h"
|
||||
|
||||
namespace jittor
|
||||
{
|
||||
ConcatOpRunner::ConcatOpRunner() : BaseOpRunner("Concat")
|
||||
{
|
||||
}
|
||||
|
||||
void ConcatOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
|
||||
{
|
||||
auto input_num = in_.size();
|
||||
std::vector<aclTensor *> concatTensorList = {};
|
||||
for (int i = 0; i < input_num; i++)
|
||||
{
|
||||
concatTensorList.push_back(inputTensors[i]);
|
||||
}
|
||||
auto concatTensorListInput = aclCreateTensorList(&concatTensorList[0], input_num);
|
||||
auto attr = dynamic_cast<ConcatAttr *>(op_attr.get());
|
||||
ret = aclnnCatGetWorkspaceSize(concatTensorListInput, attr->dim, outputTensors[0], &workspaceSize, &executor);
|
||||
checkRet(ret);
|
||||
|
||||
if (workspaceSize > 0)
|
||||
{
|
||||
mallocWorkSpace(workspaceSize);
|
||||
}
|
||||
|
||||
ret = aclnnCat(workspaceAddr, workspaceSize, executor, aclstream);
|
||||
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnCat failed. ERROR: %d\n", name.c_str(), ret); return);
|
||||
|
||||
syncRun();
|
||||
return;
|
||||
}
|
||||
|
||||
SplitWithSizeOpRunner::SplitWithSizeOpRunner() : BaseOpRunner("SplitWithSize")
|
||||
{
|
||||
}
|
||||
|
||||
void SplitWithSizeOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
|
||||
{
|
||||
auto output_num = out_.size();
|
||||
auto attr = dynamic_cast<SplitWithSizeAttr *>(op_attr.get());
|
||||
auto splitSize = aclCreateIntArray(attr->splitSize.data(), attr->splitSize.size());
|
||||
auto tensorList = aclCreateTensorList(&outputTensors[0], output_num);
|
||||
ret = aclnnSplitWithSizeGetWorkspaceSize(inputTensors[0], splitSize, attr->dim, tensorList, &workspaceSize, &executor);
|
||||
|
||||
checkRet(ret);
|
||||
|
||||
if (workspaceSize > 0)
|
||||
{
|
||||
mallocWorkSpace(workspaceSize);
|
||||
}
|
||||
|
||||
ret = aclnnSplitWithSize(workspaceAddr, workspaceSize, executor, aclstream);
|
||||
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnSplitWithSize failed. ERROR: %d\n", name.c_str(), ret); return);
|
||||
|
||||
syncRun();
|
||||
return;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,24 @@
|
|||
#pragma once
|
||||
#include "utils.h"
|
||||
#include "base_op.h"
|
||||
|
||||
namespace jittor
|
||||
{
|
||||
class ConcatOpRunner : public BaseOpRunner
|
||||
{
|
||||
|
||||
protected:
|
||||
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
|
||||
public:
|
||||
ConcatOpRunner();
|
||||
};
|
||||
|
||||
class SplitWithSizeOpRunner : public BaseOpRunner
|
||||
{
|
||||
|
||||
protected:
|
||||
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
|
||||
public:
|
||||
SplitWithSizeOpRunner();
|
||||
};
|
||||
}
|
|
@ -0,0 +1,83 @@
|
|||
import os
|
||||
from jittor_utils import env_or_try_find
|
||||
import jittor_utils
|
||||
import ctypes
|
||||
import glob
|
||||
import jittor.compiler as compiler
|
||||
import jittor as jt
|
||||
import math
|
||||
import numpy as np
|
||||
|
||||
from typing import Union
|
||||
from collections.abc import Sequence, Iterable
|
||||
|
||||
def flip_cmd(name: str,
|
||||
inputs: list,
|
||||
output_dtypes: list = None,
|
||||
output_shapes: list = None,
|
||||
attr_code: str = "",
|
||||
attr_header: str = "",
|
||||
outputs: list = None):
|
||||
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
|
||||
|
||||
cuda_header = '''
|
||||
#include "acl/aclops/aclops.h"
|
||||
'''
|
||||
outputs_ = []
|
||||
if outputs is not None:
|
||||
outputs_ = outputs
|
||||
else:
|
||||
assert output_dtypes is not None
|
||||
assert output_shapes is not None
|
||||
assert len(output_dtypes) == len(output_shapes)
|
||||
for i in range(len(output_shapes)):
|
||||
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
|
||||
input_code = ''
|
||||
for i in range(len(inputs)):
|
||||
input_code += f"op.add(in{i}, true);\n"
|
||||
|
||||
output_code = ''
|
||||
for i in range(len(outputs_)):
|
||||
output_code += f"op.add(out{i}, false);\n"
|
||||
return jt.code(outputs=outputs_,
|
||||
inputs=inputs,
|
||||
cuda_header=attr_header + cuda_header,
|
||||
cuda_src=f"""
|
||||
|
||||
// aclop
|
||||
{name}OpRunner op;
|
||||
{input_code}
|
||||
{output_code}
|
||||
{attr_code}
|
||||
op.run();""")
|
||||
|
||||
class FlipACL(jt.Function):
|
||||
|
||||
def __init__(self):
|
||||
super(FlipACL, self).__init__()
|
||||
|
||||
def execute(self, input, dim):
|
||||
if type(dim) is tuple:
|
||||
dim = list(dim)
|
||||
if type(dim) is not list:
|
||||
dim = [dim]
|
||||
attr_code = f"""
|
||||
op.jt_name = "flip";
|
||||
ReduceAttr *attr = new ReduceAttr();
|
||||
attr->axes = {{{', '.join(map(str, (list(dim))))}}};
|
||||
attr->prod_dim = {len(dim)};
|
||||
op.op_attr.reset(attr);
|
||||
"""
|
||||
self.attr_code = attr_code
|
||||
result = flip_cmd("Flip", [input],
|
||||
output_dtypes=[input.dtype],
|
||||
output_shapes=[input.shape],
|
||||
attr_code=self.attr_code)[0]
|
||||
return result
|
||||
|
||||
def grad(self, grad_output):
|
||||
grad_input = flip_cmd("Flip", [grad_output],
|
||||
output_dtypes=[grad_output.dtype],
|
||||
output_shapes=[grad_output.shape],
|
||||
attr_code=self.attr_code)[0]
|
||||
return grad_input
|
|
@ -0,0 +1,58 @@
|
|||
#pragma once
|
||||
#include <acl/acl.h>
|
||||
#include <acl/acl_op_compiler.h>
|
||||
#include <Python.h>
|
||||
#include <pystate.h>
|
||||
#include <algorithm>
|
||||
#include <queue>
|
||||
#include <set>
|
||||
#include "common.h"
|
||||
#include "op.h"
|
||||
#include "acl_jittor.h"
|
||||
#include "ops/random_op.h"
|
||||
#include "ops/reduce_op.h"
|
||||
#include "ops/binary_op.h"
|
||||
#include "ops/broadcast_to_op.h"
|
||||
#include "ops/transpose_op.h"
|
||||
#include "ops/array_op.h"
|
||||
#include "ops/code_op.h"
|
||||
#include "fused_op.h"
|
||||
#include "ops/unary_op.h"
|
||||
#include "ops/ternary_op.h"
|
||||
#include "executor.h"
|
||||
#include "misc/cuda_flags.h"
|
||||
#include "mem/allocator.h"
|
||||
#include "op_compiler.h"
|
||||
#include "ops/op_register.h"
|
||||
#include "opt/tuner_manager.h"
|
||||
#include "utils/str_utils.h"
|
||||
#include "aclnn/aclnn.h"
|
||||
#include "flip_op_acl.h"
|
||||
|
||||
namespace jittor
|
||||
{
|
||||
FlipOpRunner::FlipOpRunner() : BaseOpRunner("Flip")
|
||||
{
|
||||
}
|
||||
|
||||
void FlipOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
|
||||
{
|
||||
auto attr = dynamic_cast<ReduceAttr *>(op_attr.get());
|
||||
auto dim = aclCreateIntArray(attr->axes.data(), attr->axes.size());
|
||||
ret = aclnnFlipGetWorkspaceSize(inputTensors[0], dim, outputTensors[0], &workspaceSize, &executor);
|
||||
|
||||
checkRet(ret);
|
||||
|
||||
if (workspaceSize > 0)
|
||||
{
|
||||
mallocWorkSpace(workspaceSize);
|
||||
}
|
||||
|
||||
ret = aclnnFlip(workspaceAddr, workspaceSize, executor, aclstream);
|
||||
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnFlip failed. ERROR: %d\n", name.c_str(), ret); return);
|
||||
|
||||
syncRun();
|
||||
return;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,15 @@
|
|||
#pragma once
|
||||
#include "utils.h"
|
||||
#include "base_op.h"
|
||||
|
||||
namespace jittor
|
||||
{
|
||||
class FlipOpRunner : public BaseOpRunner
|
||||
{
|
||||
|
||||
protected:
|
||||
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
|
||||
public:
|
||||
FlipOpRunner();
|
||||
};
|
||||
}
|
|
@ -0,0 +1,176 @@
|
|||
import os
|
||||
from jittor_utils import env_or_try_find
|
||||
import jittor_utils
|
||||
import ctypes
|
||||
import glob
|
||||
import jittor.compiler as compiler
|
||||
import jittor as jt
|
||||
import math
|
||||
import numpy as np
|
||||
|
||||
from typing import Union
|
||||
from collections.abc import Sequence, Iterable
|
||||
|
||||
def pool_cmd(name: str,
|
||||
inputs: list,
|
||||
output_dtypes: list = None,
|
||||
output_shapes: list = None,
|
||||
attr_code: str = "",
|
||||
attr_header: str = "",
|
||||
outputs: list = None):
|
||||
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
|
||||
|
||||
cuda_header = '''
|
||||
#include "acl/aclops/aclops.h"
|
||||
'''
|
||||
outputs_ = []
|
||||
if outputs is not None:
|
||||
outputs_ = outputs
|
||||
else:
|
||||
assert output_dtypes is not None
|
||||
assert output_shapes is not None
|
||||
assert len(output_dtypes) == len(output_shapes)
|
||||
for i in range(len(output_shapes)):
|
||||
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
|
||||
input_code = ''
|
||||
for i in range(len(inputs)):
|
||||
input_code += f"op.add(in{i}, true);\n"
|
||||
|
||||
output_code = ''
|
||||
for i in range(len(outputs_)):
|
||||
output_code += f"op.add(out{i}, false);\n"
|
||||
return jt.code(outputs=outputs_,
|
||||
inputs=inputs,
|
||||
cuda_header=attr_header + cuda_header,
|
||||
cuda_src=f"""
|
||||
|
||||
// aclop
|
||||
{name}OpRunner op;
|
||||
{input_code}
|
||||
{output_code}
|
||||
{attr_code}
|
||||
op.run();""")
|
||||
|
||||
class PoolACL(jt.Function):
|
||||
|
||||
def __init__(self,
|
||||
kernel_size,
|
||||
stride=None,
|
||||
padding=0,
|
||||
dilation=None,
|
||||
return_indices=None,
|
||||
ceil_mode=False,
|
||||
count_include_pad=True,
|
||||
op='maximum'):
|
||||
self.kernel_size = kernel_size if isinstance(
|
||||
kernel_size, tuple) else (kernel_size, kernel_size)
|
||||
stride = stride if stride else kernel_size
|
||||
self.stride = stride if isinstance(stride, tuple) else (stride,
|
||||
stride)
|
||||
self.padding = padding if isinstance(padding, tuple) else (padding,
|
||||
padding)
|
||||
dilation = dilation if dilation else 1
|
||||
assert dilation == 1
|
||||
self.dilation = dilation if isinstance(
|
||||
dilation, tuple) else (dilation, dilation)
|
||||
for item in self.kernel_size:
|
||||
if item <= 0:
|
||||
raise RuntimeError(
|
||||
f"kernel_size must be greater than zero, but got {item}"
|
||||
)
|
||||
for item in self.stride:
|
||||
if item <= 0:
|
||||
raise RuntimeError(
|
||||
f"stride must be greater than zero, but got {item}")
|
||||
for item in self.padding:
|
||||
if item < 0:
|
||||
raise RuntimeError(
|
||||
f"padding must be non-negative, but got {item}")
|
||||
self.op = op
|
||||
self.return_indices = return_indices
|
||||
self.ceil_mode = ceil_mode
|
||||
self.count_include_pad = count_include_pad
|
||||
|
||||
def execute(self, input):
|
||||
self.input = input
|
||||
attr_code = f"""
|
||||
op.jt_name = "{"avgpool" if self.op == 'mean' else "maxpool"}";
|
||||
PoolAttr *attr = new PoolAttr();
|
||||
attr->kernel_size = {{ {self.kernel_size[0]}, {self.kernel_size[1]} }};
|
||||
attr->poolStrides = {{ {self.stride[0]}, {self.stride[1]} }};
|
||||
attr->poolPads = {{ {self.padding[0]}, {self.padding[1]} }};
|
||||
attr->poolDilations = {{ {self.dilation[0]}, {self.dilation[1]} }};
|
||||
attr->poolCeil = {"true" if self.ceil_mode else "false"};
|
||||
attr->countIncludePad = {"true" if self.count_include_pad else "false"};
|
||||
op.op_attr.reset(attr);
|
||||
"""
|
||||
input_height, input_width = input.shape[-2:]
|
||||
kernel_height, kernel_width = self.kernel_size[-2:]
|
||||
|
||||
output_height = (input_height + 2 * self.padding[0] -
|
||||
(kernel_height - 1) - 1) // self.stride[0] + 1
|
||||
output_width = (input_width + 2 * self.padding[1] -
|
||||
(kernel_width - 1) - 1) // self.stride[1] + 1
|
||||
|
||||
output_shape = (input.shape[0], input.shape[1], output_height,
|
||||
output_width)
|
||||
|
||||
inputs = [input]
|
||||
|
||||
if self.op == 'maximum':
|
||||
result = pool_cmd(
|
||||
"Maxpool",
|
||||
inputs,
|
||||
output_dtypes=[input.dtype, 'int32'],
|
||||
output_shapes=[output_shape, output_shape],
|
||||
attr_code=attr_code,
|
||||
)
|
||||
elif self.op == 'mean':
|
||||
result = pool_cmd(
|
||||
"Avgpool",
|
||||
inputs,
|
||||
output_dtypes=[input.dtype],
|
||||
output_shapes=[output_shape],
|
||||
attr_code=attr_code,
|
||||
)
|
||||
else:
|
||||
raise ValueError('no this type pool')
|
||||
|
||||
if self.op == 'maximum':
|
||||
self.index = result[1]
|
||||
|
||||
if self.return_indices:
|
||||
return result[0], result[1]
|
||||
else:
|
||||
return result[0]
|
||||
|
||||
def grad(self, grad_output):
|
||||
input = self.input
|
||||
attr_code = f"""
|
||||
op.jt_name = "{"avgpoolbackward" if self.op == 'mean' else "maxpoolbackward"}";
|
||||
PoolAttr *attr = new PoolAttr();
|
||||
attr->kernel_size = {{ {self.kernel_size[0]}, {self.kernel_size[1]} }};
|
||||
attr->poolStrides = {{ {self.stride[0]}, {self.stride[1]} }};
|
||||
attr->poolPads = {{ {self.padding[0]}, {self.padding[1]} }};
|
||||
attr->poolDilations = {{ {self.dilation[0]}, {self.dilation[1]} }};
|
||||
attr->poolCeil = {"true" if self.ceil_mode else "false"};
|
||||
attr->countIncludePad = {"true" if self.count_include_pad else "false"};
|
||||
op.op_attr.reset(attr);
|
||||
"""
|
||||
output_shapes = [input.shape]
|
||||
output_dtypes = [input.dtype]
|
||||
if self.op == 'maximum':
|
||||
result = pool_cmd("MaxpoolBackward",
|
||||
inputs=[grad_output, input, self.index],
|
||||
output_dtypes=output_dtypes,
|
||||
output_shapes=output_shapes,
|
||||
attr_code=attr_code)[0]
|
||||
elif self.op == 'mean':
|
||||
result = pool_cmd("AvgpoolBackward",
|
||||
inputs=[grad_output, input],
|
||||
output_dtypes=output_dtypes,
|
||||
output_shapes=output_shapes,
|
||||
attr_code=attr_code)[0]
|
||||
else:
|
||||
raise ValueError('no this type pool')
|
||||
return result
|
|
@ -0,0 +1,191 @@
|
|||
#pragma once
|
||||
#include <acl/acl.h>
|
||||
#include <acl/acl_op_compiler.h>
|
||||
#include <Python.h>
|
||||
#include <pystate.h>
|
||||
#include <algorithm>
|
||||
#include <queue>
|
||||
#include <set>
|
||||
#include "common.h"
|
||||
#include "op.h"
|
||||
#include "acl_jittor.h"
|
||||
#include "ops/random_op.h"
|
||||
#include "ops/reduce_op.h"
|
||||
#include "ops/binary_op.h"
|
||||
#include "ops/broadcast_to_op.h"
|
||||
#include "ops/transpose_op.h"
|
||||
#include "ops/array_op.h"
|
||||
#include "ops/code_op.h"
|
||||
#include "fused_op.h"
|
||||
#include "ops/unary_op.h"
|
||||
#include "ops/ternary_op.h"
|
||||
#include "executor.h"
|
||||
#include "misc/cuda_flags.h"
|
||||
#include "mem/allocator.h"
|
||||
#include "op_compiler.h"
|
||||
#include "ops/op_register.h"
|
||||
#include "opt/tuner_manager.h"
|
||||
#include "utils/str_utils.h"
|
||||
#include "aclnn/aclnn.h"
|
||||
#include "pool_op_acl.h"
|
||||
|
||||
namespace jittor
|
||||
{
|
||||
MaxpoolOpRunner::MaxpoolOpRunner() : BaseOpRunner("Maxpool")
|
||||
{
|
||||
use_nchw = true;
|
||||
}
|
||||
|
||||
void MaxpoolOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
|
||||
{
|
||||
|
||||
aclIntArray *strides = nullptr;
|
||||
aclIntArray *pads = nullptr;
|
||||
aclIntArray *dilations = nullptr;
|
||||
aclIntArray *kernel_size = nullptr;
|
||||
|
||||
auto attr = dynamic_cast<PoolAttr *>(op_attr.get());
|
||||
kernel_size = aclCreateIntArray(attr->kernel_size.data(), 2);
|
||||
strides = aclCreateIntArray(attr->poolStrides.data(), 2);
|
||||
pads = aclCreateIntArray(attr->poolPads.data(), 2);
|
||||
dilations = aclCreateIntArray(attr->poolDilations.data(), 2);
|
||||
ret = aclnnMaxPool2dWithIndicesGetWorkspaceSize(inputTensors[0], kernel_size, strides, pads, dilations, attr->poolCeil, outputTensors[0], outputTensors[1], &workspaceSize, &executor);
|
||||
|
||||
checkRet(ret);
|
||||
|
||||
if (workspaceSize > 0)
|
||||
{
|
||||
mallocWorkSpace(workspaceSize);
|
||||
}
|
||||
|
||||
ret = aclnnMaxPool2dWithIndices(workspaceAddr, workspaceSize, executor, aclstream);
|
||||
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnMaxPool2dWithIndices failed. ERROR: %d\n", name.c_str(), ret); return);
|
||||
|
||||
syncRun();
|
||||
|
||||
aclDestroyIntArray(strides);
|
||||
aclDestroyIntArray(pads);
|
||||
aclDestroyIntArray(dilations);
|
||||
aclDestroyIntArray(kernel_size);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
AvgpoolOpRunner::AvgpoolOpRunner() : BaseOpRunner("Avgpool")
|
||||
{
|
||||
use_nchw = true;
|
||||
}
|
||||
|
||||
void AvgpoolOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
|
||||
{
|
||||
|
||||
aclIntArray *strides = nullptr;
|
||||
aclIntArray *pads = nullptr;
|
||||
aclIntArray *kernel_size = nullptr;
|
||||
|
||||
auto attr = dynamic_cast<PoolAttr *>(op_attr.get());
|
||||
kernel_size = aclCreateIntArray(attr->kernel_size.data(), 2);
|
||||
strides = aclCreateIntArray(attr->poolStrides.data(), 2);
|
||||
pads = aclCreateIntArray(attr->poolPads.data(), 2);
|
||||
ret = aclnnAvgPool2dGetWorkspaceSize(inputTensors[0], kernel_size, strides, pads, attr->poolCeil, attr->countIncludePad, attr->divisorOverride, attr->divisorOverride, outputTensors[0], &workspaceSize, &executor);
|
||||
|
||||
checkRet(ret);
|
||||
|
||||
if (workspaceSize > 0)
|
||||
{
|
||||
mallocWorkSpace(workspaceSize);
|
||||
}
|
||||
|
||||
ret = aclnnAvgPool2d(workspaceAddr, workspaceSize, executor, aclstream);
|
||||
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnAvgPool2d failed. ERROR: %d\n", name.c_str(), ret); return);
|
||||
|
||||
syncRun();
|
||||
|
||||
aclDestroyIntArray(strides);
|
||||
aclDestroyIntArray(pads);
|
||||
aclDestroyIntArray(kernel_size);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
MaxpoolBackwardOpRunner::MaxpoolBackwardOpRunner() : BaseOpRunner("MaxpoolBackward")
|
||||
{
|
||||
use_nchw = true;
|
||||
}
|
||||
|
||||
void MaxpoolBackwardOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
|
||||
{
|
||||
|
||||
aclIntArray *strides = nullptr;
|
||||
aclIntArray *pads = nullptr;
|
||||
aclIntArray *dilations = nullptr;
|
||||
aclIntArray *kernel_size = nullptr;
|
||||
|
||||
auto attr = dynamic_cast<PoolAttr *>(op_attr.get());
|
||||
kernel_size = aclCreateIntArray(attr->kernel_size.data(), 2);
|
||||
strides = aclCreateIntArray(attr->poolStrides.data(), 2);
|
||||
pads = aclCreateIntArray(attr->poolPads.data(), 2);
|
||||
dilations = aclCreateIntArray(attr->poolDilations.data(), 2);
|
||||
ret = aclnnMaxPool2dWithIndicesBackwardGetWorkspaceSize(inputTensors[0], inputTensors[1], inputTensors[2], kernel_size, strides, pads, dilations, attr->poolCeil, outputTensors[0], &workspaceSize, &executor);
|
||||
|
||||
checkRet(ret);
|
||||
|
||||
if (workspaceSize > 0)
|
||||
{
|
||||
mallocWorkSpace(workspaceSize);
|
||||
}
|
||||
|
||||
ret = aclnnMaxPool2dWithIndicesBackward(workspaceAddr, workspaceSize, executor, aclstream);
|
||||
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnMaxPool2dWithIndicesBackward failed. ERROR: %d\n", name.c_str(), ret); return);
|
||||
|
||||
syncRun();
|
||||
|
||||
aclDestroyIntArray(strides);
|
||||
aclDestroyIntArray(pads);
|
||||
aclDestroyIntArray(dilations);
|
||||
aclDestroyIntArray(kernel_size);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
|
||||
AvgpoolBackwardOpRunner::AvgpoolBackwardOpRunner() : BaseOpRunner("AvgpoolBackward")
|
||||
{
|
||||
use_nchw = true;
|
||||
}
|
||||
|
||||
void AvgpoolBackwardOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
|
||||
{
|
||||
aclIntArray *strides = nullptr;
|
||||
aclIntArray *pads = nullptr;
|
||||
aclIntArray *kernel_size = nullptr;
|
||||
|
||||
auto attr = dynamic_cast<PoolAttr *>(op_attr.get());
|
||||
kernel_size = aclCreateIntArray(attr->kernel_size.data(), 2);
|
||||
strides = aclCreateIntArray(attr->poolStrides.data(), 2);
|
||||
pads = aclCreateIntArray(attr->poolPads.data(), 2);
|
||||
ret = aclnnAvgPool2dBackwardGetWorkspaceSize(inputTensors[0], inputTensors[1], kernel_size, strides, pads, attr->countIncludePad, attr->divisorOverride, attr->divisorOverride, attr->poolCeil, outputTensors[0], &workspaceSize, &executor);
|
||||
|
||||
checkRet(ret);
|
||||
|
||||
if (workspaceSize > 0)
|
||||
{
|
||||
mallocWorkSpace(workspaceSize);
|
||||
}
|
||||
|
||||
ret = aclnnAvgPool2dBackward(workspaceAddr, workspaceSize, executor, aclstream);
|
||||
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnAvgPool2dBackward failed. ERROR: %d\n", name.c_str(), ret); return);
|
||||
|
||||
syncRun();
|
||||
|
||||
aclDestroyIntArray(strides);
|
||||
aclDestroyIntArray(pads);
|
||||
aclDestroyIntArray(kernel_size);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,43 @@
|
|||
#pragma once
|
||||
#include "utils.h"
|
||||
#include "base_op.h"
|
||||
|
||||
namespace jittor
|
||||
{
|
||||
class MaxpoolOpRunner : public BaseOpRunner
|
||||
{
|
||||
|
||||
protected:
|
||||
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
|
||||
public:
|
||||
MaxpoolOpRunner();
|
||||
};
|
||||
|
||||
class AvgpoolOpRunner : public BaseOpRunner
|
||||
{
|
||||
|
||||
protected:
|
||||
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
|
||||
public:
|
||||
AvgpoolOpRunner();
|
||||
};
|
||||
|
||||
|
||||
class MaxpoolBackwardOpRunner : public BaseOpRunner
|
||||
{
|
||||
|
||||
protected:
|
||||
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
|
||||
public:
|
||||
MaxpoolBackwardOpRunner();
|
||||
};
|
||||
|
||||
class AvgpoolBackwardOpRunner : public BaseOpRunner
|
||||
{
|
||||
|
||||
protected:
|
||||
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
|
||||
public:
|
||||
AvgpoolBackwardOpRunner();
|
||||
};
|
||||
}
|
Loading…
Reference in New Issue