Merge pull request #8 from CSCG-Lab/splits

split maxpool,flip,concat
This commit is contained in:
Yuxuan Han 2024-12-12 19:52:31 +08:00 committed by GitHub
commit 5a30cd334f
12 changed files with 1203 additions and 329 deletions

View File

@ -500,300 +500,302 @@ def change_function():
self.padding, self.dilation, self.groups)
return ret
class PoolACL(Function):
# class PoolACL(Function):
def __init__(self,
kernel_size,
stride=None,
padding=0,
dilation=None,
return_indices=None,
ceil_mode=False,
count_include_pad=True,
op='maximum'):
self.kernel_size = kernel_size if isinstance(
kernel_size, tuple) else (kernel_size, kernel_size)
stride = stride if stride else kernel_size
self.stride = stride if isinstance(stride, tuple) else (stride,
stride)
self.padding = padding if isinstance(padding, tuple) else (padding,
padding)
dilation = dilation if dilation else 1
assert dilation == 1
self.dilation = dilation if isinstance(
dilation, tuple) else (dilation, dilation)
for item in self.kernel_size:
if item <= 0:
raise RuntimeError(
f"kernel_size must be greater than zero, but got {item}"
)
for item in self.stride:
if item <= 0:
raise RuntimeError(
f"stride must be greater than zero, but got {item}")
for item in self.padding:
if item < 0:
raise RuntimeError(
f"padding must be non-negative, but got {item}")
self.op = op
self.return_indices = return_indices
self.ceil_mode = ceil_mode
self.count_include_pad = count_include_pad
# def __init__(self,
# kernel_size,
# stride=None,
# padding=0,
# dilation=None,
# return_indices=None,
# ceil_mode=False,
# count_include_pad=True,
# op='maximum'):
# self.kernel_size = kernel_size if isinstance(
# kernel_size, tuple) else (kernel_size, kernel_size)
# stride = stride if stride else kernel_size
# self.stride = stride if isinstance(stride, tuple) else (stride,
# stride)
# self.padding = padding if isinstance(padding, tuple) else (padding,
# padding)
# dilation = dilation if dilation else 1
# assert dilation == 1
# self.dilation = dilation if isinstance(
# dilation, tuple) else (dilation, dilation)
# for item in self.kernel_size:
# if item <= 0:
# raise RuntimeError(
# f"kernel_size must be greater than zero, but got {item}"
# )
# for item in self.stride:
# if item <= 0:
# raise RuntimeError(
# f"stride must be greater than zero, but got {item}")
# for item in self.padding:
# if item < 0:
# raise RuntimeError(
# f"padding must be non-negative, but got {item}")
# self.op = op
# self.return_indices = return_indices
# self.ceil_mode = ceil_mode
# self.count_include_pad = count_include_pad
def execute(self, input):
self.input = input
attr_code = f"""
op.jt_name = "{"avgpool" if self.op == 'mean' else "maxpool"}";
PoolAttr *attr = new PoolAttr();
attr->kernel_size = {{ {self.kernel_size[0]}, {self.kernel_size[1]} }};
attr->poolStrides = {{ {self.stride[0]}, {self.stride[1]} }};
attr->poolPads = {{ {self.padding[0]}, {self.padding[1]} }};
attr->poolDilations = {{ {self.dilation[0]}, {self.dilation[1]} }};
attr->poolCeil = {"true" if self.ceil_mode else "false"};
attr->countIncludePad = {"true" if self.count_include_pad else "false"};
op.op_attr.reset(attr);
"""
input_height, input_width = input.shape[-2:]
kernel_height, kernel_width = self.kernel_size[-2:]
# def execute(self, input):
# self.input = input
# attr_code = f"""
# op.jt_name = "{"avgpool" if self.op == 'mean' else "maxpool"}";
# PoolAttr *attr = new PoolAttr();
# attr->kernel_size = {{ {self.kernel_size[0]}, {self.kernel_size[1]} }};
# attr->poolStrides = {{ {self.stride[0]}, {self.stride[1]} }};
# attr->poolPads = {{ {self.padding[0]}, {self.padding[1]} }};
# attr->poolDilations = {{ {self.dilation[0]}, {self.dilation[1]} }};
# attr->poolCeil = {"true" if self.ceil_mode else "false"};
# attr->countIncludePad = {"true" if self.count_include_pad else "false"};
# op.op_attr.reset(attr);
# """
# input_height, input_width = input.shape[-2:]
# kernel_height, kernel_width = self.kernel_size[-2:]
output_height = (input_height + 2 * self.padding[0] -
(kernel_height - 1) - 1) // self.stride[0] + 1
output_width = (input_width + 2 * self.padding[1] -
(kernel_width - 1) - 1) // self.stride[1] + 1
# output_height = (input_height + 2 * self.padding[0] -
# (kernel_height - 1) - 1) // self.stride[0] + 1
# output_width = (input_width + 2 * self.padding[1] -
# (kernel_width - 1) - 1) // self.stride[1] + 1
output_shape = (input.shape[0], input.shape[1], output_height,
output_width)
# output_shape = (input.shape[0], input.shape[1], output_height,
# output_width)
inputs = [input]
# inputs = [input]
if self.op == 'maximum':
result = acl_cmd(
"Maxpool",
inputs,
output_dtypes=[input.dtype, 'int32'],
output_shapes=[output_shape, output_shape],
attr_code=attr_code,
)
elif self.op == 'mean':
result = acl_cmd(
"Avgpool",
inputs,
output_dtypes=[input.dtype],
output_shapes=[output_shape],
attr_code=attr_code,
)
else:
raise ValueError('no this type pool')
# if self.op == 'maximum':
# result = acl_cmd(
# "Maxpool",
# inputs,
# output_dtypes=[input.dtype, 'int32'],
# output_shapes=[output_shape, output_shape],
# attr_code=attr_code,
# )
# elif self.op == 'mean':
# result = acl_cmd(
# "Avgpool",
# inputs,
# output_dtypes=[input.dtype],
# output_shapes=[output_shape],
# attr_code=attr_code,
# )
# else:
# raise ValueError('no this type pool')
if self.op == 'maximum':
self.index = result[1]
# if self.op == 'maximum':
# self.index = result[1]
if self.return_indices:
return result[0], result[1]
else:
return result[0]
# if self.return_indices:
# return result[0], result[1]
# else:
# return result[0]
def grad(self, grad_output):
input = self.input
attr_code = f"""
op.jt_name = "{"avgpoolbackward" if self.op == 'mean' else "maxpoolbackward"}";
PoolAttr *attr = new PoolAttr();
attr->kernel_size = {{ {self.kernel_size[0]}, {self.kernel_size[1]} }};
attr->poolStrides = {{ {self.stride[0]}, {self.stride[1]} }};
attr->poolPads = {{ {self.padding[0]}, {self.padding[1]} }};
attr->poolDilations = {{ {self.dilation[0]}, {self.dilation[1]} }};
attr->poolCeil = {"true" if self.ceil_mode else "false"};
attr->countIncludePad = {"true" if self.count_include_pad else "false"};
op.op_attr.reset(attr);
"""
output_shapes = [input.shape]
output_dtypes = [input.dtype]
if self.op == 'maximum':
result = acl_cmd("MaxpoolBackward",
inputs=[grad_output, input, self.index],
output_dtypes=output_dtypes,
output_shapes=output_shapes,
attr_code=attr_code)[0]
elif self.op == 'mean':
result = acl_cmd("AvgpoolBackward",
inputs=[grad_output, input],
output_dtypes=output_dtypes,
output_shapes=output_shapes,
attr_code=attr_code)[0]
else:
raise ValueError('no this type pool')
return result
# def grad(self, grad_output):
# input = self.input
# attr_code = f"""
# op.jt_name = "{"avgpoolbackward" if self.op == 'mean' else "maxpoolbackward"}";
# PoolAttr *attr = new PoolAttr();
# attr->kernel_size = {{ {self.kernel_size[0]}, {self.kernel_size[1]} }};
# attr->poolStrides = {{ {self.stride[0]}, {self.stride[1]} }};
# attr->poolPads = {{ {self.padding[0]}, {self.padding[1]} }};
# attr->poolDilations = {{ {self.dilation[0]}, {self.dilation[1]} }};
# attr->poolCeil = {"true" if self.ceil_mode else "false"};
# attr->countIncludePad = {"true" if self.count_include_pad else "false"};
# op.op_attr.reset(attr);
# """
# output_shapes = [input.shape]
# output_dtypes = [input.dtype]
# if self.op == 'maximum':
# result = acl_cmd("MaxpoolBackward",
# inputs=[grad_output, input, self.index],
# output_dtypes=output_dtypes,
# output_shapes=output_shapes,
# attr_code=attr_code)[0]
# elif self.op == 'mean':
# result = acl_cmd("AvgpoolBackward",
# inputs=[grad_output, input],
# output_dtypes=output_dtypes,
# output_shapes=output_shapes,
# attr_code=attr_code)[0]
# else:
# raise ValueError('no this type pool')
# return result
class FlipACL(Function):
# class FlipACL(Function):
def __init__(self):
super(FlipACL, self).__init__()
# def __init__(self):
# super(FlipACL, self).__init__()
def execute(self, input, dim):
if type(dim) is tuple:
dim = list(dim)
if type(dim) is not list:
dim = [dim]
attr_code = f"""
op.jt_name = "flip";
ReduceAttr *attr = new ReduceAttr();
attr->axes = {{{', '.join(map(str, (list(dim))))}}};
attr->prod_dim = {len(dim)};
op.op_attr.reset(attr);
"""
self.attr_code = attr_code
result = acl_cmd("Flip", [input],
output_dtypes=[input.dtype],
output_shapes=[input.shape],
attr_code=self.attr_code)[0]
return result
# def execute(self, input, dim):
# if type(dim) is tuple:
# dim = list(dim)
# if type(dim) is not list:
# dim = [dim]
# attr_code = f"""
# op.jt_name = "flip";
# ReduceAttr *attr = new ReduceAttr();
# attr->axes = {{{', '.join(map(str, (list(dim))))}}};
# attr->prod_dim = {len(dim)};
# op.op_attr.reset(attr);
# """
# self.attr_code = attr_code
# result = acl_cmd("Flip", [input],
# output_dtypes=[input.dtype],
# output_shapes=[input.shape],
# attr_code=self.attr_code)[0]
# return result
def grad(self, grad_output):
grad_input = acl_cmd("Flip", [grad_output],
output_dtypes=[grad_output.dtype],
output_shapes=[grad_output.shape],
attr_code=self.attr_code)[0]
return grad_input
# def grad(self, grad_output):
# grad_input = acl_cmd("Flip", [grad_output],
# output_dtypes=[grad_output.dtype],
# output_shapes=[grad_output.shape],
# attr_code=self.attr_code)[0]
# return grad_input
from .aclops.flip_op import FlipACL
def flip_acl(x, dim):
return FlipACL()(x, dim)
class ConcatACL(Function):
# class ConcatACL(Function):
def __init__(self):
super(ConcatACL, self).__init__()
# def __init__(self):
# super(ConcatACL, self).__init__()
def __call__(self, *args):
assert isinstance(args[0], list)
assert isinstance(args[1], int)
if jt.flags.no_grad:
return self.execute(*args)
backup = args
args = list(args)
taped_inputs = []
taped_outputs = []
input_mask = [-1] * (len(args[0]) + 1)
newargs = [list(), args[1]]
for i, v in enumerate(args[0]):
if isinstance(v, jt.Var):
if v.is_stop_grad():
# -2 in input_mask represents it is stop_grad
input_mask[i] = -2
newargs[0].append(v)
continue
v = v.tape()
newargs[0].append(v)
input_mask[i] = len(taped_inputs)
taped_inputs.append(v)
# def __call__(self, *args):
# assert isinstance(args[0], list)
# assert isinstance(args[1], int)
# if jt.flags.no_grad:
# return self.execute(*args)
# backup = args
# args = list(args)
# taped_inputs = []
# taped_outputs = []
# input_mask = [-1] * (len(args[0]) + 1)
# newargs = [list(), args[1]]
# for i, v in enumerate(args[0]):
# if isinstance(v, jt.Var):
# if v.is_stop_grad():
# # -2 in input_mask represents it is stop_grad
# input_mask[i] = -2
# newargs[0].append(v)
# continue
# v = v.tape()
# newargs[0].append(v)
# input_mask[i] = len(taped_inputs)
# taped_inputs.append(v)
ori_res = self.execute(*newargs)
if not isinstance(ori_res, Sequence):
res = [ori_res]
else:
res = list(ori_res)
output_mask = [-1] * len(res)
for i, v in enumerate(res):
if isinstance(v, jt.Var):
v = v.tape()
output_mask[i] = len(taped_outputs)
res[i] = v
taped_outputs.append(v)
self.input_mask = input_mask
self.output_mask = output_mask
# tape output and input together so
# backward treat them as one operator
jt.tape_together(taped_inputs, taped_outputs, self._grad)
if isinstance(ori_res, Sequence):
return res
else:
return res[0]
# ori_res = self.execute(*newargs)
# if not isinstance(ori_res, Sequence):
# res = [ori_res]
# else:
# res = list(ori_res)
# output_mask = [-1] * len(res)
# for i, v in enumerate(res):
# if isinstance(v, jt.Var):
# v = v.tape()
# output_mask[i] = len(taped_outputs)
# res[i] = v
# taped_outputs.append(v)
# self.input_mask = input_mask
# self.output_mask = output_mask
# # tape output and input together so
# # backward treat them as one operator
# jt.tape_together(taped_inputs, taped_outputs, self._grad)
# if isinstance(ori_res, Sequence):
# return res
# else:
# return res[0]
def execute(self, input_tensors, dim=0):
for _ in input_tensors:
if not (-_.ndim <= dim < _.ndim):
print(_.shape, dim)
raise ValueError("dim out of range")
# def execute(self, input_tensors, dim=0):
# for _ in input_tensors:
# if not (-_.ndim <= dim < _.ndim):
# print(_.shape, dim)
# raise ValueError("dim out of range")
if dim < 0:
dim += input_tensors[0].ndim
# if dim < 0:
# dim += input_tensors[0].ndim
self.input = input_tensors
self.dim = dim
for i in range(len(input_tensors)):
if input_tensors[i].dtype != input_tensors[0].dtype:
raise ValueError(
"All input tensors must have the same dtype")
if input_tensors[i].shape[:dim] != input_tensors[
0].shape[:dim] or input_tensors[i].shape[
dim + 1:] != input_tensors[0].shape[dim + 1:]:
raise ValueError(
"All input tensors must have the same shape")
attr_code = f"""
op.jt_name = "concat";
ConcatAttr *attr = new ConcatAttr();
attr->tensorNum = {len(input_tensors)};
attr->dim = {dim};
op.op_attr.reset(attr);
"""
result = acl_cmd(
"Concat",
input_tensors,
output_dtypes=[input_tensors[0].dtype],
output_shapes=[
jt.empty(self.calculate_output_shape(input_tensors,
dim)).shape
],
attr_code=attr_code)[0]
return result
# self.input = input_tensors
# self.dim = dim
# for i in range(len(input_tensors)):
# if input_tensors[i].dtype != input_tensors[0].dtype:
# raise ValueError(
# "All input tensors must have the same dtype")
# if input_tensors[i].shape[:dim] != input_tensors[
# 0].shape[:dim] or input_tensors[i].shape[
# dim + 1:] != input_tensors[0].shape[dim + 1:]:
# raise ValueError(
# "All input tensors must have the same shape")
# attr_code = f"""
# op.jt_name = "concat";
# ConcatAttr *attr = new ConcatAttr();
# attr->tensorNum = {len(input_tensors)};
# attr->dim = {dim};
# op.op_attr.reset(attr);
# """
# result = acl_cmd(
# "Concat",
# input_tensors,
# output_dtypes=[input_tensors[0].dtype],
# output_shapes=[
# jt.empty(self.calculate_output_shape(input_tensors,
# dim)).shape
# ],
# attr_code=attr_code)[0]
# return result
def _grad(self, *args):
new_args = ((args[i] if i >= 0 else None)
for i in self.output_mask)
ret = self.grad(*new_args)
new_ret = []
for i, r in enumerate(ret):
j = self.input_mask[i]
if j < 0:
# -2 in input_mask represents it is stop_grad
assert r is None or j==-2, f"{type(self)}'s {i}-th returned grad should be None, "\
"because the input value is not jittor variable."
else:
new_ret.append(r)
return new_ret
# def _grad(self, *args):
# new_args = ((args[i] if i >= 0 else None)
# for i in self.output_mask)
# ret = self.grad(*new_args)
# new_ret = []
# for i, r in enumerate(ret):
# j = self.input_mask[i]
# if j < 0:
# # -2 in input_mask represents it is stop_grad
# assert r is None or j==-2, f"{type(self)}'s {i}-th returned grad should be None, "\
# "because the input value is not jittor variable."
# else:
# new_ret.append(r)
# return new_ret
def grad(self, grad_output):
grad_inputs = self.split_grad(grad_output, self.input, self.dim)
return grad_inputs
# def grad(self, grad_output):
# grad_inputs = self.split_grad(grad_output, self.input, self.dim)
# return grad_inputs
def calculate_output_shape(self, input_tensors, axis):
shape = list(input_tensors[0].shape)
for tensor in input_tensors[1:]:
shape[axis] += tensor.shape[axis]
return tuple(shape)
# def calculate_output_shape(self, input_tensors, axis):
# shape = list(input_tensors[0].shape)
# for tensor in input_tensors[1:]:
# shape[axis] += tensor.shape[axis]
# return tuple(shape)
def split_grad(self, grad_output, input_tensors, axis):
offset = []
shapeVec = []
dtypeVec = []
for tensor in input_tensors:
offset.append(tensor.shape[axis])
dtypeVec.append(tensor.dtype)
shapeVec.append(tensor.shape)
# def split_grad(self, grad_output, input_tensors, axis):
# offset = []
# shapeVec = []
# dtypeVec = []
# for tensor in input_tensors:
# offset.append(tensor.shape[axis])
# dtypeVec.append(tensor.dtype)
# shapeVec.append(tensor.shape)
attr_code = f"""
op.jt_name = "splitwithsize";
auto *attr = new SplitWithSizeAttr();
attr->splitSize = {{ {", ".join(map(str, offset))} }};
attr->dim = {axis};
op.op_attr.reset(attr);
"""
# attr_code = f"""
# op.jt_name = "splitwithsize";
# auto *attr = new SplitWithSizeAttr();
# attr->splitSize = {{ {", ".join(map(str, offset))} }};
# attr->dim = {axis};
# op.op_attr.reset(attr);
# """
result = acl_cmd("SplitWithSize", [grad_output],
output_dtypes=dtypeVec,
output_shapes=shapeVec,
attr_code=attr_code)
return result
# result = acl_cmd("SplitWithSize", [grad_output],
# output_dtypes=dtypeVec,
# output_shapes=shapeVec,
# attr_code=attr_code)
# return result
from .aclops.concat_op import ConcatACL
def concat(x, dim=0):
return ConcatACL()(x, dim)
@ -2692,6 +2694,8 @@ def change_function():
jt.nn.conv2d = warp(jt.nn.conv2d, conv_acl)
jt.nn.Conv2d = warp(jt.nn.Conv2d, Conv2D)
jt.nn.Conv = warp(jt.nn.Conv, Conv2D)
from .aclops.pool_op import PoolACL
jt.nn.Pool = warp(jt.nn.Pool, PoolACL)
jt.flip = warp(jt.flip, flip_acl)

View File

@ -309,63 +309,63 @@ namespace jittor
ret = it->second.getWorkspaceSizeFuncConvBackward(inputTensors[0], inputTensors[1], inputTensors[2], biasSizes, strides, pads, dilations, false, outPads, attr->group, outMask, 0, outputTensors[0], outputTensors[1], outputTensors[2], &workspaceSize, &executor);
break;
}
case 22:
{
auto attr = dynamic_cast<PoolAttr *>(op_attr.get());
kernel_size = aclCreateIntArray(attr->kernel_size.data(), 2);
strides = aclCreateIntArray(attr->poolStrides.data(), 2);
pads = aclCreateIntArray(attr->poolPads.data(), 2);
dilations = aclCreateIntArray(attr->poolDilations.data(), 2);
ret = it->second.getWorkspaceSizeFuncMaxPool(inputTensors[0], kernel_size, strides, pads, dilations, attr->poolCeil, outputTensors[0], outputTensors[1], &workspaceSize, &executor);
break;
}
case 23:
{
auto attr = dynamic_cast<PoolAttr *>(op_attr.get());
kernel_size = aclCreateIntArray(attr->kernel_size.data(), 2);
strides = aclCreateIntArray(attr->poolStrides.data(), 2);
pads = aclCreateIntArray(attr->poolPads.data(), 2);
dilations = aclCreateIntArray(attr->poolDilations.data(), 2);
ret = it->second.getWorkspaceSizeFuncMaxPoolBackward(inputTensors[0], inputTensors[1], inputTensors[2], kernel_size, strides, pads, dilations, attr->poolCeil, outputTensors[0], &workspaceSize, &executor);
break;
}
case 24:
{
auto attr = dynamic_cast<PoolAttr *>(op_attr.get());
kernel_size = aclCreateIntArray(attr->kernel_size.data(), 2);
strides = aclCreateIntArray(attr->poolStrides.data(), 2);
pads = aclCreateIntArray(attr->poolPads.data(), 2);
ret = it->second.getWorkspaceSizeFuncAvgPool(inputTensors[0], kernel_size, strides, pads, attr->poolCeil, attr->countIncludePad, attr->divisorOverride, attr->divisorOverride, outputTensors[0], &workspaceSize, &executor);
break;
}
case 25:
{
auto attr = dynamic_cast<PoolAttr *>(op_attr.get());
kernel_size = aclCreateIntArray(attr->kernel_size.data(), 2);
strides = aclCreateIntArray(attr->poolStrides.data(), 2);
pads = aclCreateIntArray(attr->poolPads.data(), 2);
ret = it->second.getWorkspaceSizeFuncAvgPoolBackward(inputTensors[0], inputTensors[1], kernel_size, strides, pads, attr->countIncludePad, attr->divisorOverride, attr->divisorOverride, attr->poolCeil, outputTensors[0], &workspaceSize, &executor);
break;
}
case 26:
{
auto attr = dynamic_cast<ReduceAttr *>(op_attr.get());
dim = aclCreateIntArray(attr->axes.data(), attr->axes.size());
ret = it->second.getWorkspaceSizeFuncExpand(inputTensors[0], dim, outputTensors[0], &workspaceSize, &executor);
break;
}
case 27:
{
std::vector<aclTensor *> concatTensorList = {};
for (int i = 0; i < input_num; i++)
{
concatTensorList.push_back(inputTensors[i]);
}
auto concatTensorListInput = aclCreateTensorList(&concatTensorList[0], input_num);
auto attr = dynamic_cast<ConcatAttr *>(op_attr.get());
ret = it->second.getWorkspaceSizeFuncConcat(concatTensorListInput, attr->dim, outputTensors[0], &workspaceSize, &executor);
break;
}
// case 22:
// {
// auto attr = dynamic_cast<PoolAttr *>(op_attr.get());
// kernel_size = aclCreateIntArray(attr->kernel_size.data(), 2);
// strides = aclCreateIntArray(attr->poolStrides.data(), 2);
// pads = aclCreateIntArray(attr->poolPads.data(), 2);
// dilations = aclCreateIntArray(attr->poolDilations.data(), 2);
// ret = it->second.getWorkspaceSizeFuncMaxPool(inputTensors[0], kernel_size, strides, pads, dilations, attr->poolCeil, outputTensors[0], outputTensors[1], &workspaceSize, &executor);
// break;
// }
// case 23:
// {
// auto attr = dynamic_cast<PoolAttr *>(op_attr.get());
// kernel_size = aclCreateIntArray(attr->kernel_size.data(), 2);
// strides = aclCreateIntArray(attr->poolStrides.data(), 2);
// pads = aclCreateIntArray(attr->poolPads.data(), 2);
// dilations = aclCreateIntArray(attr->poolDilations.data(), 2);
// ret = it->second.getWorkspaceSizeFuncMaxPoolBackward(inputTensors[0], inputTensors[1], inputTensors[2], kernel_size, strides, pads, dilations, attr->poolCeil, outputTensors[0], &workspaceSize, &executor);
// break;
// }
// case 24:
// {
// auto attr = dynamic_cast<PoolAttr *>(op_attr.get());
// kernel_size = aclCreateIntArray(attr->kernel_size.data(), 2);
// strides = aclCreateIntArray(attr->poolStrides.data(), 2);
// pads = aclCreateIntArray(attr->poolPads.data(), 2);
// ret = it->second.getWorkspaceSizeFuncAvgPool(inputTensors[0], kernel_size, strides, pads, attr->poolCeil, attr->countIncludePad, attr->divisorOverride, attr->divisorOverride, outputTensors[0], &workspaceSize, &executor);
// break;
// }
// case 25:
// {
// auto attr = dynamic_cast<PoolAttr *>(op_attr.get());
// kernel_size = aclCreateIntArray(attr->kernel_size.data(), 2);
// strides = aclCreateIntArray(attr->poolStrides.data(), 2);
// pads = aclCreateIntArray(attr->poolPads.data(), 2);
// ret = it->second.getWorkspaceSizeFuncAvgPoolBackward(inputTensors[0], inputTensors[1], kernel_size, strides, pads, attr->countIncludePad, attr->divisorOverride, attr->divisorOverride, attr->poolCeil, outputTensors[0], &workspaceSize, &executor);
// break;
// }
// case 26:
// {
// auto attr = dynamic_cast<ReduceAttr *>(op_attr.get());
// dim = aclCreateIntArray(attr->axes.data(), attr->axes.size());
// ret = it->second.getWorkspaceSizeFuncExpand(inputTensors[0], dim, outputTensors[0], &workspaceSize, &executor);
// break;
// }
// case 27:
// {
// std::vector<aclTensor *> concatTensorList = {};
// for (int i = 0; i < input_num; i++)
// {
// concatTensorList.push_back(inputTensors[i]);
// }
// auto concatTensorListInput = aclCreateTensorList(&concatTensorList[0], input_num);
// auto attr = dynamic_cast<ConcatAttr *>(op_attr.get());
// ret = it->second.getWorkspaceSizeFuncConcat(concatTensorListInput, attr->dim, outputTensors[0], &workspaceSize, &executor);
// break;
// }
case 28:
{
auto attr = dynamic_cast<GatherAttr *>(op_attr.get());
@ -510,14 +510,14 @@ namespace jittor
// ret = it->second.getWorkspaceSizeFuncBinary(inputTensors[0], inputTensors[1], outputTensors[0], &workspaceSize, &executor);
// break;
// }
case 50:
{
auto attr = dynamic_cast<SplitWithSizeAttr *>(op_attr.get());
auto splitSize = aclCreateIntArray(attr->splitSize.data(), attr->splitSize.size());
auto tensorList = aclCreateTensorList(&outputTensors[0], output_num);
ret = it->second.getWorkspaceSizeFuncSplitWithSize(inputTensors[0], splitSize, attr->dim, tensorList, &workspaceSize, &executor);
break;
}
// case 50:
// {
// auto attr = dynamic_cast<SplitWithSizeAttr *>(op_attr.get());
// auto splitSize = aclCreateIntArray(attr->splitSize.data(), attr->splitSize.size());
// auto tensorList = aclCreateTensorList(&outputTensors[0], output_num);
// ret = it->second.getWorkspaceSizeFuncSplitWithSize(inputTensors[0], splitSize, attr->dim, tensorList, &workspaceSize, &executor);
// break;
// }
case 51:
{
auto attr = dynamic_cast<FlashAttentionAttr *>(op_attr.get());

View File

@ -10,3 +10,6 @@
#include <acl/aclops/matmul_op_acl.h>
#include <acl/aclops/random_op_acl.h>
#include <acl/aclops/bmm_op_acl.h>
#include <acl/aclops/pool_op_acl.h>
#include <acl/aclops/flip_op_acl.h>
#include <acl/aclops/concat_op_acl.h>

View File

@ -0,0 +1,188 @@
import os
from jittor_utils import env_or_try_find
import jittor_utils
import ctypes
import glob
import jittor.compiler as compiler
import jittor as jt
import math
import numpy as np
from typing import Union
from collections.abc import Sequence, Iterable
def concat_cmd(name: str,
inputs: list,
output_dtypes: list = None,
output_shapes: list = None,
attr_code: str = "",
attr_header: str = "",
outputs: list = None):
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
cuda_header = '''
#include "acl/aclops/aclops.h"
'''
outputs_ = []
if outputs is not None:
outputs_ = outputs
else:
assert output_dtypes is not None
assert output_shapes is not None
assert len(output_dtypes) == len(output_shapes)
for i in range(len(output_shapes)):
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
input_code = ''
for i in range(len(inputs)):
input_code += f"op.add(in{i}, true);\n"
output_code = ''
for i in range(len(outputs_)):
output_code += f"op.add(out{i}, false);\n"
return jt.code(outputs=outputs_,
inputs=inputs,
cuda_header=attr_header + cuda_header,
cuda_src=f"""
// aclop
{name}OpRunner op;
{input_code}
{output_code}
{attr_code}
op.run();""")
class ConcatACL(jt.Function):
def __init__(self):
super(ConcatACL, self).__init__()
def __call__(self, *args):
assert isinstance(args[0], list)
assert isinstance(args[1], int)
if jt.flags.no_grad:
return self.execute(*args)
backup = args
args = list(args)
taped_inputs = []
taped_outputs = []
input_mask = [-1] * (len(args[0]) + 1)
newargs = [list(), args[1]]
for i, v in enumerate(args[0]):
if isinstance(v, jt.Var):
if v.is_stop_grad():
# -2 in input_mask represents it is stop_grad
input_mask[i] = -2
newargs[0].append(v)
continue
v = v.tape()
newargs[0].append(v)
input_mask[i] = len(taped_inputs)
taped_inputs.append(v)
ori_res = self.execute(*newargs)
if not isinstance(ori_res, Sequence):
res = [ori_res]
else:
res = list(ori_res)
output_mask = [-1] * len(res)
for i, v in enumerate(res):
if isinstance(v, jt.Var):
v = v.tape()
output_mask[i] = len(taped_outputs)
res[i] = v
taped_outputs.append(v)
self.input_mask = input_mask
self.output_mask = output_mask
# tape output and input together so
# backward treat them as one operator
jt.tape_together(taped_inputs, taped_outputs, self._grad)
if isinstance(ori_res, Sequence):
return res
else:
return res[0]
def execute(self, input_tensors, dim=0):
for _ in input_tensors:
if not (-_.ndim <= dim < _.ndim):
print(_.shape, dim)
raise ValueError("dim out of range")
if dim < 0:
dim += input_tensors[0].ndim
self.input = input_tensors
self.dim = dim
for i in range(len(input_tensors)):
if input_tensors[i].dtype != input_tensors[0].dtype:
raise ValueError(
"All input tensors must have the same dtype")
if input_tensors[i].shape[:dim] != input_tensors[
0].shape[:dim] or input_tensors[i].shape[
dim + 1:] != input_tensors[0].shape[dim + 1:]:
raise ValueError(
"All input tensors must have the same shape")
attr_code = f"""
op.jt_name = "concat";
ConcatAttr *attr = new ConcatAttr();
attr->tensorNum = {len(input_tensors)};
attr->dim = {dim};
op.op_attr.reset(attr);
"""
result = concat_cmd(
"Concat",
input_tensors,
output_dtypes=[input_tensors[0].dtype],
output_shapes=[
jt.empty(self.calculate_output_shape(input_tensors,
dim)).shape
],
attr_code=attr_code)[0]
return result
def _grad(self, *args):
new_args = ((args[i] if i >= 0 else None)
for i in self.output_mask)
ret = self.grad(*new_args)
new_ret = []
for i, r in enumerate(ret):
j = self.input_mask[i]
if j < 0:
# -2 in input_mask represents it is stop_grad
assert r is None or j==-2, f"{type(self)}'s {i}-th returned grad should be None, "\
"because the input value is not jittor variable."
else:
new_ret.append(r)
return new_ret
def grad(self, grad_output):
grad_inputs = self.split_grad(grad_output, self.input, self.dim)
return grad_inputs
def calculate_output_shape(self, input_tensors, axis):
shape = list(input_tensors[0].shape)
for tensor in input_tensors[1:]:
shape[axis] += tensor.shape[axis]
return tuple(shape)
def split_grad(self, grad_output, input_tensors, axis):
offset = []
shapeVec = []
dtypeVec = []
for tensor in input_tensors:
offset.append(tensor.shape[axis])
dtypeVec.append(tensor.dtype)
shapeVec.append(tensor.shape)
attr_code = f"""
op.jt_name = "splitwithsize";
auto *attr = new SplitWithSizeAttr();
attr->splitSize = {{ {", ".join(map(str, offset))} }};
attr->dim = {axis};
op.op_attr.reset(attr);
"""
result = concat_cmd("SplitWithSize", [grad_output],
output_dtypes=dtypeVec,
output_shapes=shapeVec,
attr_code=attr_code)
return result

View File

@ -0,0 +1,89 @@
#pragma once
#include <acl/acl.h>
#include <acl/acl_op_compiler.h>
#include <Python.h>
#include <pystate.h>
#include <algorithm>
#include <queue>
#include <set>
#include "common.h"
#include "op.h"
#include "acl_jittor.h"
#include "ops/random_op.h"
#include "ops/reduce_op.h"
#include "ops/binary_op.h"
#include "ops/broadcast_to_op.h"
#include "ops/transpose_op.h"
#include "ops/array_op.h"
#include "ops/code_op.h"
#include "fused_op.h"
#include "ops/unary_op.h"
#include "ops/ternary_op.h"
#include "executor.h"
#include "misc/cuda_flags.h"
#include "mem/allocator.h"
#include "op_compiler.h"
#include "ops/op_register.h"
#include "opt/tuner_manager.h"
#include "utils/str_utils.h"
#include "aclnn/aclnn.h"
#include "concat_op_acl.h"
namespace jittor
{
ConcatOpRunner::ConcatOpRunner() : BaseOpRunner("Concat")
{
}
void ConcatOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
auto input_num = in_.size();
std::vector<aclTensor *> concatTensorList = {};
for (int i = 0; i < input_num; i++)
{
concatTensorList.push_back(inputTensors[i]);
}
auto concatTensorListInput = aclCreateTensorList(&concatTensorList[0], input_num);
auto attr = dynamic_cast<ConcatAttr *>(op_attr.get());
ret = aclnnCatGetWorkspaceSize(concatTensorListInput, attr->dim, outputTensors[0], &workspaceSize, &executor);
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnCat(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnCat failed. ERROR: %d\n", name.c_str(), ret); return);
syncRun();
return;
}
SplitWithSizeOpRunner::SplitWithSizeOpRunner() : BaseOpRunner("SplitWithSize")
{
}
void SplitWithSizeOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
auto output_num = out_.size();
auto attr = dynamic_cast<SplitWithSizeAttr *>(op_attr.get());
auto splitSize = aclCreateIntArray(attr->splitSize.data(), attr->splitSize.size());
auto tensorList = aclCreateTensorList(&outputTensors[0], output_num);
ret = aclnnSplitWithSizeGetWorkspaceSize(inputTensors[0], splitSize, attr->dim, tensorList, &workspaceSize, &executor);
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnSplitWithSize(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnSplitWithSize failed. ERROR: %d\n", name.c_str(), ret); return);
syncRun();
return;
}
}

View File

@ -0,0 +1,24 @@
#pragma once
#include "utils.h"
#include "base_op.h"
namespace jittor
{
class ConcatOpRunner : public BaseOpRunner
{
protected:
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
public:
ConcatOpRunner();
};
class SplitWithSizeOpRunner : public BaseOpRunner
{
protected:
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
public:
SplitWithSizeOpRunner();
};
}

View File

@ -0,0 +1,83 @@
import os
from jittor_utils import env_or_try_find
import jittor_utils
import ctypes
import glob
import jittor.compiler as compiler
import jittor as jt
import math
import numpy as np
from typing import Union
from collections.abc import Sequence, Iterable
def flip_cmd(name: str,
inputs: list,
output_dtypes: list = None,
output_shapes: list = None,
attr_code: str = "",
attr_header: str = "",
outputs: list = None):
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
cuda_header = '''
#include "acl/aclops/aclops.h"
'''
outputs_ = []
if outputs is not None:
outputs_ = outputs
else:
assert output_dtypes is not None
assert output_shapes is not None
assert len(output_dtypes) == len(output_shapes)
for i in range(len(output_shapes)):
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
input_code = ''
for i in range(len(inputs)):
input_code += f"op.add(in{i}, true);\n"
output_code = ''
for i in range(len(outputs_)):
output_code += f"op.add(out{i}, false);\n"
return jt.code(outputs=outputs_,
inputs=inputs,
cuda_header=attr_header + cuda_header,
cuda_src=f"""
// aclop
{name}OpRunner op;
{input_code}
{output_code}
{attr_code}
op.run();""")
class FlipACL(jt.Function):
def __init__(self):
super(FlipACL, self).__init__()
def execute(self, input, dim):
if type(dim) is tuple:
dim = list(dim)
if type(dim) is not list:
dim = [dim]
attr_code = f"""
op.jt_name = "flip";
ReduceAttr *attr = new ReduceAttr();
attr->axes = {{{', '.join(map(str, (list(dim))))}}};
attr->prod_dim = {len(dim)};
op.op_attr.reset(attr);
"""
self.attr_code = attr_code
result = flip_cmd("Flip", [input],
output_dtypes=[input.dtype],
output_shapes=[input.shape],
attr_code=self.attr_code)[0]
return result
def grad(self, grad_output):
grad_input = flip_cmd("Flip", [grad_output],
output_dtypes=[grad_output.dtype],
output_shapes=[grad_output.shape],
attr_code=self.attr_code)[0]
return grad_input

View File

@ -0,0 +1,58 @@
#pragma once
#include <acl/acl.h>
#include <acl/acl_op_compiler.h>
#include <Python.h>
#include <pystate.h>
#include <algorithm>
#include <queue>
#include <set>
#include "common.h"
#include "op.h"
#include "acl_jittor.h"
#include "ops/random_op.h"
#include "ops/reduce_op.h"
#include "ops/binary_op.h"
#include "ops/broadcast_to_op.h"
#include "ops/transpose_op.h"
#include "ops/array_op.h"
#include "ops/code_op.h"
#include "fused_op.h"
#include "ops/unary_op.h"
#include "ops/ternary_op.h"
#include "executor.h"
#include "misc/cuda_flags.h"
#include "mem/allocator.h"
#include "op_compiler.h"
#include "ops/op_register.h"
#include "opt/tuner_manager.h"
#include "utils/str_utils.h"
#include "aclnn/aclnn.h"
#include "flip_op_acl.h"
namespace jittor
{
FlipOpRunner::FlipOpRunner() : BaseOpRunner("Flip")
{
}
void FlipOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
auto attr = dynamic_cast<ReduceAttr *>(op_attr.get());
auto dim = aclCreateIntArray(attr->axes.data(), attr->axes.size());
ret = aclnnFlipGetWorkspaceSize(inputTensors[0], dim, outputTensors[0], &workspaceSize, &executor);
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnFlip(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnFlip failed. ERROR: %d\n", name.c_str(), ret); return);
syncRun();
return;
}
}

View File

@ -0,0 +1,15 @@
#pragma once
#include "utils.h"
#include "base_op.h"
namespace jittor
{
class FlipOpRunner : public BaseOpRunner
{
protected:
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
public:
FlipOpRunner();
};
}

View File

@ -0,0 +1,176 @@
import os
from jittor_utils import env_or_try_find
import jittor_utils
import ctypes
import glob
import jittor.compiler as compiler
import jittor as jt
import math
import numpy as np
from typing import Union
from collections.abc import Sequence, Iterable
def pool_cmd(name: str,
inputs: list,
output_dtypes: list = None,
output_shapes: list = None,
attr_code: str = "",
attr_header: str = "",
outputs: list = None):
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
cuda_header = '''
#include "acl/aclops/aclops.h"
'''
outputs_ = []
if outputs is not None:
outputs_ = outputs
else:
assert output_dtypes is not None
assert output_shapes is not None
assert len(output_dtypes) == len(output_shapes)
for i in range(len(output_shapes)):
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
input_code = ''
for i in range(len(inputs)):
input_code += f"op.add(in{i}, true);\n"
output_code = ''
for i in range(len(outputs_)):
output_code += f"op.add(out{i}, false);\n"
return jt.code(outputs=outputs_,
inputs=inputs,
cuda_header=attr_header + cuda_header,
cuda_src=f"""
// aclop
{name}OpRunner op;
{input_code}
{output_code}
{attr_code}
op.run();""")
class PoolACL(jt.Function):
def __init__(self,
kernel_size,
stride=None,
padding=0,
dilation=None,
return_indices=None,
ceil_mode=False,
count_include_pad=True,
op='maximum'):
self.kernel_size = kernel_size if isinstance(
kernel_size, tuple) else (kernel_size, kernel_size)
stride = stride if stride else kernel_size
self.stride = stride if isinstance(stride, tuple) else (stride,
stride)
self.padding = padding if isinstance(padding, tuple) else (padding,
padding)
dilation = dilation if dilation else 1
assert dilation == 1
self.dilation = dilation if isinstance(
dilation, tuple) else (dilation, dilation)
for item in self.kernel_size:
if item <= 0:
raise RuntimeError(
f"kernel_size must be greater than zero, but got {item}"
)
for item in self.stride:
if item <= 0:
raise RuntimeError(
f"stride must be greater than zero, but got {item}")
for item in self.padding:
if item < 0:
raise RuntimeError(
f"padding must be non-negative, but got {item}")
self.op = op
self.return_indices = return_indices
self.ceil_mode = ceil_mode
self.count_include_pad = count_include_pad
def execute(self, input):
self.input = input
attr_code = f"""
op.jt_name = "{"avgpool" if self.op == 'mean' else "maxpool"}";
PoolAttr *attr = new PoolAttr();
attr->kernel_size = {{ {self.kernel_size[0]}, {self.kernel_size[1]} }};
attr->poolStrides = {{ {self.stride[0]}, {self.stride[1]} }};
attr->poolPads = {{ {self.padding[0]}, {self.padding[1]} }};
attr->poolDilations = {{ {self.dilation[0]}, {self.dilation[1]} }};
attr->poolCeil = {"true" if self.ceil_mode else "false"};
attr->countIncludePad = {"true" if self.count_include_pad else "false"};
op.op_attr.reset(attr);
"""
input_height, input_width = input.shape[-2:]
kernel_height, kernel_width = self.kernel_size[-2:]
output_height = (input_height + 2 * self.padding[0] -
(kernel_height - 1) - 1) // self.stride[0] + 1
output_width = (input_width + 2 * self.padding[1] -
(kernel_width - 1) - 1) // self.stride[1] + 1
output_shape = (input.shape[0], input.shape[1], output_height,
output_width)
inputs = [input]
if self.op == 'maximum':
result = pool_cmd(
"Maxpool",
inputs,
output_dtypes=[input.dtype, 'int32'],
output_shapes=[output_shape, output_shape],
attr_code=attr_code,
)
elif self.op == 'mean':
result = pool_cmd(
"Avgpool",
inputs,
output_dtypes=[input.dtype],
output_shapes=[output_shape],
attr_code=attr_code,
)
else:
raise ValueError('no this type pool')
if self.op == 'maximum':
self.index = result[1]
if self.return_indices:
return result[0], result[1]
else:
return result[0]
def grad(self, grad_output):
input = self.input
attr_code = f"""
op.jt_name = "{"avgpoolbackward" if self.op == 'mean' else "maxpoolbackward"}";
PoolAttr *attr = new PoolAttr();
attr->kernel_size = {{ {self.kernel_size[0]}, {self.kernel_size[1]} }};
attr->poolStrides = {{ {self.stride[0]}, {self.stride[1]} }};
attr->poolPads = {{ {self.padding[0]}, {self.padding[1]} }};
attr->poolDilations = {{ {self.dilation[0]}, {self.dilation[1]} }};
attr->poolCeil = {"true" if self.ceil_mode else "false"};
attr->countIncludePad = {"true" if self.count_include_pad else "false"};
op.op_attr.reset(attr);
"""
output_shapes = [input.shape]
output_dtypes = [input.dtype]
if self.op == 'maximum':
result = pool_cmd("MaxpoolBackward",
inputs=[grad_output, input, self.index],
output_dtypes=output_dtypes,
output_shapes=output_shapes,
attr_code=attr_code)[0]
elif self.op == 'mean':
result = pool_cmd("AvgpoolBackward",
inputs=[grad_output, input],
output_dtypes=output_dtypes,
output_shapes=output_shapes,
attr_code=attr_code)[0]
else:
raise ValueError('no this type pool')
return result

View File

@ -0,0 +1,191 @@
#pragma once
#include <acl/acl.h>
#include <acl/acl_op_compiler.h>
#include <Python.h>
#include <pystate.h>
#include <algorithm>
#include <queue>
#include <set>
#include "common.h"
#include "op.h"
#include "acl_jittor.h"
#include "ops/random_op.h"
#include "ops/reduce_op.h"
#include "ops/binary_op.h"
#include "ops/broadcast_to_op.h"
#include "ops/transpose_op.h"
#include "ops/array_op.h"
#include "ops/code_op.h"
#include "fused_op.h"
#include "ops/unary_op.h"
#include "ops/ternary_op.h"
#include "executor.h"
#include "misc/cuda_flags.h"
#include "mem/allocator.h"
#include "op_compiler.h"
#include "ops/op_register.h"
#include "opt/tuner_manager.h"
#include "utils/str_utils.h"
#include "aclnn/aclnn.h"
#include "pool_op_acl.h"
namespace jittor
{
MaxpoolOpRunner::MaxpoolOpRunner() : BaseOpRunner("Maxpool")
{
use_nchw = true;
}
void MaxpoolOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
aclIntArray *strides = nullptr;
aclIntArray *pads = nullptr;
aclIntArray *dilations = nullptr;
aclIntArray *kernel_size = nullptr;
auto attr = dynamic_cast<PoolAttr *>(op_attr.get());
kernel_size = aclCreateIntArray(attr->kernel_size.data(), 2);
strides = aclCreateIntArray(attr->poolStrides.data(), 2);
pads = aclCreateIntArray(attr->poolPads.data(), 2);
dilations = aclCreateIntArray(attr->poolDilations.data(), 2);
ret = aclnnMaxPool2dWithIndicesGetWorkspaceSize(inputTensors[0], kernel_size, strides, pads, dilations, attr->poolCeil, outputTensors[0], outputTensors[1], &workspaceSize, &executor);
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnMaxPool2dWithIndices(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnMaxPool2dWithIndices failed. ERROR: %d\n", name.c_str(), ret); return);
syncRun();
aclDestroyIntArray(strides);
aclDestroyIntArray(pads);
aclDestroyIntArray(dilations);
aclDestroyIntArray(kernel_size);
return;
}
AvgpoolOpRunner::AvgpoolOpRunner() : BaseOpRunner("Avgpool")
{
use_nchw = true;
}
void AvgpoolOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
aclIntArray *strides = nullptr;
aclIntArray *pads = nullptr;
aclIntArray *kernel_size = nullptr;
auto attr = dynamic_cast<PoolAttr *>(op_attr.get());
kernel_size = aclCreateIntArray(attr->kernel_size.data(), 2);
strides = aclCreateIntArray(attr->poolStrides.data(), 2);
pads = aclCreateIntArray(attr->poolPads.data(), 2);
ret = aclnnAvgPool2dGetWorkspaceSize(inputTensors[0], kernel_size, strides, pads, attr->poolCeil, attr->countIncludePad, attr->divisorOverride, attr->divisorOverride, outputTensors[0], &workspaceSize, &executor);
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnAvgPool2d(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnAvgPool2d failed. ERROR: %d\n", name.c_str(), ret); return);
syncRun();
aclDestroyIntArray(strides);
aclDestroyIntArray(pads);
aclDestroyIntArray(kernel_size);
return;
}
MaxpoolBackwardOpRunner::MaxpoolBackwardOpRunner() : BaseOpRunner("MaxpoolBackward")
{
use_nchw = true;
}
void MaxpoolBackwardOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
aclIntArray *strides = nullptr;
aclIntArray *pads = nullptr;
aclIntArray *dilations = nullptr;
aclIntArray *kernel_size = nullptr;
auto attr = dynamic_cast<PoolAttr *>(op_attr.get());
kernel_size = aclCreateIntArray(attr->kernel_size.data(), 2);
strides = aclCreateIntArray(attr->poolStrides.data(), 2);
pads = aclCreateIntArray(attr->poolPads.data(), 2);
dilations = aclCreateIntArray(attr->poolDilations.data(), 2);
ret = aclnnMaxPool2dWithIndicesBackwardGetWorkspaceSize(inputTensors[0], inputTensors[1], inputTensors[2], kernel_size, strides, pads, dilations, attr->poolCeil, outputTensors[0], &workspaceSize, &executor);
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnMaxPool2dWithIndicesBackward(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnMaxPool2dWithIndicesBackward failed. ERROR: %d\n", name.c_str(), ret); return);
syncRun();
aclDestroyIntArray(strides);
aclDestroyIntArray(pads);
aclDestroyIntArray(dilations);
aclDestroyIntArray(kernel_size);
return;
}
AvgpoolBackwardOpRunner::AvgpoolBackwardOpRunner() : BaseOpRunner("AvgpoolBackward")
{
use_nchw = true;
}
void AvgpoolBackwardOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
aclIntArray *strides = nullptr;
aclIntArray *pads = nullptr;
aclIntArray *kernel_size = nullptr;
auto attr = dynamic_cast<PoolAttr *>(op_attr.get());
kernel_size = aclCreateIntArray(attr->kernel_size.data(), 2);
strides = aclCreateIntArray(attr->poolStrides.data(), 2);
pads = aclCreateIntArray(attr->poolPads.data(), 2);
ret = aclnnAvgPool2dBackwardGetWorkspaceSize(inputTensors[0], inputTensors[1], kernel_size, strides, pads, attr->countIncludePad, attr->divisorOverride, attr->divisorOverride, attr->poolCeil, outputTensors[0], &workspaceSize, &executor);
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnAvgPool2dBackward(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnAvgPool2dBackward failed. ERROR: %d\n", name.c_str(), ret); return);
syncRun();
aclDestroyIntArray(strides);
aclDestroyIntArray(pads);
aclDestroyIntArray(kernel_size);
return;
}
}

View File

@ -0,0 +1,43 @@
#pragma once
#include "utils.h"
#include "base_op.h"
namespace jittor
{
class MaxpoolOpRunner : public BaseOpRunner
{
protected:
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
public:
MaxpoolOpRunner();
};
class AvgpoolOpRunner : public BaseOpRunner
{
protected:
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
public:
AvgpoolOpRunner();
};
class MaxpoolBackwardOpRunner : public BaseOpRunner
{
protected:
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
public:
MaxpoolBackwardOpRunner();
};
class AvgpoolBackwardOpRunner : public BaseOpRunner
{
protected:
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
public:
AvgpoolBackwardOpRunner();
};
}