mirror of https://github.com/Jittor/Jittor
delete acl_op.h
This commit is contained in:
parent
981391ea6d
commit
7192a9060c
|
@ -150,94 +150,6 @@ def post_process():
|
|||
jt.flags.amp_reg |= 32 + 4 # 32 keep float16, 4 keep reduce type
|
||||
mod.init_acl_ops()
|
||||
|
||||
|
||||
def acl_cmd(name: str,
|
||||
inputs: list,
|
||||
output_dtypes: list = None,
|
||||
output_shapes: list = None,
|
||||
attr_code: str = "",
|
||||
attr_header: str = "",
|
||||
outputs: list = None):
|
||||
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
|
||||
|
||||
cuda_header = '#include "acl/aclops/aclops.h"'
|
||||
import jittor as jt
|
||||
outputs_ = []
|
||||
if outputs is not None:
|
||||
outputs_ = outputs
|
||||
else:
|
||||
assert output_dtypes is not None
|
||||
assert output_shapes is not None
|
||||
assert len(output_dtypes) == len(output_shapes)
|
||||
|
||||
for i in range(len(output_shapes)):
|
||||
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
|
||||
|
||||
input_code = ''
|
||||
for i in range(len(inputs)):
|
||||
input_code += f"op.add(in{i}, true);\n"
|
||||
|
||||
output_code = ''
|
||||
for i in range(len(outputs_)):
|
||||
output_code += f"op.add(out{i}, false);\n"
|
||||
return jt.code(outputs=outputs_,
|
||||
inputs=inputs,
|
||||
cuda_header=attr_header + cuda_header,
|
||||
cuda_src=f"""
|
||||
|
||||
// aclop
|
||||
AclOpRunner op("{name}");
|
||||
{input_code}
|
||||
{output_code}
|
||||
{attr_code}
|
||||
op.run();""")
|
||||
|
||||
|
||||
def acl_cmd_forward(name: str,
|
||||
inputs: list,
|
||||
output_dtypes: list = None,
|
||||
output_shapes: list = None,
|
||||
attr_code: str = "",
|
||||
attr_header: str = "",
|
||||
outputs: list = None,
|
||||
extra_data: dict = {}):
|
||||
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
|
||||
|
||||
cuda_header = '''
|
||||
#include "acl/aclops/aclops.h"
|
||||
'''
|
||||
import jittor as jt
|
||||
outputs_ = []
|
||||
if outputs is not None:
|
||||
outputs_ = outputs
|
||||
else:
|
||||
assert output_dtypes is not None
|
||||
assert output_shapes is not None
|
||||
assert len(output_dtypes) == len(output_shapes)
|
||||
for i in range(len(output_shapes)):
|
||||
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
|
||||
|
||||
input_code = ''
|
||||
for i in range(len(inputs)):
|
||||
input_code += f"op.add(in{i}, true);\n"
|
||||
|
||||
output_code = ''
|
||||
for i in range(len(outputs_)):
|
||||
output_code += f"op.add(out{i}, false);\n"
|
||||
|
||||
return jt.code(outputs=outputs_,
|
||||
inputs=inputs,
|
||||
cuda_header=attr_header + cuda_header,
|
||||
cuda_src=f"""
|
||||
// aclop
|
||||
AclOpRunner op("{name}");
|
||||
{input_code}
|
||||
{output_code}
|
||||
{attr_code}
|
||||
op.run();""",
|
||||
data=extra_data)
|
||||
|
||||
|
||||
def change_function():
|
||||
import jittor as jt
|
||||
from jittor import Function
|
||||
|
|
|
@ -1,240 +0,0 @@
|
|||
#pragma once
|
||||
#include "utils.h"
|
||||
|
||||
namespace jittor
|
||||
{
|
||||
extern int sync_run;
|
||||
struct AclOpRunner
|
||||
{
|
||||
string name;
|
||||
string jt_name;
|
||||
vector<Var *> in_;
|
||||
vector<Var *> out_;
|
||||
std::unique_ptr<AclOpAttr> op_attr;
|
||||
|
||||
AclOpRunner(const string &name) : name(name)
|
||||
{
|
||||
}
|
||||
|
||||
~AclOpRunner()
|
||||
{
|
||||
}
|
||||
|
||||
void add(Var *v, bool is_input)
|
||||
{
|
||||
|
||||
if (is_input)
|
||||
{
|
||||
in_.push_back(v);
|
||||
}
|
||||
else
|
||||
{
|
||||
out_.push_back(v);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
std::vector<T> createVector(int64_t size)
|
||||
{
|
||||
return std::vector<T>(size, 0);
|
||||
}
|
||||
|
||||
void run()
|
||||
{
|
||||
// LOGir << name << " " << jt_name;
|
||||
auto it = aclOpFuncMap.find(name);
|
||||
if (it == aclOpFuncMap.end())
|
||||
{
|
||||
LOGir << "aclOpFuncMap Not supported op: " << name;
|
||||
throw std::runtime_error("Unsupported operation type.");
|
||||
}
|
||||
|
||||
// 0. 算子的输入、输出、需要的attr定义
|
||||
std::vector<std::vector<int64_t>> inputShapes;
|
||||
std::vector<std::vector<int64_t>> outputShapes;
|
||||
|
||||
// for reduce
|
||||
// std::vector<int64_t> axes;
|
||||
aclIntArray *dim = nullptr;
|
||||
bool keepdims;
|
||||
|
||||
bool use_nchw = false;
|
||||
|
||||
auto input_num = in_.size();
|
||||
auto output_num = out_.size();
|
||||
|
||||
for (int input_idx = 0; input_idx < input_num; input_idx++)
|
||||
{
|
||||
std::vector<int64_t> shape;
|
||||
for (int j = 0; j < in_[input_idx]->shape.size(); j++)
|
||||
{
|
||||
shape.push_back(in_[input_idx]->shape[j]);
|
||||
}
|
||||
inputShapes.push_back(shape);
|
||||
}
|
||||
for (int output_idx = 0; output_idx < output_num; output_idx++)
|
||||
{
|
||||
std::vector<int64_t> shape;
|
||||
for (int j = 0; j < out_[output_idx]->shape.size(); j++)
|
||||
{
|
||||
shape.push_back(out_[output_idx]->shape[j]);
|
||||
}
|
||||
outputShapes.push_back(shape);
|
||||
}
|
||||
|
||||
// 1. 创建aclTensor和aclScalar,不同算子可能不一样,需要根据具体API的接口定义修改
|
||||
std::vector<aclTensor *> inputTensors;
|
||||
std::vector<aclTensor *> outputTensors;
|
||||
|
||||
// for expand
|
||||
aclIntArray *size = nullptr;
|
||||
|
||||
// for conv
|
||||
aclIntArray *strides = nullptr;
|
||||
aclIntArray *pads = nullptr;
|
||||
aclIntArray *outPads = nullptr;
|
||||
aclIntArray *dilations = nullptr;
|
||||
int ret = -1;
|
||||
|
||||
// for maxpool
|
||||
aclIntArray *kernel_size = nullptr;
|
||||
|
||||
// for layernorm
|
||||
aclIntArray *normalizedShape = nullptr;
|
||||
|
||||
// for range
|
||||
aclScalar *start = nullptr;
|
||||
aclScalar *end = nullptr;
|
||||
aclScalar *step = nullptr;
|
||||
|
||||
// for leaky_relu
|
||||
aclScalar *negativeSlope = nullptr;
|
||||
|
||||
if (jt_name == "conv" || jt_name == "conv2d" || jt_name == "conv2dbackward" || jt_name == "maxpool" || jt_name == "maxpoolbackward" || jt_name == "avgpool" || jt_name == "avgpoolbackward")
|
||||
use_nchw = true;
|
||||
|
||||
for (int idx = 0; idx < input_num; idx++)
|
||||
{
|
||||
inputTensors.push_back(nullptr);
|
||||
if ((jt_name == "matmul_trans_1" && idx == 1) || (jt_name == "bmm_trans_1" && idx == 1) || (jt_name == "matmul_trans_0" && idx == 0) || (jt_name == "bmm_trans_0" && idx == 0))
|
||||
{
|
||||
auto ret = CreateFakeTransAclTensor(inputShapes[idx], in_[idx]->mem_ptr, in_[idx]->size, get_dtype(in_[idx]->dtype()), &inputTensors[idx], use_nchw);
|
||||
CHECK_RET(ret == ACL_SUCCESS, return);
|
||||
}
|
||||
else
|
||||
{
|
||||
auto ret = CreateAclTensor(inputShapes[idx], in_[idx]->mem_ptr, in_[idx]->size, get_dtype(in_[idx]->dtype()), &inputTensors[idx], use_nchw);
|
||||
CHECK_RET(ret == ACL_SUCCESS, return);
|
||||
}
|
||||
}
|
||||
|
||||
// if (jt_name == "reduce" || jt_name == "transpose")
|
||||
if (jt_name == "transpose")
|
||||
{
|
||||
auto attr = dynamic_cast<ReduceAttr *>(op_attr.get());
|
||||
dim = aclCreateIntArray(attr->axes.data(), attr->axes.size());
|
||||
keepdims = attr->keepdims;
|
||||
if (name == string("ReduceMax") || name == string("ReduceMin") || name == string("ReduceMean") || name == string("ReduceProd"))
|
||||
{
|
||||
if (attr->axes.size() == in_[0]->shape.size())
|
||||
outputShapes[0] = {};
|
||||
}
|
||||
}
|
||||
|
||||
// if (jt_name == "range")
|
||||
// {
|
||||
// auto attr = dynamic_cast<RangeAttr *>(op_attr.get());
|
||||
// int64_t startValue = attr->start;
|
||||
// int64_t endValue = attr->end;
|
||||
// int64_t stepValue = attr->step;
|
||||
// start = aclCreateScalar(&startValue, aclDataType::ACL_INT64);
|
||||
// end = aclCreateScalar(&endValue, aclDataType::ACL_INT64);
|
||||
// step = aclCreateScalar(&stepValue, aclDataType::ACL_INT64);
|
||||
// }
|
||||
|
||||
if (jt_name == "conv2dbackward")
|
||||
{
|
||||
for (int idx = 0; idx < 2; idx++)
|
||||
{
|
||||
outputTensors.push_back(nullptr);
|
||||
auto ret = CreateAclTensor(outputShapes[idx], out_[idx]->mem_ptr, out_[idx]->size, get_dtype(out_[idx]->dtype()), &outputTensors[idx], use_nchw);
|
||||
CHECK_RET(ret == ACL_SUCCESS, return);
|
||||
}
|
||||
// biasgrad nd format
|
||||
{
|
||||
outputTensors.push_back(nullptr);
|
||||
auto ret = CreateAclTensor(outputShapes[2], out_[2]->mem_ptr, out_[2]->size, get_dtype(out_[2]->dtype()), &outputTensors[2], false);
|
||||
CHECK_RET(ret == ACL_SUCCESS, return);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (int idx = 0; idx < output_num; idx++)
|
||||
{
|
||||
outputTensors.push_back(nullptr);
|
||||
auto ret = CreateAclTensor(outputShapes[idx], out_[idx]->mem_ptr, out_[idx]->size, get_dtype(out_[idx]->dtype()), &outputTensors[idx], use_nchw);
|
||||
CHECK_RET(ret == ACL_SUCCESS, return);
|
||||
}
|
||||
}
|
||||
|
||||
// 2. 调用CANN算子库aclnnxxxGetWorkspaceSize的接口,两段式接口的第一个
|
||||
uint64_t workspaceSize = 0;
|
||||
aclOpExecutor *executor;
|
||||
int op_idx;
|
||||
if (jt_name == "binary" && name != "Add" && name != "Sub")
|
||||
op_idx = 6;
|
||||
else if (jt_name == "unary" && name != "Cast")
|
||||
op_idx = 5;
|
||||
else
|
||||
op_idx = op_idx_map.find(name)->second;
|
||||
|
||||
// LOGir << name << " " << jt_name;
|
||||
// LOGir<<op_idx;
|
||||
|
||||
// 4. 根据第一段接口计算出的workspaceSize申请device内存
|
||||
if (workspaceSize > 0)
|
||||
{
|
||||
mallocWorkSpace(workspaceSize);
|
||||
}
|
||||
|
||||
// 5. 调用aclnnxx第二段接口
|
||||
ret = it->second.executeFunc(workspaceAddr, workspaceSize, executor, aclstream);
|
||||
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnxxx failed. ERROR: %d\n", name.c_str(), ret); return);
|
||||
|
||||
// 6. (固定写法)同步等待任务执行结束
|
||||
// if(sync_run) {
|
||||
// ret = aclrtSynchronizeStream(aclstream);
|
||||
// CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclrtSynchronizeStream failed. ERROR: %d\n", name.c_str(), ret); return);
|
||||
// }
|
||||
|
||||
// 6. 释放aclTensor和aclScalar,需要根据具体API的接口定义修改
|
||||
// destroy tensor
|
||||
// for (int idx = 0; idx < input_num; idx++)
|
||||
// {
|
||||
// aclDestroyTensor(inputTensors[idx]);
|
||||
// }
|
||||
// for (int idx = 0; idx < output_num; idx++)
|
||||
// {
|
||||
// aclDestroyTensor(outputTensors[idx]);
|
||||
// }
|
||||
// destroy scalar
|
||||
// aclDestroyScalar(start);
|
||||
// aclDestroyScalar(end);
|
||||
// aclDestroyScalar(step);
|
||||
// aclDestroyScalar(negativeSlope);
|
||||
|
||||
// // destroy IntArray
|
||||
// aclDestroyIntArray(size);
|
||||
// aclDestroyIntArray(dim);
|
||||
// aclDestroyIntArray(strides);
|
||||
// aclDestroyIntArray(pads);
|
||||
// aclDestroyIntArray(outPads);
|
||||
// aclDestroyIntArray(dilations);
|
||||
// aclDestroyIntArray(kernel_size);
|
||||
// aclDestroyIntArray(normalizedShape);
|
||||
|
||||
return;
|
||||
}
|
||||
};
|
||||
}
|
|
@ -1,5 +1,4 @@
|
|||
#pragma once
|
||||
#include <acl/aclops/acl_op.h>
|
||||
#include <acl/aclops/binary_op_acl.h>
|
||||
#include <acl/aclops/unary_op_acl.h>
|
||||
#include <acl/aclops/conv_op_acl.h>
|
||||
|
|
|
@ -227,7 +227,6 @@ class TestACL(unittest.TestCase):
|
|||
np.testing.assert_allclose(b.numpy(), [[[1], [1]], [[1], [1]]])
|
||||
print("test transpose success")
|
||||
|
||||
# 拆分后的 matmul 测试
|
||||
@jt.flag_scope(use_acl=1)
|
||||
def test_matmul_1(self):
|
||||
a = jt.arange(16).reshape(1, 4, 4).float()
|
||||
|
@ -409,7 +408,6 @@ class TestACL(unittest.TestCase):
|
|||
[[12, 16], [12, 16], [12, 16], [12, 16]])
|
||||
print("test grad_k_bb success")
|
||||
|
||||
# 拆分后的 bmm 测试用例
|
||||
@jt.flag_scope(use_acl=1)
|
||||
def test_bmm_matmul(self):
|
||||
a = jt.arange(16).reshape(1, 4, 4).float()
|
||||
|
@ -432,7 +430,6 @@ class TestACL(unittest.TestCase):
|
|||
)
|
||||
print("test bmm_transpose success")
|
||||
|
||||
# 拆分后的 bmm_grad 测试用例
|
||||
@jt.flag_scope(use_acl=1)
|
||||
def test_bmm_grad_a(self):
|
||||
a = jt.arange(16).reshape(1, 4, 4).float()
|
||||
|
@ -568,7 +565,7 @@ class TestACL(unittest.TestCase):
|
|||
b = jt.array([[0, 0], [0, 0]])
|
||||
c = self.measure_time(lambda: jt.scatter(
|
||||
b, 1, jt.array([[0, 0], [1, 0]]), a, reduce="add"))
|
||||
np.testing.assert_allclose(c.numpy(), [[3, 0], [4, 3]])
|
||||
np.testing.assert_allclose(c.numpy(), [[45, 0], [60, 45]])
|
||||
print("test scatter success")
|
||||
|
||||
@jt.flag_scope(use_acl=1)
|
||||
|
|
Loading…
Reference in New Issue