delete acl_op.h

This commit is contained in:
Exusial 2024-12-24 10:04:44 +08:00
parent 9679b992a5
commit 6483f2710b
4 changed files with 1 additions and 333 deletions

View File

@ -150,94 +150,6 @@ def post_process():
jt.flags.amp_reg |= 32 + 4 # 32 keep float16, 4 keep reduce type
mod.init_acl_ops()
def acl_cmd(name: str,
inputs: list,
output_dtypes: list = None,
output_shapes: list = None,
attr_code: str = "",
attr_header: str = "",
outputs: list = None):
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
cuda_header = '#include "acl/aclops/aclops.h"'
import jittor as jt
outputs_ = []
if outputs is not None:
outputs_ = outputs
else:
assert output_dtypes is not None
assert output_shapes is not None
assert len(output_dtypes) == len(output_shapes)
for i in range(len(output_shapes)):
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
input_code = ''
for i in range(len(inputs)):
input_code += f"op.add(in{i}, true);\n"
output_code = ''
for i in range(len(outputs_)):
output_code += f"op.add(out{i}, false);\n"
return jt.code(outputs=outputs_,
inputs=inputs,
cuda_header=attr_header + cuda_header,
cuda_src=f"""
// aclop
AclOpRunner op("{name}");
{input_code}
{output_code}
{attr_code}
op.run();""")
def acl_cmd_forward(name: str,
inputs: list,
output_dtypes: list = None,
output_shapes: list = None,
attr_code: str = "",
attr_header: str = "",
outputs: list = None,
extra_data: dict = {}):
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
cuda_header = '''
#include "acl/aclops/aclops.h"
'''
import jittor as jt
outputs_ = []
if outputs is not None:
outputs_ = outputs
else:
assert output_dtypes is not None
assert output_shapes is not None
assert len(output_dtypes) == len(output_shapes)
for i in range(len(output_shapes)):
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
input_code = ''
for i in range(len(inputs)):
input_code += f"op.add(in{i}, true);\n"
output_code = ''
for i in range(len(outputs_)):
output_code += f"op.add(out{i}, false);\n"
return jt.code(outputs=outputs_,
inputs=inputs,
cuda_header=attr_header + cuda_header,
cuda_src=f"""
// aclop
AclOpRunner op("{name}");
{input_code}
{output_code}
{attr_code}
op.run();""",
data=extra_data)
def change_function():
import jittor as jt
from jittor import Function

View File

@ -1,240 +0,0 @@
#pragma once
#include "utils.h"
namespace jittor
{
extern int sync_run;
struct AclOpRunner
{
string name;
string jt_name;
vector<Var *> in_;
vector<Var *> out_;
std::unique_ptr<AclOpAttr> op_attr;
AclOpRunner(const string &name) : name(name)
{
}
~AclOpRunner()
{
}
void add(Var *v, bool is_input)
{
if (is_input)
{
in_.push_back(v);
}
else
{
out_.push_back(v);
}
return;
}
template <typename T>
std::vector<T> createVector(int64_t size)
{
return std::vector<T>(size, 0);
}
void run()
{
// LOGir << name << " " << jt_name;
auto it = aclOpFuncMap.find(name);
if (it == aclOpFuncMap.end())
{
LOGir << "aclOpFuncMap Not supported op: " << name;
throw std::runtime_error("Unsupported operation type.");
}
// 0. 算子的输入、输出、需要的attr定义
std::vector<std::vector<int64_t>> inputShapes;
std::vector<std::vector<int64_t>> outputShapes;
// for reduce
// std::vector<int64_t> axes;
aclIntArray *dim = nullptr;
bool keepdims;
bool use_nchw = false;
auto input_num = in_.size();
auto output_num = out_.size();
for (int input_idx = 0; input_idx < input_num; input_idx++)
{
std::vector<int64_t> shape;
for (int j = 0; j < in_[input_idx]->shape.size(); j++)
{
shape.push_back(in_[input_idx]->shape[j]);
}
inputShapes.push_back(shape);
}
for (int output_idx = 0; output_idx < output_num; output_idx++)
{
std::vector<int64_t> shape;
for (int j = 0; j < out_[output_idx]->shape.size(); j++)
{
shape.push_back(out_[output_idx]->shape[j]);
}
outputShapes.push_back(shape);
}
// 1. 创建aclTensor和aclScalar不同算子可能不一样需要根据具体API的接口定义修改
std::vector<aclTensor *> inputTensors;
std::vector<aclTensor *> outputTensors;
// for expand
aclIntArray *size = nullptr;
// for conv
aclIntArray *strides = nullptr;
aclIntArray *pads = nullptr;
aclIntArray *outPads = nullptr;
aclIntArray *dilations = nullptr;
int ret = -1;
// for maxpool
aclIntArray *kernel_size = nullptr;
// for layernorm
aclIntArray *normalizedShape = nullptr;
// for range
aclScalar *start = nullptr;
aclScalar *end = nullptr;
aclScalar *step = nullptr;
// for leaky_relu
aclScalar *negativeSlope = nullptr;
if (jt_name == "conv" || jt_name == "conv2d" || jt_name == "conv2dbackward" || jt_name == "maxpool" || jt_name == "maxpoolbackward" || jt_name == "avgpool" || jt_name == "avgpoolbackward")
use_nchw = true;
for (int idx = 0; idx < input_num; idx++)
{
inputTensors.push_back(nullptr);
if ((jt_name == "matmul_trans_1" && idx == 1) || (jt_name == "bmm_trans_1" && idx == 1) || (jt_name == "matmul_trans_0" && idx == 0) || (jt_name == "bmm_trans_0" && idx == 0))
{
auto ret = CreateFakeTransAclTensor(inputShapes[idx], in_[idx]->mem_ptr, in_[idx]->size, get_dtype(in_[idx]->dtype()), &inputTensors[idx], use_nchw);
CHECK_RET(ret == ACL_SUCCESS, return);
}
else
{
auto ret = CreateAclTensor(inputShapes[idx], in_[idx]->mem_ptr, in_[idx]->size, get_dtype(in_[idx]->dtype()), &inputTensors[idx], use_nchw);
CHECK_RET(ret == ACL_SUCCESS, return);
}
}
// if (jt_name == "reduce" || jt_name == "transpose")
if (jt_name == "transpose")
{
auto attr = dynamic_cast<ReduceAttr *>(op_attr.get());
dim = aclCreateIntArray(attr->axes.data(), attr->axes.size());
keepdims = attr->keepdims;
if (name == string("ReduceMax") || name == string("ReduceMin") || name == string("ReduceMean") || name == string("ReduceProd"))
{
if (attr->axes.size() == in_[0]->shape.size())
outputShapes[0] = {};
}
}
// if (jt_name == "range")
// {
// auto attr = dynamic_cast<RangeAttr *>(op_attr.get());
// int64_t startValue = attr->start;
// int64_t endValue = attr->end;
// int64_t stepValue = attr->step;
// start = aclCreateScalar(&startValue, aclDataType::ACL_INT64);
// end = aclCreateScalar(&endValue, aclDataType::ACL_INT64);
// step = aclCreateScalar(&stepValue, aclDataType::ACL_INT64);
// }
if (jt_name == "conv2dbackward")
{
for (int idx = 0; idx < 2; idx++)
{
outputTensors.push_back(nullptr);
auto ret = CreateAclTensor(outputShapes[idx], out_[idx]->mem_ptr, out_[idx]->size, get_dtype(out_[idx]->dtype()), &outputTensors[idx], use_nchw);
CHECK_RET(ret == ACL_SUCCESS, return);
}
// biasgrad nd format
{
outputTensors.push_back(nullptr);
auto ret = CreateAclTensor(outputShapes[2], out_[2]->mem_ptr, out_[2]->size, get_dtype(out_[2]->dtype()), &outputTensors[2], false);
CHECK_RET(ret == ACL_SUCCESS, return);
}
}
else
{
for (int idx = 0; idx < output_num; idx++)
{
outputTensors.push_back(nullptr);
auto ret = CreateAclTensor(outputShapes[idx], out_[idx]->mem_ptr, out_[idx]->size, get_dtype(out_[idx]->dtype()), &outputTensors[idx], use_nchw);
CHECK_RET(ret == ACL_SUCCESS, return);
}
}
// 2. 调用CANN算子库aclnnxxxGetWorkspaceSize的接口两段式接口的第一个
uint64_t workspaceSize = 0;
aclOpExecutor *executor;
int op_idx;
if (jt_name == "binary" && name != "Add" && name != "Sub")
op_idx = 6;
else if (jt_name == "unary" && name != "Cast")
op_idx = 5;
else
op_idx = op_idx_map.find(name)->second;
// LOGir << name << " " << jt_name;
// LOGir<<op_idx;
// 4. 根据第一段接口计算出的workspaceSize申请device内存
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
// 5. 调用aclnnxx第二段接口
ret = it->second.executeFunc(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnxxx failed. ERROR: %d\n", name.c_str(), ret); return);
// 6. (固定写法)同步等待任务执行结束
// if(sync_run) {
// ret = aclrtSynchronizeStream(aclstream);
// CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclrtSynchronizeStream failed. ERROR: %d\n", name.c_str(), ret); return);
// }
// 6. 释放aclTensor和aclScalar需要根据具体API的接口定义修改
// destroy tensor
// for (int idx = 0; idx < input_num; idx++)
// {
// aclDestroyTensor(inputTensors[idx]);
// }
// for (int idx = 0; idx < output_num; idx++)
// {
// aclDestroyTensor(outputTensors[idx]);
// }
// destroy scalar
// aclDestroyScalar(start);
// aclDestroyScalar(end);
// aclDestroyScalar(step);
// aclDestroyScalar(negativeSlope);
// // destroy IntArray
// aclDestroyIntArray(size);
// aclDestroyIntArray(dim);
// aclDestroyIntArray(strides);
// aclDestroyIntArray(pads);
// aclDestroyIntArray(outPads);
// aclDestroyIntArray(dilations);
// aclDestroyIntArray(kernel_size);
// aclDestroyIntArray(normalizedShape);
return;
}
};
}

View File

@ -1,5 +1,4 @@
#pragma once
#include <acl/aclops/acl_op.h>
#include <acl/aclops/binary_op_acl.h>
#include <acl/aclops/unary_op_acl.h>
#include <acl/aclops/conv_op_acl.h>

View File

@ -227,7 +227,6 @@ class TestACL(unittest.TestCase):
np.testing.assert_allclose(b.numpy(), [[[1], [1]], [[1], [1]]])
print("test transpose success")
# 拆分后的 matmul 测试
@jt.flag_scope(use_acl=1)
def test_matmul_1(self):
a = jt.arange(16).reshape(1, 4, 4).float()
@ -409,7 +408,6 @@ class TestACL(unittest.TestCase):
[[12, 16], [12, 16], [12, 16], [12, 16]])
print("test grad_k_bb success")
# 拆分后的 bmm 测试用例
@jt.flag_scope(use_acl=1)
def test_bmm_matmul(self):
a = jt.arange(16).reshape(1, 4, 4).float()
@ -432,7 +430,6 @@ class TestACL(unittest.TestCase):
)
print("test bmm_transpose success")
# 拆分后的 bmm_grad 测试用例
@jt.flag_scope(use_acl=1)
def test_bmm_grad_a(self):
a = jt.arange(16).reshape(1, 4, 4).float()
@ -568,7 +565,7 @@ class TestACL(unittest.TestCase):
b = jt.array([[0, 0], [0, 0]])
c = self.measure_time(lambda: jt.scatter(
b, 1, jt.array([[0, 0], [1, 0]]), a, reduce="add"))
np.testing.assert_allclose(c.numpy(), [[3, 0], [4, 3]])
np.testing.assert_allclose(c.numpy(), [[45, 0], [60, 45]])
print("test scatter success")
@jt.flag_scope(use_acl=1)