mirror of https://github.com/Jittor/Jittor
commit
d892a83d1c
|
@ -611,6 +611,26 @@ def setup_nccl():
|
|||
nccl_ops = nccl.ops
|
||||
LOG.vv("Get nccl_ops: "+str(dir(nccl_ops)))
|
||||
|
||||
def setup_hccl():
|
||||
global hccl_ops
|
||||
|
||||
hccl_src_dir = os.path.join(jittor_path, "extern", "acl", "hccl")
|
||||
hccl_src_files = []
|
||||
for r, _, f in os.walk(hccl_src_dir):
|
||||
for fname in f:
|
||||
hccl_src_files.append(os.path.join(r, fname))
|
||||
|
||||
hccl_include_path = os.path.join(os.environ.get("ASCEND_TOOLKIT_HOME"), "aarch64-linux/include/hccl")
|
||||
hccl_lib_name = os.path.join(os.environ.get("ASCEND_TOOLKIT_HOME"), "aarch64-linux/lib64/libhccl.so")
|
||||
ctypes.CDLL(hccl_lib_name, dlopen_flags)
|
||||
|
||||
hccl = compile_custom_ops(hccl_src_files,
|
||||
extra_flags=f" -I\"{hccl_include_path}\" {mpi_compile_flags} ",
|
||||
return_module=True, dlopen_flags=os.RTLD_GLOBAL | os.RTLD_NOW,
|
||||
gen_name_="jittor_hccl_core")
|
||||
hccl_ops = hccl.ops
|
||||
LOG.vv("Get hccl_ops: "+str(dir(hccl_ops)))
|
||||
|
||||
def manual_link(flags):
|
||||
lib_dirs = []
|
||||
libs = []
|
||||
|
@ -708,8 +728,14 @@ cudnn = cublas = curand = cufft = cusparse = None
|
|||
setup_mpi()
|
||||
rank = mpi.world_rank() if in_mpi else 0
|
||||
world_size = mpi.world_size() if in_mpi else 1
|
||||
# if has_acl:
|
||||
# setup_hccl()
|
||||
# elif has_cuda:
|
||||
# setup_nccl()
|
||||
# setup_cutt()
|
||||
# setup_cutlass()
|
||||
|
||||
setup_nccl()
|
||||
|
||||
setup_cutt()
|
||||
setup_cutlass()
|
||||
|
||||
|
|
|
@ -1188,7 +1188,22 @@ make_cache_dir(ck_path)
|
|||
|
||||
# build cache_compile
|
||||
cc_flags += f" -I\"{os.path.join(jittor_path, 'src')}\" "
|
||||
cc_flags += f" -I\"{os.path.join(jittor_path, 'extern')}\" "
|
||||
|
||||
ascend_toolkit_home = os.getenv('ASCEND_TOOLKIT_HOME')
|
||||
|
||||
if ascend_toolkit_home:
|
||||
cc_flags += f" -I\"{os.path.join(ascend_toolkit_home, 'include')}\" "
|
||||
cc_flags += f" -I\"{os.path.join(ascend_toolkit_home, 'include/acl')}\" "
|
||||
cc_flags += f" -I\"{os.path.join(ascend_toolkit_home, 'include/aclnn')}\" "
|
||||
cc_flags += f" -I\"{os.path.join(ascend_toolkit_home, 'include/aclnnop')}\" "
|
||||
cc_flags += f" -L\"{os.path.join(ascend_toolkit_home, 'lib64')}\" "
|
||||
cc_flags += " -llibascendcl "
|
||||
cc_flags += " -llibnnopbase "
|
||||
cc_flags += " -llibopapi "
|
||||
|
||||
cc_flags += py_include
|
||||
|
||||
check_cache_compile()
|
||||
LOG.v(f"Get cache_compile: {jit_utils.cc}")
|
||||
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -1,6 +1,6 @@
|
|||
// ***************************************************************
|
||||
// Copyright (c) 2023 Jittor. All Rights Reserved.
|
||||
// Maintainers: Dun Liang <randonlang@gmail.com>.
|
||||
// Copyright (c) 2023 Jittor. All Rights Reserved.
|
||||
// Maintainers: Dun Liang <randonlang@gmail.com>.
|
||||
// This file is subject to the terms and conditions defined in
|
||||
// file 'LICENSE.txt', which is part of this source code package.
|
||||
// ***************************************************************
|
||||
|
@ -11,23 +11,27 @@ using std::unordered_map;
|
|||
|
||||
typedef int aclError;
|
||||
|
||||
static inline unordered_map<aclError,string> gen_map(string s) {
|
||||
unordered_map<aclError,string> smap;
|
||||
for (int i=0; i<s.size(); i++) {
|
||||
if (s[i] == ';') {
|
||||
int j=s.rfind(" ", i);
|
||||
int code = std::stoi(s.substr(j+1, i-j-1));
|
||||
int k = s.rfind(" ", j-1);
|
||||
int l = s.rfind(" ACL_", k-1);
|
||||
smap[code] = s.substr(l+1, k-l-1);
|
||||
static inline unordered_map<aclError, string> gen_map(string s)
|
||||
{
|
||||
unordered_map<aclError, string> smap;
|
||||
for (int i = 0; i < s.size(); i++)
|
||||
{
|
||||
if (s[i] == ';')
|
||||
{
|
||||
int j = s.rfind(" ", i);
|
||||
int code = std::stoi(s.substr(j + 1, i - j - 1));
|
||||
int k = s.rfind(" ", j - 1);
|
||||
int l = s.rfind(" ACL_", k - 1);
|
||||
smap[code] = s.substr(l + 1, k - l - 1);
|
||||
}
|
||||
}
|
||||
return smap;
|
||||
}
|
||||
|
||||
string acl_error_to_string(aclError error) {
|
||||
string acl_error_to_string(aclError error)
|
||||
{
|
||||
|
||||
static unordered_map<aclError,string> acl_error_map = gen_map(R"(
|
||||
static unordered_map<aclError, string> acl_error_map = gen_map(R"(
|
||||
// from acl_base.h
|
||||
static const int ACL_ERROR_INVALID_PARAM = 100000;
|
||||
static const int ACL_ERROR_UNINITIALIZE = 100001;
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
// ***************************************************************
|
||||
// Copyright (c) 2023 Jittor. All Rights Reserved.
|
||||
// Maintainers: Dun Liang <randonlang@gmail.com>.
|
||||
// Copyright (c) 2023 Jittor. All Rights Reserved.
|
||||
// Maintainers: Dun Liang <randonlang@gmail.com>.
|
||||
// This file is subject to the terms and conditions defined in
|
||||
// file 'LICENSE.txt', which is part of this source code package.
|
||||
// ***************************************************************
|
||||
|
@ -10,267 +10,311 @@
|
|||
#include "utils/str_utils.h"
|
||||
#include <chrono>
|
||||
#include <thread>
|
||||
#include "aclnn/aclnn.h"
|
||||
|
||||
namespace jittor {
|
||||
namespace jittor
|
||||
{
|
||||
|
||||
uint64_t acl_jittor_tid;
|
||||
int acl_jittor_thread_running=0;
|
||||
aclrtContext acl_jittor_context;
|
||||
aclrtStream aclstream;
|
||||
uint64_t acl_jittor_tid;
|
||||
int acl_jittor_thread_running = 0;
|
||||
aclrtStream aclstream;
|
||||
void *workspaceAddr = nullptr;
|
||||
uint64_t nowWorkSpaceSize = 0;
|
||||
|
||||
#define CHECK_ACL(x) ASSERTop(x,==,0)
|
||||
#define CHECK_ACL(x) ASSERTop(x, ==, 0)
|
||||
|
||||
static void* acl_jittor_process_callback(void*) {
|
||||
acl_jittor_thread_running = 1;
|
||||
int deviceId = 0;
|
||||
CHECK_ACL(aclrtSetCurrentContext(acl_jittor_context));
|
||||
|
||||
while (acl_jittor_thread_running) {
|
||||
// LOGir << "acl_jittor_process_callback";
|
||||
auto ret = aclrtProcessReport(1000);
|
||||
if (ret) {
|
||||
if (acl_jittor_thread_running && ret != ACL_ERROR_RT_REPORT_TIMEOUT && ret != ACL_ERROR_RT_THREAD_SUBSCRIBE)
|
||||
LOGir << "aclrtProcessReport:" << ret << acl_error_to_string(ret);
|
||||
break;
|
||||
void mallocWorkSpace(uint64_t size)
|
||||
{
|
||||
uint64_t alloc_size = size + 32;
|
||||
alloc_size = ((alloc_size - 1) / 32 + 1) * 32;
|
||||
if (alloc_size > nowWorkSpaceSize)
|
||||
{
|
||||
aclrtFree(workspaceAddr);
|
||||
nowWorkSpaceSize = alloc_size;
|
||||
auto ret = aclrtMalloc(&workspaceAddr, nowWorkSpaceSize, ACL_MEM_MALLOC_HUGE_FIRST);
|
||||
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("allocate workspace failed. ERROR: %d\n", ret); return);
|
||||
}
|
||||
}
|
||||
acl_jittor_thread_running = 0;
|
||||
return (void*)0;
|
||||
}
|
||||
static void *acl_jittor_process_callback(void *)
|
||||
{
|
||||
acl_jittor_thread_running = 1;
|
||||
|
||||
// void aaa(void*) {
|
||||
// LOGir << "haha";
|
||||
// }
|
||||
|
||||
struct acl_jittor_initer {
|
||||
|
||||
acl_jittor_initer() {
|
||||
CHECK_ACL(aclInit(nullptr));
|
||||
uint device_count = 0;
|
||||
// 获取可用的Device数量
|
||||
CHECK_ACL(aclrtGetDeviceCount(&device_count));
|
||||
LOGi << "Found ACL device number:" << device_count;
|
||||
CHECK_ACL(aclrtSetDevice(0));
|
||||
CHECK_ACL(aclrtCreateContext(&acl_jittor_context, 0));
|
||||
CHECK_ACL(aclrtSetCurrentContext(acl_jittor_context));
|
||||
|
||||
pthread_create(&acl_jittor_tid, nullptr, acl_jittor_process_callback, 0);
|
||||
|
||||
// subscribe for default stream
|
||||
CHECK_ACL(aclrtSubscribeReport(acl_jittor_tid,0));
|
||||
|
||||
// simple callback test
|
||||
CHECK_ACL(aclrtCreateStream(&aclstream));
|
||||
// CHECK_ACL(aclrtSubscribeReport(acl_jittor_tid,aclstream));
|
||||
// CHECK_ACL(aclrtLaunchCallback((aclrtCallback)&aaa, 0, ACL_CALLBACK_NO_BLOCK, aclstream));
|
||||
// CHECK_ACL(aclrtLaunchCallback((aclrtCallback)&aaa, 0, ACL_CALLBACK_NO_BLOCK, 0));
|
||||
}
|
||||
|
||||
~acl_jittor_initer() {
|
||||
acl_jittor_thread_running = 0;
|
||||
CHECK_ACL(aclrtUnSubscribeReport(acl_jittor_tid,0));
|
||||
CHECK_ACL(aclrtDestroyContext(acl_jittor_context));
|
||||
CHECK_ACL(aclFinalize());
|
||||
}
|
||||
|
||||
} _acl_jittor_initer;
|
||||
|
||||
string process_acl(const string& src, const string& name, const map<string,string>& kargs) {
|
||||
if (endswith(name, "_jittor.cc"))
|
||||
return src;
|
||||
// static vector<string> dont_compile = {"fp16_emu.cc"};
|
||||
// for (auto& s : dont_compile)
|
||||
// if (endswith(name, s))
|
||||
// return " ";
|
||||
static unordered_set<string> cuda_headers = {
|
||||
"cuda_runtime", "cudnn", "driver_types",
|
||||
"cuda_fp16", "cuda_runtime_api", "fp16_emu",
|
||||
"cudnn_rnn_descriptor", "cublas_v2", "cublas_wrapper",
|
||||
"curand", "curand_wrapper", "cufft", "cufftXt",
|
||||
"CudaUtils", "cutt", "cudnn_wrapper", "cuda_bf16"
|
||||
};
|
||||
static unordered_set<string> fake_class = {
|
||||
"cudnnHandle_t", "cudnnConvolutionBwdFilterAlgo_t",
|
||||
"cudnnConvolutionBwdDataAlgo_t", "cudnnConvolutionFwdAlgo_t",
|
||||
"cufftHandle"
|
||||
};
|
||||
try {
|
||||
auto tokens = token_split(src);
|
||||
int edit = 0;
|
||||
for (int i=0; i<tokens.size(); i++) {
|
||||
auto& token = tokens[i];
|
||||
if (cuda_headers.count(token)) token = "acl_jittor", edit ++; else
|
||||
if (fake_class.count(token)) token = "int", edit ++; else
|
||||
if (token == "CUDA") token = "ACL", edit ++; else
|
||||
if (startswith(token, "cuda")) {
|
||||
if (token.size()>=5 && token[4] >= 'A' && token[4] <= 'Z') {
|
||||
if (token == "cudaGetDeviceCount") {
|
||||
token_replace(tokens, i, "($1);", "((uint*)$1);");
|
||||
} else if (token == "cudaLaunchHostFunc") {
|
||||
// ACL_CALLBACK_BLOCK for 310
|
||||
token_replace(tokens, i, "LaunchHostFunc($1,$2,$3)",
|
||||
"LaunchCallback($2,$3,ACL_CALLBACK_NO_BLOCK,$1)");
|
||||
} else if (token == "cudaMemcpy")
|
||||
token_replace(tokens, i, "cudaMemcpy($1,$2,$3,",
|
||||
"aclrtMemcpy($1,$3,$2,$3,");
|
||||
else if (token == "cudaMemcpyAsync")
|
||||
token_replace(tokens, i, "cudaMemcpyAsync($1,$2,$3,",
|
||||
"aclrtMemcpyAsync($1,$3,$2,$3,");
|
||||
else if (token == "cudaMemcpyDeviceToHost") token = "ACL_MEMCPY_DEVICE_TO_HOST";
|
||||
else if (token == "cudaMemcpyDefault") token = "ACL_MEMCPY_HOST_TO_DEVICE";
|
||||
else if (token == "cudaMemcpyHostToDevice") token = "ACL_MEMCPY_HOST_TO_DEVICE";
|
||||
else if (token == "cudaMemcpyDeviceToDevice") token = "ACL_MEMCPY_DEVICE_TO_DEVICE";
|
||||
else if (token == "cudaMallocManaged" || token == "cudaMalloc") {
|
||||
// unified address not supported
|
||||
token = "aclrtMalloc";
|
||||
token_replace(tokens, i, "($1,$2)",
|
||||
"($1,$2,ACL_MEM_MALLOC_HUGE_FIRST)");
|
||||
} else if (token == "cudaMemGetInfo")
|
||||
token_replace(tokens, i, "cudaMemGetInfo($1,$2)",
|
||||
"aclrtGetMemInfo(ACL_DDR_MEM,$1,$2)");
|
||||
else if (token == "cudaGetLastError")
|
||||
token_replace(tokens, i, "cudaGetLastError()", "0");
|
||||
else if (token == "cudaStreamCreateWithFlags")
|
||||
token_replace(tokens, i-1,
|
||||
"(cudaStreamCreateWithFlags($1,$2));",
|
||||
"(aclrtCreateStream($1)); checkAclErrors(aclrtSubscribeReport(acl_jittor_tid,*$1));");
|
||||
else if (token == "cudaEventCreate")
|
||||
token_replace(tokens, i,
|
||||
"cudaEventCreate($1,$2)",
|
||||
"aclrtCreateEvent($1)");
|
||||
else if (token == "cudaDeviceSynchronize")
|
||||
token = "aclrtSynchronizeDevice";
|
||||
else if (token == "cudaStreamDestroy")
|
||||
token_replace(tokens, i, "cudaStreamDestroy($1)",
|
||||
"(aclrtUnSubscribeReport(acl_jittor_tid,$1), aclrtDestroyStream($1))");
|
||||
else if (token == "cudaEventDestroy")
|
||||
token = "aclrtDestroyEvent";
|
||||
else if (token == "cudaEventRecord")
|
||||
token = "aclrtRecordEvent";
|
||||
else if (token == "cudaStreamWaitEvent")
|
||||
token_replace(tokens, i,
|
||||
"cudaStreamWaitEvent($1,$2,$3)",
|
||||
"aclrtStreamWaitEvent($1,$2)");
|
||||
|
||||
if (token.size() && token[0] == 'c')
|
||||
token = "aclrt" + token.substr(4);
|
||||
if (endswith(token, "_t"))
|
||||
token = token.substr(0, token.size()-2);
|
||||
edit ++;
|
||||
}
|
||||
} else
|
||||
if (token == "_cudaGetErrorEnum") {
|
||||
token_replace(tokens, i, "_cudaGetErrorEnum($1)", "(acl_error_to_string($1))");
|
||||
edit ++;
|
||||
} else
|
||||
if (token == "checkCudaErrors")
|
||||
token = "checkAclErrors";
|
||||
else if (token == "JPU") {
|
||||
edit ++;
|
||||
string new_code;
|
||||
if (tokens[i+2] == "op_compiler")
|
||||
token_replace(tokens, i,
|
||||
"JPU(op_compiler($1,$2,$3))",
|
||||
"acl_jittor_op_compiler($1,$2,$3)");
|
||||
else if (tokens[i+2] == "header")
|
||||
new_code = "#include \"acl_jittor.h\"";
|
||||
if (new_code.size())
|
||||
token_replace(tokens, i, "JPU($1)", new_code);
|
||||
} else if (token == "use_cuda_managed_allocator" && tokens[i+1][0]==',') {
|
||||
tokens[i+2] = "0"; // disable unified address
|
||||
}
|
||||
}
|
||||
if (!edit) return src;
|
||||
string new_src = join(tokens, "");
|
||||
// if (name == "executor.cc") {
|
||||
// new_src = string("#include <Python.h>\n#include <pystate.h>\n#include <common.h>\n")+
|
||||
// "namespace jittor { void acl_op_exec(Op*); }\n" +
|
||||
// replace(new_src, "op->do_run_after_prepare(jkl);",
|
||||
// R"({
|
||||
// acl_op_exec(op);
|
||||
// })");
|
||||
// }
|
||||
if (name == "profiler.cc") {
|
||||
new_src = token_replace_all(new_src, ".cc", ".tikcc");
|
||||
}
|
||||
// LOGir << name << (name == "pass_manager.cc");
|
||||
if (name == "pass_manager.cc") {
|
||||
LOGir << "replace" << name;
|
||||
new_src = token_replace_all(new_src, "run_pass<FloatAtomicFixPass>();", "WTF");
|
||||
}
|
||||
// ????????
|
||||
return new_src;
|
||||
} catch (const std::exception& e) {
|
||||
LOGe << "process acl error:" << e.what();
|
||||
LOGe << "name:" << name;
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
void acl_jittor_op_compiler(string& filename, string& src, bool is_acl, string& extra_flags) {
|
||||
if (!is_acl) return;
|
||||
// extra_flags += " --tik-soc-version=Ascend910 ";
|
||||
// filename = replace(filename, ".cc", ".tikcc");
|
||||
// LOGir << filename;
|
||||
string new_src = process_acl(src, "", {});
|
||||
new_src = replace(new_src, R"(#include "misc/cuda_atomic.h")", "");
|
||||
new_src = replace(new_src, R"(#include "misc/cuda_limits.h")", "");
|
||||
new_src = replace(new_src, "__global__", "__ai_device_entry__");
|
||||
new_src = token_replace_all(new_src, "__launch_bounds__($1)", "");
|
||||
new_src = token_replace_all(new_src, "int thread_num = $1;", "int thread_num = 1;");
|
||||
new_src = token_replace_all(new_src, "tn0=std::max(tn0, $1);", "");
|
||||
new_src = token_replace_all(new_src, "<<<$1>>>", "<<<1,0>>>");
|
||||
new_src = token_replace_all(new_src, "int thread_id = $1;", "int thread_id = 1;");
|
||||
// for inc error
|
||||
new_src = token_replace_all(new_src, "for ($1+=$2)", "for ($1++)");
|
||||
// bit op error
|
||||
new_src = token_replace_all(new_src, "int tnum$1;", "");
|
||||
new_src = token_replace_all(new_src, "int p1$1;", "");
|
||||
new_src = token_replace_all(new_src, "int p2$1;", "");
|
||||
new_src = token_replace_all(new_src, "int tn$1=$2;", "int tn$1=0;");
|
||||
new_src = token_replace_all(new_src, "int tid$1=$2;", "int tid$1=0;");
|
||||
src = new_src;
|
||||
|
||||
new_src = token_replace_all(new_src, "atomicAdd(&$1,$2);", "$1=$1+$2;");
|
||||
// new_src = token_replace_all(new_src, "bool", "int8");
|
||||
new_src = token_replace_all(new_src, "::numeric_min<float32>()", "-1e30");
|
||||
new_src = token_replace_all(new_src, "::numeric_max<float32>()", "1e30");
|
||||
// TODO: support max
|
||||
unordered_map<string,string> opmap = {
|
||||
// {"::max","tikcc::scalar_max"},
|
||||
{"::sqrtf", "tikcc::scalar_sqrt"}
|
||||
};
|
||||
auto ss = split(new_src, ";");
|
||||
for (auto &s : ss) {
|
||||
if (s.find("?") != string::npos) {
|
||||
s = token_replace_all(s+";", "auto $1=$2?$3:$4;", "auto $1=$3;if (!($2)) $1=$4;");
|
||||
}
|
||||
if (s.find("::max") != string::npos) {
|
||||
if (s.find("auto") == string::npos) {
|
||||
s = token_replace_all(s+";", " $1=$4::max($2,$3);", " $1=$2;if ($2 < $3) $1=$3;");
|
||||
} else {
|
||||
s = token_replace_all(s+";", "auto $1=$4::max($2,$3);", "auto $1=$2;if ($2 < $3) $1=$3;");
|
||||
while (acl_jittor_thread_running)
|
||||
{
|
||||
// LOGir << "acl_jittor_process_callback";
|
||||
auto ret = aclrtProcessReport(1000);
|
||||
if (ret)
|
||||
{
|
||||
if (acl_jittor_thread_running && ret != ACL_ERROR_RT_REPORT_TIMEOUT && ret != ACL_ERROR_RT_THREAD_SUBSCRIBE)
|
||||
LOGir << "aclrtProcessReport:" << ret << acl_error_to_string(ret);
|
||||
break;
|
||||
}
|
||||
}
|
||||
for (auto& kv : opmap) {
|
||||
if (s.find(kv.first) != string::npos) {
|
||||
if (s.find("auto") == string::npos) {
|
||||
// $1 = op($2) --> op($1, $2)
|
||||
s = token_replace_all(s+";", " $1= "+kv.first+"($2);", kv.second+"($1, $2);");
|
||||
} else {
|
||||
// auto $1 = op($2) --> float32 $1; op($1, $2);
|
||||
s = token_replace_all(s+";", "auto $1= "+kv.first+"($2);", "float32 $1; " + kv.second+"($1, $2);");
|
||||
acl_jittor_thread_running = 0;
|
||||
return (void *)0;
|
||||
}
|
||||
|
||||
struct acl_jittor_initer
|
||||
{
|
||||
int32_t deviceId;
|
||||
acl_jittor_initer()
|
||||
{
|
||||
CHECK_ACL(aclInit(nullptr));
|
||||
uint device_count = 0;
|
||||
deviceId = 0;
|
||||
// 获取可用的Device数量
|
||||
CHECK_ACL(aclrtGetDeviceCount(&device_count));
|
||||
LOGi << "Found ACL device number:" << device_count;
|
||||
CHECK_ACL(aclrtSetDevice(deviceId));
|
||||
CHECK_ACL(aclrtCreateStream(&aclstream));
|
||||
// pthread_create(&acl_jittor_tid, nullptr, acl_jittor_process_callback, 0);
|
||||
}
|
||||
|
||||
~acl_jittor_initer()
|
||||
{
|
||||
acl_jittor_thread_running = 0;
|
||||
// CHECK_ACL(aclrtUnSubscribeReport(acl_jittor_tid, 0));
|
||||
aclrtDestroyStream(aclstream);
|
||||
aclrtResetDevice(deviceId);
|
||||
CHECK_ACL(aclFinalize());
|
||||
if (nowWorkSpaceSize > 0)
|
||||
{
|
||||
aclrtFree(workspaceAddr);
|
||||
}
|
||||
}
|
||||
|
||||
} _acl_jittor_initer;
|
||||
|
||||
string process_acl(const string &src, const string &name, const map<string, string> &kargs)
|
||||
{
|
||||
if (endswith(name, "_jittor.cc"))
|
||||
return src;
|
||||
// static vector<string> dont_compile = {"fp16_emu.cc"};
|
||||
// for (auto& s : dont_compile)
|
||||
// if (endswith(name, s))
|
||||
// return " ";
|
||||
static unordered_set<string> cuda_headers = {
|
||||
"cuda_runtime", "cudnn", "driver_types",
|
||||
"cuda_fp16", "cuda_runtime_api", "fp16_emu",
|
||||
"cudnn_rnn_descriptor", "cublas_v2", "cublas_wrapper",
|
||||
"curand", "curand_wrapper", "cufft", "cufftXt",
|
||||
"CudaUtils", "cutt", "cudnn_wrapper", "cuda_bf16"};
|
||||
static unordered_set<string> fake_class = {
|
||||
"cudnnHandle_t", "cudnnConvolutionBwdFilterAlgo_t",
|
||||
"cudnnConvolutionBwdDataAlgo_t", "cudnnConvolutionFwdAlgo_t",
|
||||
"cufftHandle"};
|
||||
try
|
||||
{
|
||||
auto tokens = token_split(src);
|
||||
int edit = 0;
|
||||
for (int i = 0; i < tokens.size(); i++)
|
||||
{
|
||||
auto &token = tokens[i];
|
||||
if (cuda_headers.count(token))
|
||||
token = "acl_jittor", edit++;
|
||||
else if (fake_class.count(token))
|
||||
token = "int", edit++;
|
||||
else if (token == "CUDA")
|
||||
token = "ACL", edit++;
|
||||
else if (startswith(token, "cuda"))
|
||||
{
|
||||
if (token.size() >= 5 && token[4] >= 'A' && token[4] <= 'Z')
|
||||
{
|
||||
if (token == "cudaGetDeviceCount")
|
||||
{
|
||||
token_replace(tokens, i, "($1);", "((uint*)$1);");
|
||||
}
|
||||
else if (token == "cudaLaunchHostFunc")
|
||||
{
|
||||
// ACL_CALLBACK_BLOCK for 310
|
||||
token_replace(tokens, i, "LaunchHostFunc($1,$2,$3)",
|
||||
"LaunchCallback($2,$3,ACL_CALLBACK_NO_BLOCK,$1)");
|
||||
}
|
||||
else if (token == "cudaMemcpy")
|
||||
token_replace(tokens, i, "cudaMemcpy($1,$2,$3,",
|
||||
"aclrtMemcpy($1,$3,$2,$3,");
|
||||
else if (token == "cudaMemcpyAsync")
|
||||
token_replace(tokens, i, "cudaMemcpyAsync($1,$2,$3,",
|
||||
"aclrtMemcpyAsync($1,$3,$2,$3,");
|
||||
else if (token == "cudaMemcpyDeviceToHost")
|
||||
token = "ACL_MEMCPY_DEVICE_TO_HOST";
|
||||
else if (token == "cudaMemcpyDefault")
|
||||
token = "ACL_MEMCPY_HOST_TO_DEVICE";
|
||||
else if (token == "cudaMemcpyHostToDevice")
|
||||
token = "ACL_MEMCPY_HOST_TO_DEVICE";
|
||||
else if (token == "cudaMemcpyDeviceToDevice")
|
||||
token = "ACL_MEMCPY_DEVICE_TO_DEVICE";
|
||||
else if (token == "cudaMallocManaged" || token == "cudaMalloc")
|
||||
{
|
||||
// unified address not supported
|
||||
token = "aclrtMalloc";
|
||||
token_replace(tokens, i, "($1,$2)",
|
||||
"($1,$2,ACL_MEM_MALLOC_HUGE_FIRST)");
|
||||
}
|
||||
else if (token == "cudaMemGetInfo")
|
||||
token_replace(tokens, i, "cudaMemGetInfo($1,$2)",
|
||||
"aclrtGetMemInfo(ACL_DDR_MEM,$1,$2)");
|
||||
else if (token == "cudaGetLastError")
|
||||
token_replace(tokens, i, "cudaGetLastError()", "0");
|
||||
else if (token == "cudaStreamCreateWithFlags")
|
||||
token_replace(tokens, i - 1,
|
||||
"(cudaStreamCreateWithFlags($1,$2));",
|
||||
"(aclrtCreateStream($1)); checkAclErrors(aclrtSubscribeReport(acl_jittor_tid,*$1));");
|
||||
else if (token == "cudaEventCreate")
|
||||
token_replace(tokens, i,
|
||||
"cudaEventCreate($1,$2)",
|
||||
"aclrtCreateEvent($1)");
|
||||
else if (token == "cudaDeviceSynchronize")
|
||||
token = "aclrtSynchronizeDevice";
|
||||
else if (token == "cudaStreamDestroy")
|
||||
token_replace(tokens, i, "cudaStreamDestroy($1)",
|
||||
"(aclrtUnSubscribeReport(acl_jittor_tid,$1), aclrtDestroyStream($1))");
|
||||
else if (token == "cudaEventDestroy")
|
||||
token = "aclrtDestroyEvent";
|
||||
else if (token == "cudaEventRecord")
|
||||
token = "aclrtRecordEvent";
|
||||
else if (token == "cudaStreamWaitEvent")
|
||||
token_replace(tokens, i,
|
||||
"cudaStreamWaitEvent($1,$2,$3)",
|
||||
"aclrtStreamWaitEvent($1,$2)");
|
||||
|
||||
if (token.size() && token[0] == 'c')
|
||||
token = "aclrt" + token.substr(4);
|
||||
if (endswith(token, "_t"))
|
||||
token = token.substr(0, token.size() - 2);
|
||||
edit++;
|
||||
}
|
||||
}
|
||||
else if (token == "_cudaGetErrorEnum")
|
||||
{
|
||||
token_replace(tokens, i, "_cudaGetErrorEnum($1)", "(acl_error_to_string($1))");
|
||||
edit++;
|
||||
}
|
||||
else if (token == "checkCudaErrors")
|
||||
token = "checkAclErrors";
|
||||
else if (token == "JPU")
|
||||
{
|
||||
edit++;
|
||||
string new_code;
|
||||
if (tokens[i + 2] == "op_compiler")
|
||||
token_replace(tokens, i,
|
||||
"JPU(op_compiler($1,$2,$3))",
|
||||
"acl_jittor_op_compiler($1,$2,$3)");
|
||||
else if (tokens[i + 2] == "header")
|
||||
new_code = "#include \"acl_jittor.h\"";
|
||||
if (new_code.size())
|
||||
token_replace(tokens, i, "JPU($1)", new_code);
|
||||
}
|
||||
else if (token == "use_cuda_managed_allocator" && tokens[i + 1][0] == ',')
|
||||
{
|
||||
tokens[i + 2] = "0"; // disable unified address
|
||||
}
|
||||
}
|
||||
if (!edit)
|
||||
return src;
|
||||
string new_src = join(tokens, "");
|
||||
// if (name == "executor.cc") {
|
||||
// new_src = string("#include <Python.h>\n#include <pystate.h>\n#include <common.h>\n")+
|
||||
// "namespace jittor { void acl_op_exec(Op*); }\n" +
|
||||
// replace(new_src, "op->do_run_after_prepare(jkl);",
|
||||
// R"({
|
||||
// acl_op_exec(op);
|
||||
// })");
|
||||
// }
|
||||
if (name == "profiler.cc")
|
||||
{
|
||||
new_src = token_replace_all(new_src, ".cc", ".tikcc");
|
||||
}
|
||||
// LOGir << name << (name == "pass_manager.cc");
|
||||
if (name == "pass_manager.cc")
|
||||
{
|
||||
LOGir << "replace" << name;
|
||||
new_src = token_replace_all(new_src, "run_pass<FloatAtomicFixPass>();", "WTF");
|
||||
}
|
||||
// ????????
|
||||
return new_src;
|
||||
}
|
||||
catch (const std::exception &e)
|
||||
{
|
||||
LOGe << "process acl error:" << e.what();
|
||||
LOGe << "name:" << name;
|
||||
throw;
|
||||
}
|
||||
// s = token_replace_all(s+";", "auto $1=$2?$3:$4;", "auto $1=$3;if (!($2)) $1=$4;");
|
||||
// s = token_replace_all(s+";", "auto $1=$2?$3:$4;", "auto $1=$3;if (!($2)) $1=$4;");
|
||||
// if (s.find("::max") != string::npos) {
|
||||
// s = token_replace_all(s+";", " $1= ::max($2);", "tikcc::scalar_max($1, $2);");
|
||||
// }
|
||||
}
|
||||
new_src = join(ss, ";");
|
||||
src = new_src;
|
||||
}
|
||||
|
||||
void acl_jittor_op_compiler(string &filename, string &src, bool is_acl, string &extra_flags)
|
||||
{
|
||||
if (!is_acl)
|
||||
return;
|
||||
string new_src = process_acl(src, "", {});
|
||||
new_src = replace(new_src, R"(#include "misc/cuda_atomic.h")", "");
|
||||
new_src = replace(new_src, R"(#include "misc/cuda_limits.h")", "");
|
||||
new_src = replace(new_src, "__global__", "__ai_device_entry__");
|
||||
new_src = token_replace_all(new_src, "__launch_bounds__($1)", "");
|
||||
new_src = token_replace_all(new_src, "int thread_num = $1;", "int thread_num = 1;");
|
||||
new_src = token_replace_all(new_src, "tn0=std::max(tn0, $1);", "");
|
||||
new_src = token_replace_all(new_src, "<<<$1>>>", "<<<1,0>>>");
|
||||
new_src = token_replace_all(new_src, "int thread_id = $1;", "int thread_id = 1;");
|
||||
// for inc error
|
||||
new_src = token_replace_all(new_src, "for ($1+=$2)", "for ($1++)");
|
||||
// bit op error
|
||||
new_src = token_replace_all(new_src, "int tnum$1;", "");
|
||||
new_src = token_replace_all(new_src, "int p1$1;", "");
|
||||
new_src = token_replace_all(new_src, "int p2$1;", "");
|
||||
new_src = token_replace_all(new_src, "int tn$1=$2;", "int tn$1=0;");
|
||||
new_src = token_replace_all(new_src, "int tid$1=$2;", "int tid$1=0;");
|
||||
src = new_src;
|
||||
|
||||
new_src = token_replace_all(new_src, "atomicAdd(&$1,$2);", "$1=$1+$2;");
|
||||
// new_src = token_replace_all(new_src, "bool", "int8");
|
||||
new_src = token_replace_all(new_src, "::numeric_min<float32>()", "-1e30");
|
||||
new_src = token_replace_all(new_src, "::numeric_max<float32>()", "1e30");
|
||||
// TODO: support max
|
||||
unordered_map<string, string> opmap = {
|
||||
// {"::max","tikcc::scalar_max"},
|
||||
{"::sqrtf", "tikcc::scalar_sqrt"}};
|
||||
auto ss = split(new_src, ";");
|
||||
for (auto &s : ss)
|
||||
{
|
||||
if (s.find("?") != string::npos)
|
||||
{
|
||||
s = token_replace_all(s + ";", "auto $1=$2?$3:$4;", "auto $1=$3;if (!($2)) $1=$4;");
|
||||
}
|
||||
if (s.find("::max") != string::npos)
|
||||
{
|
||||
if (s.find("auto") == string::npos)
|
||||
{
|
||||
s = token_replace_all(s + ";", " $1=$4::max($2,$3);", " $1=$2;if ($2 < $3) $1=$3;");
|
||||
}
|
||||
else
|
||||
{
|
||||
s = token_replace_all(s + ";", "auto $1=$4::max($2,$3);", "auto $1=$2;if ($2 < $3) $1=$3;");
|
||||
}
|
||||
}
|
||||
for (auto &kv : opmap)
|
||||
{
|
||||
if (s.find(kv.first) != string::npos)
|
||||
{
|
||||
if (s.find("auto") == string::npos)
|
||||
{
|
||||
// $1 = op($2) --> op($1, $2)
|
||||
s = token_replace_all(s + ";", " $1= " + kv.first + "($2);", kv.second + "($1, $2);");
|
||||
}
|
||||
else
|
||||
{
|
||||
// auto $1 = op($2) --> float32 $1; op($1, $2);
|
||||
s = token_replace_all(s + ";", "auto $1= " + kv.first + "($2);", "float32 $1; " + kv.second + "($1, $2);");
|
||||
}
|
||||
}
|
||||
}
|
||||
// s = token_replace_all(s+";", "auto $1=$2?$3:$4;", "auto $1=$3;if (!($2)) $1=$4;");
|
||||
// s = token_replace_all(s+";", "auto $1=$2?$3:$4;", "auto $1=$3;if (!($2)) $1=$4;");
|
||||
// if (s.find("::max") != string::npos) {
|
||||
// s = token_replace_all(s+";", " $1= ::max($2);", "tikcc::scalar_max($1, $2);");
|
||||
// }
|
||||
}
|
||||
new_src = join(ss, ";");
|
||||
src = new_src;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,20 +1,700 @@
|
|||
// ***************************************************************
|
||||
// Copyright (c) 2023 Jittor. All Rights Reserved.
|
||||
// Maintainers: Dun Liang <randonlang@gmail.com>.
|
||||
// Copyright (c) 2023 Jittor. All Rights Reserved.
|
||||
// Maintainers: Dun Liang <randonlang@gmail.com>.
|
||||
// This file is subject to the terms and conditions defined in
|
||||
// file 'LICENSE.txt', which is part of this source code package.
|
||||
// ***************************************************************
|
||||
#pragma once
|
||||
#include "common.h"
|
||||
#include "aclnn/aclnn.h"
|
||||
#include <acl/acl.h>
|
||||
|
||||
std::string acl_error_to_string(aclError error);
|
||||
|
||||
namespace jittor {
|
||||
namespace jittor
|
||||
{
|
||||
|
||||
EXTERN_LIB uint64_t acl_jittor_tid;
|
||||
EXTERN_LIB aclrtStream aclstream;
|
||||
EXTERN_LIB uint64_t acl_jittor_tid;
|
||||
EXTERN_LIB aclrtStream aclstream;
|
||||
EXTERN_LIB void *workspaceAddr;
|
||||
|
||||
void acl_jittor_op_compiler(string& filename, string& src, bool is_acl, string& extra_flags);
|
||||
void mallocWorkSpace(uint64_t size);
|
||||
|
||||
}
|
||||
void acl_jittor_op_compiler(string &filename, string &src, bool is_acl, string &extra_flags);
|
||||
|
||||
struct AclOpFunctions
|
||||
{
|
||||
// for Unary and Nonzero
|
||||
std::function<aclnnStatus(aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncUnaryNonzero;
|
||||
// for Cast
|
||||
std::function<aclnnStatus(aclTensor *, aclDataType, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncCast;
|
||||
// for Bianry
|
||||
std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncBinary;
|
||||
// for Add and Sub
|
||||
std::function<aclnnStatus(aclTensor *, aclTensor *, aclScalar *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncAdd;
|
||||
// for Expand, permute, flip
|
||||
std::function<aclnnStatus(aclTensor *, aclIntArray *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncExpand;
|
||||
// for bmm and matmul
|
||||
std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, int8_t, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncMatmul;
|
||||
// for conv
|
||||
std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, aclIntArray *, aclIntArray *, aclIntArray *, bool, aclIntArray *, int64_t, aclTensor *, int8_t, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncConv;
|
||||
// for reducesum, mean
|
||||
std::function<aclnnStatus(aclTensor *, aclIntArray *, bool, aclDataType, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncReduceSum;
|
||||
// for amax and amin
|
||||
std::function<aclnnStatus(aclTensor *, aclIntArray *, bool, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncAmax;
|
||||
// for conv backward
|
||||
std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, aclIntArray *, aclIntArray *, aclIntArray *, aclIntArray *, bool, aclIntArray *, int, aclBoolArray *, int8_t, aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncConvBackward;
|
||||
// for proddim
|
||||
std::function<aclnnStatus(aclTensor *, float, float, float, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncProdDim;
|
||||
// for select, where
|
||||
std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncSelect;
|
||||
// for random_uniform and random_normal
|
||||
std::function<aclnnStatus(aclTensor *, int64_t, int64_t, int64_t, int64_t, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncRandom;
|
||||
// for maxpool
|
||||
std::function<aclnnStatus(aclTensor *, aclIntArray *, aclIntArray *, aclIntArray *, aclIntArray *, bool, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncMaxPool;
|
||||
// for maxpool backward
|
||||
std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, aclIntArray *, aclIntArray *, aclIntArray *, aclIntArray *, bool, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncMaxPoolBackward;
|
||||
// for avgpool
|
||||
std::function<aclnnStatus(aclTensor *, aclIntArray *, aclIntArray *, aclIntArray *, bool, bool, int64_t, int8_t, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncAvgPool;
|
||||
// for avgpool backward
|
||||
std::function<aclnnStatus(aclTensor *, aclTensor *, aclIntArray *, aclIntArray *, aclIntArray *, bool, bool, int64_t, int8_t, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncAvgPoolBackward;
|
||||
// for concat
|
||||
std::function<aclnnStatus(aclTensorList *, uint64_t, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncConcat;
|
||||
// for gather
|
||||
std::function<aclnnStatus(aclTensor *, uint64_t, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncGather;
|
||||
// for cumsum
|
||||
std::function<aclnnStatus(aclTensor *, uint64_t, aclDataType, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncCumsum;
|
||||
// for scatter
|
||||
std::function<aclnnStatus(aclTensor *, uint64_t, aclTensor *, aclTensor *, uint64_t, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncScatter;
|
||||
// for index
|
||||
std::function<aclnnStatus(aclTensor *, aclTensorList *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncIndex;
|
||||
// for stridesliceassignv2
|
||||
std::function<aclnnStatus(aclTensor *, aclTensor *, aclIntArray *, aclIntArray *, aclIntArray *, aclIntArray *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncStridedSliceAssignV2;
|
||||
// for slicev2
|
||||
std::function<aclnnStatus(aclTensor *, aclIntArray *, aclIntArray *, aclIntArray *, aclIntArray *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncSliceV2;
|
||||
// for indexputimpl
|
||||
std::function<aclnnStatus(aclTensor *, aclTensorList *, aclTensor *, bool, bool, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncIndexPutImpl;
|
||||
// for range
|
||||
std::function<aclnnStatus(aclScalar *, aclScalar *, aclScalar *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncRange;
|
||||
// for leaky_relu
|
||||
std::function<aclnnStatus(aclTensor *, aclScalar *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncLeakyRelu;
|
||||
// for leaky_relu backward
|
||||
std::function<aclnnStatus(aclTensor *, aclTensor *, aclScalar *, bool, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncLeakyReluBackward;
|
||||
// for dropout
|
||||
std::function<aclnnStatus(aclTensor *, double, bool, int64_t, int64_t, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncDropout;
|
||||
// for dropout backward
|
||||
std::function<aclnnStatus(aclTensor *, aclTensor *, double, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncDropoutBackward;
|
||||
// for split with size
|
||||
std::function<aclnnStatus(aclTensor *, aclIntArray *, int64_t, aclTensorList *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncSplitWithSize;
|
||||
|
||||
// for silu
|
||||
// std::function<aclnnStatus(aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncSilu;
|
||||
|
||||
// for silu backward
|
||||
// std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncSiluBackward;
|
||||
|
||||
// for sigmoid
|
||||
// std::function<aclnnStatus(aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncSigmoid;
|
||||
|
||||
// for sigmoid backward
|
||||
// std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncSigmoidBackward;
|
||||
|
||||
// for embedding
|
||||
// std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncEmbedding;
|
||||
|
||||
// for embedding backward
|
||||
std::function<aclnnStatus(aclTensor *, aclTensor *, uint64_t, uint64_t, bool, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncEmbeddingBackward;
|
||||
|
||||
// for InplaceMaskedScatter MaskedSelect
|
||||
// std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncInplaceMaskedScatter;
|
||||
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, aclrtStream)> executeFunc;
|
||||
|
||||
// for flashattention
|
||||
std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *,
|
||||
aclIntArray *, aclIntArray *, aclIntArray *, double, double, int64_t, int64_t, int64_t, char *, int64_t, int64_t, int64_t,
|
||||
aclTensor *, aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)>
|
||||
getWorkspaceSizeFuncFalshAttention;
|
||||
|
||||
// for flashattention backward
|
||||
std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *,
|
||||
aclIntArray *, aclIntArray *, aclIntArray *, double, double, int64_t, int64_t, int64_t, char *, int64_t, int64_t, int64_t,
|
||||
aclTensor *, aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)>
|
||||
getWorkspaceSizeFuncFalshAttentionBackward;
|
||||
|
||||
// for batchnorm
|
||||
std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, bool, double, double, aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncBatchNorm;
|
||||
|
||||
// for batchnorm backward
|
||||
std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, bool, double, aclBoolArray *, aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncBatchNormBackward;
|
||||
|
||||
// for layernorm
|
||||
std::function<aclnnStatus(aclTensor *, aclIntArray *, aclTensor *, aclTensor *, double, aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncLayerNorm;
|
||||
|
||||
// for ROPE
|
||||
std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, aclTensor *, int64_t, uint64_t *, aclOpExecutor **)>
|
||||
getWorkspaceSizeFuncRotaryPosEmb;
|
||||
|
||||
// 添加一个默认构造函数
|
||||
AclOpFunctions() = default;
|
||||
|
||||
// for Unary and Nonzero
|
||||
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
|
||||
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, aclrtStream)> execf)
|
||||
: getWorkspaceSizeFuncUnaryNonzero(gwsf), executeFunc(execf) {}
|
||||
|
||||
// for Cast
|
||||
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclDataType, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
|
||||
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, aclrtStream)> execf)
|
||||
: getWorkspaceSizeFuncCast(gwsf), executeFunc(execf) {}
|
||||
|
||||
// for Binary
|
||||
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
|
||||
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
|
||||
: getWorkspaceSizeFuncBinary(gwsf), executeFunc(execf) {}
|
||||
// for Add and Sub
|
||||
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, aclScalar *, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
|
||||
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
|
||||
: getWorkspaceSizeFuncAdd(gwsf), executeFunc(execf) {}
|
||||
|
||||
// for Expand, flip
|
||||
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclIntArray *, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
|
||||
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
|
||||
: getWorkspaceSizeFuncExpand(gwsf), executeFunc(execf) {}
|
||||
|
||||
// for Matmul
|
||||
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, int8_t, uint64_t *, aclOpExecutor **)> gwsf,
|
||||
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
|
||||
: getWorkspaceSizeFuncMatmul(gwsf), executeFunc(execf) {}
|
||||
|
||||
// for conv
|
||||
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, aclIntArray *, aclIntArray *, aclIntArray *, bool, aclIntArray *, int64_t, aclTensor *, int8_t, uint64_t *, aclOpExecutor **)> gwsf,
|
||||
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
|
||||
: getWorkspaceSizeFuncConv(gwsf), executeFunc(execf) {}
|
||||
|
||||
// for reducesum, mean
|
||||
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclIntArray *, bool, aclDataType, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
|
||||
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
|
||||
: getWorkspaceSizeFuncReduceSum(gwsf), executeFunc(execf) {}
|
||||
|
||||
// for amax amin
|
||||
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclIntArray *, bool, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
|
||||
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
|
||||
: getWorkspaceSizeFuncAmax(gwsf), executeFunc(execf) {}
|
||||
|
||||
// for conv backward
|
||||
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, aclIntArray *, aclIntArray *, aclIntArray *, aclIntArray *, bool, aclIntArray *, int, aclBoolArray *, int8_t, aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
|
||||
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
|
||||
: getWorkspaceSizeFuncConvBackward(gwsf), executeFunc(execf) {}
|
||||
|
||||
// for proddim
|
||||
AclOpFunctions(std::function<aclnnStatus(const aclTensor *, float, float, float, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
|
||||
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
|
||||
: getWorkspaceSizeFuncProdDim(gwsf), executeFunc(execf) {}
|
||||
|
||||
// for select, where
|
||||
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
|
||||
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
|
||||
: getWorkspaceSizeFuncSelect(gwsf), executeFunc(execf) {}
|
||||
|
||||
// for random_normal
|
||||
AclOpFunctions(std::function<aclnnStatus(aclTensor *, int64_t, int64_t, int64_t, int64_t, uint64_t *, aclOpExecutor **)> gwsf,
|
||||
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
|
||||
: getWorkspaceSizeFuncRandom(gwsf), executeFunc(execf) {}
|
||||
|
||||
// for maxpool
|
||||
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclIntArray *, aclIntArray *, aclIntArray *, aclIntArray *, bool, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
|
||||
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
|
||||
: getWorkspaceSizeFuncMaxPool(gwsf), executeFunc(execf) {}
|
||||
|
||||
// for maxpool backward
|
||||
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, aclIntArray *, aclIntArray *, aclIntArray *, aclIntArray *, bool, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
|
||||
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
|
||||
: getWorkspaceSizeFuncMaxPoolBackward(gwsf), executeFunc(execf) {}
|
||||
|
||||
// for avgpool
|
||||
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclIntArray *, aclIntArray *, aclIntArray *, bool, bool, int64_t, int8_t, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
|
||||
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
|
||||
: getWorkspaceSizeFuncAvgPool(gwsf), executeFunc(execf) {}
|
||||
|
||||
// for avgpool backward
|
||||
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, aclIntArray *, aclIntArray *, aclIntArray *, bool, bool, int64_t, int8_t, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
|
||||
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
|
||||
: getWorkspaceSizeFuncAvgPoolBackward(gwsf), executeFunc(execf) {}
|
||||
|
||||
// for concat
|
||||
AclOpFunctions(std::function<aclnnStatus(aclTensorList *, int64_t, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
|
||||
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
|
||||
: getWorkspaceSizeFuncConcat(gwsf), executeFunc(execf) {}
|
||||
|
||||
// for gather
|
||||
AclOpFunctions(std::function<aclnnStatus(aclTensor *, int64_t, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
|
||||
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
|
||||
: getWorkspaceSizeFuncGather(gwsf), executeFunc(execf) {}
|
||||
|
||||
// for cumsum
|
||||
AclOpFunctions(std::function<aclnnStatus(aclTensor *, int64_t, aclDataType, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
|
||||
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
|
||||
: getWorkspaceSizeFuncCumsum(gwsf), executeFunc(execf) {}
|
||||
|
||||
// for scatter
|
||||
AclOpFunctions(std::function<aclnnStatus(aclTensor *, uint64_t, aclTensor *, aclTensor *, uint64_t, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
|
||||
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
|
||||
: getWorkspaceSizeFuncScatter(gwsf), executeFunc(execf) {}
|
||||
|
||||
// for index
|
||||
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensorList *, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
|
||||
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
|
||||
: getWorkspaceSizeFuncIndex(gwsf), executeFunc(execf) {}
|
||||
|
||||
// for stridesliceassignv2
|
||||
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, aclIntArray *, aclIntArray *, aclIntArray *, aclIntArray *, uint64_t *, aclOpExecutor **)> gwsf,
|
||||
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
|
||||
: getWorkspaceSizeFuncStridedSliceAssignV2(gwsf), executeFunc(execf) {}
|
||||
|
||||
// for slicev2
|
||||
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclIntArray *, aclIntArray *, aclIntArray *, aclIntArray *, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
|
||||
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
|
||||
: getWorkspaceSizeFuncSliceV2(gwsf), executeFunc(execf) {}
|
||||
|
||||
// for indexputimpl
|
||||
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensorList *, aclTensor *, bool, bool, uint64_t *, aclOpExecutor **)> gwsf,
|
||||
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
|
||||
: getWorkspaceSizeFuncIndexPutImpl(gwsf), executeFunc(execf) {}
|
||||
|
||||
// for range
|
||||
AclOpFunctions(std::function<aclnnStatus(aclScalar *, aclScalar *, aclScalar *, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
|
||||
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
|
||||
: getWorkspaceSizeFuncRange(gwsf), executeFunc(execf) {}
|
||||
|
||||
// for leaky_relu
|
||||
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclScalar *, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
|
||||
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
|
||||
: getWorkspaceSizeFuncLeakyRelu(gwsf), executeFunc(execf) {}
|
||||
|
||||
// for leaky_relu backward
|
||||
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, aclScalar *, bool, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
|
||||
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
|
||||
: getWorkspaceSizeFuncLeakyReluBackward(gwsf), executeFunc(execf) {}
|
||||
|
||||
// for dropout
|
||||
AclOpFunctions(std::function<aclnnStatus(aclTensor *, double, bool, int64_t, int64_t, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
|
||||
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
|
||||
: getWorkspaceSizeFuncDropout(gwsf), executeFunc(execf) {}
|
||||
|
||||
// for dropout backward
|
||||
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, double, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
|
||||
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
|
||||
: getWorkspaceSizeFuncDropoutBackward(gwsf), executeFunc(execf) {}
|
||||
|
||||
// for embedding backward
|
||||
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, uint64_t, uint64_t, bool, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
|
||||
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
|
||||
: getWorkspaceSizeFuncEmbeddingBackward(gwsf), executeFunc(execf) {}
|
||||
|
||||
// for split with size
|
||||
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclIntArray *, int64_t, aclTensorList *, uint64_t *, aclOpExecutor **)> gwsf,
|
||||
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
|
||||
: getWorkspaceSizeFuncSplitWithSize(gwsf), executeFunc(execf) {}
|
||||
|
||||
// for flash attention
|
||||
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *,
|
||||
aclIntArray *, aclIntArray *, aclIntArray *, double, double, int64_t, int64_t, int64_t, char *, int64_t, int64_t, int64_t,
|
||||
aclTensor *, aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)>
|
||||
gwsf,
|
||||
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
|
||||
: getWorkspaceSizeFuncFalshAttention(gwsf), executeFunc(execf) {}
|
||||
|
||||
// for flash attention backward
|
||||
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *,
|
||||
aclIntArray *, aclIntArray *, aclIntArray *, double, double, int64_t, int64_t, int64_t, char *, int64_t, int64_t, int64_t,
|
||||
aclTensor *, aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)>
|
||||
gwsf,
|
||||
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
|
||||
: getWorkspaceSizeFuncFalshAttentionBackward(gwsf), executeFunc(execf) {}
|
||||
|
||||
// for batchnorm
|
||||
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, bool, double, double, aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)>
|
||||
gwsf,
|
||||
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
|
||||
: getWorkspaceSizeFuncBatchNorm(gwsf), executeFunc(execf) {}
|
||||
|
||||
// for batchnorm backward
|
||||
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, bool, double, aclBoolArray *, aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)>
|
||||
gwsf,
|
||||
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
|
||||
: getWorkspaceSizeFuncBatchNormBackward(gwsf), executeFunc(execf) {}
|
||||
|
||||
// for layernorm
|
||||
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclIntArray *, aclTensor *, aclTensor *, double, aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)>
|
||||
gwsf,
|
||||
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
|
||||
: getWorkspaceSizeFuncLayerNorm(gwsf), executeFunc(execf) {}
|
||||
|
||||
// for ROPE
|
||||
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, const aclTensor *, const aclTensor *, int64_t, uint64_t *, aclOpExecutor **)> gwsf,
|
||||
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
|
||||
: getWorkspaceSizeFuncRotaryPosEmb(gwsf), executeFunc(execf) {}
|
||||
};
|
||||
|
||||
static std::unordered_map<std::string, AclOpFunctions> aclOpFuncMap = {
|
||||
{"Abs", AclOpFunctions(aclnnAbsGetWorkspaceSize, aclnnAbs)},
|
||||
{"Exp", AclOpFunctions(aclnnExpGetWorkspaceSize, aclnnExp)},
|
||||
{"Log", AclOpFunctions(aclnnLogGetWorkspaceSize, aclnnLog)},
|
||||
{"Sqrt", AclOpFunctions(aclnnSqrtGetWorkspaceSize, aclnnSqrt)},
|
||||
{"Ceil", AclOpFunctions(aclnnCeilGetWorkspaceSize, aclnnCeil)},
|
||||
{"Floor", AclOpFunctions(aclnnFloorGetWorkspaceSize, aclnnFloor)},
|
||||
{"Round", AclOpFunctions(aclnnRoundGetWorkspaceSize, aclnnRound)},
|
||||
{"Sin", AclOpFunctions(aclnnSinGetWorkspaceSize, aclnnSin)},
|
||||
{"Cos", AclOpFunctions(aclnnCosGetWorkspaceSize, aclnnCos)},
|
||||
{"Tan", AclOpFunctions(aclnnTanGetWorkspaceSize, aclnnTan)},
|
||||
{"Asin", AclOpFunctions(aclnnAsinGetWorkspaceSize, aclnnAsin)},
|
||||
{"Acos", AclOpFunctions(aclnnAcosGetWorkspaceSize, aclnnAcos)},
|
||||
{"Atan", AclOpFunctions(aclnnAtanGetWorkspaceSize, aclnnAtan)},
|
||||
{"Sinh", AclOpFunctions(aclnnSinhGetWorkspaceSize, aclnnSinh)},
|
||||
{"Cosh", AclOpFunctions(aclnnCoshGetWorkspaceSize, aclnnCosh)},
|
||||
{"Tanh", AclOpFunctions(aclnnTanhGetWorkspaceSize, aclnnTanh)},
|
||||
{"Asinh", AclOpFunctions(aclnnAsinhGetWorkspaceSize, aclnnAsinh)},
|
||||
{"Acosh", AclOpFunctions(aclnnAcoshGetWorkspaceSize, aclnnAcosh)},
|
||||
{"Atanh", AclOpFunctions(aclnnAtanhGetWorkspaceSize, aclnnAtanh)},
|
||||
{"Sigmoid", AclOpFunctions(aclnnSigmoidGetWorkspaceSize, aclnnSigmoid)},
|
||||
{"Erf", AclOpFunctions(aclnnErfGetWorkspaceSize, aclnnErf)},
|
||||
{"Erfinv", AclOpFunctions(aclnnErfinvGetWorkspaceSize, aclnnErfinv)},
|
||||
{"LogicalNot", AclOpFunctions(aclnnLogicalNotGetWorkspaceSize, aclnnLogicalNot)},
|
||||
{"BitwiseNot", AclOpFunctions(aclnnBitwiseNotGetWorkspaceSize, aclnnBitwiseNot)},
|
||||
{"Neg", AclOpFunctions(aclnnNegGetWorkspaceSize, aclnnNeg)},
|
||||
{"Cast", AclOpFunctions(aclnnCastGetWorkspaceSize, aclnnCast)},
|
||||
{"Maximum", AclOpFunctions(aclnnMaximumGetWorkspaceSize, aclnnMaximum)},
|
||||
{"Minimum", AclOpFunctions(aclnnMinimumGetWorkspaceSize, aclnnMinimum)},
|
||||
{"Add", AclOpFunctions(aclnnAddGetWorkspaceSize, aclnnAdd)},
|
||||
{"Sub", AclOpFunctions(aclnnSubGetWorkspaceSize, aclnnSub)},
|
||||
{"Mul", AclOpFunctions(aclnnMulGetWorkspaceSize, aclnnMul)},
|
||||
{"RealDiv", AclOpFunctions(aclnnDivGetWorkspaceSize, aclnnDiv)},
|
||||
{"FloorDiv", AclOpFunctions(aclnnFloorDivideGetWorkspaceSize, aclnnFloorDivide)},
|
||||
{"LessEqual", AclOpFunctions(aclnnLeTensorGetWorkspaceSize, aclnnLeTensor)},
|
||||
{"Less", AclOpFunctions(aclnnLtTensorGetWorkspaceSize, aclnnLtTensor)},
|
||||
{"GreaterEqual", AclOpFunctions(aclnnGeTensorGetWorkspaceSize, aclnnGeTensor)},
|
||||
{"Greater", AclOpFunctions(aclnnGtTensorGetWorkspaceSize, aclnnGtTensor)},
|
||||
{"Equal", AclOpFunctions(aclnnEqTensorGetWorkspaceSize, aclnnEqTensor)},
|
||||
{"NotEqual", AclOpFunctions(aclnnNeTensorGetWorkspaceSize, aclnnNeTensor)},
|
||||
{"LogicalAnd", AclOpFunctions(aclnnLogicalAndGetWorkspaceSize, aclnnLogicalAnd)},
|
||||
{"LogicalOr", AclOpFunctions(aclnnLogicalOrGetWorkspaceSize, aclnnLogicalOr)},
|
||||
{"LogicalXor", AclOpFunctions(aclnnLogicalXorGetWorkspaceSize, aclnnLogicalXor)},
|
||||
{"BitwiseAnd", AclOpFunctions(aclnnBitwiseAndTensorGetWorkspaceSize, aclnnBitwiseAndTensor)},
|
||||
{"BitwiseOr", AclOpFunctions(aclnnBitwiseOrTensorGetWorkspaceSize, aclnnBitwiseOrTensor)},
|
||||
{"BitwiseXor", AclOpFunctions(aclnnBitwiseXorTensorGetWorkspaceSize, aclnnBitwiseXorTensor)},
|
||||
{"Pow", AclOpFunctions(aclnnPowTensorTensorGetWorkspaceSize, aclnnPowTensorTensor)},
|
||||
{"Expand", AclOpFunctions(aclnnExpandGetWorkspaceSize, aclnnExpand)},
|
||||
{"MatMul", AclOpFunctions(aclnnMatmulGetWorkspaceSize, aclnnMatmul)},
|
||||
{"BatchMatMul", AclOpFunctions(aclnnBatchMatMulGetWorkspaceSize, aclnnBatchMatMul)},
|
||||
{"ReduceMax", AclOpFunctions(aclnnAmaxGetWorkspaceSize, aclnnAmax)},
|
||||
{"ReduceMin", AclOpFunctions(aclnnAminGetWorkspaceSize, aclnnAmin)},
|
||||
{"ReduceSum", AclOpFunctions(aclnnReduceSumGetWorkspaceSize, aclnnReduceSum)},
|
||||
{"Triu", AclOpFunctions(aclnnTriuGetWorkspaceSize, aclnnTriu)},
|
||||
{"Conv2d", AclOpFunctions(aclnnConvolutionGetWorkspaceSize, aclnnConvolution)},
|
||||
{"Conv2dBackward", AclOpFunctions(aclnnConvolutionBackwardGetWorkspaceSize, aclnnConvolutionBackward)},
|
||||
{"ReduceMean", AclOpFunctions(aclnnMeanGetWorkspaceSize, aclnnMean)},
|
||||
// {"ReduceProd", AclOpFunctions(aclnnProdDimGetWorkspaceSize, aclnnProdDim)},
|
||||
{"Select", AclOpFunctions(aclnnSWhereGetWorkspaceSize, aclnnSWhere)},
|
||||
{"RandomUniform", AclOpFunctions(aclnnInplaceUniformGetWorkspaceSize, aclnnInplaceUniform)},
|
||||
{"RandomNormal", AclOpFunctions(aclnnInplaceNormalGetWorkspaceSize, aclnnInplaceNormal)},
|
||||
{"Transpose", AclOpFunctions(aclnnPermuteGetWorkspaceSize, aclnnPermute)},
|
||||
{"Maxpool", AclOpFunctions(aclnnMaxPool2dWithIndicesGetWorkspaceSize, aclnnMaxPool2dWithIndices)},
|
||||
{"MaxpoolBackward", AclOpFunctions(aclnnMaxPool2dWithIndicesBackwardGetWorkspaceSize, aclnnMaxPool2dWithIndicesBackward)},
|
||||
{"Avgpool", AclOpFunctions(aclnnAvgPool2dGetWorkspaceSize, aclnnAvgPool2d)},
|
||||
{"AvgpoolBackward", AclOpFunctions(aclnnAvgPool2dBackwardGetWorkspaceSize, aclnnAvgPool2dBackward)},
|
||||
{"Flip", AclOpFunctions(aclnnFlipGetWorkspaceSize, aclnnFlip)},
|
||||
{"Concat", AclOpFunctions(aclnnCatGetWorkspaceSize, aclnnCat)},
|
||||
{"Gather", AclOpFunctions(aclnnGatherGetWorkspaceSize, aclnnGather)},
|
||||
{"Cumsum", AclOpFunctions(aclnnCumsumGetWorkspaceSize, aclnnCumsum)},
|
||||
{"Index", AclOpFunctions(aclnnIndexGetWorkspaceSize, aclnnIndex)},
|
||||
{"Scatter", AclOpFunctions(aclnnScatterGetWorkspaceSize, aclnnScatter)},
|
||||
{"Nonzero", AclOpFunctions(aclnnNonzeroGetWorkspaceSize, aclnnNonzero)},
|
||||
{"Where", AclOpFunctions(aclnnSWhereGetWorkspaceSize, aclnnSWhere)},
|
||||
{"Floor", AclOpFunctions(aclnnFloorGetWorkspaceSize, aclnnFloor)},
|
||||
{"StridedSliceAssignV2", AclOpFunctions(aclnnStridedSliceAssignV2GetWorkspaceSize, aclnnStridedSliceAssignV2)},
|
||||
{"SliceV2", AclOpFunctions(aclnnSliceV2GetWorkspaceSize, aclnnSliceV2)},
|
||||
{"IndexPutImpl", AclOpFunctions(aclnnIndexPutImplGetWorkspaceSize, aclnnIndexPutImpl)},
|
||||
{"IndexPutImplAccumulate", AclOpFunctions(aclnnIndexPutImplGetWorkspaceSize, aclnnIndexPutImpl)},
|
||||
{"Range", AclOpFunctions(aclnnRangeGetWorkspaceSize, aclnnRange)},
|
||||
{"ReLU", AclOpFunctions(aclnnReluGetWorkspaceSize, aclnnRelu)},
|
||||
{"LeakyReLU", AclOpFunctions(aclnnLeakyReluGetWorkspaceSize, aclnnLeakyRelu)},
|
||||
{"LeakyReLUBackward", AclOpFunctions(aclnnLeakyReluBackwardGetWorkspaceSize, aclnnLeakyReluBackward)},
|
||||
{"Dropout", AclOpFunctions(aclnnDropoutGetWorkspaceSize, aclnnDropout)},
|
||||
{"DropoutBackward", AclOpFunctions(aclnnDropoutBackwardGetWorkspaceSize, aclnnDropoutBackward)},
|
||||
{"SiLU", AclOpFunctions(aclnnSiluGetWorkspaceSize, aclnnSilu)},
|
||||
{"SiLUBackward", AclOpFunctions(aclnnSiluBackwardGetWorkspaceSize, aclnnSiluBackward)},
|
||||
{"Sigmoid", AclOpFunctions(aclnnSigmoidGetWorkspaceSize, aclnnSigmoid)},
|
||||
{"SigmoidBackward", AclOpFunctions(aclnnSigmoidBackwardGetWorkspaceSize, aclnnSigmoidBackward)},
|
||||
{"Embedding", AclOpFunctions(aclnnEmbeddingGetWorkspaceSize, aclnnEmbedding)},
|
||||
{"EmbeddingBackward", AclOpFunctions(aclnnEmbeddingDenseBackwardGetWorkspaceSize, aclnnEmbeddingDenseBackward)},
|
||||
{"InplaceMaskedScatter", AclOpFunctions(aclnnInplaceMaskedScatterGetWorkspaceSize, aclnnInplaceMaskedScatter)},
|
||||
{"MaskedSelect", AclOpFunctions(aclnnMaskedSelectGetWorkspaceSize, aclnnMaskedSelect)},
|
||||
{"SplitWithSize", AclOpFunctions(aclnnSplitWithSizeGetWorkspaceSize, aclnnSplitWithSize)},
|
||||
{"Softmax", AclOpFunctions(aclnnSoftmaxGetWorkspaceSize, aclnnSoftmax)},
|
||||
{"SoftmaxBackward", AclOpFunctions(aclnnSoftmaxBackwardGetWorkspaceSize, aclnnSoftmaxBackward)},
|
||||
{"FlashAttention", AclOpFunctions(aclnnFlashAttentionScoreV2GetWorkspaceSize, aclnnFlashAttentionScoreV2)},
|
||||
{"FlashAttentionBackward", AclOpFunctions(aclnnFlashAttentionScoreGradV2GetWorkspaceSize, aclnnFlashAttentionScoreGradV2)},
|
||||
{"BatchNorm", AclOpFunctions(aclnnBatchNormGetWorkspaceSize, aclnnBatchNorm)},
|
||||
{"BatchNormBackward", AclOpFunctions(aclnnBatchNormBackwardGetWorkspaceSize, aclnnBatchNormBackward)},
|
||||
{"LayerNorm", AclOpFunctions(aclnnLayerNormGetWorkspaceSize, aclnnLayerNorm)},
|
||||
{"RotaryPosEmb", AclOpFunctions(aclnnApplyRotaryPosEmbGetWorkspaceSize, aclnnApplyRotaryPosEmb)},
|
||||
{"Stack", AclOpFunctions(aclnnStackGetWorkspaceSize, aclnnStack)},
|
||||
{"NanToNum", AclOpFunctions(aclnnNanToNumGetWorkspaceSize, aclnnNanToNum)},
|
||||
};
|
||||
|
||||
struct AclOpAttr
|
||||
{
|
||||
virtual ~AclOpAttr() {}
|
||||
};
|
||||
|
||||
struct ConvAttr : AclOpAttr
|
||||
{
|
||||
vector<int64_t> convStrides;
|
||||
vector<int64_t> convPads;
|
||||
vector<int64_t> convOutPads;
|
||||
vector<int64_t> convDilations;
|
||||
bool convWithBias;
|
||||
bool is_transposed;
|
||||
int64_t group;
|
||||
|
||||
// 析构函数
|
||||
~ConvAttr()
|
||||
{
|
||||
convStrides.clear();
|
||||
convPads.clear();
|
||||
convOutPads.clear();
|
||||
convDilations.clear();
|
||||
}
|
||||
};
|
||||
|
||||
struct ReduceAttr : AclOpAttr
|
||||
{
|
||||
vector<int64_t> axes;
|
||||
// for proddim
|
||||
int64_t prod_dim;
|
||||
bool keepdims;
|
||||
|
||||
~ReduceAttr()
|
||||
{
|
||||
axes.clear();
|
||||
}
|
||||
};
|
||||
|
||||
struct RandomAttr : AclOpAttr
|
||||
{
|
||||
int64_t seed, offset;
|
||||
|
||||
~RandomAttr()
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
struct TriuAttr : AclOpAttr
|
||||
{
|
||||
int64_t diagonal;
|
||||
|
||||
~TriuAttr()
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
struct PoolAttr : AclOpAttr
|
||||
{
|
||||
vector<int64_t> kernel_size;
|
||||
vector<int64_t> poolStrides;
|
||||
vector<int64_t> poolPads;
|
||||
vector<int64_t> poolDilations;
|
||||
bool poolCeil;
|
||||
bool countIncludePad;
|
||||
|
||||
// divisorOverride(const int64_t,计算输入): 表示取平均的除数。数据类型支持INT64。divisorOverride配置为默认值0时表示功能不使能。
|
||||
// https://www.hiascend.com/document/detail/zh/canncommercial/80RC2/apiref/appdevgapi/context/aclnnAvgPool2d.md
|
||||
int64_t divisorOverride = 0;
|
||||
|
||||
// cubeMathType(int8_t,计算输入): host侧的整型,判断Cube单元应该使用哪种计算逻辑进行运算,数据类型支持INT8。对于无特殊说明的数据类型,均保持原始输入数据类型计算。支持的枚举值如下:
|
||||
// 0:KEEP_DTYPE,保持输入的数据类型进行计算。当输入是FLOAT,Atlas 训练系列产品和Atlas 推理系列产品(Ascend 310P处理器)暂不支持,取0时会报错。
|
||||
// 1:ALLOW_FP32_DOWN_PRECISION,允许将输入数据降精度计算。当输入是FLOAT,Atlas 训练系列产品和Atlas 推理系列产品(Ascend 310P处理器)允许转换为FLOAT16计算。
|
||||
// 2:USE_FP16,允许转换为数据类型FLOAT16进行计算。当输入数据类型是FLOAT,转换为FLOAT16计算。
|
||||
// 3:USE_HF32,允许转换为数据类型HFLOAT32计算。当输入是FLOAT,Atlas 训练系列产品、Atlas 推理系列产品(Ascend 310P处理器)和Atlas A2训练系列产品/Atlas 800I A2推理产品暂不支持,取3时会报错。
|
||||
// https://www.hiascend.com/document/detail/zh/canncommercial/80RC2/apiref/appdevgapi/context/aclnnAvgPool2d.md
|
||||
int8_t cubeMathType = 0;
|
||||
|
||||
// 析构函数
|
||||
~PoolAttr()
|
||||
{
|
||||
kernel_size.clear();
|
||||
poolStrides.clear();
|
||||
poolPads.clear();
|
||||
poolDilations.clear();
|
||||
}
|
||||
};
|
||||
|
||||
struct ConcatAttr : AclOpAttr
|
||||
{
|
||||
int64_t tensorNum;
|
||||
int64_t dim;
|
||||
|
||||
~ConcatAttr()
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
struct GatherAttr : AclOpAttr
|
||||
{
|
||||
int64_t dim;
|
||||
|
||||
~GatherAttr()
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
struct ScatterAttr : AclOpAttr
|
||||
{
|
||||
int64_t axis;
|
||||
int64_t reduction;
|
||||
|
||||
~ScatterAttr()
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
struct StrideAttr : AclOpAttr
|
||||
{
|
||||
vector<int64_t> begins;
|
||||
vector<int64_t> ends;
|
||||
vector<int64_t> steps;
|
||||
vector<int64_t> axes;
|
||||
~StrideAttr()
|
||||
{
|
||||
begins.clear();
|
||||
ends.clear();
|
||||
steps.clear();
|
||||
axes.clear();
|
||||
}
|
||||
};
|
||||
|
||||
struct RangeAttr : AclOpAttr
|
||||
{
|
||||
int64_t start;
|
||||
int64_t end;
|
||||
int64_t step;
|
||||
|
||||
~RangeAttr()
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
struct LeakyReluAttr : AclOpAttr
|
||||
{
|
||||
float negativeSlope;
|
||||
bool selfIsResult;
|
||||
|
||||
~LeakyReluAttr()
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
struct DropoutAttr : AclOpAttr
|
||||
{
|
||||
float p;
|
||||
bool train;
|
||||
int64_t seed;
|
||||
int64_t offset;
|
||||
float scale;
|
||||
|
||||
~DropoutAttr()
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
struct EmbeddingAttr : AclOpAttr
|
||||
{
|
||||
int64_t numEmbeddings;
|
||||
// int64_t embeddingDim;
|
||||
int64_t paddingIdx;
|
||||
bool scaleGradByFreq;
|
||||
// bool sparse;
|
||||
// bool isSparse;
|
||||
// bool isDense;
|
||||
|
||||
~EmbeddingAttr()
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
struct SplitWithSizeAttr : AclOpAttr
|
||||
{
|
||||
vector<int64_t> splitSize;
|
||||
int64_t dim;
|
||||
~SplitWithSizeAttr()
|
||||
{
|
||||
splitSize.clear();
|
||||
}
|
||||
};
|
||||
|
||||
struct SoftmaxAttr : AclOpAttr
|
||||
{
|
||||
int64_t dim;
|
||||
~SoftmaxAttr()
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
struct BatchNormAttr : AclOpAttr
|
||||
{
|
||||
bool is_train;
|
||||
float momentum;
|
||||
float eps;
|
||||
~BatchNormAttr()
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
struct LayerNormAttr : AclOpAttr
|
||||
{
|
||||
float eps;
|
||||
vector<int64_t> normalizedShape;
|
||||
int64_t size;
|
||||
~LayerNormAttr()
|
||||
{
|
||||
normalizedShape.clear();
|
||||
}
|
||||
};
|
||||
|
||||
struct FlashAttentionAttr : AclOpAttr
|
||||
{
|
||||
vector<int64_t> prefix;
|
||||
vector<int64_t> qStartIdx;
|
||||
vector<int64_t> kvStartIdx;
|
||||
float scale;
|
||||
float keepProb;
|
||||
int64_t preToken;
|
||||
int64_t nextToken;
|
||||
int64_t headNum;
|
||||
string inputLayout;
|
||||
int64_t innerPrecise;
|
||||
int64_t sparseMode;
|
||||
int64_t psetype;
|
||||
bool hasRealshift;
|
||||
bool hasDropmask;
|
||||
bool hasPaddingmask;
|
||||
bool hasAttentmask;
|
||||
|
||||
~FlashAttentionAttr()
|
||||
{
|
||||
prefix.clear();
|
||||
qStartIdx.clear();
|
||||
kvStartIdx.clear();
|
||||
}
|
||||
};
|
||||
|
||||
struct NanToNumAttr : AclOpAttr
|
||||
{
|
||||
float nan;
|
||||
float posinf;
|
||||
float neginf;
|
||||
~NanToNumAttr()
|
||||
{
|
||||
}
|
||||
};
|
||||
}
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,58 @@
|
|||
#include <iostream>
|
||||
#include <vector>
|
||||
#include "aclnn.h"
|
||||
|
||||
int64_t GetShapeSize(const std::vector<int64_t>& shape) {
|
||||
int64_t shapeSize = 1;
|
||||
for (auto i : shape) {
|
||||
shapeSize *= i;
|
||||
}
|
||||
return shapeSize;
|
||||
}
|
||||
|
||||
void PrintOutResult(std::vector<int64_t> &shape, void** deviceAddr) {
|
||||
auto size = GetShapeSize(shape);
|
||||
std::vector<int> resultData(size, 0);
|
||||
auto ret = aclrtMemcpy(resultData.data(), resultData.size() * sizeof(resultData[0]),
|
||||
*deviceAddr, size * sizeof(resultData[0]), ACL_MEMCPY_DEVICE_TO_HOST);
|
||||
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("copy result from device to host failed. ERROR: %d\n", ret); return);
|
||||
for (int64_t i = 0; i < size; i++) {
|
||||
LOG_PRINT("mean result[%ld] is: %d\n", i, resultData[i]);
|
||||
}
|
||||
}
|
||||
|
||||
/*int Init(int32_t deviceId) {
|
||||
// 固定写法,AscendCL初始化
|
||||
auto ret = aclInit(nullptr);
|
||||
CHECK_RET(ret == ACL_SUCCESS or ret == 100002, LOG_PRINT("aclInit failed. ERROR: %d\n", ret); return ret);
|
||||
ret = aclrtSetDevice(deviceId);
|
||||
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclrtSetDevice failed. ERROR: %d\n", ret); return ret);
|
||||
//ret = aclrtCreateStream(stream);
|
||||
//CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclrtCreateStream failed. ERROR: %d\n", ret); return ret);
|
||||
return 0;
|
||||
}*/
|
||||
|
||||
/*
|
||||
template <typename T>
|
||||
int CreateAclTensor(const std::vector<T>& hostData, const std::vector<int64_t>& shape, void** deviceAddr,
|
||||
aclDataType dataType, aclTensor** tensor) {
|
||||
auto size = GetShapeSize(shape) * sizeof(T);
|
||||
// 调用aclrtMalloc申请device侧内存
|
||||
auto ret = aclrtMalloc(deviceAddr, size, ACL_MEM_MALLOC_HUGE_FIRST);
|
||||
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclrtMalloc failed. ERROR: %d\n", ret); return ret);
|
||||
// 调用aclrtMemcpy将host侧数据拷贝到device侧内存上
|
||||
ret = aclrtMemcpy(*deviceAddr, size, hostData.data(), size, ACL_MEMCPY_HOST_TO_DEVICE);
|
||||
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclrtMemcpy failed. ERROR: %d\n", ret); return ret);
|
||||
|
||||
// 计算连续tensor的strides
|
||||
std::vector<int64_t> strides(shape.size(), 1);
|
||||
for (int64_t i = shape.size() - 2; i >= 0; i--) {
|
||||
strides[i] = shape[i + 1] * strides[i + 1];
|
||||
}
|
||||
|
||||
// 调用aclCreateTensor接口创建aclTensor
|
||||
*tensor = aclCreateTensor(shape.data(), shape.size(), dataType, strides.data(), 0, aclFormat::ACL_FORMAT_ND,
|
||||
shape.data(), shape.size(), *deviceAddr);
|
||||
return 0;
|
||||
}*/
|
||||
|
|
@ -0,0 +1,134 @@
|
|||
#include <iostream>
|
||||
#include <vector>
|
||||
#include "acl.h"
|
||||
// unary
|
||||
#include "aclnnop/aclnn_abs.h"
|
||||
#include "aclnnop/aclnn_neg.h"
|
||||
#include "aclnnop/aclnn_exp.h"
|
||||
#include "aclnnop/aclnn_log.h"
|
||||
#include "aclnnop/aclnn_sqrt.h"
|
||||
#include "aclnnop/aclnn_ceil.h"
|
||||
#include "aclnnop/aclnn_floor.h"
|
||||
#include "aclnnop/aclnn_round.h"
|
||||
#include "aclnnop/aclnn_sin.h"
|
||||
#include "aclnnop/aclnn_cos.h"
|
||||
#include "aclnnop/aclnn_tan.h"
|
||||
#include "aclnnop/aclnn_asin.h"
|
||||
#include "aclnnop/aclnn_acos.h"
|
||||
#include "aclnnop/aclnn_atan.h"
|
||||
#include "aclnnop/aclnn_sinh.h"
|
||||
#include "aclnnop/aclnn_cosh.h"
|
||||
#include "aclnnop/aclnn_tanh.h"
|
||||
#include "aclnnop/aclnn_asinh.h"
|
||||
#include "aclnnop/aclnn_acosh.h"
|
||||
#include "aclnnop/aclnn_atanh.h"
|
||||
#include "aclnnop/aclnn_sigmoid.h"
|
||||
#include "aclnnop/aclnn_erf.h"
|
||||
#include "aclnnop/aclnn_erfinv.h"
|
||||
#include "aclnnop/aclnn_logical_not.h"
|
||||
#include "aclnnop/aclnn_bitwise_not.h"
|
||||
#include "aclnnop/aclnn_cast.h"
|
||||
#include "aclnnop/aclnn_nonzero.h"
|
||||
// binary
|
||||
#include "aclnnop/aclnn_maximum.h"
|
||||
#include "aclnnop/aclnn_minimum.h"
|
||||
#include "aclnnop/aclnn_add.h"
|
||||
#include "aclnnop/aclnn_sub.h"
|
||||
#include "aclnnop/aclnn_mul.h"
|
||||
#include "aclnnop/aclnn_div.h"
|
||||
#include "aclnnop/aclnn_floor_divide.h"
|
||||
#include "aclnnop/aclnn_le_tensor.h"
|
||||
#include "aclnnop/aclnn_lt_tensor.h"
|
||||
#include "aclnnop/aclnn_ge_tensor.h"
|
||||
#include "aclnnop/aclnn_gt_tensor.h"
|
||||
#include "aclnnop/aclnn_eq_tensor.h"
|
||||
#include "aclnnop/aclnn_ne_tensor.h"
|
||||
#include "aclnnop/aclnn_logical_and.h"
|
||||
#include "aclnnop/aclnn_logical_or.h"
|
||||
#include "aclnnop/aclnn_logical_xor.h"
|
||||
#include "aclnnop/aclnn_bitwise_and_tensor.h"
|
||||
#include "aclnnop/aclnn_bitwise_or_tensor.h"
|
||||
#include "aclnnop/aclnn_bitwise_xor_tensor.h"
|
||||
#include "aclnnop/aclnn_pow_tensor_tensor.h"
|
||||
#include "aclnnop/aclnn_expand.h"
|
||||
#include "aclnnop/aclnn_matmul.h"
|
||||
#include "aclnnop/aclnn_batch_matmul.h"
|
||||
#include "aclnnop/aclnn_convolution.h"
|
||||
#include "aclnnop/aclnn_convolution_backward.h"
|
||||
#include "aclnnop/aclnn_reduce_sum.h"
|
||||
#include "aclnnop/aclnn_amax.h"
|
||||
#include "aclnnop/aclnn_amin.h"
|
||||
#include "aclnnop/aclnn_mean.h"
|
||||
#include "aclnnop/aclnn_prod.h"
|
||||
#include "aclnnop/aclnn_triu.h"
|
||||
#include "aclnnop/aclnn_s_where.h"
|
||||
#include "aclnnop/aclnn_random.h"
|
||||
#include "aclnnop/aclnn_normal.h"
|
||||
#include "aclnnop/aclnn_permute.h"
|
||||
#include "aclnnop/aclnn_max_pool2d_with_indices.h"
|
||||
#include "aclnnop/aclnn_max_pool2d_with_indices_backward.h"
|
||||
#include "aclnnop/aclnn_avgpool2d.h"
|
||||
#include "aclnnop/aclnn_avgpool2d_backward.h"
|
||||
#include "aclnnop/aclnn_flip.h"
|
||||
#include "aclnnop/aclnn_cat.h"
|
||||
#include "aclnnop/aclnn_gather.h"
|
||||
#include "aclnnop/aclnn_cumsum.h"
|
||||
#include "aclnnop/aclnn_index.h"
|
||||
#include "aclnnop/aclnn_scatter.h"
|
||||
#include "aclnnop/aclnn_index.h"
|
||||
#include "aclnnop/aclnn_strided_slice_assign_v2.h"
|
||||
#include "aclnnop/aclnn_slice_v2.h"
|
||||
#include "aclnnop/aclnn_index_put_impl.h"
|
||||
#include "aclnnop/aclnn_range.h"
|
||||
#include "aclnnop/aclnn_relu.h"
|
||||
#include "aclnnop/aclnn_dropout.h"
|
||||
#include "aclnnop/aclnn_dropout_backward.h"
|
||||
#include "aclnnop/aclnn_leaky_relu.h"
|
||||
#include "aclnnop/aclnn_leaky_relu_backward.h"
|
||||
#include "aclnnop/aclnn_uniform.h"
|
||||
#include "aclnnop/aclnn_silu.h"
|
||||
#include "aclnnop/aclnn_silu_backward.h"
|
||||
#include "aclnnop/aclnn_sigmoid.h"
|
||||
#include "aclnnop/aclnn_sigmoid_backward.h"
|
||||
#include "aclnnop/aclnn_embedding.h"
|
||||
#include "aclnnop/aclnn_embedding_dense_backward.h"
|
||||
#include "aclnnop/aclnn_masked_scatter.h"
|
||||
#include "aclnnop/aclnn_masked_select.h"
|
||||
#include "aclnnop/aclnn_split_with_size.h"
|
||||
#include "aclnnop/aclnn_flash_attention_score.h"
|
||||
#include "aclnnop/aclnn_flash_attention_score_grad.h"
|
||||
#include "aclnnop/aclnn_softmax.h"
|
||||
#include "aclnnop/aclnn_softmax_backward.h"
|
||||
#include "aclnnop/aclnn_batch_norm.h"
|
||||
#include "aclnnop/aclnn_batch_norm_backward.h"
|
||||
#include "aclnnop/aclnn_layer_norm.h"
|
||||
#include "aclnnop/aclnn_apply_rotary_pos_emb.h"
|
||||
#include "aclnnop/aclnn_stack.h"
|
||||
#include "aclnnop/aclnn_nan_to_num.h"
|
||||
|
||||
#define CHECK_RET(cond, return_expr) \
|
||||
do \
|
||||
{ \
|
||||
if (!(cond)) \
|
||||
{ \
|
||||
return_expr; \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define LOG_PRINT(message, ...) \
|
||||
do \
|
||||
{ \
|
||||
printf(message, ##__VA_ARGS__); \
|
||||
} while (0)
|
||||
|
||||
int64_t GetShapeSize(const std::vector<int64_t> &shape);
|
||||
|
||||
void PrintOutResult(std::vector<int64_t> &shape, void **deviceAddr);
|
||||
|
||||
//int Init(int32_t deviceId);
|
||||
|
||||
/*
|
||||
template <typename T>
|
||||
int CreateAclTensor(const std::vector<T>& hostData, const std::vector<int64_t>& shape, void** deviceAddr,
|
||||
aclDataType dataType, aclTensor** tensor);
|
||||
*/
|
|
@ -0,0 +1,33 @@
|
|||
#pragma once
|
||||
#include <acl/aclops/binary_op_acl.h>
|
||||
#include <acl/aclops/unary_op_acl.h>
|
||||
#include <acl/aclops/conv_op_acl.h>
|
||||
#include <acl/aclops/ternary_op_acl.h>
|
||||
#include <acl/aclops/reduce_op_acl.h>
|
||||
#include <acl/aclops/expand_op_acl.h>
|
||||
#include <acl/aclops/getitem_op_acl.h>
|
||||
#include <acl/aclops/setitem_op_acl.h>
|
||||
#include <acl/aclops/matmul_op_acl.h>
|
||||
#include <acl/aclops/random_op_acl.h>
|
||||
#include <acl/aclops/bmm_op_acl.h>
|
||||
#include <acl/aclops/pool_op_acl.h>
|
||||
#include <acl/aclops/flip_op_acl.h>
|
||||
#include <acl/aclops/concat_op_acl.h>
|
||||
#include <acl/aclops/gather_scatter_op_acl.h>
|
||||
#include <acl/aclops/cumsum_op_acl.h>
|
||||
#include <acl/aclops/index_op_acl.h>
|
||||
#include <acl/aclops/where_op_acl.h>
|
||||
#include <acl/aclops/floor_op_acl.h>
|
||||
#include <acl/aclops/transpose_op_acl.h>
|
||||
#include <acl/aclops/flashattention_op_acl.h>
|
||||
#include <acl/aclops/relu_op_acl.h>
|
||||
#include <acl/aclops/dropout_op_acl.h>
|
||||
#include <acl/aclops/silu_op_acl.h>
|
||||
#include <acl/aclops/sigmoid_op_acl.h>
|
||||
#include <acl/aclops/softmax_op_acl.h>
|
||||
#include <acl/aclops/stack_op_acl.h>
|
||||
#include <acl/aclops/nantonum_op_acl.h>
|
||||
#include <acl/aclops/rope_op_acl.h>
|
||||
#include <acl/aclops/triu_op_acl.h>
|
||||
#include <acl/aclops/embedding_op_acl.h>
|
||||
#include <acl/aclops/norms_op_acl.h>
|
|
@ -0,0 +1,56 @@
|
|||
#pragma once
|
||||
#include "utils.h"
|
||||
#include "acl_jittor.h"
|
||||
|
||||
namespace jittor
|
||||
{
|
||||
extern int sync_run;
|
||||
class BaseOpRunner
|
||||
{
|
||||
protected:
|
||||
vector<Var *> in_;
|
||||
vector<Var *> out_;
|
||||
|
||||
int ret = -1;
|
||||
uint64_t workspaceSize = 0;
|
||||
aclOpExecutor *executor;
|
||||
bool is_group_op = false;
|
||||
|
||||
std::vector<std::vector<int64_t>> inputShapes;
|
||||
std::vector<std::vector<int64_t>> outputShapes;
|
||||
|
||||
std::vector<aclTensor *> inputTensors;
|
||||
std::vector<aclTensor *> outputTensors;
|
||||
|
||||
public:
|
||||
string name;
|
||||
string jt_name;
|
||||
std::unique_ptr<AclOpAttr> op_attr;
|
||||
bool use_nchw = false;
|
||||
|
||||
BaseOpRunner(const string &name = "") : name(name) {}
|
||||
virtual ~BaseOpRunner() = default;
|
||||
|
||||
// Common functionality for adding input/output variables
|
||||
void add(Var *v, bool is_input);
|
||||
|
||||
virtual void setupInputDesc();
|
||||
|
||||
void cleanupDesc();
|
||||
|
||||
virtual void setupOutputDesc();
|
||||
|
||||
virtual void syncRun();
|
||||
|
||||
void checkRet(aclnnStatus ret);
|
||||
|
||||
// Base run method with common operator lookup logic
|
||||
void run();
|
||||
|
||||
protected:
|
||||
// Virtual method for specific operator execution
|
||||
virtual void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) = 0;
|
||||
void cleanupAttr();
|
||||
};
|
||||
|
||||
}
|
|
@ -0,0 +1,152 @@
|
|||
#include <acl/acl.h>
|
||||
#include <acl/acl_op_compiler.h>
|
||||
#include <Python.h>
|
||||
#include <pystate.h>
|
||||
#include <algorithm>
|
||||
#include <queue>
|
||||
#include <set>
|
||||
#include "common.h"
|
||||
#include "op.h"
|
||||
#include "acl_jittor.h"
|
||||
#include "ops/random_op.h"
|
||||
#include "ops/reduce_op.h"
|
||||
#include "ops/binary_op.h"
|
||||
#include "ops/broadcast_to_op.h"
|
||||
#include "ops/transpose_op.h"
|
||||
#include "ops/array_op.h"
|
||||
#include "ops/code_op.h"
|
||||
#include "fused_op.h"
|
||||
#include "ops/unary_op.h"
|
||||
#include "ops/ternary_op.h"
|
||||
#include "executor.h"
|
||||
#include "misc/cuda_flags.h"
|
||||
#include "mem/allocator.h"
|
||||
#include "op_compiler.h"
|
||||
#include "ops/op_register.h"
|
||||
#include "opt/tuner_manager.h"
|
||||
#include "utils/str_utils.h"
|
||||
#include "aclnn/aclnn.h"
|
||||
#include "binary_op_acl.h"
|
||||
#include "base_op.h"
|
||||
|
||||
namespace jittor
|
||||
{
|
||||
extern int sync_run;
|
||||
// Common functionality for adding input/output variables
|
||||
void BaseOpRunner::add(Var *v, bool is_input)
|
||||
{
|
||||
if (is_input)
|
||||
{
|
||||
in_.push_back(v);
|
||||
}
|
||||
else
|
||||
{
|
||||
out_.push_back(v);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
void BaseOpRunner::setupInputDesc()
|
||||
{
|
||||
auto input_num = in_.size();
|
||||
for (int input_idx = 0; input_idx < input_num; input_idx++)
|
||||
{
|
||||
std::vector<int64_t> shape;
|
||||
for (int j = 0; j < in_[input_idx]->shape.size(); j++)
|
||||
{
|
||||
shape.push_back(in_[input_idx]->shape[j]);
|
||||
}
|
||||
inputShapes.push_back(shape);
|
||||
}
|
||||
|
||||
for (int idx = 0; idx < input_num; idx++)
|
||||
{
|
||||
inputTensors.push_back(nullptr);
|
||||
auto ret = CreateAclTensor(inputShapes[idx], in_[idx]->mem_ptr, in_[idx]->size, get_dtype(in_[idx]->dtype()), &inputTensors[idx], use_nchw);
|
||||
CHECK_RET(ret == ACL_SUCCESS, return);
|
||||
}
|
||||
}
|
||||
|
||||
void BaseOpRunner::cleanupDesc()
|
||||
{
|
||||
auto input_num = in_.size();
|
||||
auto output_num = out_.size();
|
||||
for (int idx = 0; idx < input_num; idx++)
|
||||
{
|
||||
aclDestroyTensor(inputTensors[idx]);
|
||||
}
|
||||
for (int idx = 0; idx < output_num; idx++)
|
||||
{
|
||||
aclDestroyTensor(outputTensors[idx]);
|
||||
}
|
||||
}
|
||||
|
||||
void BaseOpRunner::setupOutputDesc()
|
||||
{
|
||||
auto output_num = out_.size();
|
||||
|
||||
for (int output_idx = 0; output_idx < output_num; output_idx++)
|
||||
{
|
||||
std::vector<int64_t> shape;
|
||||
for (int j = 0; j < out_[output_idx]->shape.size(); j++)
|
||||
{
|
||||
shape.push_back(out_[output_idx]->shape[j]);
|
||||
}
|
||||
outputShapes.push_back(shape);
|
||||
}
|
||||
|
||||
for (int idx = 0; idx < output_num; idx++)
|
||||
{
|
||||
outputTensors.push_back(nullptr);
|
||||
auto ret = CreateAclTensor(outputShapes[idx], out_[idx]->mem_ptr, out_[idx]->size, get_dtype(out_[idx]->dtype()), &outputTensors[idx], use_nchw);
|
||||
CHECK_RET(ret == ACL_SUCCESS, return);
|
||||
}
|
||||
}
|
||||
|
||||
void BaseOpRunner::syncRun()
|
||||
{
|
||||
if (sync_run)
|
||||
{
|
||||
// ret = aclrtSynchronizeStream(aclstream);
|
||||
// CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclrtSynchronizeStream failed. ERROR: %d\n", name.c_str(), ret); return);
|
||||
}
|
||||
}
|
||||
|
||||
void BaseOpRunner::checkRet(aclnnStatus ret)
|
||||
{
|
||||
if (ret != ACL_SUCCESS)
|
||||
{
|
||||
auto tmp_err_msg = aclGetRecentErrMsg();
|
||||
LOGir << name << ", " << tmp_err_msg;
|
||||
}
|
||||
|
||||
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnxxxGetWorkspaceSize failed. ERROR: %d\n", name.c_str(), ret); return);
|
||||
}
|
||||
|
||||
// Base run method with common operator lookup logic
|
||||
void BaseOpRunner::run()
|
||||
{
|
||||
if (is_group_op)
|
||||
{
|
||||
auto it = aclOpFuncMap.find(name);
|
||||
if (it == aclOpFuncMap.end())
|
||||
{
|
||||
LOGir << "aclOpFuncMap Not supported op: " << name;
|
||||
throw std::runtime_error("Unsupported operation type.");
|
||||
}
|
||||
setupInputDesc();
|
||||
setupOutputDesc();
|
||||
executeOp(it);
|
||||
cleanupDesc();
|
||||
}
|
||||
else
|
||||
{
|
||||
auto it = aclOpFuncMap.find(name);
|
||||
setupInputDesc();
|
||||
setupOutputDesc();
|
||||
executeOp(it);
|
||||
cleanupDesc();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,124 @@
|
|||
#include <acl/acl.h>
|
||||
#include <acl/acl_op_compiler.h>
|
||||
#include <Python.h>
|
||||
#include <pystate.h>
|
||||
#include <algorithm>
|
||||
#include <queue>
|
||||
#include <set>
|
||||
#include "common.h"
|
||||
#include "op.h"
|
||||
#include "acl_jittor.h"
|
||||
#include "ops/random_op.h"
|
||||
#include "ops/reduce_op.h"
|
||||
#include "ops/binary_op.h"
|
||||
#include "ops/broadcast_to_op.h"
|
||||
#include "ops/transpose_op.h"
|
||||
#include "ops/array_op.h"
|
||||
#include "ops/code_op.h"
|
||||
#include "fused_op.h"
|
||||
#include "ops/unary_op.h"
|
||||
#include "ops/ternary_op.h"
|
||||
#include "executor.h"
|
||||
#include "misc/cuda_flags.h"
|
||||
#include "mem/allocator.h"
|
||||
#include "op_compiler.h"
|
||||
#include "ops/op_register.h"
|
||||
#include "opt/tuner_manager.h"
|
||||
#include "utils/str_utils.h"
|
||||
#include "aclnn/aclnn.h"
|
||||
#include "binary_op_acl.h"
|
||||
|
||||
namespace jittor
|
||||
{
|
||||
BinaryOpRunner::BinaryOpRunner() : BaseOpRunner("binary")
|
||||
{
|
||||
use_nchw = false;
|
||||
is_group_op = true;
|
||||
}
|
||||
|
||||
void BinaryOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
|
||||
{
|
||||
aclScalar *alpha = nullptr;
|
||||
|
||||
if (name == string("Add") || name == string("Sub"))
|
||||
{
|
||||
if (get_dtype(in_[0]->dtype()) == ACL_FLOAT)
|
||||
{
|
||||
float alphaValue = 1.0;
|
||||
alpha = aclCreateScalar(&alphaValue, get_dtype(in_[0]->dtype()));
|
||||
}
|
||||
else if (get_dtype(in_[0]->dtype()) == ACL_FLOAT16)
|
||||
{
|
||||
__fp16 alphaValue = 1.0;
|
||||
alpha = aclCreateScalar(&alphaValue, get_dtype(in_[0]->dtype()));
|
||||
}
|
||||
else if (get_dtype(in_[0]->dtype()) == ACL_INT64)
|
||||
{
|
||||
int64_t alphaValue = 1;
|
||||
alpha = aclCreateScalar(&alphaValue, get_dtype(in_[0]->dtype()));
|
||||
}
|
||||
else if (get_dtype(in_[0]->dtype()) == ACL_INT32)
|
||||
{
|
||||
int alphaValue = 1;
|
||||
alpha = aclCreateScalar(&alphaValue, get_dtype(in_[0]->dtype()));
|
||||
}
|
||||
else if (get_dtype(in_[0]->dtype()) == ACL_INT8)
|
||||
{
|
||||
int8_t alphaValue = 1;
|
||||
alpha = aclCreateScalar(&alphaValue, get_dtype(in_[0]->dtype()));
|
||||
}
|
||||
else if (get_dtype(in_[0]->dtype()) == ACL_INT16)
|
||||
{
|
||||
int16_t alphaValue = 1;
|
||||
alpha = aclCreateScalar(&alphaValue, get_dtype(in_[0]->dtype()));
|
||||
}
|
||||
else if (get_dtype(in_[0]->dtype()) == ACL_UINT8)
|
||||
{
|
||||
uint8_t alphaValue = 1;
|
||||
alpha = aclCreateScalar(&alphaValue, get_dtype(in_[0]->dtype()));
|
||||
}
|
||||
else if (get_dtype(in_[0]->dtype()) == ACL_UINT16)
|
||||
{
|
||||
uint16_t alphaValue = 1;
|
||||
alpha = aclCreateScalar(&alphaValue, get_dtype(in_[0]->dtype()));
|
||||
}
|
||||
else if (get_dtype(in_[0]->dtype()) == ACL_UINT32)
|
||||
{
|
||||
uint32_t alphaValue = 1;
|
||||
alpha = aclCreateScalar(&alphaValue, get_dtype(in_[0]->dtype()));
|
||||
}
|
||||
else if (get_dtype(in_[0]->dtype()) == ACL_BOOL)
|
||||
{
|
||||
bool alphaValue = true;
|
||||
alpha = aclCreateScalar(&alphaValue, get_dtype(in_[0]->dtype()));
|
||||
}
|
||||
else
|
||||
{
|
||||
LOGf << "Not supported dtype: " << in_[0]->dtype();
|
||||
}
|
||||
|
||||
CHECK_RET(alpha != nullptr, return);
|
||||
ret = it->second.getWorkspaceSizeFuncAdd(inputTensors[0], inputTensors[1], alpha, outputTensors[0], &workspaceSize, &executor);
|
||||
}
|
||||
else
|
||||
|
||||
{
|
||||
ret = it->second.getWorkspaceSizeFuncBinary(inputTensors[0], inputTensors[1], outputTensors[0], &workspaceSize, &executor);
|
||||
}
|
||||
|
||||
checkRet(ret);
|
||||
|
||||
if (workspaceSize > 0)
|
||||
{
|
||||
mallocWorkSpace(workspaceSize);
|
||||
}
|
||||
|
||||
ret = it->second.executeFunc(workspaceAddr, workspaceSize, executor, aclstream);
|
||||
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnxxx failed. ERROR: %d\n", name.c_str(), ret); return);
|
||||
|
||||
syncRun();
|
||||
|
||||
aclDestroyScalar(alpha);
|
||||
return;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,14 @@
|
|||
#pragma once
|
||||
#include "utils.h"
|
||||
#include "base_op.h"
|
||||
|
||||
namespace jittor
|
||||
{
|
||||
struct BinaryOpRunner : public BaseOpRunner
|
||||
{
|
||||
BinaryOpRunner();
|
||||
|
||||
protected:
|
||||
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
|
||||
};
|
||||
}
|
|
@ -0,0 +1,128 @@
|
|||
import os
|
||||
from jittor_utils import env_or_try_find
|
||||
import jittor_utils
|
||||
import ctypes
|
||||
import glob
|
||||
import jittor.compiler as compiler
|
||||
import jittor as jt
|
||||
import math
|
||||
import numpy as np
|
||||
|
||||
from typing import Union
|
||||
from collections.abc import Sequence, Iterable
|
||||
|
||||
|
||||
def acl_cmd(name: str,
|
||||
inputs: list,
|
||||
output_dtypes: list = None,
|
||||
output_shapes: list = None,
|
||||
attr_code: str = "",
|
||||
attr_header: str = "",
|
||||
outputs: list = None,
|
||||
extra_data: dict = {}):
|
||||
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
|
||||
|
||||
cuda_header = '''
|
||||
#include "acl/aclops/aclops.h"
|
||||
'''
|
||||
outputs_ = []
|
||||
if outputs is not None:
|
||||
outputs_ = outputs
|
||||
else:
|
||||
assert output_dtypes is not None
|
||||
assert output_shapes is not None
|
||||
assert len(output_dtypes) == len(output_shapes)
|
||||
for i in range(len(output_shapes)):
|
||||
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
|
||||
input_code = ''
|
||||
for i in range(len(inputs)):
|
||||
input_code += f"op.add(in{i}, true);\n"
|
||||
|
||||
return jt.code(outputs=outputs_,
|
||||
inputs=inputs,
|
||||
cuda_header=attr_header + cuda_header,
|
||||
cuda_src=f"""
|
||||
// aclop
|
||||
BatchMatMulOpRunner op;
|
||||
{input_code}
|
||||
op.add(out0, false);
|
||||
{attr_code}
|
||||
op.run();""",
|
||||
data=extra_data)
|
||||
|
||||
|
||||
class BmmACL(jt.Function):
|
||||
|
||||
def __init__(self, trans_x2=False):
|
||||
super(BmmACL, self).__init__()
|
||||
self.trans_x2 = trans_x2
|
||||
|
||||
def execute(self, x1, x2):
|
||||
self.input = [x1, x2]
|
||||
result = acl_cmd("BatchMatMul", [x1, x2],
|
||||
output_dtypes=[x1.dtype],
|
||||
output_shapes=[
|
||||
x1.shape[:-1] + x2.shape[-2:-1] if self.trans_x2
|
||||
else x1.shape[:-1] + x2.shape[-1:]
|
||||
],
|
||||
attr_code="op.jt_name=\"bmm_trans_1\";"
|
||||
if self.trans_x2 else "op.jt_name=\"bmm\";")[0]
|
||||
|
||||
return result
|
||||
|
||||
def grad(self, grad_output):
|
||||
x1, x2 = self.input
|
||||
if len(x1) != len(x2):
|
||||
reshape_grad_x2 = True
|
||||
else:
|
||||
reshape_grad_x2 = False
|
||||
grad_x1 = acl_cmd(
|
||||
"BatchMatMul", [grad_output, x2],
|
||||
output_dtypes=[x1.dtype],
|
||||
output_shapes=[
|
||||
grad_output.shape[:-1] + x2.shape[-2:-1] if not self.trans_x2
|
||||
else grad_output.shape[:-1] + x1.shape[-1:]
|
||||
],
|
||||
attr_code="op.jt_name=\"bmm_trans_1\";"
|
||||
if not self.trans_x2 else "op.jt_name=\"bmm\";")[0]
|
||||
if self.trans_x2:
|
||||
if reshape_grad_x2:
|
||||
output_shape = grad_output.shape[1:-2] + grad_output.shape[
|
||||
-1:] + x1.shape[-1:]
|
||||
grad_x2 = acl_cmd("BatchMatMul", [
|
||||
grad_output.reshape(-1, grad_output.shape[-1]),
|
||||
x1.reshape(-1, x1.shape[-1])
|
||||
],
|
||||
output_dtypes=[x2.dtype],
|
||||
output_shapes=[output_shape],
|
||||
attr_code="op.jt_name=\"bmm_trans_0\";")[0]
|
||||
else:
|
||||
output_shape = grad_output.shape[:-2] + grad_output.shape[
|
||||
-1:] + x1.shape[-1:]
|
||||
grad_x2 = acl_cmd("BatchMatMul", [grad_output, x1],
|
||||
output_dtypes=[x2.dtype],
|
||||
output_shapes=[output_shape],
|
||||
attr_code="op.jt_name=\"bmm_trans_0\";")[0]
|
||||
else:
|
||||
if reshape_grad_x2:
|
||||
output_shape = x1.shape[1:-2] + x1.shape[
|
||||
-1:] + grad_output.shape[-1:]
|
||||
grad_x2 = acl_cmd("BatchMatMul", [
|
||||
x1.reshape(-1, x1.shape[-1]),
|
||||
grad_output.reshape(-1, grad_output.shape[-1])
|
||||
],
|
||||
output_dtypes=[x2.dtype],
|
||||
output_shapes=[output_shape],
|
||||
attr_code="op.jt_name=\"bmm_trans_0\";")[0]
|
||||
else:
|
||||
output_shape = x1.shape[:-2] + x1.shape[
|
||||
-1:] + grad_output.shape[-1:]
|
||||
grad_x2 = acl_cmd("BatchMatMul", [x1, grad_output],
|
||||
output_dtypes=[x2.dtype],
|
||||
output_shapes=[output_shape],
|
||||
attr_code="op.jt_name=\"bmm_trans_0\";")[0]
|
||||
if len(grad_x1.shape) > len(x1.shape):
|
||||
grad_x1 = grad_x1.sum(0)
|
||||
if len(grad_x2.shape) > len(x2.shape):
|
||||
grad_x2 = grad_x2.sum(0)
|
||||
return grad_x1, grad_x2
|
|
@ -0,0 +1,77 @@
|
|||
#pragma once
|
||||
#include <acl/acl.h>
|
||||
#include <acl/acl_op_compiler.h>
|
||||
#include <Python.h>
|
||||
#include <pystate.h>
|
||||
#include <algorithm>
|
||||
#include <queue>
|
||||
#include <set>
|
||||
#include "common.h"
|
||||
#include "op.h"
|
||||
#include "acl_jittor.h"
|
||||
#include "ops/random_op.h"
|
||||
#include "ops/reduce_op.h"
|
||||
#include "ops/binary_op.h"
|
||||
#include "ops/broadcast_to_op.h"
|
||||
#include "ops/transpose_op.h"
|
||||
#include "ops/array_op.h"
|
||||
#include "ops/code_op.h"
|
||||
#include "fused_op.h"
|
||||
#include "ops/unary_op.h"
|
||||
#include "ops/ternary_op.h"
|
||||
#include "executor.h"
|
||||
#include "misc/cuda_flags.h"
|
||||
#include "mem/allocator.h"
|
||||
#include "op_compiler.h"
|
||||
#include "ops/op_register.h"
|
||||
#include "opt/tuner_manager.h"
|
||||
#include "utils/str_utils.h"
|
||||
#include "aclnn/aclnn.h"
|
||||
#include "bmm_op_acl.h"
|
||||
|
||||
namespace jittor
|
||||
{
|
||||
BatchMatMulOpRunner::BatchMatMulOpRunner() : BaseOpRunner("BatchMatMulMatMul")
|
||||
{
|
||||
}
|
||||
void BatchMatMulOpRunner::setupInputDesc()
|
||||
{
|
||||
auto input_num = in_.size();
|
||||
for (int input_idx = 0; input_idx < input_num; input_idx++)
|
||||
{
|
||||
std::vector<int64_t> shape;
|
||||
for (int j = 0; j < in_[input_idx]->shape.size(); j++)
|
||||
{
|
||||
shape.push_back(in_[input_idx]->shape[j]);
|
||||
}
|
||||
inputShapes.push_back(shape);
|
||||
}
|
||||
for (int idx = 0; idx < input_num; idx++)
|
||||
{
|
||||
inputTensors.push_back(nullptr);
|
||||
if ((jt_name == "bmm_trans_1" && idx == 1) || (jt_name == "bmm_trans_0" && idx == 0))
|
||||
{
|
||||
auto ret = CreateFakeTransAclTensor(inputShapes[idx], in_[idx]->mem_ptr, in_[idx]->size, get_dtype(in_[idx]->dtype()), &inputTensors[idx], use_nchw);
|
||||
CHECK_RET(ret == ACL_SUCCESS, return);
|
||||
}
|
||||
else
|
||||
{
|
||||
auto ret = CreateAclTensor(inputShapes[idx], in_[idx]->mem_ptr, in_[idx]->size, get_dtype(in_[idx]->dtype()), &inputTensors[idx], use_nchw);
|
||||
CHECK_RET(ret == ACL_SUCCESS, return);
|
||||
}
|
||||
}
|
||||
}
|
||||
void BatchMatMulOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
|
||||
{
|
||||
|
||||
ret = aclnnBatchMatMulGetWorkspaceSize(inputTensors[0], inputTensors[1], outputTensors[0], 1, &workspaceSize, &executor);
|
||||
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnBatchMatmulGetWorkspaceSize failed. ERROR: %d\n", name.c_str(), ret); return);
|
||||
if (workspaceSize > 0)
|
||||
{
|
||||
mallocWorkSpace(workspaceSize);
|
||||
}
|
||||
ret = aclnnBatchMatMul(workspaceAddr, workspaceSize, executor, aclstream);
|
||||
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnbatchMatmul failed. ERROR: %d\n", name.c_str(), ret); return);
|
||||
syncRun();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,17 @@
|
|||
#pragma once
|
||||
#include "utils.h"
|
||||
#include "base_op.h"
|
||||
|
||||
namespace jittor
|
||||
{
|
||||
class BatchMatMulOpRunner : public BaseOpRunner
|
||||
{
|
||||
|
||||
protected:
|
||||
void setupInputDesc() override;
|
||||
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
|
||||
|
||||
public:
|
||||
BatchMatMulOpRunner();
|
||||
};
|
||||
}
|
|
@ -0,0 +1,186 @@
|
|||
import os
|
||||
from jittor_utils import env_or_try_find
|
||||
import jittor_utils
|
||||
import ctypes
|
||||
import glob
|
||||
import jittor.compiler as compiler
|
||||
import jittor as jt
|
||||
import math
|
||||
import numpy as np
|
||||
|
||||
from typing import Union
|
||||
from collections.abc import Sequence, Iterable
|
||||
|
||||
|
||||
def concat_cmd(name: str,
|
||||
inputs: list,
|
||||
output_dtypes: list = None,
|
||||
output_shapes: list = None,
|
||||
attr_code: str = "",
|
||||
attr_header: str = "",
|
||||
outputs: list = None):
|
||||
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
|
||||
|
||||
cuda_header = '''
|
||||
#include "acl/aclops/aclops.h"
|
||||
'''
|
||||
outputs_ = []
|
||||
if outputs is not None:
|
||||
outputs_ = outputs
|
||||
else:
|
||||
assert output_dtypes is not None
|
||||
assert output_shapes is not None
|
||||
assert len(output_dtypes) == len(output_shapes)
|
||||
for i in range(len(output_shapes)):
|
||||
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
|
||||
input_code = ''
|
||||
for i in range(len(inputs)):
|
||||
input_code += f"op.add(in{i}, true);\n"
|
||||
|
||||
output_code = ''
|
||||
for i in range(len(outputs_)):
|
||||
output_code += f"op.add(out{i}, false);\n"
|
||||
return jt.code(outputs=outputs_,
|
||||
inputs=inputs,
|
||||
cuda_header=attr_header + cuda_header,
|
||||
cuda_src=f"""
|
||||
|
||||
// aclop
|
||||
{name}OpRunner op;
|
||||
{input_code}
|
||||
{output_code}
|
||||
{attr_code}
|
||||
op.run();""")
|
||||
|
||||
|
||||
class ConcatACL(jt.Function):
|
||||
|
||||
def __init__(self):
|
||||
super(ConcatACL, self).__init__()
|
||||
|
||||
def __call__(self, *args):
|
||||
assert isinstance(args[0], (list, tuple))
|
||||
assert isinstance(args[1], int)
|
||||
if jt.flags.no_grad:
|
||||
return self.execute(*args)
|
||||
backup = args
|
||||
args = list(args)
|
||||
taped_inputs = []
|
||||
taped_outputs = []
|
||||
input_mask = [-1] * (len(args[0]) + 1)
|
||||
newargs = [list(), args[1]]
|
||||
for i, v in enumerate(args[0]):
|
||||
if isinstance(v, jt.Var):
|
||||
if v.is_stop_grad():
|
||||
# -2 in input_mask represents it is stop_grad
|
||||
input_mask[i] = -2
|
||||
newargs[0].append(v)
|
||||
continue
|
||||
v = v.tape()
|
||||
newargs[0].append(v)
|
||||
input_mask[i] = len(taped_inputs)
|
||||
taped_inputs.append(v)
|
||||
|
||||
ori_res = self.execute(*newargs)
|
||||
if not isinstance(ori_res, Sequence):
|
||||
res = [ori_res]
|
||||
else:
|
||||
res = list(ori_res)
|
||||
output_mask = [-1] * len(res)
|
||||
for i, v in enumerate(res):
|
||||
if isinstance(v, jt.Var):
|
||||
v = v.tape()
|
||||
output_mask[i] = len(taped_outputs)
|
||||
res[i] = v
|
||||
taped_outputs.append(v)
|
||||
self.input_mask = input_mask
|
||||
self.output_mask = output_mask
|
||||
# tape output and input together so
|
||||
# backward treat them as one operator
|
||||
jt.tape_together(taped_inputs, taped_outputs, self._grad)
|
||||
if isinstance(ori_res, Sequence):
|
||||
return res
|
||||
else:
|
||||
return res[0]
|
||||
|
||||
def execute(self, input_tensors, dim=0):
|
||||
for _ in input_tensors:
|
||||
if not (-_.ndim <= dim < _.ndim):
|
||||
print(_.shape, dim)
|
||||
raise ValueError("dim out of range")
|
||||
|
||||
if dim < 0:
|
||||
dim += input_tensors[0].ndim
|
||||
|
||||
self.input = input_tensors
|
||||
self.dim = dim
|
||||
for i in range(len(input_tensors)):
|
||||
if input_tensors[i].dtype != input_tensors[0].dtype:
|
||||
raise ValueError("All input tensors must have the same dtype")
|
||||
if input_tensors[i].shape[:dim] != input_tensors[
|
||||
0].shape[:dim] or input_tensors[i].shape[
|
||||
dim + 1:] != input_tensors[0].shape[dim + 1:]:
|
||||
raise ValueError("All input tensors must have the same shape")
|
||||
attr_code = f"""
|
||||
op.jt_name = "concat";
|
||||
ConcatAttr *attr = new ConcatAttr();
|
||||
attr->tensorNum = {len(input_tensors)};
|
||||
attr->dim = {dim};
|
||||
op.op_attr.reset(attr);
|
||||
"""
|
||||
result = concat_cmd(
|
||||
"Concat",
|
||||
input_tensors,
|
||||
output_dtypes=[input_tensors[0].dtype],
|
||||
output_shapes=[
|
||||
jt.empty(self.calculate_output_shape(input_tensors, dim)).shape
|
||||
],
|
||||
attr_code=attr_code)[0]
|
||||
return result
|
||||
|
||||
def _grad(self, *args):
|
||||
new_args = ((args[i] if i >= 0 else None) for i in self.output_mask)
|
||||
ret = self.grad(*new_args)
|
||||
new_ret = []
|
||||
for i, r in enumerate(ret):
|
||||
j = self.input_mask[i]
|
||||
if j < 0:
|
||||
# -2 in input_mask represents it is stop_grad
|
||||
assert r is None or j==-2, f"{type(self)}'s {i}-th returned grad should be None, "\
|
||||
"because the input value is not jittor variable."
|
||||
else:
|
||||
new_ret.append(r)
|
||||
return new_ret
|
||||
|
||||
def grad(self, grad_output):
|
||||
grad_inputs = self.split_grad(grad_output, self.input, self.dim)
|
||||
return grad_inputs
|
||||
|
||||
def calculate_output_shape(self, input_tensors, axis):
|
||||
shape = list(input_tensors[0].shape)
|
||||
for tensor in input_tensors[1:]:
|
||||
shape[axis] += tensor.shape[axis]
|
||||
return tuple(shape)
|
||||
|
||||
def split_grad(self, grad_output, input_tensors, axis):
|
||||
offset = []
|
||||
shapeVec = []
|
||||
dtypeVec = []
|
||||
for tensor in input_tensors:
|
||||
offset.append(tensor.shape[axis])
|
||||
dtypeVec.append(tensor.dtype)
|
||||
shapeVec.append(tensor.shape)
|
||||
|
||||
attr_code = f"""
|
||||
op.jt_name = "splitwithsize";
|
||||
auto *attr = new SplitWithSizeAttr();
|
||||
attr->splitSize = {{ {", ".join(map(str, offset))} }};
|
||||
attr->dim = {axis};
|
||||
op.op_attr.reset(attr);
|
||||
"""
|
||||
|
||||
result = concat_cmd("SplitWithSize", [grad_output],
|
||||
output_dtypes=dtypeVec,
|
||||
output_shapes=shapeVec,
|
||||
attr_code=attr_code)
|
||||
return result
|
|
@ -0,0 +1,89 @@
|
|||
#pragma once
|
||||
#include <acl/acl.h>
|
||||
#include <acl/acl_op_compiler.h>
|
||||
#include <Python.h>
|
||||
#include <pystate.h>
|
||||
#include <algorithm>
|
||||
#include <queue>
|
||||
#include <set>
|
||||
#include "common.h"
|
||||
#include "op.h"
|
||||
#include "acl_jittor.h"
|
||||
#include "ops/random_op.h"
|
||||
#include "ops/reduce_op.h"
|
||||
#include "ops/binary_op.h"
|
||||
#include "ops/broadcast_to_op.h"
|
||||
#include "ops/transpose_op.h"
|
||||
#include "ops/array_op.h"
|
||||
#include "ops/code_op.h"
|
||||
#include "fused_op.h"
|
||||
#include "ops/unary_op.h"
|
||||
#include "ops/ternary_op.h"
|
||||
#include "executor.h"
|
||||
#include "misc/cuda_flags.h"
|
||||
#include "mem/allocator.h"
|
||||
#include "op_compiler.h"
|
||||
#include "ops/op_register.h"
|
||||
#include "opt/tuner_manager.h"
|
||||
#include "utils/str_utils.h"
|
||||
#include "aclnn/aclnn.h"
|
||||
#include "concat_op_acl.h"
|
||||
|
||||
namespace jittor
|
||||
{
|
||||
ConcatOpRunner::ConcatOpRunner() : BaseOpRunner("Concat")
|
||||
{
|
||||
}
|
||||
|
||||
void ConcatOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
|
||||
{
|
||||
auto input_num = in_.size();
|
||||
std::vector<aclTensor *> concatTensorList = {};
|
||||
for (int i = 0; i < input_num; i++)
|
||||
{
|
||||
concatTensorList.push_back(inputTensors[i]);
|
||||
}
|
||||
auto concatTensorListInput = aclCreateTensorList(&concatTensorList[0], input_num);
|
||||
auto attr = dynamic_cast<ConcatAttr *>(op_attr.get());
|
||||
ret = aclnnCatGetWorkspaceSize(concatTensorListInput, attr->dim, outputTensors[0], &workspaceSize, &executor);
|
||||
checkRet(ret);
|
||||
|
||||
if (workspaceSize > 0)
|
||||
{
|
||||
mallocWorkSpace(workspaceSize);
|
||||
}
|
||||
|
||||
ret = aclnnCat(workspaceAddr, workspaceSize, executor, aclstream);
|
||||
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnCat failed. ERROR: %d\n", name.c_str(), ret); return);
|
||||
|
||||
syncRun();
|
||||
return;
|
||||
}
|
||||
|
||||
SplitWithSizeOpRunner::SplitWithSizeOpRunner() : BaseOpRunner("SplitWithSize")
|
||||
{
|
||||
}
|
||||
|
||||
void SplitWithSizeOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
|
||||
{
|
||||
auto output_num = out_.size();
|
||||
auto attr = dynamic_cast<SplitWithSizeAttr *>(op_attr.get());
|
||||
auto splitSize = aclCreateIntArray(attr->splitSize.data(), attr->splitSize.size());
|
||||
auto tensorList = aclCreateTensorList(&outputTensors[0], output_num);
|
||||
ret = aclnnSplitWithSizeGetWorkspaceSize(inputTensors[0], splitSize, attr->dim, tensorList, &workspaceSize, &executor);
|
||||
|
||||
checkRet(ret);
|
||||
|
||||
if (workspaceSize > 0)
|
||||
{
|
||||
mallocWorkSpace(workspaceSize);
|
||||
}
|
||||
|
||||
ret = aclnnSplitWithSize(workspaceAddr, workspaceSize, executor, aclstream);
|
||||
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnSplitWithSize failed. ERROR: %d\n", name.c_str(), ret); return);
|
||||
|
||||
syncRun();
|
||||
return;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,26 @@
|
|||
#pragma once
|
||||
#include "utils.h"
|
||||
#include "base_op.h"
|
||||
|
||||
namespace jittor
|
||||
{
|
||||
class ConcatOpRunner : public BaseOpRunner
|
||||
{
|
||||
|
||||
protected:
|
||||
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
|
||||
|
||||
public:
|
||||
ConcatOpRunner();
|
||||
};
|
||||
|
||||
class SplitWithSizeOpRunner : public BaseOpRunner
|
||||
{
|
||||
|
||||
protected:
|
||||
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
|
||||
|
||||
public:
|
||||
SplitWithSizeOpRunner();
|
||||
};
|
||||
}
|
|
@ -0,0 +1,160 @@
|
|||
import os
|
||||
import jittor_utils
|
||||
from jittor_utils import env_or_try_find
|
||||
import ctypes
|
||||
import glob
|
||||
import jittor as jt
|
||||
import jittor.compiler as compiler
|
||||
import math
|
||||
import numpy as np
|
||||
|
||||
from typing import Union
|
||||
from collections.abc import Sequence, Iterable
|
||||
|
||||
|
||||
def _ntuple(n):
|
||||
|
||||
def parse(x):
|
||||
if isinstance(x, Iterable):
|
||||
return x
|
||||
return tuple([x] * n)
|
||||
|
||||
return parse
|
||||
|
||||
|
||||
_pair = _ntuple(2)
|
||||
|
||||
|
||||
def conv_cmd(name: str,
|
||||
inputs: list,
|
||||
output_dtypes: list = None,
|
||||
output_shapes: list = None,
|
||||
attr_code: str = "",
|
||||
attr_header: str = "",
|
||||
outputs: list = None):
|
||||
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
|
||||
|
||||
cuda_header = '''
|
||||
#include "acl/aclops/aclops.h"
|
||||
'''
|
||||
outputs_ = []
|
||||
if outputs is not None:
|
||||
outputs_ = outputs
|
||||
else:
|
||||
assert output_dtypes is not None
|
||||
assert output_shapes is not None
|
||||
assert len(output_dtypes) == len(output_shapes)
|
||||
for i in range(len(output_shapes)):
|
||||
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
|
||||
input_code = ''
|
||||
for i in range(len(inputs)):
|
||||
input_code += f"op.add(in{i}, true);\n"
|
||||
|
||||
output_code = ''
|
||||
for i in range(len(outputs_)):
|
||||
output_code += f"op.add(out{i}, false);\n"
|
||||
return jt.code(outputs=outputs_,
|
||||
inputs=inputs,
|
||||
cuda_header=attr_header + cuda_header,
|
||||
cuda_src=f"""
|
||||
|
||||
// aclop
|
||||
{name}OpRunner op;
|
||||
{input_code}
|
||||
{output_code}
|
||||
{attr_code}
|
||||
op.run();""")
|
||||
|
||||
class ConvACL(jt.Function):
|
||||
|
||||
def execute(self,
|
||||
x,
|
||||
weight,
|
||||
bias=None,
|
||||
stride=1,
|
||||
padding=0,
|
||||
dilation=1,
|
||||
groups=1):
|
||||
self.input = x
|
||||
self.weight = weight
|
||||
self.bias = bias
|
||||
padding = _pair(padding)
|
||||
stride = _pair(stride)
|
||||
dilation = _pair(dilation)
|
||||
out_channels = weight.shape[0]
|
||||
if groups <= 0:
|
||||
raise ValueError("groups must be a positive integer")
|
||||
self.padding = padding
|
||||
self.stride = stride
|
||||
self.dilation = dilation
|
||||
self.groups = groups
|
||||
attr_code = f"""
|
||||
op.jt_name = "conv2d";
|
||||
ConvAttr *attr = new ConvAttr();
|
||||
attr->convStrides = {{ {stride[0]}, {stride[1]} }};
|
||||
attr->convPads = {{ {padding[0]}, {padding[1]} }};
|
||||
attr->convDilations = {{ {dilation[0]}, {dilation[1]} }};
|
||||
attr->group = {groups};
|
||||
attr->convOutPads = {{1,1}};
|
||||
op.op_attr.reset(attr);
|
||||
"""
|
||||
input_height, input_width = x.shape[-2:]
|
||||
kernel_height, kernel_width = weight.shape[-2:]
|
||||
|
||||
output_height = (input_height + 2 * padding[0] - dilation[0] *
|
||||
(kernel_height - 1) - 1) // stride[0] + 1
|
||||
output_width = (input_width + 2 * padding[1] - dilation[1] *
|
||||
(kernel_width - 1) - 1) // stride[1] + 1
|
||||
|
||||
output_shape = (x.shape[0], out_channels, output_height, output_width)
|
||||
|
||||
inputs = [x, weight]
|
||||
if bias is not None:
|
||||
inputs.append(bias)
|
||||
result = conv_cmd(
|
||||
"Conv2d",
|
||||
inputs,
|
||||
output_dtypes=[x.dtype],
|
||||
output_shapes=[output_shape],
|
||||
attr_code=attr_code,
|
||||
)[0]
|
||||
return result
|
||||
|
||||
def grad(self, grad_output):
|
||||
x = self.input
|
||||
weight = self.weight
|
||||
bias = self.bias
|
||||
inputs = [grad_output, x, weight]
|
||||
if bias is not None:
|
||||
inputs.append(bias)
|
||||
output_shapes = [x.shape, weight.shape]
|
||||
output_dtypes = [x.dtype, weight.dtype]
|
||||
if bias is not None:
|
||||
output_shapes.append(bias.shape)
|
||||
output_dtypes.append(bias.dtype)
|
||||
else:
|
||||
output_shapes.append([weight.shape[0]])
|
||||
output_dtypes.append(x.dtype)
|
||||
padding = self.padding
|
||||
stride = self.stride
|
||||
dilation = self.dilation
|
||||
groups = self.groups
|
||||
attr_code = f"""
|
||||
op.jt_name = "conv2dbackward";
|
||||
ConvAttr *attr = new ConvAttr();
|
||||
attr->convStrides = {{ {stride[0]}, {stride[1]} }};
|
||||
attr->convPads = {{ {padding[0]}, {padding[1]} }};
|
||||
attr->convDilations = {{ {dilation[0]}, {dilation[1]} }};
|
||||
attr->group = {groups};
|
||||
attr->convOutPads = {{ 1,1}};
|
||||
op.op_attr.reset(attr);
|
||||
"""
|
||||
results = conv_cmd("Conv2dBackward",
|
||||
inputs,
|
||||
output_dtypes=output_dtypes,
|
||||
output_shapes=output_shapes,
|
||||
attr_code=attr_code)
|
||||
if self.bias is None:
|
||||
return results[0], results[1]
|
||||
|
||||
return results
|
|
@ -0,0 +1,152 @@
|
|||
#include <acl/acl.h>
|
||||
#include <acl/acl_op_compiler.h>
|
||||
#include <Python.h>
|
||||
#include <pystate.h>
|
||||
#include <algorithm>
|
||||
#include <queue>
|
||||
#include <set>
|
||||
#include "common.h"
|
||||
#include "op.h"
|
||||
#include "acl_jittor.h"
|
||||
#include "ops/random_op.h"
|
||||
#include "ops/reduce_op.h"
|
||||
#include "ops/binary_op.h"
|
||||
#include "ops/broadcast_to_op.h"
|
||||
#include "ops/transpose_op.h"
|
||||
#include "ops/array_op.h"
|
||||
#include "ops/code_op.h"
|
||||
#include "fused_op.h"
|
||||
#include "ops/unary_op.h"
|
||||
#include "ops/ternary_op.h"
|
||||
#include "executor.h"
|
||||
#include "misc/cuda_flags.h"
|
||||
#include "mem/allocator.h"
|
||||
#include "op_compiler.h"
|
||||
#include "ops/op_register.h"
|
||||
#include "opt/tuner_manager.h"
|
||||
#include "utils/str_utils.h"
|
||||
#include "aclnn/aclnn.h"
|
||||
#include "conv_op_acl.h"
|
||||
|
||||
namespace jittor
|
||||
{
|
||||
Conv2dOpRunner::Conv2dOpRunner() : BaseOpRunner("Conv2d")
|
||||
{
|
||||
use_nchw = true;
|
||||
}
|
||||
|
||||
void Conv2dOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
|
||||
{
|
||||
// for conv
|
||||
aclIntArray *strides = nullptr;
|
||||
aclIntArray *pads = nullptr;
|
||||
aclIntArray *outPads = nullptr;
|
||||
aclIntArray *dilations = nullptr;
|
||||
auto attr = dynamic_cast<ConvAttr *>(op_attr.get());
|
||||
strides = aclCreateIntArray(attr->convStrides.data(), 2);
|
||||
pads = aclCreateIntArray(attr->convPads.data(), 2);
|
||||
outPads = aclCreateIntArray(attr->convOutPads.data(), 2);
|
||||
dilations = aclCreateIntArray(attr->convDilations.data(), 2);
|
||||
|
||||
aclTensor *bias = nullptr;
|
||||
|
||||
auto input_num = in_.size();
|
||||
if (input_num == 3)
|
||||
bias = inputTensors[2];
|
||||
|
||||
ret = aclnnConvolutionGetWorkspaceSize(inputTensors[0], inputTensors[1], bias, strides, pads, dilations, false, outPads, attr->group, outputTensors[0], 0, &workspaceSize, &executor);
|
||||
|
||||
checkRet(ret);
|
||||
|
||||
if (workspaceSize > 0)
|
||||
{
|
||||
mallocWorkSpace(workspaceSize);
|
||||
}
|
||||
|
||||
ret = aclnnConvolution(workspaceAddr, workspaceSize, executor, aclstream);
|
||||
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnConvolution failed. ERROR: %d\n", name.c_str(), ret); return);
|
||||
|
||||
syncRun();
|
||||
|
||||
aclDestroyIntArray(strides);
|
||||
aclDestroyIntArray(pads);
|
||||
aclDestroyIntArray(outPads);
|
||||
aclDestroyIntArray(dilations);
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
Conv2dBackwardOpRunner::Conv2dBackwardOpRunner() : BaseOpRunner("Conv2dBackward")
|
||||
{
|
||||
use_nchw = true;
|
||||
}
|
||||
|
||||
void Conv2dBackwardOpRunner::setupOutputDesc()
|
||||
{
|
||||
auto output_num = out_.size();
|
||||
|
||||
for (int output_idx = 0; output_idx < output_num; output_idx++)
|
||||
{
|
||||
std::vector<int64_t> shape;
|
||||
for (int j = 0; j < out_[output_idx]->shape.size(); j++)
|
||||
{
|
||||
shape.push_back(out_[output_idx]->shape[j]);
|
||||
}
|
||||
outputShapes.push_back(shape);
|
||||
}
|
||||
|
||||
for (int idx = 0; idx < 2; idx++)
|
||||
{
|
||||
outputTensors.push_back(nullptr);
|
||||
auto ret = CreateAclTensor(outputShapes[idx], out_[idx]->mem_ptr, out_[idx]->size, get_dtype(out_[idx]->dtype()), &outputTensors[idx], use_nchw);
|
||||
CHECK_RET(ret == ACL_SUCCESS, return);
|
||||
}
|
||||
// biasgrad nd format
|
||||
{
|
||||
outputTensors.push_back(nullptr);
|
||||
auto ret = CreateAclTensor(outputShapes[2], out_[2]->mem_ptr, out_[2]->size, get_dtype(out_[2]->dtype()), &outputTensors[2], false);
|
||||
CHECK_RET(ret == ACL_SUCCESS, return);
|
||||
}
|
||||
}
|
||||
|
||||
void Conv2dBackwardOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
|
||||
{
|
||||
// for conv
|
||||
aclIntArray *strides = nullptr;
|
||||
aclIntArray *pads = nullptr;
|
||||
aclIntArray *outPads = nullptr;
|
||||
aclIntArray *dilations = nullptr;
|
||||
auto attr = dynamic_cast<ConvAttr *>(op_attr.get());
|
||||
strides = aclCreateIntArray(attr->convStrides.data(), 2);
|
||||
pads = aclCreateIntArray(attr->convPads.data(), 2);
|
||||
outPads = aclCreateIntArray(attr->convOutPads.data(), 2);
|
||||
dilations = aclCreateIntArray(attr->convDilations.data(), 2);
|
||||
bool outputMask[3] = {true, true, true};
|
||||
auto input_num = in_.size();
|
||||
if (input_num == 3)
|
||||
{
|
||||
outputMask[2] = false;
|
||||
}
|
||||
aclBoolArray *outMask = aclCreateBoolArray(outputMask, 3);
|
||||
auto biasSizes = aclCreateIntArray(&outputShapes[2][0], outputShapes[2].size());
|
||||
ret = aclnnConvolutionBackwardGetWorkspaceSize(inputTensors[0], inputTensors[1], inputTensors[2], biasSizes, strides, pads, dilations, false, outPads, attr->group, outMask, 0, outputTensors[0], outputTensors[1], outputTensors[2], &workspaceSize, &executor);
|
||||
|
||||
checkRet(ret);
|
||||
|
||||
if (workspaceSize > 0)
|
||||
{
|
||||
mallocWorkSpace(workspaceSize);
|
||||
}
|
||||
|
||||
ret = aclnnConvolutionBackward(workspaceAddr, workspaceSize, executor, aclstream);
|
||||
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnConvolutionBackward failed. ERROR: %d\n", name.c_str(), ret); return);
|
||||
|
||||
syncRun();
|
||||
|
||||
aclDestroyIntArray(strides);
|
||||
aclDestroyIntArray(pads);
|
||||
aclDestroyIntArray(outPads);
|
||||
aclDestroyIntArray(dilations);
|
||||
return;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,27 @@
|
|||
#pragma once
|
||||
#include "utils.h"
|
||||
#include "base_op.h"
|
||||
|
||||
namespace jittor
|
||||
{
|
||||
class Conv2dOpRunner : public BaseOpRunner
|
||||
{
|
||||
|
||||
protected:
|
||||
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
|
||||
|
||||
public:
|
||||
Conv2dOpRunner();
|
||||
};
|
||||
|
||||
class Conv2dBackwardOpRunner : public BaseOpRunner
|
||||
{
|
||||
|
||||
protected:
|
||||
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
|
||||
void setupOutputDesc() override;
|
||||
|
||||
public:
|
||||
Conv2dBackwardOpRunner();
|
||||
};
|
||||
}
|
|
@ -0,0 +1,101 @@
|
|||
import os
|
||||
from jittor_utils import env_or_try_find
|
||||
import jittor_utils
|
||||
import ctypes
|
||||
import glob
|
||||
import jittor.compiler as compiler
|
||||
import jittor as jt
|
||||
import math
|
||||
import numpy as np
|
||||
|
||||
from typing import Union
|
||||
from collections.abc import Sequence, Iterable
|
||||
|
||||
|
||||
def cumsum_cmd(name: str,
|
||||
inputs: list,
|
||||
output_dtypes: list = None,
|
||||
output_shapes: list = None,
|
||||
attr_code: str = "",
|
||||
attr_header: str = "",
|
||||
outputs: list = None):
|
||||
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
|
||||
|
||||
cuda_header = '''
|
||||
#include "acl/aclops/aclops.h"
|
||||
'''
|
||||
outputs_ = []
|
||||
if outputs is not None:
|
||||
outputs_ = outputs
|
||||
else:
|
||||
assert output_dtypes is not None
|
||||
assert output_shapes is not None
|
||||
assert len(output_dtypes) == len(output_shapes)
|
||||
for i in range(len(output_shapes)):
|
||||
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
|
||||
input_code = ''
|
||||
for i in range(len(inputs)):
|
||||
input_code += f"op.add(in{i}, true);\n"
|
||||
|
||||
output_code = ''
|
||||
for i in range(len(outputs_)):
|
||||
output_code += f"op.add(out{i}, false);\n"
|
||||
return jt.code(outputs=outputs_,
|
||||
inputs=inputs,
|
||||
cuda_header=attr_header + cuda_header,
|
||||
cuda_src=f"""
|
||||
|
||||
// aclop
|
||||
{name}OpRunner op;
|
||||
{input_code}
|
||||
{output_code}
|
||||
{attr_code}
|
||||
op.run();""")
|
||||
|
||||
|
||||
class CumsumACL(jt.Function):
|
||||
|
||||
def __init__(self):
|
||||
super(CumsumACL, self).__init__()
|
||||
|
||||
def execute(self, input, dim=-1):
|
||||
self.dim = dim
|
||||
attr_code = f"""
|
||||
op.jt_name = "cumsum";
|
||||
GatherAttr *attr = new GatherAttr();
|
||||
attr->dim = {dim};
|
||||
op.op_attr.reset(attr);
|
||||
"""
|
||||
result = cumsum_cmd("Cumsum", [input],
|
||||
output_dtypes=[input.dtype],
|
||||
output_shapes=[input.shape],
|
||||
attr_code=attr_code)[0]
|
||||
return result
|
||||
|
||||
def grad(self, grad_output):
|
||||
cumsum_attr_code = f"""
|
||||
op.jt_name = "cumsum";
|
||||
GatherAttr *attr = new GatherAttr();
|
||||
attr->dim = {self.dim};
|
||||
op.op_attr.reset(attr);
|
||||
"""
|
||||
flip_attr_code = f"""
|
||||
op.jt_name = "flip";
|
||||
ReduceAttr *attr = new ReduceAttr();
|
||||
attr->axes = {{{self.dim}}};
|
||||
attr->prod_dim = {{{1}}};
|
||||
op.op_attr.reset(attr);
|
||||
"""
|
||||
flipped_grad_output = cumsum_cmd("Flip", [grad_output],
|
||||
output_dtypes=[grad_output.dtype],
|
||||
output_shapes=[grad_output.shape],
|
||||
attr_code=flip_attr_code)[0]
|
||||
cumulative_grad = cumsum_cmd("Cumsum", [flipped_grad_output],
|
||||
output_dtypes=[grad_output.dtype],
|
||||
output_shapes=[grad_output.shape],
|
||||
attr_code=cumsum_attr_code)[0]
|
||||
grad_input = cumsum_cmd("Flip", [cumulative_grad],
|
||||
output_dtypes=[grad_output.dtype],
|
||||
output_shapes=[grad_output.shape],
|
||||
attr_code=flip_attr_code)[0]
|
||||
return grad_input
|
|
@ -0,0 +1,57 @@
|
|||
#pragma once
|
||||
#include <acl/acl.h>
|
||||
#include <acl/acl_op_compiler.h>
|
||||
#include <Python.h>
|
||||
#include <pystate.h>
|
||||
#include <algorithm>
|
||||
#include <queue>
|
||||
#include <set>
|
||||
#include "common.h"
|
||||
#include "op.h"
|
||||
#include "acl_jittor.h"
|
||||
#include "ops/random_op.h"
|
||||
#include "ops/reduce_op.h"
|
||||
#include "ops/binary_op.h"
|
||||
#include "ops/broadcast_to_op.h"
|
||||
#include "ops/transpose_op.h"
|
||||
#include "ops/array_op.h"
|
||||
#include "ops/code_op.h"
|
||||
#include "fused_op.h"
|
||||
#include "ops/unary_op.h"
|
||||
#include "ops/ternary_op.h"
|
||||
#include "executor.h"
|
||||
#include "misc/cuda_flags.h"
|
||||
#include "mem/allocator.h"
|
||||
#include "op_compiler.h"
|
||||
#include "ops/op_register.h"
|
||||
#include "opt/tuner_manager.h"
|
||||
#include "utils/str_utils.h"
|
||||
#include "aclnn/aclnn.h"
|
||||
#include "cumsum_op_acl.h"
|
||||
|
||||
namespace jittor
|
||||
{
|
||||
CumsumOpRunner::CumsumOpRunner() : BaseOpRunner("Cumsum")
|
||||
{
|
||||
}
|
||||
|
||||
void CumsumOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
|
||||
{
|
||||
auto attr = dynamic_cast<GatherAttr *>(op_attr.get());
|
||||
ret = aclnnCumsumGetWorkspaceSize(inputTensors[0], attr->dim, get_dtype(out_[0]->dtype()), outputTensors[0], &workspaceSize, &executor);
|
||||
|
||||
checkRet(ret);
|
||||
|
||||
if (workspaceSize > 0)
|
||||
{
|
||||
mallocWorkSpace(workspaceSize);
|
||||
}
|
||||
|
||||
ret = aclnnCumsum(workspaceAddr, workspaceSize, executor, aclstream);
|
||||
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnCumsum failed. ERROR: %d\n", name.c_str(), ret); return);
|
||||
|
||||
syncRun();
|
||||
return;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,17 @@
|
|||
#pragma once
|
||||
#include "utils.h"
|
||||
#include "base_op.h"
|
||||
|
||||
namespace jittor
|
||||
{
|
||||
class CumsumOpRunner : public BaseOpRunner
|
||||
{
|
||||
|
||||
protected:
|
||||
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
|
||||
|
||||
public:
|
||||
CumsumOpRunner();
|
||||
};
|
||||
|
||||
}
|
|
@ -0,0 +1,94 @@
|
|||
import os
|
||||
from jittor_utils import env_or_try_find
|
||||
import jittor_utils
|
||||
import ctypes
|
||||
import glob
|
||||
import jittor.compiler as compiler
|
||||
import jittor as jt
|
||||
import math
|
||||
import numpy as np
|
||||
|
||||
from typing import Union
|
||||
from collections.abc import Sequence, Iterable
|
||||
|
||||
|
||||
def dropout_cmd(name: str,
|
||||
inputs: list,
|
||||
output_dtypes: list = None,
|
||||
output_shapes: list = None,
|
||||
attr_code: str = "",
|
||||
attr_header: str = "",
|
||||
outputs: list = None):
|
||||
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
|
||||
|
||||
cuda_header = '''
|
||||
#include "acl/aclops/aclops.h"
|
||||
'''
|
||||
outputs_ = []
|
||||
if outputs is not None:
|
||||
outputs_ = outputs
|
||||
else:
|
||||
assert output_dtypes is not None
|
||||
assert output_shapes is not None
|
||||
assert len(output_dtypes) == len(output_shapes)
|
||||
for i in range(len(output_shapes)):
|
||||
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
|
||||
input_code = ''
|
||||
for i in range(len(inputs)):
|
||||
input_code += f"op.add(in{i}, true);\n"
|
||||
|
||||
output_code = ''
|
||||
for i in range(len(outputs_)):
|
||||
output_code += f"op.add(out{i}, false);\n"
|
||||
return jt.code(outputs=outputs_,
|
||||
inputs=inputs,
|
||||
cuda_header=attr_header + cuda_header,
|
||||
cuda_src=f"""
|
||||
|
||||
// aclop
|
||||
{name}OpRunner op;
|
||||
{input_code}
|
||||
{output_code}
|
||||
{attr_code}
|
||||
op.run();""")
|
||||
|
||||
|
||||
class DropoutACL(jt.Function):
|
||||
|
||||
def __init__(self):
|
||||
super(DropoutACL, self).__init__()
|
||||
|
||||
def execute(self, x, p=0.5, is_train=False):
|
||||
self.input = x
|
||||
num_elements = x.numel()
|
||||
aligned_elements = (num_elements + 127) // 128 * 128
|
||||
mask_shape = (aligned_elements // 8, )
|
||||
attr_code = f"""
|
||||
op.jt_name = "dropout";
|
||||
DropoutAttr *attr = new DropoutAttr();
|
||||
attr->p = {p};
|
||||
attr->train = {"true" if is_train else "false"};
|
||||
attr->seed = 0;
|
||||
attr->offset = 0;
|
||||
op.op_attr.reset(attr);
|
||||
"""
|
||||
result = dropout_cmd("Dropout", [x],
|
||||
output_dtypes=[x.dtype, "uint8"],
|
||||
output_shapes=[x.shape, mask_shape],
|
||||
attr_code=attr_code)
|
||||
self.maskout = result[1]
|
||||
return result[0]
|
||||
|
||||
def grad(self, grad_output):
|
||||
attr_code = f"""
|
||||
op.jt_name = "dropoutbackward";
|
||||
DropoutAttr *attr = new DropoutAttr();
|
||||
attr->scale = 1.0;
|
||||
op.op_attr.reset(attr);
|
||||
"""
|
||||
grad_input = dropout_cmd("DropoutBackward",
|
||||
[grad_output, self.maskout],
|
||||
output_dtypes=[grad_output.dtype],
|
||||
output_shapes=[grad_output.shape],
|
||||
attr_code=attr_code)[0]
|
||||
return grad_input
|
|
@ -0,0 +1,82 @@
|
|||
#pragma once
|
||||
#include <acl/acl.h>
|
||||
#include <acl/acl_op_compiler.h>
|
||||
#include <Python.h>
|
||||
#include <pystate.h>
|
||||
#include <algorithm>
|
||||
#include <queue>
|
||||
#include <set>
|
||||
#include "common.h"
|
||||
#include "op.h"
|
||||
#include "acl_jittor.h"
|
||||
#include "ops/random_op.h"
|
||||
#include "ops/reduce_op.h"
|
||||
#include "ops/binary_op.h"
|
||||
#include "ops/broadcast_to_op.h"
|
||||
#include "ops/transpose_op.h"
|
||||
#include "ops/array_op.h"
|
||||
#include "ops/code_op.h"
|
||||
#include "fused_op.h"
|
||||
#include "ops/unary_op.h"
|
||||
#include "ops/ternary_op.h"
|
||||
#include "executor.h"
|
||||
#include "misc/cuda_flags.h"
|
||||
#include "mem/allocator.h"
|
||||
#include "op_compiler.h"
|
||||
#include "ops/op_register.h"
|
||||
#include "opt/tuner_manager.h"
|
||||
#include "utils/str_utils.h"
|
||||
#include "aclnn/aclnn.h"
|
||||
#include "dropout_op_acl.h"
|
||||
|
||||
namespace jittor
|
||||
{
|
||||
DropoutOpRunner::DropoutOpRunner() : BaseOpRunner("Dropout")
|
||||
{
|
||||
}
|
||||
|
||||
void DropoutOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
|
||||
{
|
||||
auto attr = dynamic_cast<DropoutAttr *>(op_attr.get());
|
||||
ret = aclnnDropoutGetWorkspaceSize(inputTensors[0], attr->p, attr->train, attr->seed, attr->offset, outputTensors[0], outputTensors[1], &workspaceSize, &executor);
|
||||
|
||||
checkRet(ret);
|
||||
|
||||
if (workspaceSize > 0)
|
||||
{
|
||||
mallocWorkSpace(workspaceSize);
|
||||
}
|
||||
|
||||
ret = aclnnDropout(workspaceAddr, workspaceSize, executor, aclstream);
|
||||
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnDropout failed. ERROR: %d\n", name.c_str(), ret); return);
|
||||
|
||||
syncRun();
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
DropoutBackwardOpRunner::DropoutBackwardOpRunner() : BaseOpRunner("DropoutBackward")
|
||||
{
|
||||
}
|
||||
|
||||
void DropoutBackwardOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
|
||||
{
|
||||
auto attr = dynamic_cast<DropoutAttr *>(op_attr.get());
|
||||
ret = aclnnDropoutBackwardGetWorkspaceSize(inputTensors[0], inputTensors[1], attr->scale, outputTensors[0], &workspaceSize, &executor);
|
||||
|
||||
checkRet(ret);
|
||||
|
||||
if (workspaceSize > 0)
|
||||
{
|
||||
mallocWorkSpace(workspaceSize);
|
||||
}
|
||||
|
||||
ret = aclnnDropoutBackward(workspaceAddr, workspaceSize, executor, aclstream);
|
||||
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnDropoutBackward failed. ERROR: %d\n", name.c_str(), ret); return);
|
||||
|
||||
syncRun();
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,27 @@
|
|||
#pragma once
|
||||
#include "utils.h"
|
||||
#include "base_op.h"
|
||||
|
||||
namespace jittor
|
||||
{
|
||||
class DropoutOpRunner : public BaseOpRunner
|
||||
{
|
||||
|
||||
protected:
|
||||
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
|
||||
|
||||
public:
|
||||
DropoutOpRunner();
|
||||
};
|
||||
|
||||
class DropoutBackwardOpRunner : public BaseOpRunner
|
||||
{
|
||||
|
||||
protected:
|
||||
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
|
||||
|
||||
public:
|
||||
DropoutBackwardOpRunner();
|
||||
};
|
||||
|
||||
}
|
|
@ -0,0 +1,91 @@
|
|||
import os
|
||||
from jittor_utils import env_or_try_find
|
||||
import jittor_utils
|
||||
import ctypes
|
||||
import glob
|
||||
import jittor.compiler as compiler
|
||||
import jittor as jt
|
||||
import math
|
||||
import numpy as np
|
||||
|
||||
from typing import Union
|
||||
from collections.abc import Sequence, Iterable
|
||||
|
||||
def embedding_cmd(name: str,
|
||||
inputs: list,
|
||||
output_dtypes: list = None,
|
||||
output_shapes: list = None,
|
||||
attr_code: str = "",
|
||||
attr_header: str = "",
|
||||
outputs: list = None):
|
||||
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
|
||||
|
||||
cuda_header = '''
|
||||
#include "acl/aclops/aclops.h"
|
||||
'''
|
||||
outputs_ = []
|
||||
if outputs is not None:
|
||||
outputs_ = outputs
|
||||
else:
|
||||
assert output_dtypes is not None
|
||||
assert output_shapes is not None
|
||||
assert len(output_dtypes) == len(output_shapes)
|
||||
for i in range(len(output_shapes)):
|
||||
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
|
||||
input_code = ''
|
||||
for i in range(len(inputs)):
|
||||
input_code += f"op.add(in{i}, true);\n"
|
||||
|
||||
output_code = ''
|
||||
for i in range(len(outputs_)):
|
||||
output_code += f"op.add(out{i}, false);\n"
|
||||
return jt.code(outputs=outputs_,
|
||||
inputs=inputs,
|
||||
cuda_header=attr_header + cuda_header,
|
||||
cuda_src=f"""
|
||||
|
||||
// aclop
|
||||
{name}OpRunner op;
|
||||
{input_code}
|
||||
{output_code}
|
||||
{attr_code}
|
||||
op.run();""")
|
||||
|
||||
class EmbeddingACL(jt.Function):
|
||||
|
||||
def __init__(self):
|
||||
super(EmbeddingACL, self).__init__()
|
||||
|
||||
def execute(
|
||||
self,
|
||||
indices,
|
||||
weight,
|
||||
):
|
||||
inputs = [weight, indices]
|
||||
self.indices = indices
|
||||
self.weight_shape = weight.shape
|
||||
output_shape = list(indices.shape) + list(weight.shape[1:])
|
||||
outputs = [jt.empty(output_shape, weight.dtype)]
|
||||
attr_code = f"""
|
||||
op.jt_name = "embedding";
|
||||
"""
|
||||
result = embedding_cmd("Embedding",
|
||||
inputs=inputs,
|
||||
outputs=outputs,
|
||||
attr_code=attr_code)[0]
|
||||
return result
|
||||
|
||||
def grad(self, grad_output):
|
||||
inputs = [grad_output, self.indices]
|
||||
outputs = [jt.empty(self.weight_shape, grad_output.dtype)]
|
||||
attr_code = f"""
|
||||
op.jt_name = "embeddingbackward";
|
||||
EmbeddingAttr *attr = new EmbeddingAttr();
|
||||
attr->numEmbeddings = {self.weight_shape[0]};
|
||||
op.op_attr.reset(attr);
|
||||
"""
|
||||
grad_weight = embedding_cmd("EmbeddingBackward",
|
||||
inputs=inputs,
|
||||
outputs=outputs,
|
||||
attr_code=attr_code)[0]
|
||||
return None, grad_weight
|
|
@ -0,0 +1,82 @@
|
|||
#pragma once
|
||||
#include <acl/acl.h>
|
||||
#include <acl/acl_op_compiler.h>
|
||||
#include <Python.h>
|
||||
#include <pystate.h>
|
||||
#include <algorithm>
|
||||
#include <queue>
|
||||
#include <set>
|
||||
#include "common.h"
|
||||
#include "op.h"
|
||||
#include "acl_jittor.h"
|
||||
#include "ops/random_op.h"
|
||||
#include "ops/reduce_op.h"
|
||||
#include "ops/binary_op.h"
|
||||
#include "ops/broadcast_to_op.h"
|
||||
#include "ops/transpose_op.h"
|
||||
#include "ops/array_op.h"
|
||||
#include "ops/code_op.h"
|
||||
#include "fused_op.h"
|
||||
#include "ops/unary_op.h"
|
||||
#include "ops/ternary_op.h"
|
||||
#include "executor.h"
|
||||
#include "misc/cuda_flags.h"
|
||||
#include "mem/allocator.h"
|
||||
#include "op_compiler.h"
|
||||
#include "ops/op_register.h"
|
||||
#include "opt/tuner_manager.h"
|
||||
#include "utils/str_utils.h"
|
||||
#include "aclnn/aclnn.h"
|
||||
#include "embedding_op_acl.h"
|
||||
|
||||
namespace jittor
|
||||
{
|
||||
EmbeddingOpRunner::EmbeddingOpRunner() : BaseOpRunner("Embedding")
|
||||
{
|
||||
}
|
||||
|
||||
void EmbeddingOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
|
||||
{
|
||||
ret = aclnnEmbeddingGetWorkspaceSize(inputTensors[0], inputTensors[1], outputTensors[0], &workspaceSize, &executor);
|
||||
|
||||
checkRet(ret);
|
||||
|
||||
if (workspaceSize > 0)
|
||||
{
|
||||
mallocWorkSpace(workspaceSize);
|
||||
}
|
||||
|
||||
ret = aclnnEmbedding(workspaceAddr, workspaceSize, executor, aclstream);
|
||||
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnEmbedding failed. ERROR: %d\n", name.c_str(), ret); return);
|
||||
|
||||
syncRun();
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
EmbeddingBackwardOpRunner::EmbeddingBackwardOpRunner() : BaseOpRunner("EmbeddingBackward")
|
||||
{
|
||||
}
|
||||
|
||||
void EmbeddingBackwardOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
|
||||
{
|
||||
auto attr = dynamic_cast<EmbeddingAttr *>(op_attr.get());
|
||||
auto numEmbeddings = attr->numEmbeddings;
|
||||
ret = aclnnEmbeddingDenseBackwardGetWorkspaceSize(inputTensors[0], inputTensors[1], numEmbeddings, 0, false, outputTensors[0], &workspaceSize, &executor);
|
||||
|
||||
checkRet(ret);
|
||||
|
||||
if (workspaceSize > 0)
|
||||
{
|
||||
mallocWorkSpace(workspaceSize);
|
||||
}
|
||||
|
||||
ret = aclnnEmbeddingDenseBackward(workspaceAddr, workspaceSize, executor, aclstream);
|
||||
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnEmbeddingDenseBackward failed. ERROR: %d\n", name.c_str(), ret); return);
|
||||
|
||||
syncRun();
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,25 @@
|
|||
#pragma once
|
||||
#include "utils.h"
|
||||
#include "base_op.h"
|
||||
|
||||
namespace jittor
|
||||
{
|
||||
class EmbeddingOpRunner : public BaseOpRunner
|
||||
{
|
||||
|
||||
protected:
|
||||
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
|
||||
public:
|
||||
EmbeddingOpRunner();
|
||||
};
|
||||
|
||||
class EmbeddingBackwardOpRunner : public BaseOpRunner
|
||||
{
|
||||
|
||||
protected:
|
||||
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
|
||||
public:
|
||||
EmbeddingBackwardOpRunner();
|
||||
};
|
||||
|
||||
}
|
|
@ -0,0 +1,58 @@
|
|||
#include <acl/acl.h>
|
||||
#include <acl/acl_op_compiler.h>
|
||||
#include <Python.h>
|
||||
#include <pystate.h>
|
||||
#include <algorithm>
|
||||
#include <queue>
|
||||
#include <set>
|
||||
#include "common.h"
|
||||
#include "op.h"
|
||||
#include "acl_jittor.h"
|
||||
#include "ops/random_op.h"
|
||||
#include "ops/reduce_op.h"
|
||||
#include "ops/binary_op.h"
|
||||
#include "ops/broadcast_to_op.h"
|
||||
#include "ops/transpose_op.h"
|
||||
#include "ops/array_op.h"
|
||||
#include "ops/code_op.h"
|
||||
#include "fused_op.h"
|
||||
#include "ops/unary_op.h"
|
||||
#include "ops/ternary_op.h"
|
||||
#include "executor.h"
|
||||
#include "misc/cuda_flags.h"
|
||||
#include "mem/allocator.h"
|
||||
#include "op_compiler.h"
|
||||
#include "ops/op_register.h"
|
||||
#include "opt/tuner_manager.h"
|
||||
#include "utils/str_utils.h"
|
||||
#include "aclnn/aclnn.h"
|
||||
#include "expand_op_acl.h"
|
||||
|
||||
namespace jittor
|
||||
{
|
||||
ExpandOpRunner::ExpandOpRunner() : BaseOpRunner("ternary")
|
||||
{
|
||||
use_nchw = false;
|
||||
}
|
||||
|
||||
void ExpandOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
|
||||
{
|
||||
aclIntArray *size = nullptr;
|
||||
size = aclCreateIntArray(&outputShapes[0][0], outputShapes[0].size());
|
||||
ret = aclnnExpandGetWorkspaceSize(inputTensors[0], size, outputTensors[0], &workspaceSize, &executor);
|
||||
|
||||
checkRet(ret);
|
||||
|
||||
if (workspaceSize > 0)
|
||||
{
|
||||
mallocWorkSpace(workspaceSize);
|
||||
}
|
||||
|
||||
ret = aclnnExpand(workspaceAddr, workspaceSize, executor, aclstream);
|
||||
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnExpand failed. ERROR: %d\n", name.c_str(), ret); return);
|
||||
|
||||
aclDestroyIntArray(size);
|
||||
|
||||
return;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,14 @@
|
|||
#pragma once
|
||||
#include "utils.h"
|
||||
#include "base_op.h"
|
||||
|
||||
namespace jittor
|
||||
{
|
||||
struct ExpandOpRunner : public BaseOpRunner
|
||||
{
|
||||
ExpandOpRunner();
|
||||
|
||||
protected:
|
||||
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
|
||||
};
|
||||
}
|
|
@ -0,0 +1,209 @@
|
|||
import os
|
||||
from jittor_utils import env_or_try_find
|
||||
import jittor_utils
|
||||
import ctypes
|
||||
import glob
|
||||
import jittor.compiler as compiler
|
||||
import jittor as jt
|
||||
import math
|
||||
import numpy as np
|
||||
|
||||
from typing import Union
|
||||
from collections.abc import Sequence, Iterable
|
||||
|
||||
|
||||
def flashattention_cmd(name: str,
|
||||
inputs: list,
|
||||
output_dtypes: list = None,
|
||||
output_shapes: list = None,
|
||||
attr_code: str = "",
|
||||
attr_header: str = "",
|
||||
outputs: list = None):
|
||||
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
|
||||
|
||||
cuda_header = '''
|
||||
#include "acl/aclops/aclops.h"
|
||||
'''
|
||||
outputs_ = []
|
||||
if outputs is not None:
|
||||
outputs_ = outputs
|
||||
else:
|
||||
assert output_dtypes is not None
|
||||
assert output_shapes is not None
|
||||
assert len(output_dtypes) == len(output_shapes)
|
||||
for i in range(len(output_shapes)):
|
||||
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
|
||||
input_code = ''
|
||||
for i in range(len(inputs)):
|
||||
input_code += f"op.add(in{i}, true);\n"
|
||||
|
||||
output_code = ''
|
||||
for i in range(len(outputs_)):
|
||||
output_code += f"op.add(out{i}, false);\n"
|
||||
return jt.code(outputs=outputs_,
|
||||
inputs=inputs,
|
||||
cuda_header=attr_header + cuda_header,
|
||||
cuda_src=f"""
|
||||
|
||||
// aclop
|
||||
{name}OpRunner op;
|
||||
{input_code}
|
||||
{output_code}
|
||||
{attr_code}
|
||||
op.run();""")
|
||||
|
||||
|
||||
class FlashAttentionACL(jt.Function):
|
||||
|
||||
def __init__(self,
|
||||
headnum,
|
||||
layout="BNSD",
|
||||
prefix=None,
|
||||
qstart=None,
|
||||
kvstart=None,
|
||||
scale=1.0,
|
||||
prob=1.0,
|
||||
pretokens=2147483647,
|
||||
nexttokens=2147483647,
|
||||
innerprecise=0,
|
||||
sparsemode=0,
|
||||
psetype=1):
|
||||
self.headnum = headnum
|
||||
self.layout = layout
|
||||
self.scale = scale
|
||||
self.prob = prob
|
||||
self.pretokens = pretokens
|
||||
self.nexttokens = nexttokens
|
||||
self.innerprecise = innerprecise
|
||||
self.sparsemode = sparsemode
|
||||
self.psetype = psetype
|
||||
self.prefix = prefix
|
||||
self.qstart = qstart
|
||||
self.kvstart = kvstart
|
||||
|
||||
def execute(
|
||||
self,
|
||||
q,
|
||||
k,
|
||||
v,
|
||||
realshift=None,
|
||||
dropMask=None,
|
||||
paddingMask=None,
|
||||
attenMask=None,
|
||||
):
|
||||
if self.layout == 'BSH':
|
||||
B, SQ, H = q.shape
|
||||
SKV = k.shape[1]
|
||||
N = self.headnum
|
||||
D = H / N
|
||||
elif self.layout == 'SBH':
|
||||
SQ, B, H = q.shape
|
||||
SKV = k.shape[0]
|
||||
N = self.headnum
|
||||
D = H / N
|
||||
elif self.layout == 'BSND':
|
||||
B, SQ, N, D = q.shape
|
||||
SKV = k.shape[1]
|
||||
elif self.layout == 'BNSD':
|
||||
B, N, SQ, D = q.shape
|
||||
SKV = k.shape[2]
|
||||
else:
|
||||
raise ValueError(f"got invalid input layout {self.layout}")
|
||||
|
||||
output_shape = (B, N, SQ, 8)
|
||||
|
||||
self.q = q
|
||||
self.k = k
|
||||
self.v = v
|
||||
|
||||
self.prefix = self.prefix if self.prefix else [0 for _ in range(B)]
|
||||
self.qstart = self.qstart if self.qstart else [0 for _ in range(B)]
|
||||
self.kvstart = self.kvstart if self.kvstart else [0 for _ in range(B)]
|
||||
|
||||
self.hasRealshift = (not realshift == None)
|
||||
self.hasDropmask = (not dropMask == None)
|
||||
self.hasPaddingmask = (not paddingMask == None)
|
||||
self.hasAttenmask = (not attenMask == None)
|
||||
|
||||
# 待定,目前设为nullptr
|
||||
self.realshift = realshift if realshift else jt.zeros(B, N, SQ, SKV)
|
||||
self.dropMask = dropMask if dropMask else jt.ones(B, N, SQ, SKV)
|
||||
self.paddingMask = paddingMask if paddingMask else jt.zeros(
|
||||
B, N, SQ, SKV)
|
||||
self.attenMask = attenMask if attenMask else jt.zeros(SQ, SKV)
|
||||
|
||||
attr_code = f"""
|
||||
op.jt_name = "flashattention";
|
||||
FlashAttentionAttr *attr = new FlashAttentionAttr();
|
||||
attr->scale = {self.scale};
|
||||
attr->keepProb = {self.prob};
|
||||
attr->preToken = {self.pretokens};
|
||||
attr->nextToken = {self.nexttokens};
|
||||
attr->headNum = {self.headnum};
|
||||
attr->inputLayout = "{self.layout}";
|
||||
attr->innerPrecise = {self.innerprecise};
|
||||
attr->sparseMode = {self.sparsemode};
|
||||
attr->psetype = {self.psetype};
|
||||
attr->prefix = {{ {", ".join(map(str, self.prefix))} }};
|
||||
attr->qStartIdx = {{ {", ".join(map(str, self.qstart))} }};
|
||||
attr->kvStartIdx = {{ {", ".join(map(str, self.kvstart))} }};
|
||||
attr->hasRealshift = {"true" if self.hasRealshift else "false"};
|
||||
attr->hasDropmask = {"true" if self.hasDropmask else "false"};
|
||||
attr->hasPaddingmask = {"true" if self.hasPaddingmask else "false"};
|
||||
attr->hasAttentmask = {"true" if self.hasAttenmask else "false"};
|
||||
op.op_attr.reset(attr);
|
||||
"""
|
||||
|
||||
inputs = [
|
||||
q, k, v, self.realshift, self.dropMask, self.paddingMask,
|
||||
self.attenMask
|
||||
]
|
||||
|
||||
result = flashattention_cmd(
|
||||
"FlashAttention",
|
||||
inputs,
|
||||
output_dtypes=["float", "float", q.dtype],
|
||||
output_shapes=[output_shape, output_shape, q.shape],
|
||||
attr_code=attr_code)
|
||||
|
||||
self.maxout = result[0]
|
||||
self.sumout = result[1]
|
||||
self.attenout = result[2]
|
||||
|
||||
return self.attenout
|
||||
|
||||
def grad(self, dy):
|
||||
attr_code = f"""
|
||||
op.jt_name = "flashattentionbackward";
|
||||
FlashAttentionAttr *attr = new FlashAttentionAttr();
|
||||
attr->scale = {self.scale};
|
||||
attr->keepProb = {self.prob};
|
||||
attr->preToken = {self.pretokens};
|
||||
attr->nextToken = {self.nexttokens};
|
||||
attr->headNum = {self.headnum};
|
||||
attr->inputLayout = "{self.layout}";
|
||||
attr->innerPrecise = {self.innerprecise};
|
||||
attr->sparseMode = {self.sparsemode};
|
||||
attr->psetype = {self.psetype};
|
||||
attr->prefix = {{ {", ".join(map(str, self.prefix))} }};
|
||||
attr->qStartIdx = {{ {", ".join(map(str, self.qstart))} }};
|
||||
attr->kvStartIdx = {{ {", ".join(map(str, self.kvstart))} }};
|
||||
attr->hasRealshift = {"true" if self.hasRealshift else "false"};
|
||||
attr->hasDropmask = {"true" if self.hasDropmask else "false"};
|
||||
attr->hasPaddingmask = {"true" if self.hasPaddingmask else "false"};
|
||||
attr->hasAttentmask = {"true" if self.hasAttenmask else "false"};
|
||||
op.op_attr.reset(attr);
|
||||
"""
|
||||
inputs = [
|
||||
self.q, self.k, self.v, dy, self.realshift, self.dropMask,
|
||||
self.paddingMask, self.attenMask, self.maxout, self.sumout,
|
||||
self.attenout
|
||||
]
|
||||
|
||||
result = flashattention_cmd(
|
||||
"FlashAttentionBackward",
|
||||
inputs,
|
||||
output_dtypes=[self.q.dtype, self.k.dtype, self.v.dtype],
|
||||
output_shapes=[self.q.shape, self.k.shape, self.v.shape],
|
||||
attr_code=attr_code)
|
||||
return result
|
|
@ -0,0 +1,88 @@
|
|||
#pragma once
|
||||
#include <acl/acl.h>
|
||||
#include <acl/acl_op_compiler.h>
|
||||
#include <Python.h>
|
||||
#include <pystate.h>
|
||||
#include <algorithm>
|
||||
#include <queue>
|
||||
#include <set>
|
||||
#include "common.h"
|
||||
#include "op.h"
|
||||
#include "acl_jittor.h"
|
||||
#include "ops/random_op.h"
|
||||
#include "ops/reduce_op.h"
|
||||
#include "ops/binary_op.h"
|
||||
#include "ops/broadcast_to_op.h"
|
||||
#include "ops/transpose_op.h"
|
||||
#include "ops/array_op.h"
|
||||
#include "ops/code_op.h"
|
||||
#include "fused_op.h"
|
||||
#include "ops/unary_op.h"
|
||||
#include "ops/ternary_op.h"
|
||||
#include "executor.h"
|
||||
#include "misc/cuda_flags.h"
|
||||
#include "mem/allocator.h"
|
||||
#include "op_compiler.h"
|
||||
#include "ops/op_register.h"
|
||||
#include "opt/tuner_manager.h"
|
||||
#include "utils/str_utils.h"
|
||||
#include "aclnn/aclnn.h"
|
||||
#include "flashattention_op_acl.h"
|
||||
|
||||
namespace jittor
|
||||
{
|
||||
FlashAttentionOpRunner::FlashAttentionOpRunner() : BaseOpRunner("FlashAttention")
|
||||
{
|
||||
}
|
||||
|
||||
void FlashAttentionOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
|
||||
{
|
||||
auto attr = dynamic_cast<FlashAttentionAttr *>(op_attr.get());
|
||||
auto prefix = aclCreateIntArray(attr->prefix.data(), attr->prefix.size());
|
||||
auto qstart = aclCreateIntArray(attr->qStartIdx.data(), attr->qStartIdx.size());
|
||||
auto kvstart = aclCreateIntArray(attr->kvStartIdx.data(), attr->kvStartIdx.size());
|
||||
char *layout = const_cast<char *>(attr->inputLayout.data());
|
||||
ret = aclnnFlashAttentionScoreV2GetWorkspaceSize(inputTensors[0], inputTensors[1], inputTensors[2], attr->hasRealshift ? inputTensors[3] : nullptr, attr->hasDropmask ? inputTensors[4] : nullptr, nullptr, attr->hasAttentmask ? inputTensors[6] : nullptr, prefix, qstart, kvstart, attr->scale, attr->keepProb, attr->preToken, attr->nextToken, attr->headNum, layout, attr->innerPrecise, attr->sparseMode, attr->psetype, outputTensors[0], outputTensors[1], nullptr, outputTensors[2], &workspaceSize, &executor);
|
||||
|
||||
checkRet(ret);
|
||||
|
||||
if (workspaceSize > 0)
|
||||
{
|
||||
mallocWorkSpace(workspaceSize);
|
||||
}
|
||||
|
||||
ret = aclnnFlashAttentionScoreV2(workspaceAddr, workspaceSize, executor, aclstream);
|
||||
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnFlashAttentionScoreV2 failed. ERROR: %d\n", name.c_str(), ret); return);
|
||||
|
||||
syncRun();
|
||||
return;
|
||||
}
|
||||
|
||||
FlashAttentionBackwardOpRunner::FlashAttentionBackwardOpRunner() : BaseOpRunner("FlashAttentionBackward")
|
||||
{
|
||||
}
|
||||
|
||||
void FlashAttentionBackwardOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
|
||||
{
|
||||
auto attr = dynamic_cast<FlashAttentionAttr *>(op_attr.get());
|
||||
auto prefix = aclCreateIntArray(attr->prefix.data(), attr->prefix.size());
|
||||
auto qstart = aclCreateIntArray(attr->qStartIdx.data(), attr->qStartIdx.size());
|
||||
auto kvstart = aclCreateIntArray(attr->kvStartIdx.data(), attr->kvStartIdx.size());
|
||||
char *layout = const_cast<char *>(attr->inputLayout.data());
|
||||
ret = aclnnFlashAttentionScoreGradV2GetWorkspaceSize(inputTensors[0], inputTensors[1], inputTensors[2], inputTensors[3], attr->hasRealshift ? inputTensors[4] : nullptr, attr->hasDropmask ? inputTensors[5] : nullptr, nullptr, attr->hasAttentmask ? inputTensors[7] : nullptr, inputTensors[8], inputTensors[9], nullptr, inputTensors[10], prefix, qstart, kvstart, attr->scale, attr->keepProb, attr->preToken, attr->nextToken, attr->headNum, layout, attr->innerPrecise, attr->sparseMode, attr->psetype, outputTensors[0], outputTensors[1], outputTensors[2], nullptr, &workspaceSize, &executor);
|
||||
|
||||
checkRet(ret);
|
||||
|
||||
if (workspaceSize > 0)
|
||||
{
|
||||
mallocWorkSpace(workspaceSize);
|
||||
}
|
||||
|
||||
ret = aclnnFlashAttentionScoreGradV2(workspaceAddr, workspaceSize, executor, aclstream);
|
||||
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnFlashAttentionScoreGradV2 failed. ERROR: %d\n", name.c_str(), ret); return);
|
||||
|
||||
syncRun();
|
||||
return;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,27 @@
|
|||
#pragma once
|
||||
#include "utils.h"
|
||||
#include "base_op.h"
|
||||
|
||||
namespace jittor
|
||||
{
|
||||
class FlashAttentionOpRunner : public BaseOpRunner
|
||||
{
|
||||
|
||||
protected:
|
||||
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
|
||||
|
||||
public:
|
||||
FlashAttentionOpRunner();
|
||||
};
|
||||
|
||||
class FlashAttentionBackwardOpRunner : public BaseOpRunner
|
||||
{
|
||||
|
||||
protected:
|
||||
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
|
||||
|
||||
public:
|
||||
FlashAttentionBackwardOpRunner();
|
||||
};
|
||||
|
||||
}
|
|
@ -0,0 +1,85 @@
|
|||
import os
|
||||
from jittor_utils import env_or_try_find
|
||||
import jittor_utils
|
||||
import ctypes
|
||||
import glob
|
||||
import jittor.compiler as compiler
|
||||
import jittor as jt
|
||||
import math
|
||||
import numpy as np
|
||||
|
||||
from typing import Union
|
||||
from collections.abc import Sequence, Iterable
|
||||
|
||||
|
||||
def flip_cmd(name: str,
|
||||
inputs: list,
|
||||
output_dtypes: list = None,
|
||||
output_shapes: list = None,
|
||||
attr_code: str = "",
|
||||
attr_header: str = "",
|
||||
outputs: list = None):
|
||||
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
|
||||
|
||||
cuda_header = '''
|
||||
#include "acl/aclops/aclops.h"
|
||||
'''
|
||||
outputs_ = []
|
||||
if outputs is not None:
|
||||
outputs_ = outputs
|
||||
else:
|
||||
assert output_dtypes is not None
|
||||
assert output_shapes is not None
|
||||
assert len(output_dtypes) == len(output_shapes)
|
||||
for i in range(len(output_shapes)):
|
||||
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
|
||||
input_code = ''
|
||||
for i in range(len(inputs)):
|
||||
input_code += f"op.add(in{i}, true);\n"
|
||||
|
||||
output_code = ''
|
||||
for i in range(len(outputs_)):
|
||||
output_code += f"op.add(out{i}, false);\n"
|
||||
return jt.code(outputs=outputs_,
|
||||
inputs=inputs,
|
||||
cuda_header=attr_header + cuda_header,
|
||||
cuda_src=f"""
|
||||
|
||||
// aclop
|
||||
{name}OpRunner op;
|
||||
{input_code}
|
||||
{output_code}
|
||||
{attr_code}
|
||||
op.run();""")
|
||||
|
||||
|
||||
class FlipACL(jt.Function):
|
||||
|
||||
def __init__(self):
|
||||
super(FlipACL, self).__init__()
|
||||
|
||||
def execute(self, input, dim):
|
||||
if type(dim) is tuple:
|
||||
dim = list(dim)
|
||||
if type(dim) is not list:
|
||||
dim = [dim]
|
||||
attr_code = f"""
|
||||
op.jt_name = "flip";
|
||||
ReduceAttr *attr = new ReduceAttr();
|
||||
attr->axes = {{{', '.join(map(str, (list(dim))))}}};
|
||||
attr->prod_dim = {len(dim)};
|
||||
op.op_attr.reset(attr);
|
||||
"""
|
||||
self.attr_code = attr_code
|
||||
result = flip_cmd("Flip", [input],
|
||||
output_dtypes=[input.dtype],
|
||||
output_shapes=[input.shape],
|
||||
attr_code=self.attr_code)[0]
|
||||
return result
|
||||
|
||||
def grad(self, grad_output):
|
||||
grad_input = flip_cmd("Flip", [grad_output],
|
||||
output_dtypes=[grad_output.dtype],
|
||||
output_shapes=[grad_output.shape],
|
||||
attr_code=self.attr_code)[0]
|
||||
return grad_input
|
|
@ -0,0 +1,58 @@
|
|||
#pragma once
|
||||
#include <acl/acl.h>
|
||||
#include <acl/acl_op_compiler.h>
|
||||
#include <Python.h>
|
||||
#include <pystate.h>
|
||||
#include <algorithm>
|
||||
#include <queue>
|
||||
#include <set>
|
||||
#include "common.h"
|
||||
#include "op.h"
|
||||
#include "acl_jittor.h"
|
||||
#include "ops/random_op.h"
|
||||
#include "ops/reduce_op.h"
|
||||
#include "ops/binary_op.h"
|
||||
#include "ops/broadcast_to_op.h"
|
||||
#include "ops/transpose_op.h"
|
||||
#include "ops/array_op.h"
|
||||
#include "ops/code_op.h"
|
||||
#include "fused_op.h"
|
||||
#include "ops/unary_op.h"
|
||||
#include "ops/ternary_op.h"
|
||||
#include "executor.h"
|
||||
#include "misc/cuda_flags.h"
|
||||
#include "mem/allocator.h"
|
||||
#include "op_compiler.h"
|
||||
#include "ops/op_register.h"
|
||||
#include "opt/tuner_manager.h"
|
||||
#include "utils/str_utils.h"
|
||||
#include "aclnn/aclnn.h"
|
||||
#include "flip_op_acl.h"
|
||||
|
||||
namespace jittor
|
||||
{
|
||||
FlipOpRunner::FlipOpRunner() : BaseOpRunner("Flip")
|
||||
{
|
||||
}
|
||||
|
||||
void FlipOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
|
||||
{
|
||||
auto attr = dynamic_cast<ReduceAttr *>(op_attr.get());
|
||||
auto dim = aclCreateIntArray(attr->axes.data(), attr->axes.size());
|
||||
ret = aclnnFlipGetWorkspaceSize(inputTensors[0], dim, outputTensors[0], &workspaceSize, &executor);
|
||||
|
||||
checkRet(ret);
|
||||
|
||||
if (workspaceSize > 0)
|
||||
{
|
||||
mallocWorkSpace(workspaceSize);
|
||||
}
|
||||
|
||||
ret = aclnnFlip(workspaceAddr, workspaceSize, executor, aclstream);
|
||||
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnFlip failed. ERROR: %d\n", name.c_str(), ret); return);
|
||||
|
||||
syncRun();
|
||||
return;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,16 @@
|
|||
#pragma once
|
||||
#include "utils.h"
|
||||
#include "base_op.h"
|
||||
|
||||
namespace jittor
|
||||
{
|
||||
class FlipOpRunner : public BaseOpRunner
|
||||
{
|
||||
|
||||
protected:
|
||||
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
|
||||
|
||||
public:
|
||||
FlipOpRunner();
|
||||
};
|
||||
}
|
|
@ -0,0 +1,70 @@
|
|||
import os
|
||||
from jittor_utils import env_or_try_find
|
||||
import jittor_utils
|
||||
import ctypes
|
||||
import glob
|
||||
import jittor.compiler as compiler
|
||||
import jittor as jt
|
||||
import math
|
||||
import numpy as np
|
||||
|
||||
from typing import Union
|
||||
from collections.abc import Sequence, Iterable
|
||||
|
||||
|
||||
def floor_cmd(name: str,
|
||||
inputs: list,
|
||||
output_dtypes: list = None,
|
||||
output_shapes: list = None,
|
||||
attr_code: str = "",
|
||||
attr_header: str = "",
|
||||
outputs: list = None):
|
||||
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
|
||||
|
||||
cuda_header = '''
|
||||
#include "acl/aclops/aclops.h"
|
||||
'''
|
||||
outputs_ = []
|
||||
if outputs is not None:
|
||||
outputs_ = outputs
|
||||
else:
|
||||
assert output_dtypes is not None
|
||||
assert output_shapes is not None
|
||||
assert len(output_dtypes) == len(output_shapes)
|
||||
for i in range(len(output_shapes)):
|
||||
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
|
||||
input_code = ''
|
||||
for i in range(len(inputs)):
|
||||
input_code += f"op.add(in{i}, true);\n"
|
||||
|
||||
output_code = ''
|
||||
for i in range(len(outputs_)):
|
||||
output_code += f"op.add(out{i}, false);\n"
|
||||
return jt.code(outputs=outputs_,
|
||||
inputs=inputs,
|
||||
cuda_header=attr_header + cuda_header,
|
||||
cuda_src=f"""
|
||||
|
||||
// aclop
|
||||
{name}OpRunner op;
|
||||
{input_code}
|
||||
{output_code}
|
||||
{attr_code}
|
||||
op.run();""")
|
||||
|
||||
|
||||
class FloorIntACL(jt.Function):
|
||||
|
||||
def __init__(self):
|
||||
super(FloorIntACL, self).__init__()
|
||||
|
||||
def execute(self, input):
|
||||
self.shape = input.shape
|
||||
result = floor_cmd("Floor", [input],
|
||||
output_dtypes=[input.dtype],
|
||||
output_shapes=[input.shape],
|
||||
attr_code="op.jt_name=\"floor\";")[0]
|
||||
return result
|
||||
|
||||
def grad(self, grad_output):
|
||||
return jt.zeros(self.shape, dtype=grad_output.dtype)
|
|
@ -0,0 +1,56 @@
|
|||
#pragma once
|
||||
#include <acl/acl.h>
|
||||
#include <acl/acl_op_compiler.h>
|
||||
#include <Python.h>
|
||||
#include <pystate.h>
|
||||
#include <algorithm>
|
||||
#include <queue>
|
||||
#include <set>
|
||||
#include "common.h"
|
||||
#include "op.h"
|
||||
#include "acl_jittor.h"
|
||||
#include "ops/random_op.h"
|
||||
#include "ops/reduce_op.h"
|
||||
#include "ops/binary_op.h"
|
||||
#include "ops/broadcast_to_op.h"
|
||||
#include "ops/transpose_op.h"
|
||||
#include "ops/array_op.h"
|
||||
#include "ops/code_op.h"
|
||||
#include "fused_op.h"
|
||||
#include "ops/unary_op.h"
|
||||
#include "ops/ternary_op.h"
|
||||
#include "executor.h"
|
||||
#include "misc/cuda_flags.h"
|
||||
#include "mem/allocator.h"
|
||||
#include "op_compiler.h"
|
||||
#include "ops/op_register.h"
|
||||
#include "opt/tuner_manager.h"
|
||||
#include "utils/str_utils.h"
|
||||
#include "aclnn/aclnn.h"
|
||||
#include "floor_op_acl.h"
|
||||
|
||||
namespace jittor
|
||||
{
|
||||
FloorOpRunner::FloorOpRunner() : BaseOpRunner("Floor")
|
||||
{
|
||||
}
|
||||
|
||||
void FloorOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
|
||||
{
|
||||
ret = aclnnFloorGetWorkspaceSize(inputTensors[0], outputTensors[0], &workspaceSize, &executor);
|
||||
|
||||
checkRet(ret);
|
||||
|
||||
if (workspaceSize > 0)
|
||||
{
|
||||
mallocWorkSpace(workspaceSize);
|
||||
}
|
||||
|
||||
ret = aclnnFloor(workspaceAddr, workspaceSize, executor, aclstream);
|
||||
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnFloor failed. ERROR: %d\n", name.c_str(), ret); return);
|
||||
|
||||
syncRun();
|
||||
return;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,16 @@
|
|||
#pragma once
|
||||
#include "utils.h"
|
||||
#include "base_op.h"
|
||||
|
||||
namespace jittor
|
||||
{
|
||||
class FloorOpRunner : public BaseOpRunner
|
||||
{
|
||||
|
||||
protected:
|
||||
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
|
||||
|
||||
public:
|
||||
FloorOpRunner();
|
||||
};
|
||||
}
|
|
@ -0,0 +1,126 @@
|
|||
import os
|
||||
from jittor_utils import env_or_try_find
|
||||
import jittor_utils
|
||||
import ctypes
|
||||
import glob
|
||||
import jittor.compiler as compiler
|
||||
import jittor as jt
|
||||
import math
|
||||
import numpy as np
|
||||
|
||||
from typing import Union
|
||||
from collections.abc import Sequence, Iterable
|
||||
|
||||
|
||||
def gather_scatter_cmd(name: str,
|
||||
inputs: list,
|
||||
output_dtypes: list = None,
|
||||
output_shapes: list = None,
|
||||
attr_code: str = "",
|
||||
attr_header: str = "",
|
||||
outputs: list = None):
|
||||
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
|
||||
|
||||
cuda_header = '''
|
||||
#include "acl/aclops/aclops.h"
|
||||
'''
|
||||
outputs_ = []
|
||||
if outputs is not None:
|
||||
outputs_ = outputs
|
||||
else:
|
||||
assert output_dtypes is not None
|
||||
assert output_shapes is not None
|
||||
assert len(output_dtypes) == len(output_shapes)
|
||||
for i in range(len(output_shapes)):
|
||||
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
|
||||
input_code = ''
|
||||
for i in range(len(inputs)):
|
||||
input_code += f"op.add(in{i}, true);\n"
|
||||
|
||||
output_code = ''
|
||||
for i in range(len(outputs_)):
|
||||
output_code += f"op.add(out{i}, false);\n"
|
||||
return jt.code(outputs=outputs_,
|
||||
inputs=inputs,
|
||||
cuda_header=attr_header + cuda_header,
|
||||
cuda_src=f"""
|
||||
|
||||
// aclop
|
||||
{name}OpRunner op;
|
||||
{input_code}
|
||||
{output_code}
|
||||
{attr_code}
|
||||
op.run();""")
|
||||
|
||||
|
||||
class GatherACL(jt.Function):
|
||||
|
||||
def __init__(self):
|
||||
super(GatherACL, self).__init__()
|
||||
|
||||
def execute(self, input, dim, index):
|
||||
self.dim = dim
|
||||
self.index = index
|
||||
attr_code = f"""
|
||||
op.jt_name = "gather";
|
||||
GatherAttr *attr = new GatherAttr();
|
||||
attr->dim = {dim};
|
||||
op.op_attr.reset(attr);
|
||||
"""
|
||||
result = gather_scatter_cmd("Gather", [input, index],
|
||||
output_dtypes=[input.dtype],
|
||||
output_shapes=[index.shape],
|
||||
attr_code=attr_code)[0]
|
||||
return result
|
||||
|
||||
def grad(self, grad_output):
|
||||
tmp = jt.zeros(self.index.shape, dtype=grad_output.dtype)
|
||||
attr_code = f"""
|
||||
op.jt_name = "scatter";
|
||||
ScatterAttr *attr = new ScatterAttr();
|
||||
attr->axis = {self.dim};
|
||||
attr->reduction = {1};
|
||||
op.op_attr.reset(attr);
|
||||
"""
|
||||
grad_input = gather_scatter_cmd("Scatter",
|
||||
[tmp, self.index, grad_output],
|
||||
output_dtypes=[grad_output.dtype],
|
||||
output_shapes=[tmp.shape],
|
||||
attr_code=attr_code)[0]
|
||||
return grad_input
|
||||
|
||||
|
||||
class ScatterACL(jt.Function):
|
||||
|
||||
def __init__(self):
|
||||
super(ScatterACL, self).__init__()
|
||||
|
||||
def execute(self, input, dim, index, src, reduce='void'):
|
||||
self.dim = dim
|
||||
self.index = index
|
||||
self.reduce = reduce
|
||||
attr_code = f"""
|
||||
op.jt_name = "scatter";
|
||||
ScatterAttr *attr = new ScatterAttr();
|
||||
attr->axis = {dim};
|
||||
attr->reduction = {1 if reduce == 'add' else 2 if reduce == 'mul' else 0};
|
||||
op.op_attr.reset(attr);
|
||||
"""
|
||||
result = gather_scatter_cmd("Scatter", [input, self.index, src],
|
||||
output_dtypes=[input.dtype],
|
||||
output_shapes=[input.shape],
|
||||
attr_code=attr_code)[0]
|
||||
return result
|
||||
|
||||
def grad(self, grad_output):
|
||||
attr_code = f"""
|
||||
op.jt_name = "gather";
|
||||
GatherAttr *attr = new GatherAttr();
|
||||
attr->dim = {self.dim};
|
||||
op.op_attr.reset(attr);
|
||||
"""
|
||||
grad_input = gather_scatter_cmd("Gather", [grad_output, self.index],
|
||||
output_dtypes=[grad_output.dtype],
|
||||
output_shapes=[self.index.shape],
|
||||
attr_code=attr_code)[0]
|
||||
return grad_output, None, None, grad_input
|
|
@ -0,0 +1,80 @@
|
|||
#pragma once
|
||||
#include <acl/acl.h>
|
||||
#include <acl/acl_op_compiler.h>
|
||||
#include <Python.h>
|
||||
#include <pystate.h>
|
||||
#include <algorithm>
|
||||
#include <queue>
|
||||
#include <set>
|
||||
#include "common.h"
|
||||
#include "op.h"
|
||||
#include "acl_jittor.h"
|
||||
#include "ops/random_op.h"
|
||||
#include "ops/reduce_op.h"
|
||||
#include "ops/binary_op.h"
|
||||
#include "ops/broadcast_to_op.h"
|
||||
#include "ops/transpose_op.h"
|
||||
#include "ops/array_op.h"
|
||||
#include "ops/code_op.h"
|
||||
#include "fused_op.h"
|
||||
#include "ops/unary_op.h"
|
||||
#include "ops/ternary_op.h"
|
||||
#include "executor.h"
|
||||
#include "misc/cuda_flags.h"
|
||||
#include "mem/allocator.h"
|
||||
#include "op_compiler.h"
|
||||
#include "ops/op_register.h"
|
||||
#include "opt/tuner_manager.h"
|
||||
#include "utils/str_utils.h"
|
||||
#include "aclnn/aclnn.h"
|
||||
#include "gather_scatter_op_acl.h"
|
||||
|
||||
namespace jittor
|
||||
{
|
||||
GatherOpRunner::GatherOpRunner() : BaseOpRunner("Gather")
|
||||
{
|
||||
}
|
||||
|
||||
void GatherOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
|
||||
{
|
||||
auto attr = dynamic_cast<GatherAttr *>(op_attr.get());
|
||||
ret = aclnnGatherGetWorkspaceSize(inputTensors[0], attr->dim, inputTensors[1], outputTensors[0], &workspaceSize, &executor);
|
||||
|
||||
checkRet(ret);
|
||||
|
||||
if (workspaceSize > 0)
|
||||
{
|
||||
mallocWorkSpace(workspaceSize);
|
||||
}
|
||||
|
||||
ret = aclnnGather(workspaceAddr, workspaceSize, executor, aclstream);
|
||||
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnGather failed. ERROR: %d\n", name.c_str(), ret); return);
|
||||
|
||||
syncRun();
|
||||
return;
|
||||
}
|
||||
|
||||
ScatterOpRunner::ScatterOpRunner() : BaseOpRunner("Scatter")
|
||||
{
|
||||
}
|
||||
|
||||
void ScatterOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
|
||||
{
|
||||
auto attr = dynamic_cast<ScatterAttr *>(op_attr.get());
|
||||
ret = aclnnScatterGetWorkspaceSize(inputTensors[0], attr->axis, inputTensors[1], inputTensors[2], attr->reduction, outputTensors[0], &workspaceSize, &executor);
|
||||
|
||||
checkRet(ret);
|
||||
|
||||
if (workspaceSize > 0)
|
||||
{
|
||||
mallocWorkSpace(workspaceSize);
|
||||
}
|
||||
|
||||
ret = aclnnScatter(workspaceAddr, workspaceSize, executor, aclstream);
|
||||
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnScatter failed. ERROR: %d\n", name.c_str(), ret); return);
|
||||
|
||||
syncRun();
|
||||
return;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,26 @@
|
|||
#pragma once
|
||||
#include "utils.h"
|
||||
#include "base_op.h"
|
||||
|
||||
namespace jittor
|
||||
{
|
||||
class GatherOpRunner : public BaseOpRunner
|
||||
{
|
||||
|
||||
protected:
|
||||
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
|
||||
|
||||
public:
|
||||
GatherOpRunner();
|
||||
};
|
||||
|
||||
class ScatterOpRunner : public BaseOpRunner
|
||||
{
|
||||
|
||||
protected:
|
||||
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
|
||||
|
||||
public:
|
||||
ScatterOpRunner();
|
||||
};
|
||||
}
|
|
@ -0,0 +1,419 @@
|
|||
import os
|
||||
from jittor_utils import env_or_try_find
|
||||
import jittor_utils
|
||||
import ctypes
|
||||
import glob
|
||||
import jittor.compiler as compiler
|
||||
import jittor as jt
|
||||
import math
|
||||
import numpy as np
|
||||
|
||||
from typing import Union
|
||||
from collections.abc import Sequence, Iterable
|
||||
|
||||
|
||||
def getitem_cmd(name: str,
|
||||
inputs: list,
|
||||
output_dtypes: list = None,
|
||||
output_shapes: list = None,
|
||||
attr_code: str = "",
|
||||
attr_header: str = "",
|
||||
outputs: list = None):
|
||||
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
|
||||
|
||||
cuda_header = '''
|
||||
#include "acl/aclops/aclops.h"
|
||||
'''
|
||||
outputs_ = []
|
||||
if outputs is not None:
|
||||
outputs_ = outputs
|
||||
else:
|
||||
assert output_dtypes is not None
|
||||
assert output_shapes is not None
|
||||
assert len(output_dtypes) == len(output_shapes)
|
||||
for i in range(len(output_shapes)):
|
||||
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
|
||||
input_code = ''
|
||||
for i in range(len(inputs)):
|
||||
input_code += f"op.add(in{i}, true);\n"
|
||||
|
||||
output_code = ''
|
||||
for i in range(len(outputs_)):
|
||||
output_code += f"op.add(out{i}, false);\n"
|
||||
return jt.code(outputs=outputs_,
|
||||
inputs=inputs,
|
||||
cuda_header=attr_header + cuda_header,
|
||||
cuda_src=f"""
|
||||
|
||||
// aclop
|
||||
{name}OpRunner op;
|
||||
{input_code}
|
||||
{output_code}
|
||||
{attr_code}
|
||||
op.run();""")
|
||||
|
||||
|
||||
def getitem_forward(name: str,
|
||||
inputs: list,
|
||||
output_dtypes: list = None,
|
||||
output_shapes: list = None,
|
||||
attr_code: str = "",
|
||||
attr_header: str = "",
|
||||
outputs: list = None,
|
||||
extra_data: dict = {}):
|
||||
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
|
||||
|
||||
cuda_header = '''
|
||||
#include "acl/aclops/aclops.h"
|
||||
'''
|
||||
outputs_ = []
|
||||
if outputs is not None:
|
||||
outputs_ = outputs
|
||||
else:
|
||||
assert output_dtypes is not None
|
||||
assert output_shapes is not None
|
||||
assert len(output_dtypes) == len(output_shapes)
|
||||
for i in range(len(output_shapes)):
|
||||
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
|
||||
input_code = ''
|
||||
for i in range(len(inputs)):
|
||||
input_code += f"op.add(in{i}, true);\n"
|
||||
|
||||
output_code = ''
|
||||
for i in range(len(outputs_)):
|
||||
output_code += f"op.add(out{i}, false);\n"
|
||||
return jt.code(outputs=outputs_,
|
||||
inputs=inputs,
|
||||
cuda_header=attr_header + cuda_header,
|
||||
cuda_src=f"""
|
||||
// aclop
|
||||
{name}OpRunner op;
|
||||
{input_code}
|
||||
op.add(out0, false);
|
||||
{attr_code}
|
||||
op.run();""",
|
||||
data=extra_data)
|
||||
|
||||
|
||||
def caculate_shape(tensors):
|
||||
if isinstance(tensors, jt.Var):
|
||||
# tensors = tensors[0]
|
||||
return tensors.shape
|
||||
elif isinstance(tensors, (int, float)):
|
||||
return []
|
||||
elif isinstance(tensors, (list, tuple)):
|
||||
# return [caculate_shape(tensor) for tensor in tensors]
|
||||
sub_shape = caculate_shape(tensors[0])
|
||||
return [len(tensors)] + sub_shape
|
||||
else:
|
||||
assert False, f"not implemented for {type(tensors)}"
|
||||
|
||||
|
||||
def can_broadcast_and_shape(shape1, shape2):
|
||||
"""
|
||||
检查两个张量是否可以广播,并返回广播后的形状。
|
||||
|
||||
参数:
|
||||
- shape1: 第一个张量的形状(tuple 或 list)
|
||||
- shape2: 第二个张量的形状(tuple 或 list)
|
||||
|
||||
返回:
|
||||
- can_broadcast: 布尔值,表示是否可以广播
|
||||
- broadcast_shape: 如果可以广播,返回广播后的形状;否则返回 None
|
||||
"""
|
||||
# 将形状转换为元组,以防输入是列表
|
||||
shape1 = tuple(shape1)
|
||||
shape2 = tuple(shape2)
|
||||
|
||||
# 使两个形状的长度一致,通过在前面补1
|
||||
len1, len2 = len(shape1), len(shape2)
|
||||
if len1 < len2:
|
||||
shape1 = (1, ) * (len2 - len1) + shape1
|
||||
elif len2 < len1:
|
||||
shape2 = (1, ) * (len1 - len2) + shape2
|
||||
|
||||
broadcast_shape = []
|
||||
|
||||
# 从最后一维开始检查每一维度
|
||||
for dim1, dim2 in zip(shape1, shape2):
|
||||
if dim1 == dim2:
|
||||
broadcast_shape.append(dim1)
|
||||
elif dim1 == 1:
|
||||
broadcast_shape.append(dim2)
|
||||
elif dim2 == 1:
|
||||
broadcast_shape.append(dim1)
|
||||
else:
|
||||
# 如果在某一维度上不兼容,则不能广播
|
||||
return False, None
|
||||
|
||||
return True, tuple(broadcast_shape)
|
||||
|
||||
|
||||
class GetItemACL(jt.Function):
|
||||
|
||||
def __init__(self):
|
||||
self.type_ = 'notype'
|
||||
|
||||
def stride(self, x, dim):
|
||||
stride = 1
|
||||
for i in range(dim + 1, len(x.shape)):
|
||||
stride *= x.shape[i]
|
||||
return stride
|
||||
|
||||
def execute(self, x, slices, return_x=None):
|
||||
if isinstance(slices, jt.Var) and slices.dtype == 'bool':
|
||||
# assert False, "not support bool type now"
|
||||
#TODO:优化
|
||||
assert x.shape == slices.shape, "shape not match"
|
||||
output_len = slices.sum().item()
|
||||
# output = jt.empty((output_len,),dtype=x.dtype)
|
||||
x_len = x.numel()
|
||||
output = jt.empty((x_len), dtype=x.dtype)
|
||||
outputs = [output]
|
||||
inputs = [x, slices]
|
||||
# print(inputs,outputs)
|
||||
# print(output.shape)
|
||||
self.mask = slices
|
||||
self.type_ = 'mask'
|
||||
attr_code = f"""
|
||||
op.jt_name = "maskedselect";
|
||||
"""
|
||||
result = getitem_cmd("MaskedSelect",
|
||||
inputs=inputs,
|
||||
outputs=outputs,
|
||||
attr_code=attr_code)[0]
|
||||
result = result[:output_len]
|
||||
result.sync()
|
||||
return result
|
||||
self.x_shape = x.shape
|
||||
if not isinstance(slices, tuple):
|
||||
slices = (slices, )
|
||||
slices = list(slices)
|
||||
for i, s in enumerate(slices):
|
||||
if isinstance(s, int) and s < 0:
|
||||
slices[i] = s + x.shape[i]
|
||||
slices = tuple(slices)
|
||||
slices_list = list(slices)
|
||||
# if not isinstance(slices[0], slice):
|
||||
#check slices contains slice type
|
||||
contains_slice = False
|
||||
for s in slices:
|
||||
if not isinstance(s, jt.Var) and (isinstance(s, slice)
|
||||
or s == Ellipsis):
|
||||
contains_slice = True
|
||||
break
|
||||
if not contains_slice:
|
||||
indices = []
|
||||
output_shape = []
|
||||
slices_len = len(slices)
|
||||
boardcast_shape = caculate_shape(slices_list[0])
|
||||
for ii in range(1, len(slices)):
|
||||
dd, boardcast_shape = can_broadcast_and_shape(
|
||||
boardcast_shape, caculate_shape(slices_list[ii]))
|
||||
assert dd is True, "can not broadcast"
|
||||
output_shape = boardcast_shape
|
||||
output_shape += x.shape[slices_len:]
|
||||
if output_shape == []:
|
||||
output_shape = [1]
|
||||
for ii in slices:
|
||||
indices.append(jt.Var(ii).int32())
|
||||
if isinstance(slices[0],
|
||||
jt.Var) or isinstance(slices[0], int) or isinstance(
|
||||
slices[0], list) or isinstance(slices[0], tuple):
|
||||
self.indices = indices
|
||||
inputs = [x] + indices
|
||||
attr_code = f"""
|
||||
op.jt_name = "index";
|
||||
"""
|
||||
self.type_ = 'index'
|
||||
result = getitem_cmd("Index",
|
||||
inputs=inputs,
|
||||
output_dtypes=[x.dtype],
|
||||
output_shapes=[output_shape],
|
||||
attr_code=attr_code)[0]
|
||||
result.sync()
|
||||
return result
|
||||
assert contains_slice, "slice type error"
|
||||
x_dim = len(x.shape)
|
||||
slices = list(slices)
|
||||
for s in slices:
|
||||
if not isinstance(s, jt.Var) and s == Ellipsis:
|
||||
slices = slices[:slices.index(s)] + [
|
||||
slice(None, None, None)
|
||||
] * (x_dim - len(slices) + 1) + slices[slices.index(s) + 1:]
|
||||
break
|
||||
slices = tuple(slices)
|
||||
|
||||
if len(slices) < x_dim:
|
||||
slices += (slice(None, None, None), ) * (x_dim - len(slices))
|
||||
inputs = [x]
|
||||
sizes = []
|
||||
begins = []
|
||||
ends = []
|
||||
steps = []
|
||||
dims = []
|
||||
squeeze_dims = []
|
||||
|
||||
extra_data = {}
|
||||
if len(slices):
|
||||
extra_data["a"] = len(slices)
|
||||
for dim, s in enumerate(slices):
|
||||
if isinstance(s, int):
|
||||
s = slice(s, s + 1, 1)
|
||||
squeeze_dims.append(dim)
|
||||
if isinstance(s, jt.Var):
|
||||
assert False, "jt.Var not supported"
|
||||
start, stop, step = s.indices(x.size(dim))
|
||||
size = (stop - start - 1) // step + 1
|
||||
# stride = self.stride(x, dim) * step
|
||||
sizes.append(size)
|
||||
extra_data[str(dim * 3)] = start
|
||||
extra_data[str(dim * 3 + 1)] = stop
|
||||
extra_data[str(dim * 3 + 2)] = step
|
||||
|
||||
steps.append(step)
|
||||
begins.append(start)
|
||||
ends.append(stop)
|
||||
dims.append(dim)
|
||||
else:
|
||||
extra_data["a"] = -1
|
||||
sizes = [1]
|
||||
steps = [1]
|
||||
self.type_ = 'slicev2'
|
||||
# for backward
|
||||
self.begins = begins
|
||||
self.ends = ends
|
||||
self.steps = steps
|
||||
self.dims = dims
|
||||
|
||||
self.slices = slices
|
||||
attr_code = """
|
||||
op.jt_name = "slicev2";
|
||||
StrideAttr *attr = new StrideAttr();
|
||||
|
||||
int slice_dim = data["a"];
|
||||
|
||||
if(slice_dim == -1) {
|
||||
attr->begins = {};
|
||||
attr->ends = {};
|
||||
attr->steps = {1};
|
||||
attr->axes = {};
|
||||
} else {
|
||||
vector<long int> begins;
|
||||
vector<long int> ends;
|
||||
vector<long int> steps;
|
||||
vector<long int> dims;
|
||||
for(int dim = 0; dim < slice_dim; dim++) {
|
||||
dims.push_back(dim);
|
||||
begins.push_back(data[std::to_string(dim*3)]);
|
||||
ends.push_back(data[std::to_string(dim*3+1)]);
|
||||
steps.push_back(data[std::to_string(dim*3+2)]);
|
||||
}
|
||||
attr->begins = begins;
|
||||
attr->ends = ends;
|
||||
attr->steps = steps;
|
||||
attr->axes = dims;
|
||||
}
|
||||
op.op_attr.reset(attr);
|
||||
"""
|
||||
result = getitem_forward("SliceV2",
|
||||
inputs,
|
||||
output_dtypes=[x.dtype],
|
||||
output_shapes=[jt.empty(sizes).shape],
|
||||
attr_code=attr_code,
|
||||
extra_data=extra_data)[0]
|
||||
self.squeeze_dims = squeeze_dims
|
||||
for dim in squeeze_dims[::-1]:
|
||||
result = jt.squeeze(result, dim)
|
||||
result.sync()
|
||||
return result
|
||||
|
||||
def grad(self, grad_output):
|
||||
if self.type_ == 'index':
|
||||
indices = self.indices
|
||||
inputs = [grad_output] + indices
|
||||
attr_code = f"""
|
||||
op.jt_name = "indexputimplaccumulate";
|
||||
"""
|
||||
outputs = [jt.zeros(self.x_shape, dtype=grad_output.dtype)]
|
||||
# breakpoint()
|
||||
result = getitem_cmd("IndexPutImplAccumulate",
|
||||
inputs=inputs,
|
||||
outputs=outputs,
|
||||
attr_code=attr_code)[0]
|
||||
result.sync()
|
||||
return result, None
|
||||
elif self.type_ == 'slicev2':
|
||||
begins = self.begins
|
||||
ends = self.ends
|
||||
steps = self.steps
|
||||
dims = self.dims
|
||||
slices = self.slices
|
||||
#注意前向的维数可能会被压缩,所以这里要还原
|
||||
for dim in self.squeeze_dims:
|
||||
grad_output = jt.unsqueeze(grad_output, dim)
|
||||
#适配华为奇怪的要求,最后一个维度的step必须是1
|
||||
expand_dim = False
|
||||
if isinstance(slices[-1], slice):
|
||||
if slices[-1].step is not None and slices[-1].step != 1:
|
||||
slices = slices + (slice(None, None, None), )
|
||||
expand_dim = True
|
||||
elif isinstance(slices[-1], int):
|
||||
#注意最后一个维度是数字
|
||||
slices = list(slices)
|
||||
slices[-1] = slice(slices[-1], slices[-1] + 1, 1)
|
||||
slices = tuple(slices)
|
||||
slices = slices + (slice(None, None, None), )
|
||||
expand_dim = True
|
||||
else:
|
||||
assert False, "not supported"
|
||||
# x = x.unsqueeze(-1)
|
||||
if expand_dim:
|
||||
grad_output = grad_output.unsqueeze(-1)
|
||||
self.x_shape = self.x_shape + (1, )
|
||||
sizes = []
|
||||
begins = []
|
||||
ends = []
|
||||
steps = []
|
||||
dims = []
|
||||
for dim, s in enumerate(slices):
|
||||
if isinstance(s, int):
|
||||
s = slice(s, s + 1, 1)
|
||||
# squeeze_dims.append(dim)
|
||||
if isinstance(s, jt.Var):
|
||||
assert False, "jt.Var not supported"
|
||||
start, stop, step = s.indices(self.x_shape[dim])
|
||||
size = (stop - start - 1) // step + 1
|
||||
# stride = self.stride(x, dim) * step
|
||||
sizes.append(size)
|
||||
steps.append(step)
|
||||
begins.append(start)
|
||||
ends.append(stop)
|
||||
dims.append(dim)
|
||||
if not sizes:
|
||||
sizes = [1]
|
||||
steps = [1]
|
||||
attr_code = f"""
|
||||
op.jt_name = "stridedsliceassignv2";
|
||||
StrideAttr *attr = new StrideAttr();
|
||||
attr->begins = {{ {", ".join(map(str, begins))} }};
|
||||
attr->ends = {{ {", ".join(map(str, ends))} }};
|
||||
attr->steps = {{ {", ".join(map(str, steps))} }};
|
||||
attr->axes = {{ {", ".join(map(str, dims))} }};
|
||||
op.op_attr.reset(attr);
|
||||
"""
|
||||
inputs = [grad_output]
|
||||
outputs = [jt.zeros(self.x_shape, dtype=grad_output.dtype)]
|
||||
result = getitem_cmd("StridedSliceAssignV2",
|
||||
inputs=inputs,
|
||||
outputs=outputs,
|
||||
attr_code=attr_code)[0]
|
||||
result.sync()
|
||||
if expand_dim:
|
||||
result = result.squeeze(-1)
|
||||
return result, None
|
||||
elif self.type_ == 'mask':
|
||||
return self.mask.float()
|
||||
pass
|
||||
else:
|
||||
assert False, f"grad not implemented for {self.type_}"
|
|
@ -0,0 +1,165 @@
|
|||
#pragma once
|
||||
#include <acl/acl.h>
|
||||
#include <acl/acl_op_compiler.h>
|
||||
#include <Python.h>
|
||||
#include <pystate.h>
|
||||
#include <algorithm>
|
||||
#include <queue>
|
||||
#include <set>
|
||||
#include "common.h"
|
||||
#include "op.h"
|
||||
#include "acl_jittor.h"
|
||||
#include "ops/random_op.h"
|
||||
#include "ops/reduce_op.h"
|
||||
#include "ops/binary_op.h"
|
||||
#include "ops/broadcast_to_op.h"
|
||||
#include "ops/transpose_op.h"
|
||||
#include "ops/array_op.h"
|
||||
#include "ops/code_op.h"
|
||||
#include "fused_op.h"
|
||||
#include "ops/unary_op.h"
|
||||
#include "ops/ternary_op.h"
|
||||
#include "executor.h"
|
||||
#include "misc/cuda_flags.h"
|
||||
#include "mem/allocator.h"
|
||||
#include "op_compiler.h"
|
||||
#include "ops/op_register.h"
|
||||
#include "opt/tuner_manager.h"
|
||||
#include "utils/str_utils.h"
|
||||
#include "aclnn/aclnn.h"
|
||||
#include "getitem_op_acl.h"
|
||||
|
||||
namespace jittor
|
||||
{
|
||||
MaskedSelectOpRunner::MaskedSelectOpRunner() : BaseOpRunner("MaskedSelect")
|
||||
{
|
||||
}
|
||||
|
||||
void MaskedSelectOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
|
||||
{
|
||||
ret = aclnnMaskedSelectGetWorkspaceSize(inputTensors[0], inputTensors[1], outputTensors[0], &workspaceSize, &executor);
|
||||
|
||||
checkRet(ret);
|
||||
|
||||
if (workspaceSize > 0)
|
||||
{
|
||||
mallocWorkSpace(workspaceSize);
|
||||
}
|
||||
|
||||
ret = aclnnMaskedSelect(workspaceAddr, workspaceSize, executor, aclstream);
|
||||
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnMaskedSelect failed. ERROR: %d\n", name.c_str(), ret); return);
|
||||
|
||||
syncRun();
|
||||
return;
|
||||
}
|
||||
|
||||
IndexOpRunner::IndexOpRunner() : BaseOpRunner("Index")
|
||||
{
|
||||
}
|
||||
|
||||
void IndexOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
|
||||
{
|
||||
auto input_num = in_.size();
|
||||
auto indexTensorList = aclCreateTensorList(&inputTensors[1], input_num - 1);
|
||||
ret = aclnnIndexGetWorkspaceSize(inputTensors[0], indexTensorList, outputTensors[0], &workspaceSize, &executor);
|
||||
|
||||
checkRet(ret);
|
||||
|
||||
if (workspaceSize > 0)
|
||||
{
|
||||
mallocWorkSpace(workspaceSize);
|
||||
}
|
||||
|
||||
ret = aclnnIndex(workspaceAddr, workspaceSize, executor, aclstream);
|
||||
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnIndex failed. ERROR: %d\n", name.c_str(), ret); return);
|
||||
|
||||
syncRun();
|
||||
return;
|
||||
}
|
||||
|
||||
SliceV2OpRunner::SliceV2OpRunner() : BaseOpRunner("SliceV2")
|
||||
{
|
||||
}
|
||||
|
||||
void SliceV2OpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
|
||||
{
|
||||
auto attr = dynamic_cast<StrideAttr *>(op_attr.get());
|
||||
auto begins = aclCreateIntArray(attr->begins.data(), attr->begins.size());
|
||||
auto ends = aclCreateIntArray(attr->ends.data(), attr->ends.size());
|
||||
auto steps = aclCreateIntArray(attr->steps.data(), attr->steps.size());
|
||||
auto axes = aclCreateIntArray(attr->axes.data(), attr->axes.size());
|
||||
ret = aclnnSliceV2GetWorkspaceSize(inputTensors[0], begins, ends, axes, steps, outputTensors[0], &workspaceSize, &executor);
|
||||
|
||||
checkRet(ret);
|
||||
|
||||
if (workspaceSize > 0)
|
||||
{
|
||||
mallocWorkSpace(workspaceSize);
|
||||
}
|
||||
|
||||
ret = aclnnSliceV2(workspaceAddr, workspaceSize, executor, aclstream);
|
||||
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnSliceV2 failed. ERROR: %d\n", name.c_str(), ret); return);
|
||||
|
||||
syncRun();
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
IndexPutImplAccumulateOpRunner::IndexPutImplAccumulateOpRunner() : BaseOpRunner("IndexPutImplAccumulate")
|
||||
{
|
||||
}
|
||||
|
||||
void IndexPutImplAccumulateOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
|
||||
{
|
||||
auto input_num = in_.size();
|
||||
std::vector<aclTensor *> indexTensorList = {};
|
||||
for (int i = 1; i < input_num; i++)
|
||||
{
|
||||
indexTensorList.push_back(inputTensors[i]);
|
||||
}
|
||||
auto indexTensorListInput = aclCreateTensorList(&indexTensorList[0], input_num - 1);
|
||||
ret = aclnnIndexPutImplGetWorkspaceSize(outputTensors[0], indexTensorListInput, inputTensors[0], true, true, &workspaceSize, &executor);
|
||||
|
||||
checkRet(ret);
|
||||
|
||||
if (workspaceSize > 0)
|
||||
{
|
||||
mallocWorkSpace(workspaceSize);
|
||||
}
|
||||
|
||||
ret = aclnnIndexPutImpl(workspaceAddr, workspaceSize, executor, aclstream);
|
||||
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnIndexPutImpl failed. ERROR: %d\n", name.c_str(), ret); return);
|
||||
|
||||
syncRun();
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
StridedSliceAssignV2OpRunner::StridedSliceAssignV2OpRunner() : BaseOpRunner("StridedSliceAssignV2")
|
||||
{
|
||||
}
|
||||
|
||||
void StridedSliceAssignV2OpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
|
||||
{
|
||||
auto attr = dynamic_cast<StrideAttr *>(op_attr.get());
|
||||
auto begins = aclCreateIntArray(attr->begins.data(), attr->begins.size());
|
||||
auto ends = aclCreateIntArray(attr->ends.data(), attr->ends.size());
|
||||
auto steps = aclCreateIntArray(attr->steps.data(), attr->steps.size());
|
||||
auto axes = aclCreateIntArray(attr->axes.data(), attr->axes.size());
|
||||
ret = aclnnStridedSliceAssignV2GetWorkspaceSize(outputTensors[0], inputTensors[0], begins, ends, steps, axes, &workspaceSize, &executor);
|
||||
|
||||
checkRet(ret);
|
||||
|
||||
if (workspaceSize > 0)
|
||||
{
|
||||
mallocWorkSpace(workspaceSize);
|
||||
}
|
||||
|
||||
ret = aclnnStridedSliceAssignV2(workspaceAddr, workspaceSize, executor, aclstream);
|
||||
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnStridedSliceAssignV2 failed. ERROR: %d\n", name.c_str(), ret); return);
|
||||
|
||||
syncRun();
|
||||
|
||||
return;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,57 @@
|
|||
#pragma once
|
||||
#include "utils.h"
|
||||
#include "base_op.h"
|
||||
|
||||
namespace jittor
|
||||
{
|
||||
class MaskedSelectOpRunner : public BaseOpRunner
|
||||
{
|
||||
|
||||
protected:
|
||||
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
|
||||
|
||||
public:
|
||||
MaskedSelectOpRunner();
|
||||
};
|
||||
|
||||
class IndexOpRunner : public BaseOpRunner
|
||||
{
|
||||
|
||||
protected:
|
||||
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
|
||||
|
||||
public:
|
||||
IndexOpRunner();
|
||||
};
|
||||
|
||||
class SliceV2OpRunner : public BaseOpRunner
|
||||
{
|
||||
|
||||
protected:
|
||||
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
|
||||
|
||||
public:
|
||||
SliceV2OpRunner();
|
||||
};
|
||||
|
||||
class IndexPutImplAccumulateOpRunner : public BaseOpRunner
|
||||
{
|
||||
|
||||
protected:
|
||||
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
|
||||
|
||||
public:
|
||||
IndexPutImplAccumulateOpRunner();
|
||||
};
|
||||
|
||||
class StridedSliceAssignV2OpRunner : public BaseOpRunner
|
||||
{
|
||||
|
||||
protected:
|
||||
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
|
||||
|
||||
public:
|
||||
StridedSliceAssignV2OpRunner();
|
||||
};
|
||||
|
||||
}
|
|
@ -0,0 +1,107 @@
|
|||
import os
|
||||
from jittor_utils import env_or_try_find
|
||||
import jittor_utils
|
||||
import ctypes
|
||||
import glob
|
||||
import jittor.compiler as compiler
|
||||
import jittor as jt
|
||||
import math
|
||||
import numpy as np
|
||||
|
||||
from typing import Union
|
||||
from collections.abc import Sequence, Iterable
|
||||
|
||||
|
||||
def range_forward(name: str,
|
||||
inputs: list,
|
||||
output_dtypes: list = None,
|
||||
output_shapes: list = None,
|
||||
attr_code: str = "",
|
||||
attr_header: str = "",
|
||||
outputs: list = None,
|
||||
extra_data: dict = {}):
|
||||
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
|
||||
|
||||
cuda_header = '''
|
||||
#include "acl/aclops/aclops.h"
|
||||
'''
|
||||
outputs_ = []
|
||||
if outputs is not None:
|
||||
outputs_ = outputs
|
||||
else:
|
||||
assert output_dtypes is not None
|
||||
assert output_shapes is not None
|
||||
assert len(output_dtypes) == len(output_shapes)
|
||||
for i in range(len(output_shapes)):
|
||||
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
|
||||
input_code = ''
|
||||
for i in range(len(inputs)):
|
||||
input_code += f"op.add(in{i}, true);\n"
|
||||
|
||||
output_code = ''
|
||||
for i in range(len(outputs_)):
|
||||
output_code += f"op.add(out{i}, false);\n"
|
||||
return jt.code(outputs=outputs_,
|
||||
inputs=inputs,
|
||||
cuda_header=attr_header + cuda_header,
|
||||
cuda_src=f"""
|
||||
// aclop
|
||||
{name}OpRunner op;
|
||||
{input_code}
|
||||
op.add(out0, false);
|
||||
{attr_code}
|
||||
op.run();""",
|
||||
data=extra_data)
|
||||
|
||||
|
||||
class IndexACL(jt.Function):
|
||||
|
||||
def __init__(self):
|
||||
super(IndexACL, self).__init__()
|
||||
|
||||
def execute(self, inshape: list, dim=None, dtype="int32"):
|
||||
# zeros a tensor, shape is inshape, dtype is dtype
|
||||
dim_input = dim
|
||||
if dim == None:
|
||||
dim = [i for i in range(len(inshape))]
|
||||
elif type(dim) == int:
|
||||
dim = [dim]
|
||||
results = []
|
||||
extra_data = {}
|
||||
extra_data["dim_count"] = len(dim)
|
||||
|
||||
for i, d in enumerate(dim):
|
||||
max_len = inshape[d]
|
||||
|
||||
extra_data[f"dim_{i}_start"] = 0
|
||||
extra_data[f"dim_{i}_end"] = max_len
|
||||
extra_data[f"dim_{i}_step"] = 1
|
||||
|
||||
tmp = jt.zeros(max_len, dtype=dtype)
|
||||
range_attr_code = f"""
|
||||
op.jt_name = "range";
|
||||
RangeAttr *attr = new RangeAttr();
|
||||
attr->start = data["dim_{i}_start"];
|
||||
attr->end = data["dim_{i}_end"];
|
||||
attr->step = data["dim_{i}_step"];
|
||||
op.op_attr.reset(attr);
|
||||
"""
|
||||
result = range_forward("Range", [],
|
||||
output_dtypes=[tmp.dtype],
|
||||
output_shapes=[tmp.shape],
|
||||
attr_code=range_attr_code,
|
||||
extra_data=extra_data)[0]
|
||||
broadcast_dims = list(range(len(inshape)))
|
||||
broadcast_dims.remove(d)
|
||||
result = jt.broadcast(result, shape=inshape, dims=broadcast_dims)
|
||||
results.append(result)
|
||||
|
||||
if len(results) != 1 or dim_input == None:
|
||||
return tuple(results)
|
||||
elif len(results) == 1 and dim_input != None:
|
||||
return results[0]
|
||||
else:
|
||||
return results
|
||||
|
||||
def grad(self, grad_output):
|
||||
return grad_output
|
|
@ -0,0 +1,72 @@
|
|||
#pragma once
|
||||
#include <acl/acl.h>
|
||||
#include <acl/acl_op_compiler.h>
|
||||
#include <Python.h>
|
||||
#include <pystate.h>
|
||||
#include <algorithm>
|
||||
#include <queue>
|
||||
#include <set>
|
||||
#include "common.h"
|
||||
#include "op.h"
|
||||
#include "acl_jittor.h"
|
||||
#include "ops/random_op.h"
|
||||
#include "ops/reduce_op.h"
|
||||
#include "ops/binary_op.h"
|
||||
#include "ops/broadcast_to_op.h"
|
||||
#include "ops/transpose_op.h"
|
||||
#include "ops/array_op.h"
|
||||
#include "ops/code_op.h"
|
||||
#include "fused_op.h"
|
||||
#include "ops/unary_op.h"
|
||||
#include "ops/ternary_op.h"
|
||||
#include "executor.h"
|
||||
#include "misc/cuda_flags.h"
|
||||
#include "mem/allocator.h"
|
||||
#include "op_compiler.h"
|
||||
#include "ops/op_register.h"
|
||||
#include "opt/tuner_manager.h"
|
||||
#include "utils/str_utils.h"
|
||||
#include "aclnn/aclnn.h"
|
||||
#include "index_op_acl.h"
|
||||
|
||||
namespace jittor
|
||||
{
|
||||
RangeOpRunner::RangeOpRunner() : BaseOpRunner("Range")
|
||||
{
|
||||
}
|
||||
|
||||
void RangeOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
|
||||
{
|
||||
aclScalar *start = nullptr;
|
||||
aclScalar *end = nullptr;
|
||||
aclScalar *step = nullptr;
|
||||
|
||||
auto attr = dynamic_cast<RangeAttr *>(op_attr.get());
|
||||
int64_t startValue = attr->start;
|
||||
int64_t endValue = attr->end;
|
||||
int64_t stepValue = attr->step;
|
||||
start = aclCreateScalar(&startValue, aclDataType::ACL_INT64);
|
||||
end = aclCreateScalar(&endValue, aclDataType::ACL_INT64);
|
||||
step = aclCreateScalar(&stepValue, aclDataType::ACL_INT64);
|
||||
|
||||
ret = aclnnRangeGetWorkspaceSize(start, end, step, outputTensors[0], &workspaceSize, &executor);
|
||||
|
||||
checkRet(ret);
|
||||
|
||||
if (workspaceSize > 0)
|
||||
{
|
||||
mallocWorkSpace(workspaceSize);
|
||||
}
|
||||
|
||||
ret = aclnnRange(workspaceAddr, workspaceSize, executor, aclstream);
|
||||
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnRange failed. ERROR: %d\n", name.c_str(), ret); return);
|
||||
|
||||
syncRun();
|
||||
|
||||
aclDestroyScalar(start);
|
||||
aclDestroyScalar(end);
|
||||
aclDestroyScalar(step);
|
||||
return;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,17 @@
|
|||
#pragma once
|
||||
#include "utils.h"
|
||||
#include "base_op.h"
|
||||
|
||||
namespace jittor
|
||||
{
|
||||
class RangeOpRunner : public BaseOpRunner
|
||||
{
|
||||
|
||||
protected:
|
||||
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
|
||||
|
||||
public:
|
||||
RangeOpRunner();
|
||||
};
|
||||
|
||||
}
|
|
@ -0,0 +1,130 @@
|
|||
import os
|
||||
from jittor_utils import env_or_try_find
|
||||
import jittor_utils
|
||||
import ctypes
|
||||
import glob
|
||||
import jittor.compiler as compiler
|
||||
import jittor as jt
|
||||
import math
|
||||
import numpy as np
|
||||
|
||||
from typing import Union
|
||||
from collections.abc import Sequence, Iterable
|
||||
|
||||
|
||||
def matmul_forward(name: str,
|
||||
inputs: list,
|
||||
output_dtypes: list = None,
|
||||
output_shapes: list = None,
|
||||
attr_code: str = "",
|
||||
attr_header: str = "",
|
||||
outputs: list = None,
|
||||
extra_data: dict = {}):
|
||||
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
|
||||
|
||||
cuda_header = '''
|
||||
#include "acl/aclops/aclops.h"
|
||||
'''
|
||||
outputs_ = []
|
||||
if outputs is not None:
|
||||
outputs_ = outputs
|
||||
else:
|
||||
assert output_dtypes is not None
|
||||
assert output_shapes is not None
|
||||
assert len(output_dtypes) == len(output_shapes)
|
||||
for i in range(len(output_shapes)):
|
||||
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
|
||||
input_code = ''
|
||||
for i in range(len(inputs)):
|
||||
input_code += f"op.add(in{i}, true);\n"
|
||||
|
||||
return jt.code(outputs=outputs_,
|
||||
inputs=inputs,
|
||||
cuda_header=attr_header + cuda_header,
|
||||
cuda_src=f"""
|
||||
// aclop
|
||||
MatMulOpRunner op;
|
||||
{input_code}
|
||||
op.add(out0, false);
|
||||
{attr_code}
|
||||
op.run();""",
|
||||
data=extra_data)
|
||||
|
||||
|
||||
class MatmulACL(jt.Function):
|
||||
|
||||
def __init__(self, trans_x2=False):
|
||||
super(MatmulACL, self).__init__()
|
||||
self.trans_x2 = trans_x2
|
||||
|
||||
def execute(self, x1, x2):
|
||||
self.input = [x1, x2]
|
||||
result = matmul_forward(
|
||||
"MatMul", [x1, x2],
|
||||
output_dtypes=[x1.dtype],
|
||||
output_shapes=[
|
||||
x1.shape[:-1] +
|
||||
x2.shape[-2:-1] if self.trans_x2 else x1.shape[:-1] +
|
||||
x2.shape[-1:]
|
||||
],
|
||||
attr_code="op.jt_name=\"matmul_trans_1\";"
|
||||
if self.trans_x2 else "op.jt_name=\"matmul\";")[0]
|
||||
return result
|
||||
|
||||
def grad(self, grad_output):
|
||||
x1, x2 = self.input
|
||||
if len(x1) != len(x2):
|
||||
reshape_grad_x2 = True
|
||||
else:
|
||||
reshape_grad_x2 = False
|
||||
grad_x1 = matmul_forward(
|
||||
"MatMul", [grad_output, x2],
|
||||
output_dtypes=[x1.dtype],
|
||||
output_shapes=[
|
||||
grad_output.shape[:-1] + x2.shape[-2:-1] if not self.trans_x2
|
||||
else grad_output.shape[:-1] + x2.shape[-1:]
|
||||
],
|
||||
attr_code="op.jt_name=\"matmul_trans_1\";"
|
||||
if not self.trans_x2 else "op.jt_name=\"matmul\";")[0]
|
||||
|
||||
if self.trans_x2:
|
||||
if reshape_grad_x2:
|
||||
output_shape = grad_output.shape[1:-2] + grad_output.shape[
|
||||
-1:] + x1.shape[-1:]
|
||||
grad_x2 = matmul_forward(
|
||||
"MatMul", [
|
||||
grad_output.reshape(-1, grad_output.shape[-1]),
|
||||
x1.reshape(-1, x1.shape[-1])
|
||||
],
|
||||
output_dtypes=[x2.dtype],
|
||||
output_shapes=[output_shape],
|
||||
attr_code="op.jt_name=\"matmul_trans_0\";")[0]
|
||||
else:
|
||||
output_shape = grad_output.shape[:-2] + grad_output.shape[
|
||||
-1:] + x1.shape[-1:]
|
||||
grad_x2 = matmul_forward(
|
||||
"MatMul", [grad_output, x1],
|
||||
output_dtypes=[x2.dtype],
|
||||
output_shapes=[output_shape],
|
||||
attr_code="op.jt_name=\"matmul_trans_0\";")[0]
|
||||
else:
|
||||
if reshape_grad_x2:
|
||||
output_shape = x1.shape[1:-2] + x1.shape[
|
||||
-1:] + grad_output.shape[-1:]
|
||||
grad_x2 = matmul_forward(
|
||||
"MatMul", [
|
||||
x1.reshape(-1, x1.shape[-1]),
|
||||
grad_output.reshape(-1, grad_output.shape[-1])
|
||||
],
|
||||
output_dtypes=[x2.dtype],
|
||||
output_shapes=[output_shape],
|
||||
attr_code="op.jt_name=\"matmul_trans_0\";")[0]
|
||||
else:
|
||||
output_shape = x1.shape[:-2] + x1.shape[
|
||||
-1:] + grad_output.shape[-1:]
|
||||
grad_x2 = matmul_forward(
|
||||
"MatMul", [x1, grad_output],
|
||||
output_dtypes=[x2.dtype],
|
||||
output_shapes=[output_shape],
|
||||
attr_code="op.jt_name=\"matmul_trans_0\";")[0]
|
||||
return grad_x1, grad_x2
|
|
@ -0,0 +1,77 @@
|
|||
#pragma once
|
||||
#include <acl/acl.h>
|
||||
#include <acl/acl_op_compiler.h>
|
||||
#include <Python.h>
|
||||
#include <pystate.h>
|
||||
#include <algorithm>
|
||||
#include <queue>
|
||||
#include <set>
|
||||
#include "common.h"
|
||||
#include "op.h"
|
||||
#include "acl_jittor.h"
|
||||
#include "ops/random_op.h"
|
||||
#include "ops/reduce_op.h"
|
||||
#include "ops/binary_op.h"
|
||||
#include "ops/broadcast_to_op.h"
|
||||
#include "ops/transpose_op.h"
|
||||
#include "ops/array_op.h"
|
||||
#include "ops/code_op.h"
|
||||
#include "fused_op.h"
|
||||
#include "ops/unary_op.h"
|
||||
#include "ops/ternary_op.h"
|
||||
#include "executor.h"
|
||||
#include "misc/cuda_flags.h"
|
||||
#include "mem/allocator.h"
|
||||
#include "op_compiler.h"
|
||||
#include "ops/op_register.h"
|
||||
#include "opt/tuner_manager.h"
|
||||
#include "utils/str_utils.h"
|
||||
#include "aclnn/aclnn.h"
|
||||
#include "matmul_op_acl.h"
|
||||
|
||||
namespace jittor
|
||||
{
|
||||
MatMulOpRunner::MatMulOpRunner() : BaseOpRunner("MatMul")
|
||||
{
|
||||
}
|
||||
void MatMulOpRunner::setupInputDesc()
|
||||
{
|
||||
auto input_num = in_.size();
|
||||
for (int input_idx = 0; input_idx < input_num; input_idx++)
|
||||
{
|
||||
std::vector<int64_t> shape;
|
||||
for (int j = 0; j < in_[input_idx]->shape.size(); j++)
|
||||
{
|
||||
shape.push_back(in_[input_idx]->shape[j]);
|
||||
}
|
||||
inputShapes.push_back(shape);
|
||||
}
|
||||
for (int idx = 0; idx < input_num; idx++)
|
||||
{
|
||||
inputTensors.push_back(nullptr);
|
||||
if ((jt_name == "matmul_trans_1" && idx == 1) || (jt_name == "matmul_trans_0" && idx == 0))
|
||||
{
|
||||
auto ret = CreateFakeTransAclTensor(inputShapes[idx], in_[idx]->mem_ptr, in_[idx]->size, get_dtype(in_[idx]->dtype()), &inputTensors[idx], use_nchw);
|
||||
CHECK_RET(ret == ACL_SUCCESS, return);
|
||||
}
|
||||
else
|
||||
{
|
||||
auto ret = CreateAclTensor(inputShapes[idx], in_[idx]->mem_ptr, in_[idx]->size, get_dtype(in_[idx]->dtype()), &inputTensors[idx], use_nchw);
|
||||
CHECK_RET(ret == ACL_SUCCESS, return);
|
||||
}
|
||||
}
|
||||
}
|
||||
void MatMulOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
|
||||
{
|
||||
|
||||
ret = aclnnMatmulGetWorkspaceSize(inputTensors[0], inputTensors[1], outputTensors[0], 1, &workspaceSize, &executor);
|
||||
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnMatmulGetWorkspaceSize failed. ERROR: %d\n", name.c_str(), ret); return);
|
||||
if (workspaceSize > 0)
|
||||
{
|
||||
mallocWorkSpace(workspaceSize);
|
||||
}
|
||||
ret = aclnnMatmul(workspaceAddr, workspaceSize, executor, aclstream);
|
||||
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnMatmul failed. ERROR: %d\n", name.c_str(), ret); return);
|
||||
syncRun();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,17 @@
|
|||
#pragma once
|
||||
#include "utils.h"
|
||||
#include "base_op.h"
|
||||
|
||||
namespace jittor
|
||||
{
|
||||
class MatMulOpRunner : public BaseOpRunner
|
||||
{
|
||||
|
||||
protected:
|
||||
void setupInputDesc() override;
|
||||
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
|
||||
|
||||
public:
|
||||
MatMulOpRunner();
|
||||
};
|
||||
}
|
|
@ -0,0 +1,75 @@
|
|||
import os
|
||||
from jittor_utils import env_or_try_find
|
||||
import jittor_utils
|
||||
import ctypes
|
||||
import glob
|
||||
import jittor.compiler as compiler
|
||||
import jittor as jt
|
||||
import math
|
||||
import numpy as np
|
||||
|
||||
from typing import Union
|
||||
from collections.abc import Sequence, Iterable
|
||||
|
||||
|
||||
def nantonum_cmd(name: str,
|
||||
inputs: list,
|
||||
output_dtypes: list = None,
|
||||
output_shapes: list = None,
|
||||
attr_code: str = "",
|
||||
attr_header: str = "",
|
||||
outputs: list = None):
|
||||
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
|
||||
|
||||
cuda_header = '''
|
||||
#include "acl/aclops/aclops.h"
|
||||
'''
|
||||
outputs_ = []
|
||||
if outputs is not None:
|
||||
outputs_ = outputs
|
||||
else:
|
||||
assert output_dtypes is not None
|
||||
assert output_shapes is not None
|
||||
assert len(output_dtypes) == len(output_shapes)
|
||||
for i in range(len(output_shapes)):
|
||||
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
|
||||
input_code = ''
|
||||
for i in range(len(inputs)):
|
||||
input_code += f"op.add(in{i}, true);\n"
|
||||
|
||||
output_code = ''
|
||||
for i in range(len(outputs_)):
|
||||
output_code += f"op.add(out{i}, false);\n"
|
||||
return jt.code(outputs=outputs_,
|
||||
inputs=inputs,
|
||||
cuda_header=attr_header + cuda_header,
|
||||
cuda_src=f"""
|
||||
|
||||
// aclop
|
||||
{name}OpRunner op;
|
||||
{input_code}
|
||||
{output_code}
|
||||
{attr_code}
|
||||
op.run();""")
|
||||
|
||||
|
||||
class NanToNumACL(jt.Function):
|
||||
|
||||
def __init__(self):
|
||||
super(NanToNumACL, self).__init__()
|
||||
|
||||
def execute(self, input, nan_or_inf):
|
||||
attr_code = f"""
|
||||
op.jt_name = "NanToNum";
|
||||
NanToNumAttr *attr = new NanToNumAttr();
|
||||
attr->nan = {nan_or_inf};
|
||||
attr->posinf = {-nan_or_inf};
|
||||
attr->neginf = {-nan_or_inf};
|
||||
op.op_attr.reset(attr);
|
||||
"""
|
||||
self.attr_code = attr_code
|
||||
result = nantonum_cmd("NanToNum", [input],
|
||||
output_dtypes=[input[0].dtype],
|
||||
output_shapes=[input.shape],
|
||||
attr_code=self.attr_code)[0]
|
||||
return result
|
|
@ -0,0 +1,58 @@
|
|||
#pragma once
|
||||
#include <acl/acl.h>
|
||||
#include <acl/acl_op_compiler.h>
|
||||
#include <Python.h>
|
||||
#include <pystate.h>
|
||||
#include <algorithm>
|
||||
#include <queue>
|
||||
#include <set>
|
||||
#include "common.h"
|
||||
#include "op.h"
|
||||
#include "acl_jittor.h"
|
||||
#include "ops/random_op.h"
|
||||
#include "ops/reduce_op.h"
|
||||
#include "ops/binary_op.h"
|
||||
#include "ops/broadcast_to_op.h"
|
||||
#include "ops/transpose_op.h"
|
||||
#include "ops/array_op.h"
|
||||
#include "ops/code_op.h"
|
||||
#include "fused_op.h"
|
||||
#include "ops/unary_op.h"
|
||||
#include "ops/ternary_op.h"
|
||||
#include "executor.h"
|
||||
#include "misc/cuda_flags.h"
|
||||
#include "mem/allocator.h"
|
||||
#include "op_compiler.h"
|
||||
#include "ops/op_register.h"
|
||||
#include "opt/tuner_manager.h"
|
||||
#include "utils/str_utils.h"
|
||||
#include "aclnn/aclnn.h"
|
||||
#include "nantonum_op_acl.h"
|
||||
|
||||
namespace jittor
|
||||
{
|
||||
NanToNumOpRunner::NanToNumOpRunner() : BaseOpRunner("NanToNum")
|
||||
{
|
||||
}
|
||||
|
||||
void NanToNumOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
|
||||
{
|
||||
auto attr = dynamic_cast<NanToNumAttr *>(op_attr.get());
|
||||
ret = aclnnNanToNumGetWorkspaceSize(inputTensors[0], attr->nan, attr->posinf, attr->neginf, outputTensors[0], &workspaceSize, &executor);
|
||||
|
||||
checkRet(ret);
|
||||
|
||||
if (workspaceSize > 0)
|
||||
{
|
||||
mallocWorkSpace(workspaceSize);
|
||||
}
|
||||
|
||||
ret = aclnnNanToNum(workspaceAddr, workspaceSize, executor, aclstream);
|
||||
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnNanToNum failed. ERROR: %d\n", name.c_str(), ret); return);
|
||||
|
||||
syncRun();
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,17 @@
|
|||
#pragma once
|
||||
#include "utils.h"
|
||||
#include "base_op.h"
|
||||
|
||||
namespace jittor
|
||||
{
|
||||
class NanToNumOpRunner : public BaseOpRunner
|
||||
{
|
||||
|
||||
protected:
|
||||
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
|
||||
|
||||
public:
|
||||
NanToNumOpRunner();
|
||||
};
|
||||
|
||||
}
|
|
@ -0,0 +1,184 @@
|
|||
import os
|
||||
from jittor_utils import env_or_try_find
|
||||
import jittor_utils
|
||||
import ctypes
|
||||
import glob
|
||||
import jittor.compiler as compiler
|
||||
import jittor as jt
|
||||
import math
|
||||
import numpy as np
|
||||
|
||||
from typing import Union
|
||||
from collections.abc import Sequence, Iterable
|
||||
|
||||
def norms_cmd(name: str,
|
||||
inputs: list,
|
||||
output_dtypes: list = None,
|
||||
output_shapes: list = None,
|
||||
attr_code: str = "",
|
||||
attr_header: str = "",
|
||||
outputs: list = None):
|
||||
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
|
||||
|
||||
cuda_header = '''
|
||||
#include "acl/aclops/aclops.h"
|
||||
'''
|
||||
outputs_ = []
|
||||
if outputs is not None:
|
||||
outputs_ = outputs
|
||||
else:
|
||||
assert output_dtypes is not None
|
||||
assert output_shapes is not None
|
||||
assert len(output_dtypes) == len(output_shapes)
|
||||
for i in range(len(output_shapes)):
|
||||
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
|
||||
input_code = ''
|
||||
for i in range(len(inputs)):
|
||||
input_code += f"op.add(in{i}, true);\n"
|
||||
|
||||
output_code = ''
|
||||
for i in range(len(outputs_)):
|
||||
output_code += f"op.add(out{i}, false);\n"
|
||||
return jt.code(outputs=outputs_,
|
||||
inputs=inputs,
|
||||
cuda_header=attr_header + cuda_header,
|
||||
cuda_src=f"""
|
||||
|
||||
// aclop
|
||||
{name}OpRunner op;
|
||||
{input_code}
|
||||
{output_code}
|
||||
{attr_code}
|
||||
op.run();""")
|
||||
|
||||
class BatchNormACL(jt.Function):
|
||||
|
||||
def __init__(self,
|
||||
num_features,
|
||||
eps=1e-05,
|
||||
momentum=0.1,
|
||||
affine=True,
|
||||
is_train=True,
|
||||
sync=True):
|
||||
self.num_features = num_features
|
||||
self.eps = eps
|
||||
self.momentum = momentum
|
||||
self.affine = affine
|
||||
self.is_train = is_train
|
||||
self.sync = sync
|
||||
self.weight = jt.init.constant(
|
||||
(num_features, ), "float32", 1.0) if affine else 1.0
|
||||
self.bias = jt.init.constant(
|
||||
(num_features, ), "float32", 0.0) if affine else 0.0
|
||||
self.running_mean = jt.init.constant((num_features, ), "float32",
|
||||
0.0).stop_grad()
|
||||
self.running_var = jt.init.constant((num_features, ), "float32",
|
||||
1.0).stop_grad()
|
||||
|
||||
def execute(self, x):
|
||||
# assert self.num_features == x.shape[-1]
|
||||
self.input = x.float32()
|
||||
inputs = [
|
||||
self.input, self.weight, self.bias, self.running_mean,
|
||||
self.running_var
|
||||
]
|
||||
outputs = [
|
||||
jt.empty(x.shape),
|
||||
jt.empty(self.num_features),
|
||||
jt.empty(self.num_features)
|
||||
]
|
||||
attr_code = f"""
|
||||
op.jt_name = "batchnorm";
|
||||
BatchNormAttr *attr = new BatchNormAttr();
|
||||
attr->is_train = {"true" if self.is_train else "false"};
|
||||
attr->momentum = {self.momentum};
|
||||
attr->eps = {self.eps};
|
||||
op.op_attr.reset(attr);
|
||||
"""
|
||||
result = norms_cmd("BatchNorm",
|
||||
inputs=inputs,
|
||||
outputs=outputs,
|
||||
attr_code=attr_code)
|
||||
self.output = result[0]
|
||||
self.saveMean = result[1]
|
||||
self.saveInvstd = result[2]
|
||||
return self.output
|
||||
|
||||
def grad(self, grad_output):
|
||||
attr_code = f"""
|
||||
op.jt_name = "batchnorm";
|
||||
BatchNormAttr *attr = new BatchNormAttr();
|
||||
attr->is_train = {"true" if self.is_train else "false"};
|
||||
attr->momentum = {self.momentum};
|
||||
attr->eps = {self.eps};
|
||||
op.op_attr.reset(attr);
|
||||
"""
|
||||
inputs = [
|
||||
grad_output, self.input, self.weight, self.running_mean,
|
||||
self.running_var, self.saveMean, self.saveInvstd
|
||||
]
|
||||
outputs = [
|
||||
jt.empty(self.input.shape),
|
||||
jt.empty(self.num_features),
|
||||
jt.empty(self.num_features)
|
||||
]
|
||||
grad_input = norms_cmd("BatchNormBackward",
|
||||
inputs=inputs,
|
||||
outputs=outputs,
|
||||
attr_code=attr_code)[0]
|
||||
return grad_input
|
||||
|
||||
|
||||
class LayerNormACL(jt.Function):
|
||||
|
||||
def __init__(self,
|
||||
normalized_shape,
|
||||
eps: float = 1e-5,
|
||||
elementwise_affine: bool = True):
|
||||
if isinstance(normalized_shape, int):
|
||||
normalized_shape = (normalized_shape, )
|
||||
self.normalized_shape = tuple(normalized_shape)
|
||||
self.eps = eps
|
||||
self.elementwise_affine = elementwise_affine
|
||||
self.weight = jt.init.constant(normalized_shape, "float32",
|
||||
1.0) if elementwise_affine else 1.0
|
||||
self.bias = jt.init.constant(normalized_shape, "float32",
|
||||
0.0) if elementwise_affine else 0.0
|
||||
|
||||
def execute(self, x):
|
||||
self.input = x.float32()
|
||||
inputs = [self.input, self.weight, self.bias]
|
||||
outputs = [jt.empty(x.shape), jt.empty(x.shape), jt.empty(x.shape)]
|
||||
attr_code = f"""
|
||||
op.jt_name = "layernorm";
|
||||
LayerNormAttr *attr = new LayerNormAttr();
|
||||
attr->eps = {self.eps};
|
||||
attr->normalizedShape = {{{', '.join(map(str, (list(self.normalized_shape))))}}};
|
||||
attr->size = {x.shape[-1]};
|
||||
op.op_attr.reset(attr);
|
||||
"""
|
||||
result = norms_cmd("LayerNorm",
|
||||
inputs=inputs,
|
||||
outputs=outputs,
|
||||
attr_code=attr_code)
|
||||
self.output = result[0]
|
||||
self.meanout = result[1]
|
||||
self.rstdout = result[2]
|
||||
return self.output
|
||||
|
||||
def grad(self, grad_output):
|
||||
attr_code = f"""
|
||||
op.jt_name = "batchnorm";
|
||||
BatchNormAttr *attr = new BatchNormAttr();
|
||||
attr->is_train = {"true" if self.is_train else "false"};
|
||||
attr->momentum = {self.momentum};
|
||||
attr->eps = {self.eps};
|
||||
op.op_attr.reset(attr);
|
||||
"""
|
||||
inputs = [grad_output, self.input, self.weight, self.running_mean, self.running_var, self.saveMean, self.saveInvstd]
|
||||
outputs = [jt.empty(self.input.shape), jt.empty(self.num_features), jt.empty(self.num_features)]
|
||||
grad_input = norms_cmd("SoftmaxBackward",
|
||||
inputs=inputs,
|
||||
outputs=outputs,
|
||||
attr_code=attr_code)[0]
|
||||
return grad_input
|
|
@ -0,0 +1,111 @@
|
|||
#pragma once
|
||||
#include <acl/acl.h>
|
||||
#include <acl/acl_op_compiler.h>
|
||||
#include <Python.h>
|
||||
#include <pystate.h>
|
||||
#include <algorithm>
|
||||
#include <queue>
|
||||
#include <set>
|
||||
#include "common.h"
|
||||
#include "op.h"
|
||||
#include "acl_jittor.h"
|
||||
#include "ops/random_op.h"
|
||||
#include "ops/reduce_op.h"
|
||||
#include "ops/binary_op.h"
|
||||
#include "ops/broadcast_to_op.h"
|
||||
#include "ops/transpose_op.h"
|
||||
#include "ops/array_op.h"
|
||||
#include "ops/code_op.h"
|
||||
#include "fused_op.h"
|
||||
#include "ops/unary_op.h"
|
||||
#include "ops/ternary_op.h"
|
||||
#include "executor.h"
|
||||
#include "misc/cuda_flags.h"
|
||||
#include "mem/allocator.h"
|
||||
#include "op_compiler.h"
|
||||
#include "ops/op_register.h"
|
||||
#include "opt/tuner_manager.h"
|
||||
#include "utils/str_utils.h"
|
||||
#include "aclnn/aclnn.h"
|
||||
#include "norms_op_acl.h"
|
||||
|
||||
namespace jittor
|
||||
{
|
||||
BatchNormOpRunner::BatchNormOpRunner() : BaseOpRunner("BatchNorm")
|
||||
{
|
||||
}
|
||||
|
||||
void BatchNormOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
|
||||
{
|
||||
auto attr = dynamic_cast<BatchNormAttr *>(op_attr.get());
|
||||
ret = aclnnBatchNormGetWorkspaceSize(inputTensors[0], inputTensors[1], inputTensors[2], inputTensors[3], inputTensors[4], attr->is_train, attr->momentum, attr->eps, outputTensors[0], outputTensors[1], outputTensors[2], &workspaceSize, &executor);
|
||||
|
||||
checkRet(ret);
|
||||
|
||||
if (workspaceSize > 0)
|
||||
{
|
||||
mallocWorkSpace(workspaceSize);
|
||||
}
|
||||
|
||||
ret = aclnnBatchNorm(workspaceAddr, workspaceSize, executor, aclstream);
|
||||
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnBatchNorm failed. ERROR: %d\n", name.c_str(), ret); return);
|
||||
|
||||
syncRun();
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
BatchNormBackwardOpRunner::BatchNormBackwardOpRunner() : BaseOpRunner("BatchNormBackward")
|
||||
{
|
||||
}
|
||||
|
||||
void BatchNormBackwardOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
|
||||
{
|
||||
auto attr = dynamic_cast<BatchNormAttr *>(op_attr.get());
|
||||
bool outputMask[3] = {true, true, true};
|
||||
aclBoolArray *outMask = aclCreateBoolArray(outputMask, 3);
|
||||
ret = aclnnBatchNormBackwardGetWorkspaceSize(inputTensors[0], inputTensors[1], inputTensors[2], inputTensors[3], inputTensors[4], inputTensors[5], inputTensors[6], attr->is_train, attr->eps, outMask, outputTensors[0], outputTensors[1], outputTensors[2], &workspaceSize, &executor);
|
||||
|
||||
checkRet(ret);
|
||||
|
||||
if (workspaceSize > 0)
|
||||
{
|
||||
mallocWorkSpace(workspaceSize);
|
||||
}
|
||||
|
||||
ret = aclnnBatchNormBackward(workspaceAddr, workspaceSize, executor, aclstream);
|
||||
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnBatchNormBackward failed. ERROR: %d\n", name.c_str(), ret); return);
|
||||
|
||||
syncRun();
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
LayerNormOpRunner::LayerNormOpRunner() : BaseOpRunner("LayerNorm")
|
||||
{
|
||||
}
|
||||
|
||||
void LayerNormOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
|
||||
{
|
||||
auto attr = dynamic_cast<LayerNormAttr *>(op_attr.get());
|
||||
aclIntArray *normalizedShape = nullptr;
|
||||
normalizedShape = aclCreateIntArray(attr->normalizedShape.data(), attr->size);
|
||||
ret = aclnnLayerNormGetWorkspaceSize(inputTensors[0], normalizedShape, inputTensors[1], inputTensors[2], attr->eps, outputTensors[0], outputTensors[1], outputTensors[2], &workspaceSize, &executor);
|
||||
|
||||
checkRet(ret);
|
||||
|
||||
if (workspaceSize > 0)
|
||||
{
|
||||
mallocWorkSpace(workspaceSize);
|
||||
}
|
||||
|
||||
ret = aclnnLayerNorm(workspaceAddr, workspaceSize, executor, aclstream);
|
||||
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnLayerNorm failed. ERROR: %d\n", name.c_str(), ret); return);
|
||||
|
||||
syncRun();
|
||||
aclDestroyIntArray(normalizedShape);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,34 @@
|
|||
#pragma once
|
||||
#include "utils.h"
|
||||
#include "base_op.h"
|
||||
|
||||
namespace jittor
|
||||
{
|
||||
class BatchNormOpRunner : public BaseOpRunner
|
||||
{
|
||||
|
||||
protected:
|
||||
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
|
||||
public:
|
||||
BatchNormOpRunner();
|
||||
};
|
||||
|
||||
class BatchNormBackwardOpRunner : public BaseOpRunner
|
||||
{
|
||||
|
||||
protected:
|
||||
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
|
||||
public:
|
||||
BatchNormBackwardOpRunner();
|
||||
};
|
||||
|
||||
class LayerNormOpRunner : public BaseOpRunner
|
||||
{
|
||||
|
||||
protected:
|
||||
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
|
||||
public:
|
||||
LayerNormOpRunner();
|
||||
};
|
||||
|
||||
}
|
|
@ -0,0 +1,176 @@
|
|||
import os
|
||||
from jittor_utils import env_or_try_find
|
||||
import jittor_utils
|
||||
import ctypes
|
||||
import glob
|
||||
import jittor.compiler as compiler
|
||||
import jittor as jt
|
||||
import math
|
||||
import numpy as np
|
||||
|
||||
from typing import Union
|
||||
from collections.abc import Sequence, Iterable
|
||||
|
||||
|
||||
def pool_cmd(name: str,
|
||||
inputs: list,
|
||||
output_dtypes: list = None,
|
||||
output_shapes: list = None,
|
||||
attr_code: str = "",
|
||||
attr_header: str = "",
|
||||
outputs: list = None):
|
||||
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
|
||||
|
||||
cuda_header = '''
|
||||
#include "acl/aclops/aclops.h"
|
||||
'''
|
||||
outputs_ = []
|
||||
if outputs is not None:
|
||||
outputs_ = outputs
|
||||
else:
|
||||
assert output_dtypes is not None
|
||||
assert output_shapes is not None
|
||||
assert len(output_dtypes) == len(output_shapes)
|
||||
for i in range(len(output_shapes)):
|
||||
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
|
||||
input_code = ''
|
||||
for i in range(len(inputs)):
|
||||
input_code += f"op.add(in{i}, true);\n"
|
||||
|
||||
output_code = ''
|
||||
for i in range(len(outputs_)):
|
||||
output_code += f"op.add(out{i}, false);\n"
|
||||
return jt.code(outputs=outputs_,
|
||||
inputs=inputs,
|
||||
cuda_header=attr_header + cuda_header,
|
||||
cuda_src=f"""
|
||||
|
||||
// aclop
|
||||
{name}OpRunner op;
|
||||
{input_code}
|
||||
{output_code}
|
||||
{attr_code}
|
||||
op.run();""")
|
||||
|
||||
|
||||
class PoolACL(jt.Function):
|
||||
|
||||
def __init__(self,
|
||||
kernel_size,
|
||||
stride=None,
|
||||
padding=0,
|
||||
dilation=None,
|
||||
return_indices=None,
|
||||
ceil_mode=False,
|
||||
count_include_pad=True,
|
||||
op='maximum'):
|
||||
self.kernel_size = kernel_size if isinstance(
|
||||
kernel_size, tuple) else (kernel_size, kernel_size)
|
||||
stride = stride if stride else kernel_size
|
||||
self.stride = stride if isinstance(stride, tuple) else (stride, stride)
|
||||
self.padding = padding if isinstance(padding, tuple) else (padding,
|
||||
padding)
|
||||
dilation = dilation if dilation else 1
|
||||
assert dilation == 1
|
||||
self.dilation = dilation if isinstance(dilation, tuple) else (dilation,
|
||||
dilation)
|
||||
for item in self.kernel_size:
|
||||
if item <= 0:
|
||||
raise RuntimeError(
|
||||
f"kernel_size must be greater than zero, but got {item}")
|
||||
for item in self.stride:
|
||||
if item <= 0:
|
||||
raise RuntimeError(
|
||||
f"stride must be greater than zero, but got {item}")
|
||||
for item in self.padding:
|
||||
if item < 0:
|
||||
raise RuntimeError(
|
||||
f"padding must be non-negative, but got {item}")
|
||||
self.op = op
|
||||
self.return_indices = return_indices
|
||||
self.ceil_mode = ceil_mode
|
||||
self.count_include_pad = count_include_pad
|
||||
|
||||
def execute(self, input):
|
||||
self.input = input
|
||||
attr_code = f"""
|
||||
op.jt_name = "{"avgpool" if self.op == 'mean' else "maxpool"}";
|
||||
PoolAttr *attr = new PoolAttr();
|
||||
attr->kernel_size = {{ {self.kernel_size[0]}, {self.kernel_size[1]} }};
|
||||
attr->poolStrides = {{ {self.stride[0]}, {self.stride[1]} }};
|
||||
attr->poolPads = {{ {self.padding[0]}, {self.padding[1]} }};
|
||||
attr->poolDilations = {{ {self.dilation[0]}, {self.dilation[1]} }};
|
||||
attr->poolCeil = {"true" if self.ceil_mode else "false"};
|
||||
attr->countIncludePad = {"true" if self.count_include_pad else "false"};
|
||||
op.op_attr.reset(attr);
|
||||
"""
|
||||
input_height, input_width = input.shape[-2:]
|
||||
kernel_height, kernel_width = self.kernel_size[-2:]
|
||||
|
||||
output_height = (input_height + 2 * self.padding[0] -
|
||||
(kernel_height - 1) - 1) // self.stride[0] + 1
|
||||
output_width = (input_width + 2 * self.padding[1] -
|
||||
(kernel_width - 1) - 1) // self.stride[1] + 1
|
||||
|
||||
output_shape = (input.shape[0], input.shape[1], output_height,
|
||||
output_width)
|
||||
|
||||
inputs = [input]
|
||||
|
||||
if self.op == 'maximum':
|
||||
result = pool_cmd(
|
||||
"Maxpool",
|
||||
inputs,
|
||||
output_dtypes=[input.dtype, 'int32'],
|
||||
output_shapes=[output_shape, output_shape],
|
||||
attr_code=attr_code,
|
||||
)
|
||||
elif self.op == 'mean':
|
||||
result = pool_cmd(
|
||||
"Avgpool",
|
||||
inputs,
|
||||
output_dtypes=[input.dtype],
|
||||
output_shapes=[output_shape],
|
||||
attr_code=attr_code,
|
||||
)
|
||||
else:
|
||||
raise ValueError('no this type pool')
|
||||
|
||||
if self.op == 'maximum':
|
||||
self.index = result[1]
|
||||
|
||||
if self.return_indices:
|
||||
return result[0], result[1]
|
||||
else:
|
||||
return result[0]
|
||||
|
||||
def grad(self, grad_output):
|
||||
input = self.input
|
||||
attr_code = f"""
|
||||
op.jt_name = "{"avgpoolbackward" if self.op == 'mean' else "maxpoolbackward"}";
|
||||
PoolAttr *attr = new PoolAttr();
|
||||
attr->kernel_size = {{ {self.kernel_size[0]}, {self.kernel_size[1]} }};
|
||||
attr->poolStrides = {{ {self.stride[0]}, {self.stride[1]} }};
|
||||
attr->poolPads = {{ {self.padding[0]}, {self.padding[1]} }};
|
||||
attr->poolDilations = {{ {self.dilation[0]}, {self.dilation[1]} }};
|
||||
attr->poolCeil = {"true" if self.ceil_mode else "false"};
|
||||
attr->countIncludePad = {"true" if self.count_include_pad else "false"};
|
||||
op.op_attr.reset(attr);
|
||||
"""
|
||||
output_shapes = [input.shape]
|
||||
output_dtypes = [input.dtype]
|
||||
if self.op == 'maximum':
|
||||
result = pool_cmd("MaxpoolBackward",
|
||||
inputs=[grad_output, input, self.index],
|
||||
output_dtypes=output_dtypes,
|
||||
output_shapes=output_shapes,
|
||||
attr_code=attr_code)[0]
|
||||
elif self.op == 'mean':
|
||||
result = pool_cmd("AvgpoolBackward",
|
||||
inputs=[grad_output, input],
|
||||
output_dtypes=output_dtypes,
|
||||
output_shapes=output_shapes,
|
||||
attr_code=attr_code)[0]
|
||||
else:
|
||||
raise ValueError('no this type pool')
|
||||
return result
|
|
@ -0,0 +1,187 @@
|
|||
#pragma once
|
||||
#include <acl/acl.h>
|
||||
#include <acl/acl_op_compiler.h>
|
||||
#include <Python.h>
|
||||
#include <pystate.h>
|
||||
#include <algorithm>
|
||||
#include <queue>
|
||||
#include <set>
|
||||
#include "common.h"
|
||||
#include "op.h"
|
||||
#include "acl_jittor.h"
|
||||
#include "ops/random_op.h"
|
||||
#include "ops/reduce_op.h"
|
||||
#include "ops/binary_op.h"
|
||||
#include "ops/broadcast_to_op.h"
|
||||
#include "ops/transpose_op.h"
|
||||
#include "ops/array_op.h"
|
||||
#include "ops/code_op.h"
|
||||
#include "fused_op.h"
|
||||
#include "ops/unary_op.h"
|
||||
#include "ops/ternary_op.h"
|
||||
#include "executor.h"
|
||||
#include "misc/cuda_flags.h"
|
||||
#include "mem/allocator.h"
|
||||
#include "op_compiler.h"
|
||||
#include "ops/op_register.h"
|
||||
#include "opt/tuner_manager.h"
|
||||
#include "utils/str_utils.h"
|
||||
#include "aclnn/aclnn.h"
|
||||
#include "pool_op_acl.h"
|
||||
|
||||
namespace jittor
|
||||
{
|
||||
MaxpoolOpRunner::MaxpoolOpRunner() : BaseOpRunner("Maxpool")
|
||||
{
|
||||
use_nchw = true;
|
||||
}
|
||||
|
||||
void MaxpoolOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
|
||||
{
|
||||
|
||||
aclIntArray *strides = nullptr;
|
||||
aclIntArray *pads = nullptr;
|
||||
aclIntArray *dilations = nullptr;
|
||||
aclIntArray *kernel_size = nullptr;
|
||||
|
||||
auto attr = dynamic_cast<PoolAttr *>(op_attr.get());
|
||||
kernel_size = aclCreateIntArray(attr->kernel_size.data(), 2);
|
||||
strides = aclCreateIntArray(attr->poolStrides.data(), 2);
|
||||
pads = aclCreateIntArray(attr->poolPads.data(), 2);
|
||||
dilations = aclCreateIntArray(attr->poolDilations.data(), 2);
|
||||
ret = aclnnMaxPool2dWithIndicesGetWorkspaceSize(inputTensors[0], kernel_size, strides, pads, dilations, attr->poolCeil, outputTensors[0], outputTensors[1], &workspaceSize, &executor);
|
||||
|
||||
checkRet(ret);
|
||||
|
||||
if (workspaceSize > 0)
|
||||
{
|
||||
mallocWorkSpace(workspaceSize);
|
||||
}
|
||||
|
||||
ret = aclnnMaxPool2dWithIndices(workspaceAddr, workspaceSize, executor, aclstream);
|
||||
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnMaxPool2dWithIndices failed. ERROR: %d\n", name.c_str(), ret); return);
|
||||
|
||||
syncRun();
|
||||
|
||||
aclDestroyIntArray(strides);
|
||||
aclDestroyIntArray(pads);
|
||||
aclDestroyIntArray(dilations);
|
||||
aclDestroyIntArray(kernel_size);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
AvgpoolOpRunner::AvgpoolOpRunner() : BaseOpRunner("Avgpool")
|
||||
{
|
||||
use_nchw = true;
|
||||
}
|
||||
|
||||
void AvgpoolOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
|
||||
{
|
||||
|
||||
aclIntArray *strides = nullptr;
|
||||
aclIntArray *pads = nullptr;
|
||||
aclIntArray *kernel_size = nullptr;
|
||||
|
||||
auto attr = dynamic_cast<PoolAttr *>(op_attr.get());
|
||||
kernel_size = aclCreateIntArray(attr->kernel_size.data(), 2);
|
||||
strides = aclCreateIntArray(attr->poolStrides.data(), 2);
|
||||
pads = aclCreateIntArray(attr->poolPads.data(), 2);
|
||||
ret = aclnnAvgPool2dGetWorkspaceSize(inputTensors[0], kernel_size, strides, pads, attr->poolCeil, attr->countIncludePad, attr->divisorOverride, attr->divisorOverride, outputTensors[0], &workspaceSize, &executor);
|
||||
|
||||
checkRet(ret);
|
||||
|
||||
if (workspaceSize > 0)
|
||||
{
|
||||
mallocWorkSpace(workspaceSize);
|
||||
}
|
||||
|
||||
ret = aclnnAvgPool2d(workspaceAddr, workspaceSize, executor, aclstream);
|
||||
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnAvgPool2d failed. ERROR: %d\n", name.c_str(), ret); return);
|
||||
|
||||
syncRun();
|
||||
|
||||
aclDestroyIntArray(strides);
|
||||
aclDestroyIntArray(pads);
|
||||
aclDestroyIntArray(kernel_size);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
MaxpoolBackwardOpRunner::MaxpoolBackwardOpRunner() : BaseOpRunner("MaxpoolBackward")
|
||||
{
|
||||
use_nchw = true;
|
||||
}
|
||||
|
||||
void MaxpoolBackwardOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
|
||||
{
|
||||
|
||||
aclIntArray *strides = nullptr;
|
||||
aclIntArray *pads = nullptr;
|
||||
aclIntArray *dilations = nullptr;
|
||||
aclIntArray *kernel_size = nullptr;
|
||||
|
||||
auto attr = dynamic_cast<PoolAttr *>(op_attr.get());
|
||||
kernel_size = aclCreateIntArray(attr->kernel_size.data(), 2);
|
||||
strides = aclCreateIntArray(attr->poolStrides.data(), 2);
|
||||
pads = aclCreateIntArray(attr->poolPads.data(), 2);
|
||||
dilations = aclCreateIntArray(attr->poolDilations.data(), 2);
|
||||
ret = aclnnMaxPool2dWithIndicesBackwardGetWorkspaceSize(inputTensors[0], inputTensors[1], inputTensors[2], kernel_size, strides, pads, dilations, attr->poolCeil, outputTensors[0], &workspaceSize, &executor);
|
||||
|
||||
checkRet(ret);
|
||||
|
||||
if (workspaceSize > 0)
|
||||
{
|
||||
mallocWorkSpace(workspaceSize);
|
||||
}
|
||||
|
||||
ret = aclnnMaxPool2dWithIndicesBackward(workspaceAddr, workspaceSize, executor, aclstream);
|
||||
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnMaxPool2dWithIndicesBackward failed. ERROR: %d\n", name.c_str(), ret); return);
|
||||
|
||||
syncRun();
|
||||
|
||||
aclDestroyIntArray(strides);
|
||||
aclDestroyIntArray(pads);
|
||||
aclDestroyIntArray(dilations);
|
||||
aclDestroyIntArray(kernel_size);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
AvgpoolBackwardOpRunner::AvgpoolBackwardOpRunner() : BaseOpRunner("AvgpoolBackward")
|
||||
{
|
||||
use_nchw = true;
|
||||
}
|
||||
|
||||
void AvgpoolBackwardOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
|
||||
{
|
||||
aclIntArray *strides = nullptr;
|
||||
aclIntArray *pads = nullptr;
|
||||
aclIntArray *kernel_size = nullptr;
|
||||
|
||||
auto attr = dynamic_cast<PoolAttr *>(op_attr.get());
|
||||
kernel_size = aclCreateIntArray(attr->kernel_size.data(), 2);
|
||||
strides = aclCreateIntArray(attr->poolStrides.data(), 2);
|
||||
pads = aclCreateIntArray(attr->poolPads.data(), 2);
|
||||
ret = aclnnAvgPool2dBackwardGetWorkspaceSize(inputTensors[0], inputTensors[1], kernel_size, strides, pads, attr->countIncludePad, attr->divisorOverride, attr->divisorOverride, attr->poolCeil, outputTensors[0], &workspaceSize, &executor);
|
||||
|
||||
checkRet(ret);
|
||||
|
||||
if (workspaceSize > 0)
|
||||
{
|
||||
mallocWorkSpace(workspaceSize);
|
||||
}
|
||||
|
||||
ret = aclnnAvgPool2dBackward(workspaceAddr, workspaceSize, executor, aclstream);
|
||||
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnAvgPool2dBackward failed. ERROR: %d\n", name.c_str(), ret); return);
|
||||
|
||||
syncRun();
|
||||
|
||||
aclDestroyIntArray(strides);
|
||||
aclDestroyIntArray(pads);
|
||||
aclDestroyIntArray(kernel_size);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,46 @@
|
|||
#pragma once
|
||||
#include "utils.h"
|
||||
#include "base_op.h"
|
||||
|
||||
namespace jittor
|
||||
{
|
||||
class MaxpoolOpRunner : public BaseOpRunner
|
||||
{
|
||||
|
||||
protected:
|
||||
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
|
||||
|
||||
public:
|
||||
MaxpoolOpRunner();
|
||||
};
|
||||
|
||||
class AvgpoolOpRunner : public BaseOpRunner
|
||||
{
|
||||
|
||||
protected:
|
||||
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
|
||||
|
||||
public:
|
||||
AvgpoolOpRunner();
|
||||
};
|
||||
|
||||
class MaxpoolBackwardOpRunner : public BaseOpRunner
|
||||
{
|
||||
|
||||
protected:
|
||||
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
|
||||
|
||||
public:
|
||||
MaxpoolBackwardOpRunner();
|
||||
};
|
||||
|
||||
class AvgpoolBackwardOpRunner : public BaseOpRunner
|
||||
{
|
||||
|
||||
protected:
|
||||
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
|
||||
|
||||
public:
|
||||
AvgpoolBackwardOpRunner();
|
||||
};
|
||||
}
|
|
@ -0,0 +1,82 @@
|
|||
#pragma once
|
||||
#include <acl/acl.h>
|
||||
#include <acl/acl_op_compiler.h>
|
||||
#include <Python.h>
|
||||
#include <pystate.h>
|
||||
#include <algorithm>
|
||||
#include <queue>
|
||||
#include <set>
|
||||
#include "common.h"
|
||||
#include "op.h"
|
||||
#include "acl_jittor.h"
|
||||
#include "ops/random_op.h"
|
||||
#include "ops/reduce_op.h"
|
||||
#include "ops/binary_op.h"
|
||||
#include "ops/broadcast_to_op.h"
|
||||
#include "ops/transpose_op.h"
|
||||
#include "ops/array_op.h"
|
||||
#include "ops/code_op.h"
|
||||
#include "fused_op.h"
|
||||
#include "ops/unary_op.h"
|
||||
#include "ops/ternary_op.h"
|
||||
#include "executor.h"
|
||||
#include "misc/cuda_flags.h"
|
||||
#include "mem/allocator.h"
|
||||
#include "op_compiler.h"
|
||||
#include "ops/op_register.h"
|
||||
#include "opt/tuner_manager.h"
|
||||
#include "utils/str_utils.h"
|
||||
#include "aclnn/aclnn.h"
|
||||
#include "random_op_acl.h"
|
||||
|
||||
namespace jittor
|
||||
{
|
||||
RandomOpRunner::RandomOpRunner() : BaseOpRunner("RandomUniform")
|
||||
{
|
||||
name = "RandomUniform";
|
||||
}
|
||||
|
||||
RandomOpRunner::RandomOpRunner(const string &_name) : BaseOpRunner(_name)
|
||||
{
|
||||
name = _name;
|
||||
}
|
||||
|
||||
void RandomOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
|
||||
{
|
||||
auto attr = dynamic_cast<RandomAttr *>(op_attr.get());
|
||||
if (name == "RandomUniform")
|
||||
{
|
||||
ret = aclnnInplaceUniformGetWorkspaceSize(outputTensors[0], 0.0, 1.0, attr->seed, attr->offset, &workspaceSize, &executor);
|
||||
|
||||
checkRet(ret);
|
||||
|
||||
if (workspaceSize > 0)
|
||||
{
|
||||
mallocWorkSpace(workspaceSize);
|
||||
}
|
||||
|
||||
ret = aclnnInplaceUniform(workspaceAddr, workspaceSize, executor, aclstream);
|
||||
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnInplaceUniform failed. ERROR: %d\n", name.c_str(), ret); return);
|
||||
}
|
||||
else if (name == "RandomNormal")
|
||||
{
|
||||
ret = aclnnInplaceNormalGetWorkspaceSize(outputTensors[0], 0.0, 1.0, attr->seed, attr->offset, &workspaceSize, &executor);
|
||||
|
||||
checkRet(ret);
|
||||
|
||||
if (workspaceSize > 0)
|
||||
{
|
||||
mallocWorkSpace(workspaceSize);
|
||||
}
|
||||
|
||||
ret = aclnnInplaceNormal(workspaceAddr, workspaceSize, executor, aclstream);
|
||||
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnInplaceNormal failed. ERROR: %d\n", name.c_str(), ret); return);
|
||||
}
|
||||
else
|
||||
{
|
||||
LOGf << "Not supported random type : " << name;
|
||||
}
|
||||
syncRun();
|
||||
return;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,18 @@
|
|||
#pragma once
|
||||
#include "utils.h"
|
||||
#include "base_op.h"
|
||||
|
||||
namespace jittor
|
||||
{
|
||||
class RandomOpRunner : public BaseOpRunner
|
||||
{
|
||||
|
||||
protected:
|
||||
string name; // special to random op
|
||||
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
|
||||
|
||||
public:
|
||||
RandomOpRunner();
|
||||
RandomOpRunner(const string &name);
|
||||
};
|
||||
}
|
|
@ -0,0 +1,127 @@
|
|||
#include <acl/acl.h>
|
||||
#include <acl/acl_op_compiler.h>
|
||||
#include <Python.h>
|
||||
#include <pystate.h>
|
||||
#include <algorithm>
|
||||
#include <queue>
|
||||
#include <set>
|
||||
#include "common.h"
|
||||
#include "op.h"
|
||||
#include "acl_jittor.h"
|
||||
#include "ops/random_op.h"
|
||||
#include "ops/reduce_op.h"
|
||||
#include "ops/binary_op.h"
|
||||
#include "ops/broadcast_to_op.h"
|
||||
#include "ops/transpose_op.h"
|
||||
#include "ops/array_op.h"
|
||||
#include "ops/code_op.h"
|
||||
#include "fused_op.h"
|
||||
#include "ops/unary_op.h"
|
||||
#include "ops/ternary_op.h"
|
||||
#include "executor.h"
|
||||
#include "misc/cuda_flags.h"
|
||||
#include "mem/allocator.h"
|
||||
#include "op_compiler.h"
|
||||
#include "ops/op_register.h"
|
||||
#include "opt/tuner_manager.h"
|
||||
#include "utils/str_utils.h"
|
||||
#include "aclnn/aclnn.h"
|
||||
#include "reduce_op_acl.h"
|
||||
|
||||
namespace jittor
|
||||
{
|
||||
ReduceOpRunner::ReduceOpRunner() : BaseOpRunner("reduce")
|
||||
{
|
||||
use_nchw = false;
|
||||
}
|
||||
|
||||
void ReduceOpRunner::setupOutputDesc()
|
||||
{
|
||||
auto output_num = out_.size();
|
||||
|
||||
for (int output_idx = 0; output_idx < output_num; output_idx++)
|
||||
{
|
||||
std::vector<int64_t> shape;
|
||||
for (int j = 0; j < out_[output_idx]->shape.size(); j++)
|
||||
{
|
||||
shape.push_back(out_[output_idx]->shape[j]);
|
||||
}
|
||||
outputShapes.push_back(shape);
|
||||
}
|
||||
|
||||
attr = dynamic_cast<ReduceAttr *>(op_attr.get());
|
||||
dim = aclCreateIntArray(attr->axes.data(), attr->axes.size());
|
||||
keepdims = attr->keepdims;
|
||||
|
||||
if (op_idx < 13)
|
||||
{
|
||||
if (attr->axes.size() == in_[0]->shape.size())
|
||||
outputShapes[0] = {};
|
||||
}
|
||||
|
||||
for (int idx = 0; idx < output_num; idx++)
|
||||
{
|
||||
outputTensors.push_back(nullptr);
|
||||
auto ret = CreateAclTensor(outputShapes[idx], out_[idx]->mem_ptr, out_[idx]->size, get_dtype(out_[idx]->dtype()), &outputTensors[idx], use_nchw);
|
||||
CHECK_RET(ret == ACL_SUCCESS, return);
|
||||
}
|
||||
}
|
||||
|
||||
void ReduceOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
|
||||
{
|
||||
switch (op_idx)
|
||||
{
|
||||
case 9:
|
||||
{
|
||||
ret = aclnnReduceSumGetWorkspaceSize(inputTensors[0], dim, keepdims, get_dtype(out_[0]->dtype()), outputTensors[0], &workspaceSize, &executor);
|
||||
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnReduceSumGetWorkspaceSize failed. ERROR: %d\n", name.c_str(), ret); return);
|
||||
if (workspaceSize > 0)
|
||||
{
|
||||
mallocWorkSpace(workspaceSize);
|
||||
}
|
||||
ret = aclnnReduceSum(workspaceAddr, workspaceSize, executor, aclstream);
|
||||
break;
|
||||
}
|
||||
case 10:
|
||||
{
|
||||
ret = aclnnMeanGetWorkspaceSize(inputTensors[0], dim, keepdims, get_dtype(out_[0]->dtype()), outputTensors[0], &workspaceSize, &executor);
|
||||
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnMeanGetWorkspaceSize failed. ERROR: %d\n", name.c_str(), ret); return);
|
||||
if (workspaceSize > 0)
|
||||
{
|
||||
mallocWorkSpace(workspaceSize);
|
||||
}
|
||||
ret = aclnnMean(workspaceAddr, workspaceSize, executor, aclstream);
|
||||
break;
|
||||
}
|
||||
case 11:
|
||||
{
|
||||
ret = aclnnAmaxGetWorkspaceSize(inputTensors[0], dim, keepdims, outputTensors[0], &workspaceSize, &executor);
|
||||
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnAmaxGetWorkspaceSize failed. ERROR: %d\n", name.c_str(), ret); return);
|
||||
if (workspaceSize > 0)
|
||||
{
|
||||
mallocWorkSpace(workspaceSize);
|
||||
}
|
||||
ret = aclnnAmax(workspaceAddr, workspaceSize, executor, aclstream);
|
||||
break;
|
||||
}
|
||||
case 12:
|
||||
{
|
||||
ret = aclnnAminGetWorkspaceSize(inputTensors[0], dim, keepdims, outputTensors[0], &workspaceSize, &executor);
|
||||
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnAminGetWorkspaceSize failed. ERROR: %d\n", name.c_str(), ret); return);
|
||||
if (workspaceSize > 0)
|
||||
{
|
||||
mallocWorkSpace(workspaceSize);
|
||||
}
|
||||
ret = aclnnAmin(workspaceAddr, workspaceSize, executor, aclstream);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
{
|
||||
LOGir << "no such reduce!!";
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
syncRun();
|
||||
return;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,21 @@
|
|||
#pragma once
|
||||
#include "utils.h"
|
||||
#include "base_op.h"
|
||||
|
||||
namespace jittor
|
||||
{
|
||||
struct ReduceOpRunner : public BaseOpRunner
|
||||
{
|
||||
int op_idx; // Specific to reduce operations
|
||||
|
||||
ReduceOpRunner();
|
||||
|
||||
protected:
|
||||
ReduceAttr *attr;
|
||||
aclIntArray *dim;
|
||||
bool keepdims;
|
||||
|
||||
void setupOutputDesc() override;
|
||||
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
|
||||
};
|
||||
}
|
|
@ -0,0 +1,115 @@
|
|||
import os
|
||||
from jittor_utils import env_or_try_find
|
||||
import jittor_utils
|
||||
import ctypes
|
||||
import glob
|
||||
import jittor.compiler as compiler
|
||||
import jittor as jt
|
||||
import math
|
||||
import numpy as np
|
||||
|
||||
from typing import Union
|
||||
from collections.abc import Sequence, Iterable
|
||||
|
||||
|
||||
def relu_cmd(name: str,
|
||||
inputs: list,
|
||||
output_dtypes: list = None,
|
||||
output_shapes: list = None,
|
||||
attr_code: str = "",
|
||||
attr_header: str = "",
|
||||
outputs: list = None):
|
||||
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
|
||||
|
||||
cuda_header = '''
|
||||
#include "acl/aclops/aclops.h"
|
||||
'''
|
||||
outputs_ = []
|
||||
if outputs is not None:
|
||||
outputs_ = outputs
|
||||
else:
|
||||
assert output_dtypes is not None
|
||||
assert output_shapes is not None
|
||||
assert len(output_dtypes) == len(output_shapes)
|
||||
for i in range(len(output_shapes)):
|
||||
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
|
||||
input_code = ''
|
||||
for i in range(len(inputs)):
|
||||
input_code += f"op.add(in{i}, true);\n"
|
||||
|
||||
output_code = ''
|
||||
for i in range(len(outputs_)):
|
||||
output_code += f"op.add(out{i}, false);\n"
|
||||
return jt.code(outputs=outputs_,
|
||||
inputs=inputs,
|
||||
cuda_header=attr_header + cuda_header,
|
||||
cuda_src=f"""
|
||||
|
||||
// aclop
|
||||
{name}OpRunner op;
|
||||
{input_code}
|
||||
{output_code}
|
||||
{attr_code}
|
||||
op.run();""")
|
||||
|
||||
|
||||
class ReLUACL(jt.Function):
|
||||
|
||||
def __init__(self):
|
||||
super(ReLUACL, self).__init__()
|
||||
|
||||
def execute(self, x):
|
||||
x = x.float32()
|
||||
self.input = x
|
||||
result = relu_cmd("Unary", [x],
|
||||
output_dtypes=[x.dtype],
|
||||
output_shapes=[x.shape],
|
||||
attr_code="op.name=\"ReLU\";")[0]
|
||||
return result
|
||||
|
||||
def grad(self, grad_output):
|
||||
mask = relu_cmd("Binary",
|
||||
[self.input, jt.zeros(self.input.shape)],
|
||||
output_dtypes=[self.input.dtype],
|
||||
output_shapes=[self.input.shape],
|
||||
attr_code="op.name=\"Greater\";")[0]
|
||||
grad_input = relu_cmd("Binary", [grad_output, mask],
|
||||
output_dtypes=[grad_output.dtype],
|
||||
output_shapes=[grad_output.shape],
|
||||
attr_code="op.name=\"Mul\";")[0]
|
||||
return grad_input
|
||||
|
||||
class LeakyReLUACL(jt.Function):
|
||||
|
||||
def __init__(self):
|
||||
super(LeakyReLUACL, self).__init__()
|
||||
|
||||
def execute(self, x, negative_slope=0.01):
|
||||
x = x.float32()
|
||||
self.input = x
|
||||
attr_code = f"""
|
||||
op.jt_name = "leakyrelu";
|
||||
LeakyReluAttr *attr = new LeakyReluAttr();
|
||||
attr->negativeSlope = {negative_slope};
|
||||
op.op_attr.reset(attr);
|
||||
"""
|
||||
result = relu_cmd("LeakyReLU", [x],
|
||||
output_dtypes=[x.dtype],
|
||||
output_shapes=[x.shape],
|
||||
attr_code=attr_code)[0]
|
||||
self.negative_slope = negative_slope
|
||||
return result
|
||||
|
||||
def grad(self, grad_output):
|
||||
attr_code = f"""
|
||||
op.jt_name = "leakyrelubackward";
|
||||
LeakyReluAttr *attr = new LeakyReluAttr();
|
||||
attr->negativeSlope = {self.negative_slope};
|
||||
attr->selfIsResult = false;
|
||||
op.op_attr.reset(attr);
|
||||
"""
|
||||
grad_input = relu_cmd("LeakyReLUBackward", [grad_output, self.input],
|
||||
output_dtypes=[grad_output.dtype],
|
||||
output_shapes=[grad_output.shape],
|
||||
attr_code=attr_code)[0]
|
||||
return grad_input
|
|
@ -0,0 +1,90 @@
|
|||
#pragma once
|
||||
#include <acl/acl.h>
|
||||
#include <acl/acl_op_compiler.h>
|
||||
#include <Python.h>
|
||||
#include <pystate.h>
|
||||
#include <algorithm>
|
||||
#include <queue>
|
||||
#include <set>
|
||||
#include "common.h"
|
||||
#include "op.h"
|
||||
#include "acl_jittor.h"
|
||||
#include "ops/random_op.h"
|
||||
#include "ops/reduce_op.h"
|
||||
#include "ops/binary_op.h"
|
||||
#include "ops/broadcast_to_op.h"
|
||||
#include "ops/transpose_op.h"
|
||||
#include "ops/array_op.h"
|
||||
#include "ops/code_op.h"
|
||||
#include "fused_op.h"
|
||||
#include "ops/unary_op.h"
|
||||
#include "ops/ternary_op.h"
|
||||
#include "executor.h"
|
||||
#include "misc/cuda_flags.h"
|
||||
#include "mem/allocator.h"
|
||||
#include "op_compiler.h"
|
||||
#include "ops/op_register.h"
|
||||
#include "opt/tuner_manager.h"
|
||||
#include "utils/str_utils.h"
|
||||
#include "aclnn/aclnn.h"
|
||||
#include "relu_op_acl.h"
|
||||
|
||||
namespace jittor
|
||||
{
|
||||
LeakyReLUOpRunner::LeakyReLUOpRunner() : BaseOpRunner("LeakyReLU")
|
||||
{
|
||||
}
|
||||
|
||||
void LeakyReLUOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
|
||||
{
|
||||
aclScalar *negativeSlope = nullptr;
|
||||
|
||||
auto attr = dynamic_cast<LeakyReluAttr *>(op_attr.get());
|
||||
negativeSlope = aclCreateScalar(&attr->negativeSlope, aclDataType::ACL_FLOAT);
|
||||
ret = aclnnLeakyReluGetWorkspaceSize(inputTensors[0], negativeSlope, outputTensors[0], &workspaceSize, &executor);
|
||||
|
||||
checkRet(ret);
|
||||
|
||||
if (workspaceSize > 0)
|
||||
{
|
||||
mallocWorkSpace(workspaceSize);
|
||||
}
|
||||
|
||||
ret = aclnnLeakyRelu(workspaceAddr, workspaceSize, executor, aclstream);
|
||||
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnLeakyRelu failed. ERROR: %d\n", name.c_str(), ret); return);
|
||||
|
||||
syncRun();
|
||||
|
||||
aclDestroyScalar(negativeSlope);
|
||||
return;
|
||||
}
|
||||
|
||||
LeakyReLUBackwardOpRunner::LeakyReLUBackwardOpRunner() : BaseOpRunner("LeakyReLUBackward")
|
||||
{
|
||||
}
|
||||
|
||||
void LeakyReLUBackwardOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
|
||||
{
|
||||
aclScalar *negativeSlope = nullptr;
|
||||
|
||||
auto attr = dynamic_cast<LeakyReluAttr *>(op_attr.get());
|
||||
negativeSlope = aclCreateScalar(&attr->negativeSlope, aclDataType::ACL_FLOAT);
|
||||
ret = aclnnLeakyReluBackwardGetWorkspaceSize(inputTensors[0], inputTensors[1], negativeSlope, attr->selfIsResult, outputTensors[0], &workspaceSize, &executor);
|
||||
|
||||
checkRet(ret);
|
||||
|
||||
if (workspaceSize > 0)
|
||||
{
|
||||
mallocWorkSpace(workspaceSize);
|
||||
}
|
||||
|
||||
ret = aclnnLeakyReluBackward(workspaceAddr, workspaceSize, executor, aclstream);
|
||||
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnLeakyReluBackward failed. ERROR: %d\n", name.c_str(), ret); return);
|
||||
|
||||
syncRun();
|
||||
|
||||
aclDestroyScalar(negativeSlope);
|
||||
return;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,27 @@
|
|||
#pragma once
|
||||
#include "utils.h"
|
||||
#include "base_op.h"
|
||||
|
||||
namespace jittor
|
||||
{
|
||||
class LeakyReLUOpRunner : public BaseOpRunner
|
||||
{
|
||||
|
||||
protected:
|
||||
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
|
||||
|
||||
public:
|
||||
LeakyReLUOpRunner();
|
||||
};
|
||||
|
||||
class LeakyReLUBackwardOpRunner : public BaseOpRunner
|
||||
{
|
||||
|
||||
protected:
|
||||
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
|
||||
|
||||
public:
|
||||
LeakyReLUBackwardOpRunner();
|
||||
};
|
||||
|
||||
}
|
|
@ -0,0 +1,84 @@
|
|||
import os
|
||||
from jittor_utils import env_or_try_find
|
||||
import jittor_utils
|
||||
import ctypes
|
||||
import glob
|
||||
import jittor.compiler as compiler
|
||||
import jittor as jt
|
||||
import math
|
||||
import numpy as np
|
||||
|
||||
from typing import Union
|
||||
from collections.abc import Sequence, Iterable
|
||||
|
||||
|
||||
def rope_cmd(name: str,
|
||||
inputs: list,
|
||||
output_dtypes: list = None,
|
||||
output_shapes: list = None,
|
||||
attr_code: str = "",
|
||||
attr_header: str = "",
|
||||
outputs: list = None):
|
||||
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
|
||||
|
||||
cuda_header = '''
|
||||
#include "acl/aclops/aclops.h"
|
||||
'''
|
||||
outputs_ = []
|
||||
if outputs is not None:
|
||||
outputs_ = outputs
|
||||
else:
|
||||
assert output_dtypes is not None
|
||||
assert output_shapes is not None
|
||||
assert len(output_dtypes) == len(output_shapes)
|
||||
for i in range(len(output_shapes)):
|
||||
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
|
||||
input_code = ''
|
||||
for i in range(len(inputs)):
|
||||
input_code += f"op.add(in{i}, true);\n"
|
||||
|
||||
output_code = ''
|
||||
for i in range(len(outputs_)):
|
||||
output_code += f"op.add(out{i}, false);\n"
|
||||
return jt.code(outputs=outputs_,
|
||||
inputs=inputs,
|
||||
cuda_header=attr_header + cuda_header,
|
||||
cuda_src=f"""
|
||||
|
||||
// aclop
|
||||
{name}OpRunner op;
|
||||
{input_code}
|
||||
{output_code}
|
||||
{attr_code}
|
||||
op.run();""")
|
||||
|
||||
|
||||
class RopeACL(jt.Function):
|
||||
|
||||
def __init__(self):
|
||||
super(RopeACL, self).__init__()
|
||||
|
||||
def execute(self, xq, xk, freqs_cis, freq_cos, freq_sin):
|
||||
attr_code = f"""
|
||||
op.jt_name = "RotaryPosEmb";
|
||||
"""
|
||||
if freqs_cis is not None:
|
||||
freq_cos = freqs_cis[..., 0]
|
||||
freq_sin = freqs_cis[..., 1]
|
||||
else:
|
||||
assert freq_cos is not None and freq_sin is not None
|
||||
inputs = [xq, xk, freq_cos, freq_sin]
|
||||
results = rope_cmd("RotaryPosEmb",
|
||||
inputs,
|
||||
output_dtypes=[
|
||||
xq.dtype,
|
||||
],
|
||||
output_shapes=[
|
||||
xq.shape,
|
||||
],
|
||||
attr_code=attr_code)
|
||||
results[0].sync()
|
||||
return inputs[0], inputs[1]
|
||||
|
||||
def grad(self, grad_output):
|
||||
return grad_output
|
|
@ -0,0 +1,57 @@
|
|||
#pragma once
|
||||
#include <acl/acl.h>
|
||||
#include <acl/acl_op_compiler.h>
|
||||
#include <Python.h>
|
||||
#include <pystate.h>
|
||||
#include <algorithm>
|
||||
#include <queue>
|
||||
#include <set>
|
||||
#include "common.h"
|
||||
#include "op.h"
|
||||
#include "acl_jittor.h"
|
||||
#include "ops/random_op.h"
|
||||
#include "ops/reduce_op.h"
|
||||
#include "ops/binary_op.h"
|
||||
#include "ops/broadcast_to_op.h"
|
||||
#include "ops/transpose_op.h"
|
||||
#include "ops/array_op.h"
|
||||
#include "ops/code_op.h"
|
||||
#include "fused_op.h"
|
||||
#include "ops/unary_op.h"
|
||||
#include "ops/ternary_op.h"
|
||||
#include "executor.h"
|
||||
#include "misc/cuda_flags.h"
|
||||
#include "mem/allocator.h"
|
||||
#include "op_compiler.h"
|
||||
#include "ops/op_register.h"
|
||||
#include "opt/tuner_manager.h"
|
||||
#include "utils/str_utils.h"
|
||||
#include "aclnn/aclnn.h"
|
||||
#include "rope_op_acl.h"
|
||||
|
||||
namespace jittor
|
||||
{
|
||||
RotaryPosEmbOpRunner::RotaryPosEmbOpRunner() : BaseOpRunner("RotaryPosEmb")
|
||||
{
|
||||
}
|
||||
|
||||
void RotaryPosEmbOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
|
||||
{
|
||||
ret = aclnnApplyRotaryPosEmbGetWorkspaceSize(inputTensors[0], inputTensors[1], inputTensors[2], inputTensors[3], (int64_t)1, &workspaceSize, &executor);
|
||||
|
||||
checkRet(ret);
|
||||
|
||||
if (workspaceSize > 0)
|
||||
{
|
||||
mallocWorkSpace(workspaceSize);
|
||||
}
|
||||
|
||||
ret = aclnnApplyRotaryPosEmb(workspaceAddr, workspaceSize, executor, aclstream);
|
||||
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnApplyRotaryPosEmb failed. ERROR: %d\n", name.c_str(), ret); return);
|
||||
|
||||
syncRun();
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,17 @@
|
|||
#pragma once
|
||||
#include "utils.h"
|
||||
#include "base_op.h"
|
||||
|
||||
namespace jittor
|
||||
{
|
||||
class RotaryPosEmbOpRunner : public BaseOpRunner
|
||||
{
|
||||
|
||||
protected:
|
||||
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
|
||||
|
||||
public:
|
||||
RotaryPosEmbOpRunner();
|
||||
};
|
||||
|
||||
}
|
|
@ -0,0 +1,356 @@
|
|||
import os
|
||||
from jittor_utils import env_or_try_find
|
||||
import jittor_utils
|
||||
import ctypes
|
||||
import glob
|
||||
import jittor.compiler as compiler
|
||||
import jittor as jt
|
||||
import math
|
||||
import numpy as np
|
||||
|
||||
from typing import Union
|
||||
from collections.abc import Sequence, Iterable
|
||||
|
||||
|
||||
def setitem_cmd(name: str,
|
||||
inputs: list,
|
||||
output_dtypes: list = None,
|
||||
output_shapes: list = None,
|
||||
attr_code: str = "",
|
||||
attr_header: str = "",
|
||||
outputs: list = None):
|
||||
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
|
||||
|
||||
cuda_header = '''
|
||||
#include "acl/aclops/aclops.h"
|
||||
'''
|
||||
outputs_ = []
|
||||
if outputs is not None:
|
||||
outputs_ = outputs
|
||||
else:
|
||||
assert output_dtypes is not None
|
||||
assert output_shapes is not None
|
||||
assert len(output_dtypes) == len(output_shapes)
|
||||
for i in range(len(output_shapes)):
|
||||
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
|
||||
input_code = ''
|
||||
for i in range(len(inputs)):
|
||||
input_code += f"op.add(in{i}, true);\n"
|
||||
|
||||
output_code = ''
|
||||
for i in range(len(outputs_)):
|
||||
output_code += f"op.add(out{i}, false);\n"
|
||||
return jt.code(outputs=outputs_,
|
||||
inputs=inputs,
|
||||
cuda_header=attr_header + cuda_header,
|
||||
cuda_src=f"""
|
||||
|
||||
// aclop
|
||||
{name}OpRunner op;
|
||||
{input_code}
|
||||
{output_code}
|
||||
{attr_code}
|
||||
op.run();""")
|
||||
|
||||
|
||||
def setitem_forward(name: str,
|
||||
inputs: list,
|
||||
output_dtypes: list = None,
|
||||
output_shapes: list = None,
|
||||
attr_code: str = "",
|
||||
attr_header: str = "",
|
||||
outputs: list = None,
|
||||
extra_data: dict = {}):
|
||||
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
|
||||
|
||||
cuda_header = '''
|
||||
#include "acl/aclops/aclops.h"
|
||||
'''
|
||||
outputs_ = []
|
||||
if outputs is not None:
|
||||
outputs_ = outputs
|
||||
else:
|
||||
assert output_dtypes is not None
|
||||
assert output_shapes is not None
|
||||
assert len(output_dtypes) == len(output_shapes)
|
||||
for i in range(len(output_shapes)):
|
||||
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
|
||||
input_code = ''
|
||||
for i in range(len(inputs)):
|
||||
input_code += f"op.add(in{i}, true);\n"
|
||||
|
||||
output_code = ''
|
||||
for i in range(len(outputs_)):
|
||||
output_code += f"op.add(out{i}, false);\n"
|
||||
return jt.code(outputs=outputs_,
|
||||
inputs=inputs,
|
||||
cuda_header=attr_header + cuda_header,
|
||||
cuda_src=f"""
|
||||
// aclop
|
||||
{name}OpRunner op;
|
||||
{input_code}
|
||||
op.add(out0, false);
|
||||
{attr_code}
|
||||
op.run();""",
|
||||
data=extra_data)
|
||||
|
||||
|
||||
def caculate_shape(tensors):
|
||||
if isinstance(tensors, jt.Var):
|
||||
# tensors = tensors[0]
|
||||
return tensors.shape
|
||||
elif isinstance(tensors, (int, float)):
|
||||
return []
|
||||
elif isinstance(tensors, (list, tuple)):
|
||||
# return [caculate_shape(tensor) for tensor in tensors]
|
||||
sub_shape = caculate_shape(tensors[0])
|
||||
return [len(tensors)] + sub_shape
|
||||
else:
|
||||
assert False, f"not implemented for {type(tensors)}"
|
||||
|
||||
|
||||
def can_broadcast_and_shape(shape1, shape2):
|
||||
"""
|
||||
检查两个张量是否可以广播,并返回广播后的形状。
|
||||
|
||||
参数:
|
||||
- shape1: 第一个张量的形状(tuple 或 list)
|
||||
- shape2: 第二个张量的形状(tuple 或 list)
|
||||
|
||||
返回:
|
||||
- can_broadcast: 布尔值,表示是否可以广播
|
||||
- broadcast_shape: 如果可以广播,返回广播后的形状;否则返回 None
|
||||
"""
|
||||
# 将形状转换为元组,以防输入是列表
|
||||
shape1 = tuple(shape1)
|
||||
shape2 = tuple(shape2)
|
||||
|
||||
# 使两个形状的长度一致,通过在前面补1
|
||||
len1, len2 = len(shape1), len(shape2)
|
||||
if len1 < len2:
|
||||
shape1 = (1, ) * (len2 - len1) + shape1
|
||||
elif len2 < len1:
|
||||
shape2 = (1, ) * (len1 - len2) + shape2
|
||||
|
||||
broadcast_shape = []
|
||||
|
||||
# 从最后一维开始检查每一维度
|
||||
for dim1, dim2 in zip(shape1, shape2):
|
||||
if dim1 == dim2:
|
||||
broadcast_shape.append(dim1)
|
||||
elif dim1 == 1:
|
||||
broadcast_shape.append(dim2)
|
||||
elif dim2 == 1:
|
||||
broadcast_shape.append(dim1)
|
||||
else:
|
||||
# 如果在某一维度上不兼容,则不能广播
|
||||
return False, None
|
||||
|
||||
return True, tuple(broadcast_shape)
|
||||
|
||||
|
||||
class SetItemACL(jt.Function):
|
||||
|
||||
def __init__(self):
|
||||
self.type_ = 'notype'
|
||||
self.value_var = True
|
||||
|
||||
def stride(self, x, dim):
|
||||
stride = 1
|
||||
for i in range(dim + 1, len(x.shape)):
|
||||
stride *= x.shape[i]
|
||||
return stride
|
||||
|
||||
def execute(self, x, slices, value):
|
||||
self.x_shape = x.shape
|
||||
self.input_slice = slices
|
||||
if not isinstance(value, jt.Var):
|
||||
self.value_var = False
|
||||
if isinstance(slices, jt.Var):
|
||||
if slices.dtype == "bool":
|
||||
slices_len = slices.sum().item()
|
||||
if slices_len == 0:
|
||||
return x
|
||||
if isinstance(value, int) or isinstance(value, float):
|
||||
value = jt.full((slices_len, ), value, dtype=x.dtype)
|
||||
assert slices.shape == x.shape, "setitem shape not match"
|
||||
assert len(value.shape) == 1, "value shape must be 1D"
|
||||
assert value.shape[
|
||||
0] == slices_len, "value shape length must be equal to slices sum"
|
||||
self.type_ = 'mask'
|
||||
self.value_shape = value.shape
|
||||
inputs = [slices, value]
|
||||
outputs = [x.clone()]
|
||||
attr_code = f"""
|
||||
op.jt_name = "inplacemaskedscatter";
|
||||
"""
|
||||
result = setitem_cmd("InplaceMaskedScatter",
|
||||
inputs=inputs,
|
||||
outputs=outputs,
|
||||
attr_code=attr_code)[0]
|
||||
return result
|
||||
|
||||
# assert isinstance(value,jt.Var), "value must be jt.Var"
|
||||
# self.value_shape = value.shape
|
||||
if not isinstance(slices, tuple):
|
||||
slices = (slices, )
|
||||
slices = list(slices)
|
||||
for i, s in enumerate(slices):
|
||||
if isinstance(s, int) and s < 0:
|
||||
slices[i] = x.shape[i] + s
|
||||
slices = tuple(slices)
|
||||
slices_list = list(slices)
|
||||
#check slices contains slice type
|
||||
contains_slice = False
|
||||
for s in slices:
|
||||
if not isinstance(s, jt.Var) and (isinstance(s, slice)
|
||||
or s == Ellipsis):
|
||||
contains_slice = True
|
||||
break
|
||||
if not contains_slice:
|
||||
indices = []
|
||||
value_shape = []
|
||||
slices_len = len(slices)
|
||||
boardcast_shape = caculate_shape(slices_list[0])
|
||||
for ii in range(1, len(slices)):
|
||||
dd, boardcast_shape = can_broadcast_and_shape(
|
||||
boardcast_shape, caculate_shape(slices_list[ii]))
|
||||
assert dd is True, "can not broadcast"
|
||||
value_shape = boardcast_shape
|
||||
value_shape += x.shape[slices_len:]
|
||||
if value_shape == []:
|
||||
value_shape = [1]
|
||||
if isinstance(value, int) or isinstance(value, float):
|
||||
value = jt.full(value_shape, value)
|
||||
self.value_shape = value_shape
|
||||
for ii in slices:
|
||||
indices.append(jt.Var(ii).int32())
|
||||
if isinstance(slices[0],
|
||||
jt.Var) or isinstance(slices[0], int) or isinstance(
|
||||
slices[0], list) or isinstance(slices[0], tuple):
|
||||
self.indices = indices
|
||||
self.type_ = 'index'
|
||||
attr_code = f"""
|
||||
op.jt_name = "indexputimpl";
|
||||
"""
|
||||
inputs = [value] + indices
|
||||
outputs = [x.clone()]
|
||||
result = setitem_cmd("IndexPutImpl",
|
||||
inputs=inputs,
|
||||
outputs=outputs,
|
||||
attr_code=attr_code)[0]
|
||||
# result.sync()
|
||||
return result
|
||||
assert "not support"
|
||||
assert contains_slice, "slice type error"
|
||||
x_dim = len(x.shape)
|
||||
slices = list(slices)
|
||||
for s in slices:
|
||||
if not isinstance(s, jt.Var) and s == Ellipsis:
|
||||
slices = slices[:slices.index(s)] + [
|
||||
slice(None, None, None)
|
||||
] * (x_dim - len(slices) + 1) + slices[slices.index(s) + 1:]
|
||||
break
|
||||
slices = tuple(slices)
|
||||
self.input_slice = slices
|
||||
if len(slices) < x_dim:
|
||||
slices += (slice(None, None, None), ) * (x_dim - len(slices))
|
||||
sizes = []
|
||||
#适配华为奇怪的要求,最后一个维度的step必须是1
|
||||
expand_dim = False
|
||||
if isinstance(slices[-1], slice):
|
||||
if slices[-1].step is not None and slices[-1].step != 1:
|
||||
slices = slices + (slice(None, None, None), )
|
||||
expand_dim = True
|
||||
|
||||
elif isinstance(slices[-1], int):
|
||||
#注意最后一个维度是数字
|
||||
slices = slices + (slice(None, None, None), )
|
||||
expand_dim = True
|
||||
# value = value.unsqueeze(-1)
|
||||
else:
|
||||
assert False, "not supported"
|
||||
x_shape = list(x.shape)
|
||||
if expand_dim:
|
||||
x_shape.append(1)
|
||||
x = x.unsqueeze(-1)
|
||||
value = value.unsqueeze(-1)
|
||||
|
||||
squeeze_dims = []
|
||||
if isinstance(value, jt.Var):
|
||||
for dim, s in enumerate(slices):
|
||||
if isinstance(s, int):
|
||||
s = slice(s, s + 1, 1)
|
||||
squeeze_dims.append(dim)
|
||||
|
||||
for dim in squeeze_dims:
|
||||
value = value.unsqueeze(dim)
|
||||
|
||||
extra_data = {}
|
||||
if len(slices):
|
||||
extra_data["a"] = len(slices)
|
||||
for dim, s in enumerate(slices):
|
||||
if isinstance(s, int):
|
||||
s = slice(s, s + 1, 1)
|
||||
if isinstance(s, jt.Var):
|
||||
assert False, "jt.Var not supported"
|
||||
start, stop, step = s.indices(x_shape[dim])
|
||||
size = (stop - start - 1) // step + 1
|
||||
sizes.append(size)
|
||||
extra_data[str(dim * 3)] = start
|
||||
extra_data[str(dim * 3 + 1)] = stop
|
||||
extra_data[str(dim * 3 + 2)] = step
|
||||
else:
|
||||
extra_data["a"] = -1
|
||||
sizes = [1]
|
||||
steps = [1]
|
||||
if isinstance(value, int) or isinstance(value, float):
|
||||
value = jt.full(sizes, value)
|
||||
self.type_ = 'slicev2'
|
||||
attr_code = """
|
||||
op.jt_name = "stridedsliceassignv2";
|
||||
StrideAttr *attr = new StrideAttr();
|
||||
int slice_dim = data["a"];
|
||||
|
||||
if(slice_dim == -1) {
|
||||
attr->begins = {};
|
||||
attr->ends = {};
|
||||
attr->steps = {1};
|
||||
attr->axes = {};
|
||||
} else {
|
||||
vector<long int> begins;
|
||||
vector<long int> ends;
|
||||
vector<long int> steps;
|
||||
vector<long int> dims;
|
||||
for(int dim = 0; dim < slice_dim; dim++) {
|
||||
dims.push_back(dim);
|
||||
begins.push_back(data[std::to_string(dim*3)]);
|
||||
ends.push_back(data[std::to_string(dim*3+1)]);
|
||||
steps.push_back(data[std::to_string(dim*3+2)]);
|
||||
}
|
||||
attr->begins = begins;
|
||||
attr->ends = ends;
|
||||
attr->steps = steps;
|
||||
attr->axes = dims;
|
||||
}
|
||||
op.op_attr.reset(attr);
|
||||
"""
|
||||
self.value_shape = value.shape
|
||||
inputs = [value]
|
||||
outputs = [x.clone()]
|
||||
result = setitem_forward("StridedSliceAssignV2",
|
||||
inputs=inputs,
|
||||
outputs=outputs,
|
||||
attr_code=attr_code,
|
||||
extra_data=extra_data)[0]
|
||||
if expand_dim:
|
||||
result = result.squeeze(-1)
|
||||
# result.sync()
|
||||
return result
|
||||
|
||||
def grad(self, grad_output):
|
||||
value_grad = None
|
||||
if self.value_var:
|
||||
value_grad = grad_output[self.input_slice]
|
||||
grad_output[self.input_slice] = jt.zeros(self.value_shape)
|
||||
return grad_output, None, value_grad
|
|
@ -0,0 +1,84 @@
|
|||
#include <acl/acl.h>
|
||||
#include <acl/acl_op_compiler.h>
|
||||
#include <Python.h>
|
||||
#include <pystate.h>
|
||||
#include <algorithm>
|
||||
#include <queue>
|
||||
#include <set>
|
||||
#include "common.h"
|
||||
#include "op.h"
|
||||
#include "acl_jittor.h"
|
||||
#include "ops/random_op.h"
|
||||
#include "ops/reduce_op.h"
|
||||
#include "ops/binary_op.h"
|
||||
#include "ops/broadcast_to_op.h"
|
||||
#include "ops/transpose_op.h"
|
||||
#include "ops/array_op.h"
|
||||
#include "ops/code_op.h"
|
||||
#include "fused_op.h"
|
||||
#include "ops/unary_op.h"
|
||||
#include "ops/ternary_op.h"
|
||||
#include "executor.h"
|
||||
#include "misc/cuda_flags.h"
|
||||
#include "mem/allocator.h"
|
||||
#include "op_compiler.h"
|
||||
#include "ops/op_register.h"
|
||||
#include "opt/tuner_manager.h"
|
||||
#include "utils/str_utils.h"
|
||||
#include "aclnn/aclnn.h"
|
||||
#include "setitem_op_acl.h"
|
||||
|
||||
namespace jittor
|
||||
{
|
||||
InplaceMaskedScatterOpRunner::InplaceMaskedScatterOpRunner() : BaseOpRunner("InplaceMaskedScatter")
|
||||
{
|
||||
}
|
||||
|
||||
void InplaceMaskedScatterOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
|
||||
{
|
||||
ret = aclnnInplaceMaskedScatterGetWorkspaceSize(outputTensors[0], inputTensors[0], inputTensors[1], &workspaceSize, &executor);
|
||||
|
||||
checkRet(ret);
|
||||
|
||||
if (workspaceSize > 0)
|
||||
{
|
||||
mallocWorkSpace(workspaceSize);
|
||||
}
|
||||
|
||||
ret = aclnnInplaceMaskedScatter(workspaceAddr, workspaceSize, executor, aclstream);
|
||||
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnInplaceMaskedScatter failed. ERROR: %d\n", name.c_str(), ret); return);
|
||||
|
||||
syncRun();
|
||||
return;
|
||||
}
|
||||
|
||||
IndexPutImplOpRunner::IndexPutImplOpRunner() : BaseOpRunner("IndexPutImpl")
|
||||
{
|
||||
}
|
||||
|
||||
void IndexPutImplOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
|
||||
{
|
||||
auto input_num = in_.size();
|
||||
std::vector<aclTensor *> indexTensorList = {};
|
||||
for (int i = 1; i < input_num; i++)
|
||||
{
|
||||
indexTensorList.push_back(inputTensors[i]);
|
||||
}
|
||||
auto indexTensorListInput = aclCreateTensorList(&indexTensorList[0], input_num - 1);
|
||||
ret = aclnnIndexPutImplGetWorkspaceSize(outputTensors[0], indexTensorListInput, inputTensors[0], false, true, &workspaceSize, &executor);
|
||||
|
||||
checkRet(ret);
|
||||
|
||||
if (workspaceSize > 0)
|
||||
{
|
||||
mallocWorkSpace(workspaceSize);
|
||||
}
|
||||
|
||||
ret = aclnnIndexPutImpl(workspaceAddr, workspaceSize, executor, aclstream);
|
||||
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnIndexPutImpl failed. ERROR: %d\n", name.c_str(), ret); return);
|
||||
|
||||
syncRun();
|
||||
return;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,26 @@
|
|||
#pragma once
|
||||
#include "utils.h"
|
||||
#include "base_op.h"
|
||||
|
||||
namespace jittor
|
||||
{
|
||||
class InplaceMaskedScatterOpRunner : public BaseOpRunner
|
||||
{
|
||||
|
||||
protected:
|
||||
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
|
||||
|
||||
public:
|
||||
InplaceMaskedScatterOpRunner();
|
||||
};
|
||||
|
||||
class IndexPutImplOpRunner : public BaseOpRunner
|
||||
{
|
||||
|
||||
protected:
|
||||
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
|
||||
|
||||
public:
|
||||
IndexPutImplOpRunner();
|
||||
};
|
||||
}
|
|
@ -0,0 +1,85 @@
|
|||
import os
|
||||
from jittor_utils import env_or_try_find
|
||||
import jittor_utils
|
||||
import ctypes
|
||||
import glob
|
||||
import jittor.compiler as compiler
|
||||
import jittor as jt
|
||||
import math
|
||||
import numpy as np
|
||||
|
||||
from typing import Union
|
||||
from collections.abc import Sequence, Iterable
|
||||
|
||||
|
||||
def sigmoid_cmd(name: str,
|
||||
inputs: list,
|
||||
output_dtypes: list = None,
|
||||
output_shapes: list = None,
|
||||
attr_code: str = "",
|
||||
attr_header: str = "",
|
||||
outputs: list = None):
|
||||
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
|
||||
|
||||
cuda_header = '''
|
||||
#include "acl/aclops/aclops.h"
|
||||
'''
|
||||
outputs_ = []
|
||||
if outputs is not None:
|
||||
outputs_ = outputs
|
||||
else:
|
||||
assert output_dtypes is not None
|
||||
assert output_shapes is not None
|
||||
assert len(output_dtypes) == len(output_shapes)
|
||||
for i in range(len(output_shapes)):
|
||||
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
|
||||
input_code = ''
|
||||
for i in range(len(inputs)):
|
||||
input_code += f"op.add(in{i}, true);\n"
|
||||
|
||||
output_code = ''
|
||||
for i in range(len(outputs_)):
|
||||
output_code += f"op.add(out{i}, false);\n"
|
||||
return jt.code(outputs=outputs_,
|
||||
inputs=inputs,
|
||||
cuda_header=attr_header + cuda_header,
|
||||
cuda_src=f"""
|
||||
|
||||
// aclop
|
||||
{name}OpRunner op;
|
||||
{input_code}
|
||||
{output_code}
|
||||
{attr_code}
|
||||
op.run();""")
|
||||
|
||||
|
||||
class SigmoidACL(jt.Function):
|
||||
|
||||
def __init__(self):
|
||||
super(SigmoidACL, self).__init__()
|
||||
|
||||
def execute(self, x):
|
||||
x = x.float32()
|
||||
inputs = [x]
|
||||
outputs = [jt.empty(x.shape, x.dtype)]
|
||||
attr_code = f"""
|
||||
op.jt_name = "sigmoid";
|
||||
"""
|
||||
result = sigmoid_cmd("Sigmoid",
|
||||
inputs=inputs,
|
||||
outputs=outputs,
|
||||
attr_code=attr_code)[0]
|
||||
self.output = result
|
||||
return result
|
||||
|
||||
def grad(self, grad_output):
|
||||
attr_code = f"""
|
||||
op.jt_name = "sigmoidbackward";
|
||||
"""
|
||||
inputs = [grad_output, self.output]
|
||||
outputs = [jt.empty(grad_output.shape, grad_output.dtype)]
|
||||
grad_input = sigmoid_cmd("SigmoidBackward",
|
||||
inputs=inputs,
|
||||
outputs=outputs,
|
||||
attr_code=attr_code)[0]
|
||||
return grad_input
|
|
@ -0,0 +1,80 @@
|
|||
#pragma once
|
||||
#include <acl/acl.h>
|
||||
#include <acl/acl_op_compiler.h>
|
||||
#include <Python.h>
|
||||
#include <pystate.h>
|
||||
#include <algorithm>
|
||||
#include <queue>
|
||||
#include <set>
|
||||
#include "common.h"
|
||||
#include "op.h"
|
||||
#include "acl_jittor.h"
|
||||
#include "ops/random_op.h"
|
||||
#include "ops/reduce_op.h"
|
||||
#include "ops/binary_op.h"
|
||||
#include "ops/broadcast_to_op.h"
|
||||
#include "ops/transpose_op.h"
|
||||
#include "ops/array_op.h"
|
||||
#include "ops/code_op.h"
|
||||
#include "fused_op.h"
|
||||
#include "ops/unary_op.h"
|
||||
#include "ops/ternary_op.h"
|
||||
#include "executor.h"
|
||||
#include "misc/cuda_flags.h"
|
||||
#include "mem/allocator.h"
|
||||
#include "op_compiler.h"
|
||||
#include "ops/op_register.h"
|
||||
#include "opt/tuner_manager.h"
|
||||
#include "utils/str_utils.h"
|
||||
#include "aclnn/aclnn.h"
|
||||
#include "sigmoid_op_acl.h"
|
||||
|
||||
namespace jittor
|
||||
{
|
||||
SigmoidOpRunner::SigmoidOpRunner() : BaseOpRunner("Sigmoid")
|
||||
{
|
||||
}
|
||||
|
||||
void SigmoidOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
|
||||
{
|
||||
ret = aclnnSigmoidGetWorkspaceSize(inputTensors[0], outputTensors[0], &workspaceSize, &executor);
|
||||
|
||||
checkRet(ret);
|
||||
|
||||
if (workspaceSize > 0)
|
||||
{
|
||||
mallocWorkSpace(workspaceSize);
|
||||
}
|
||||
|
||||
ret = aclnnSigmoid(workspaceAddr, workspaceSize, executor, aclstream);
|
||||
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnSigmoid failed. ERROR: %d\n", name.c_str(), ret); return);
|
||||
|
||||
syncRun();
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
SigmoidBackwardOpRunner::SigmoidBackwardOpRunner() : BaseOpRunner("SigmoidBackward")
|
||||
{
|
||||
}
|
||||
|
||||
void SigmoidBackwardOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
|
||||
{
|
||||
ret = aclnnSigmoidBackwardGetWorkspaceSize(inputTensors[0], inputTensors[1], outputTensors[0], &workspaceSize, &executor);
|
||||
|
||||
checkRet(ret);
|
||||
|
||||
if (workspaceSize > 0)
|
||||
{
|
||||
mallocWorkSpace(workspaceSize);
|
||||
}
|
||||
|
||||
ret = aclnnSigmoidBackward(workspaceAddr, workspaceSize, executor, aclstream);
|
||||
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnSigmoidBackward failed. ERROR: %d\n", name.c_str(), ret); return);
|
||||
|
||||
syncRun();
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,27 @@
|
|||
#pragma once
|
||||
#include "utils.h"
|
||||
#include "base_op.h"
|
||||
|
||||
namespace jittor
|
||||
{
|
||||
class SigmoidOpRunner : public BaseOpRunner
|
||||
{
|
||||
|
||||
protected:
|
||||
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
|
||||
|
||||
public:
|
||||
SigmoidOpRunner();
|
||||
};
|
||||
|
||||
class SigmoidBackwardOpRunner : public BaseOpRunner
|
||||
{
|
||||
|
||||
protected:
|
||||
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
|
||||
|
||||
public:
|
||||
SigmoidBackwardOpRunner();
|
||||
};
|
||||
|
||||
}
|
|
@ -0,0 +1,85 @@
|
|||
import os
|
||||
from jittor_utils import env_or_try_find
|
||||
import jittor_utils
|
||||
import ctypes
|
||||
import glob
|
||||
import jittor.compiler as compiler
|
||||
import jittor as jt
|
||||
import math
|
||||
import numpy as np
|
||||
|
||||
from typing import Union
|
||||
from collections.abc import Sequence, Iterable
|
||||
|
||||
|
||||
def silu_cmd(name: str,
|
||||
inputs: list,
|
||||
output_dtypes: list = None,
|
||||
output_shapes: list = None,
|
||||
attr_code: str = "",
|
||||
attr_header: str = "",
|
||||
outputs: list = None):
|
||||
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
|
||||
|
||||
cuda_header = '''
|
||||
#include "acl/aclops/aclops.h"
|
||||
'''
|
||||
outputs_ = []
|
||||
if outputs is not None:
|
||||
outputs_ = outputs
|
||||
else:
|
||||
assert output_dtypes is not None
|
||||
assert output_shapes is not None
|
||||
assert len(output_dtypes) == len(output_shapes)
|
||||
for i in range(len(output_shapes)):
|
||||
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
|
||||
input_code = ''
|
||||
for i in range(len(inputs)):
|
||||
input_code += f"op.add(in{i}, true);\n"
|
||||
|
||||
output_code = ''
|
||||
for i in range(len(outputs_)):
|
||||
output_code += f"op.add(out{i}, false);\n"
|
||||
return jt.code(outputs=outputs_,
|
||||
inputs=inputs,
|
||||
cuda_header=attr_header + cuda_header,
|
||||
cuda_src=f"""
|
||||
|
||||
// aclop
|
||||
{name}OpRunner op;
|
||||
{input_code}
|
||||
{output_code}
|
||||
{attr_code}
|
||||
op.run();""")
|
||||
|
||||
|
||||
class SiLUACL(jt.Function):
|
||||
|
||||
def __init__(self):
|
||||
super(SiLUACL, self).__init__()
|
||||
|
||||
def execute(self, x):
|
||||
x = x.float32()
|
||||
inputs = [x]
|
||||
self.input = x
|
||||
outputs = [jt.empty(x.shape, x.dtype)]
|
||||
attr_code = f"""
|
||||
op.jt_name = "silu";
|
||||
"""
|
||||
result = silu_cmd("SiLU",
|
||||
inputs=inputs,
|
||||
outputs=outputs,
|
||||
attr_code=attr_code)[0]
|
||||
return result
|
||||
|
||||
def grad(self, grad_output):
|
||||
attr_code = f"""
|
||||
op.jt_name = "silubackward";
|
||||
"""
|
||||
inputs = [grad_output, self.input]
|
||||
outputs = [jt.empty(grad_output.shape, grad_output.dtype)]
|
||||
grad_input = silu_cmd("SiLUBackward",
|
||||
inputs=inputs,
|
||||
outputs=outputs,
|
||||
attr_code=attr_code)[0]
|
||||
return grad_input
|
|
@ -0,0 +1,80 @@
|
|||
#pragma once
|
||||
#include <acl/acl.h>
|
||||
#include <acl/acl_op_compiler.h>
|
||||
#include <Python.h>
|
||||
#include <pystate.h>
|
||||
#include <algorithm>
|
||||
#include <queue>
|
||||
#include <set>
|
||||
#include "common.h"
|
||||
#include "op.h"
|
||||
#include "acl_jittor.h"
|
||||
#include "ops/random_op.h"
|
||||
#include "ops/reduce_op.h"
|
||||
#include "ops/binary_op.h"
|
||||
#include "ops/broadcast_to_op.h"
|
||||
#include "ops/transpose_op.h"
|
||||
#include "ops/array_op.h"
|
||||
#include "ops/code_op.h"
|
||||
#include "fused_op.h"
|
||||
#include "ops/unary_op.h"
|
||||
#include "ops/ternary_op.h"
|
||||
#include "executor.h"
|
||||
#include "misc/cuda_flags.h"
|
||||
#include "mem/allocator.h"
|
||||
#include "op_compiler.h"
|
||||
#include "ops/op_register.h"
|
||||
#include "opt/tuner_manager.h"
|
||||
#include "utils/str_utils.h"
|
||||
#include "aclnn/aclnn.h"
|
||||
#include "silu_op_acl.h"
|
||||
|
||||
namespace jittor
|
||||
{
|
||||
SiLUOpRunner::SiLUOpRunner() : BaseOpRunner("SiLU")
|
||||
{
|
||||
}
|
||||
|
||||
void SiLUOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
|
||||
{
|
||||
ret = aclnnSiluGetWorkspaceSize(inputTensors[0], outputTensors[0], &workspaceSize, &executor);
|
||||
|
||||
checkRet(ret);
|
||||
|
||||
if (workspaceSize > 0)
|
||||
{
|
||||
mallocWorkSpace(workspaceSize);
|
||||
}
|
||||
|
||||
ret = aclnnSilu(workspaceAddr, workspaceSize, executor, aclstream);
|
||||
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnSilu failed. ERROR: %d\n", name.c_str(), ret); return);
|
||||
|
||||
syncRun();
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
SiLUBackwardOpRunner::SiLUBackwardOpRunner() : BaseOpRunner("SiLUBackward")
|
||||
{
|
||||
}
|
||||
|
||||
void SiLUBackwardOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
|
||||
{
|
||||
ret = aclnnSiluBackwardGetWorkspaceSize(inputTensors[0], inputTensors[1], outputTensors[0], &workspaceSize, &executor);
|
||||
|
||||
checkRet(ret);
|
||||
|
||||
if (workspaceSize > 0)
|
||||
{
|
||||
mallocWorkSpace(workspaceSize);
|
||||
}
|
||||
|
||||
ret = aclnnSiluBackward(workspaceAddr, workspaceSize, executor, aclstream);
|
||||
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnSiluBackward failed. ERROR: %d\n", name.c_str(), ret); return);
|
||||
|
||||
syncRun();
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,27 @@
|
|||
#pragma once
|
||||
#include "utils.h"
|
||||
#include "base_op.h"
|
||||
|
||||
namespace jittor
|
||||
{
|
||||
class SiLUOpRunner : public BaseOpRunner
|
||||
{
|
||||
|
||||
protected:
|
||||
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
|
||||
|
||||
public:
|
||||
SiLUOpRunner();
|
||||
};
|
||||
|
||||
class SiLUBackwardOpRunner : public BaseOpRunner
|
||||
{
|
||||
|
||||
protected:
|
||||
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
|
||||
|
||||
public:
|
||||
SiLUBackwardOpRunner();
|
||||
};
|
||||
|
||||
}
|
|
@ -0,0 +1,92 @@
|
|||
import os
|
||||
from jittor_utils import env_or_try_find
|
||||
import jittor_utils
|
||||
import ctypes
|
||||
import glob
|
||||
import jittor.compiler as compiler
|
||||
import jittor as jt
|
||||
import math
|
||||
import numpy as np
|
||||
|
||||
from typing import Union
|
||||
from collections.abc import Sequence, Iterable
|
||||
|
||||
|
||||
def softmax_cmd(name: str,
|
||||
inputs: list,
|
||||
output_dtypes: list = None,
|
||||
output_shapes: list = None,
|
||||
attr_code: str = "",
|
||||
attr_header: str = "",
|
||||
outputs: list = None):
|
||||
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
|
||||
|
||||
cuda_header = '''
|
||||
#include "acl/aclops/aclops.h"
|
||||
'''
|
||||
outputs_ = []
|
||||
if outputs is not None:
|
||||
outputs_ = outputs
|
||||
else:
|
||||
assert output_dtypes is not None
|
||||
assert output_shapes is not None
|
||||
assert len(output_dtypes) == len(output_shapes)
|
||||
for i in range(len(output_shapes)):
|
||||
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
|
||||
input_code = ''
|
||||
for i in range(len(inputs)):
|
||||
input_code += f"op.add(in{i}, true);\n"
|
||||
|
||||
output_code = ''
|
||||
for i in range(len(outputs_)):
|
||||
output_code += f"op.add(out{i}, false);\n"
|
||||
return jt.code(outputs=outputs_,
|
||||
inputs=inputs,
|
||||
cuda_header=attr_header + cuda_header,
|
||||
cuda_src=f"""
|
||||
|
||||
// aclop
|
||||
{name}OpRunner op;
|
||||
{input_code}
|
||||
{output_code}
|
||||
{attr_code}
|
||||
op.run();""")
|
||||
|
||||
|
||||
class SoftmaxACL(jt.Function):
|
||||
|
||||
def __init__(self):
|
||||
super(SoftmaxACL, self).__init__()
|
||||
|
||||
def execute(self, x, dim):
|
||||
x = x.float32()
|
||||
inputs = [x]
|
||||
outputs = [jt.empty(x.shape)]
|
||||
self.dim = dim
|
||||
attr_code = f"""
|
||||
op.jt_name = "softmax";
|
||||
SoftmaxAttr *attr = new SoftmaxAttr();
|
||||
attr->dim = {dim};
|
||||
op.op_attr.reset(attr);
|
||||
"""
|
||||
result = softmax_cmd("Softmax",
|
||||
inputs=inputs,
|
||||
outputs=outputs,
|
||||
attr_code=attr_code)[0]
|
||||
self.output = result
|
||||
return result
|
||||
|
||||
def grad(self, grad_output):
|
||||
attr_code = f"""
|
||||
op.jt_name = "softmax";
|
||||
SoftmaxAttr *attr = new SoftmaxAttr();
|
||||
attr->dim = {self.dim};
|
||||
op.op_attr.reset(attr);
|
||||
"""
|
||||
inputs = [grad_output, self.output]
|
||||
outputs = [jt.empty(grad_output.shape)]
|
||||
grad_input = softmax_cmd("SoftmaxBackward",
|
||||
inputs=inputs,
|
||||
outputs=outputs,
|
||||
attr_code=attr_code)[0]
|
||||
return grad_input
|
|
@ -0,0 +1,82 @@
|
|||
#pragma once
|
||||
#include <acl/acl.h>
|
||||
#include <acl/acl_op_compiler.h>
|
||||
#include <Python.h>
|
||||
#include <pystate.h>
|
||||
#include <algorithm>
|
||||
#include <queue>
|
||||
#include <set>
|
||||
#include "common.h"
|
||||
#include "op.h"
|
||||
#include "acl_jittor.h"
|
||||
#include "ops/random_op.h"
|
||||
#include "ops/reduce_op.h"
|
||||
#include "ops/binary_op.h"
|
||||
#include "ops/broadcast_to_op.h"
|
||||
#include "ops/transpose_op.h"
|
||||
#include "ops/array_op.h"
|
||||
#include "ops/code_op.h"
|
||||
#include "fused_op.h"
|
||||
#include "ops/unary_op.h"
|
||||
#include "ops/ternary_op.h"
|
||||
#include "executor.h"
|
||||
#include "misc/cuda_flags.h"
|
||||
#include "mem/allocator.h"
|
||||
#include "op_compiler.h"
|
||||
#include "ops/op_register.h"
|
||||
#include "opt/tuner_manager.h"
|
||||
#include "utils/str_utils.h"
|
||||
#include "aclnn/aclnn.h"
|
||||
#include "softmax_op_acl.h"
|
||||
|
||||
namespace jittor
|
||||
{
|
||||
SoftmaxOpRunner::SoftmaxOpRunner() : BaseOpRunner("Softmax")
|
||||
{
|
||||
}
|
||||
|
||||
void SoftmaxOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
|
||||
{
|
||||
auto attr = dynamic_cast<SoftmaxAttr *>(op_attr.get());
|
||||
ret = aclnnSoftmaxGetWorkspaceSize(inputTensors[0], aclDataType(attr->dim), outputTensors[0], &workspaceSize, &executor);
|
||||
|
||||
checkRet(ret);
|
||||
|
||||
if (workspaceSize > 0)
|
||||
{
|
||||
mallocWorkSpace(workspaceSize);
|
||||
}
|
||||
|
||||
ret = aclnnSoftmax(workspaceAddr, workspaceSize, executor, aclstream);
|
||||
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnSoftmax failed. ERROR: %d\n", name.c_str(), ret); return);
|
||||
|
||||
syncRun();
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
SoftmaxBackwardOpRunner::SoftmaxBackwardOpRunner() : BaseOpRunner("SoftmaxBackward")
|
||||
{
|
||||
}
|
||||
|
||||
void SoftmaxBackwardOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
|
||||
{
|
||||
auto attr = dynamic_cast<SoftmaxAttr *>(op_attr.get());
|
||||
ret = aclnnSoftmaxBackwardGetWorkspaceSize(inputTensors[0], inputTensors[1], attr->dim, outputTensors[0], &workspaceSize, &executor);
|
||||
|
||||
checkRet(ret);
|
||||
|
||||
if (workspaceSize > 0)
|
||||
{
|
||||
mallocWorkSpace(workspaceSize);
|
||||
}
|
||||
|
||||
ret = aclnnSoftmaxBackward(workspaceAddr, workspaceSize, executor, aclstream);
|
||||
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnSoftmaxBackward failed. ERROR: %d\n", name.c_str(), ret); return);
|
||||
|
||||
syncRun();
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,27 @@
|
|||
#pragma once
|
||||
#include "utils.h"
|
||||
#include "base_op.h"
|
||||
|
||||
namespace jittor
|
||||
{
|
||||
class SoftmaxOpRunner : public BaseOpRunner
|
||||
{
|
||||
|
||||
protected:
|
||||
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
|
||||
|
||||
public:
|
||||
SoftmaxOpRunner();
|
||||
};
|
||||
|
||||
class SoftmaxBackwardOpRunner : public BaseOpRunner
|
||||
{
|
||||
|
||||
protected:
|
||||
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
|
||||
|
||||
public:
|
||||
SoftmaxBackwardOpRunner();
|
||||
};
|
||||
|
||||
}
|
|
@ -0,0 +1,115 @@
|
|||
import os
|
||||
from jittor_utils import env_or_try_find
|
||||
import jittor_utils
|
||||
import ctypes
|
||||
import glob
|
||||
import jittor.compiler as compiler
|
||||
import jittor as jt
|
||||
import math
|
||||
import numpy as np
|
||||
|
||||
from typing import Union
|
||||
from collections.abc import Sequence, Iterable
|
||||
|
||||
|
||||
def stack_cmd(name: str,
|
||||
inputs: list,
|
||||
output_dtypes: list = None,
|
||||
output_shapes: list = None,
|
||||
attr_code: str = "",
|
||||
attr_header: str = "",
|
||||
outputs: list = None):
|
||||
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
|
||||
|
||||
cuda_header = '''
|
||||
#include "acl/aclops/aclops.h"
|
||||
'''
|
||||
outputs_ = []
|
||||
if outputs is not None:
|
||||
outputs_ = outputs
|
||||
else:
|
||||
assert output_dtypes is not None
|
||||
assert output_shapes is not None
|
||||
assert len(output_dtypes) == len(output_shapes)
|
||||
for i in range(len(output_shapes)):
|
||||
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
|
||||
input_code = ''
|
||||
for i in range(len(inputs)):
|
||||
input_code += f"op.add(in{i}, true);\n"
|
||||
|
||||
output_code = ''
|
||||
for i in range(len(outputs_)):
|
||||
output_code += f"op.add(out{i}, false);\n"
|
||||
return jt.code(outputs=outputs_,
|
||||
inputs=inputs,
|
||||
cuda_header=attr_header + cuda_header,
|
||||
cuda_src=f"""
|
||||
|
||||
// aclop
|
||||
{name}OpRunner op;
|
||||
{input_code}
|
||||
{output_code}
|
||||
{attr_code}
|
||||
op.run();""")
|
||||
|
||||
|
||||
class StackACL(jt.Function):
|
||||
|
||||
def __init__(self):
|
||||
super(StackACL, self).__init__()
|
||||
|
||||
def execute(self, input_tensors, dim):
|
||||
if type(input_tensors) is tuple:
|
||||
input_tensors = list(input_tensors)
|
||||
assert type(input_tensors) is list
|
||||
assert -1 * len(input_tensors) - 1 <= dim and dim <= len(input_tensors)
|
||||
for i in range(len(input_tensors)):
|
||||
if input_tensors[i].dtype != input_tensors[0].dtype:
|
||||
raise ValueError("All input tensors must have the same dtype")
|
||||
if input_tensors[i].shape != input_tensors[0].shape:
|
||||
raise ValueError("All input tensors must have the same shape")
|
||||
self.input = input_tensors
|
||||
input_shape = list(input_tensors[0].shape)
|
||||
output_shape = input_shape[:dim] + [len(input_tensors)
|
||||
] + input_shape[dim:]
|
||||
attr_code = f"""
|
||||
op.jt_name = "stack";
|
||||
ConcatAttr *attr = new ConcatAttr();
|
||||
attr->tensorNum = {len(input_tensors)};
|
||||
attr->dim = {dim};
|
||||
op.op_attr.reset(attr);
|
||||
"""
|
||||
self.attr_code = attr_code
|
||||
result = stack_cmd("Stack",
|
||||
input_tensors,
|
||||
output_dtypes=[input_tensors[0].dtype],
|
||||
output_shapes=[output_shape],
|
||||
attr_code=self.attr_code)[0]
|
||||
return result
|
||||
|
||||
def grad(self, grad_output):
|
||||
grad_inputs = self.split_grad(grad_output, self.input, self.dim)
|
||||
return grad_inputs
|
||||
|
||||
def split_grad(self, grad_output, input_tensors, axis):
|
||||
offset = []
|
||||
shapeVec = []
|
||||
dtypeVec = []
|
||||
for tensor in input_tensors:
|
||||
offset.append(tensor.shape[axis])
|
||||
dtypeVec.append(tensor.dtype)
|
||||
shapeVec.append(tensor.shape)
|
||||
|
||||
attr_code = f"""
|
||||
op.jt_name = "splitwithsize";
|
||||
auto *attr = new SplitWithSizeAttr();
|
||||
attr->splitSize = {{ {", ".join(map(str, offset))} }};
|
||||
attr->dim = {axis};
|
||||
op.op_attr.reset(attr);
|
||||
"""
|
||||
|
||||
result = stack_cmd("SplitWithSize", [grad_output],
|
||||
output_dtypes=dtypeVec,
|
||||
output_shapes=shapeVec,
|
||||
attr_code=attr_code)
|
||||
return result
|
|
@ -0,0 +1,65 @@
|
|||
#pragma once
|
||||
#include <acl/acl.h>
|
||||
#include <acl/acl_op_compiler.h>
|
||||
#include <Python.h>
|
||||
#include <pystate.h>
|
||||
#include <algorithm>
|
||||
#include <queue>
|
||||
#include <set>
|
||||
#include "common.h"
|
||||
#include "op.h"
|
||||
#include "acl_jittor.h"
|
||||
#include "ops/random_op.h"
|
||||
#include "ops/reduce_op.h"
|
||||
#include "ops/binary_op.h"
|
||||
#include "ops/broadcast_to_op.h"
|
||||
#include "ops/transpose_op.h"
|
||||
#include "ops/array_op.h"
|
||||
#include "ops/code_op.h"
|
||||
#include "fused_op.h"
|
||||
#include "ops/unary_op.h"
|
||||
#include "ops/ternary_op.h"
|
||||
#include "executor.h"
|
||||
#include "misc/cuda_flags.h"
|
||||
#include "mem/allocator.h"
|
||||
#include "op_compiler.h"
|
||||
#include "ops/op_register.h"
|
||||
#include "opt/tuner_manager.h"
|
||||
#include "utils/str_utils.h"
|
||||
#include "aclnn/aclnn.h"
|
||||
#include "stack_op_acl.h"
|
||||
|
||||
namespace jittor
|
||||
{
|
||||
StackOpRunner::StackOpRunner() : BaseOpRunner("Stack")
|
||||
{
|
||||
}
|
||||
|
||||
void StackOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
|
||||
{
|
||||
auto input_num = in_.size();
|
||||
std::vector<aclTensor *> stackTensorList = {};
|
||||
for (int i = 0; i < input_num; i++)
|
||||
{
|
||||
stackTensorList.push_back(inputTensors[i]);
|
||||
}
|
||||
auto stackTensorListInput = aclCreateTensorList(&stackTensorList[0], input_num);
|
||||
auto attr = dynamic_cast<ConcatAttr *>(op_attr.get());
|
||||
ret = aclnnStackGetWorkspaceSize(stackTensorListInput, attr->dim, outputTensors[0], &workspaceSize, &executor);
|
||||
|
||||
checkRet(ret);
|
||||
|
||||
if (workspaceSize > 0)
|
||||
{
|
||||
mallocWorkSpace(workspaceSize);
|
||||
}
|
||||
|
||||
ret = aclnnStack(workspaceAddr, workspaceSize, executor, aclstream);
|
||||
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnStack failed. ERROR: %d\n", name.c_str(), ret); return);
|
||||
|
||||
syncRun();
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,17 @@
|
|||
#pragma once
|
||||
#include "utils.h"
|
||||
#include "base_op.h"
|
||||
|
||||
namespace jittor
|
||||
{
|
||||
class StackOpRunner : public BaseOpRunner
|
||||
{
|
||||
|
||||
protected:
|
||||
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
|
||||
|
||||
public:
|
||||
StackOpRunner();
|
||||
};
|
||||
|
||||
}
|
|
@ -0,0 +1,54 @@
|
|||
#include <acl/acl.h>
|
||||
#include <acl/acl_op_compiler.h>
|
||||
#include <Python.h>
|
||||
#include <pystate.h>
|
||||
#include <algorithm>
|
||||
#include <queue>
|
||||
#include <set>
|
||||
#include "common.h"
|
||||
#include "op.h"
|
||||
#include "acl_jittor.h"
|
||||
#include "ops/random_op.h"
|
||||
#include "ops/reduce_op.h"
|
||||
#include "ops/binary_op.h"
|
||||
#include "ops/broadcast_to_op.h"
|
||||
#include "ops/transpose_op.h"
|
||||
#include "ops/array_op.h"
|
||||
#include "ops/code_op.h"
|
||||
#include "fused_op.h"
|
||||
#include "ops/unary_op.h"
|
||||
#include "ops/ternary_op.h"
|
||||
#include "executor.h"
|
||||
#include "misc/cuda_flags.h"
|
||||
#include "mem/allocator.h"
|
||||
#include "op_compiler.h"
|
||||
#include "ops/op_register.h"
|
||||
#include "opt/tuner_manager.h"
|
||||
#include "utils/str_utils.h"
|
||||
#include "aclnn/aclnn.h"
|
||||
#include "ternary_op_acl.h"
|
||||
|
||||
namespace jittor
|
||||
{
|
||||
TernaryOpRunner::TernaryOpRunner() : BaseOpRunner("ternary")
|
||||
{
|
||||
use_nchw = false;
|
||||
}
|
||||
|
||||
void TernaryOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
|
||||
{
|
||||
ret = aclnnSWhereGetWorkspaceSize(inputTensors[0], inputTensors[1], inputTensors[2], outputTensors[0], &workspaceSize, &executor);
|
||||
|
||||
checkRet(ret);
|
||||
|
||||
if (workspaceSize > 0)
|
||||
{
|
||||
mallocWorkSpace(workspaceSize);
|
||||
}
|
||||
|
||||
ret = aclnnSWhere(workspaceAddr, workspaceSize, executor, aclstream);
|
||||
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnxxx failed. ERROR: %d\n", name.c_str(), ret); return);
|
||||
// syncRun();
|
||||
return;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,14 @@
|
|||
#pragma once
|
||||
#include "utils.h"
|
||||
#include "base_op.h"
|
||||
|
||||
namespace jittor
|
||||
{
|
||||
struct TernaryOpRunner : public BaseOpRunner
|
||||
{
|
||||
TernaryOpRunner();
|
||||
|
||||
protected:
|
||||
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
|
||||
};
|
||||
}
|
|
@ -0,0 +1,101 @@
|
|||
import os
|
||||
from jittor_utils import env_or_try_find
|
||||
import jittor_utils
|
||||
import ctypes
|
||||
import glob
|
||||
import jittor.compiler as compiler
|
||||
import jittor as jt
|
||||
import math
|
||||
import numpy as np
|
||||
|
||||
from typing import Union
|
||||
from collections.abc import Sequence, Iterable
|
||||
|
||||
|
||||
def transpose_cmd(name: str,
|
||||
inputs: list,
|
||||
output_dtypes: list = None,
|
||||
output_shapes: list = None,
|
||||
attr_code: str = "",
|
||||
attr_header: str = "",
|
||||
outputs: list = None):
|
||||
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
|
||||
|
||||
cuda_header = '''
|
||||
#include "acl/aclops/aclops.h"
|
||||
'''
|
||||
outputs_ = []
|
||||
if outputs is not None:
|
||||
outputs_ = outputs
|
||||
else:
|
||||
assert output_dtypes is not None
|
||||
assert output_shapes is not None
|
||||
assert len(output_dtypes) == len(output_shapes)
|
||||
for i in range(len(output_shapes)):
|
||||
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
|
||||
input_code = ''
|
||||
for i in range(len(inputs)):
|
||||
input_code += f"op.add(in{i}, true);\n"
|
||||
|
||||
output_code = ''
|
||||
for i in range(len(outputs_)):
|
||||
output_code += f"op.add(out{i}, false);\n"
|
||||
return jt.code(outputs=outputs_,
|
||||
inputs=inputs,
|
||||
cuda_header=attr_header + cuda_header,
|
||||
cuda_src=f"""
|
||||
|
||||
// aclop
|
||||
{name}OpRunner op;
|
||||
{input_code}
|
||||
{output_code}
|
||||
{attr_code}
|
||||
op.run();""")
|
||||
|
||||
|
||||
class TransPoseACL(jt.Function):
|
||||
|
||||
def __init__(self):
|
||||
super(TransPoseACL, self).__init__()
|
||||
|
||||
def execute(self, x, *dim):
|
||||
self.input = x
|
||||
if len(dim) == 1 and isinstance(dim[0], Sequence):
|
||||
dim = dim[0]
|
||||
elif len(dim) == 2:
|
||||
axes = list(range(x.ndim))
|
||||
a, b = dim
|
||||
axes[a], axes[b] = axes[b], axes[a]
|
||||
dim = axes
|
||||
|
||||
attr_code = f"""
|
||||
op.jt_name = "transpose";
|
||||
ReduceAttr *attr = new ReduceAttr();
|
||||
attr->axes = {{ {", ".join(map(str, dim))} }};
|
||||
op.op_attr.reset(attr);
|
||||
"""
|
||||
# calculate output shape
|
||||
output_shape = [x.shape[i] for i in dim]
|
||||
output = transpose_cmd("Transpose", [x],
|
||||
output_dtypes=[x.dtype],
|
||||
output_shapes=[jt.empty(output_shape).shape],
|
||||
attr_code=attr_code)[0]
|
||||
self.dim = dim
|
||||
return output
|
||||
|
||||
def grad(self, grad_output):
|
||||
dim = list(range(grad_output.ndim))
|
||||
for i, p in enumerate(self.dim):
|
||||
dim[p] = i
|
||||
output_shape = [grad_output.shape[i] for i in dim]
|
||||
attr_code = f"""
|
||||
op.jt_name = "transpose";
|
||||
ReduceAttr *attr = new ReduceAttr();
|
||||
attr->axes = {{ {", ".join(map(str, dim))} }};
|
||||
op.op_attr.reset(attr);
|
||||
"""
|
||||
output = transpose_cmd("Transpose", [grad_output],
|
||||
output_dtypes=[grad_output.dtype],
|
||||
output_shapes=[jt.empty(output_shape).shape],
|
||||
attr_code=attr_code)[0]
|
||||
return output
|
|
@ -0,0 +1,66 @@
|
|||
#pragma once
|
||||
#include <acl/acl.h>
|
||||
#include <acl/acl_op_compiler.h>
|
||||
#include <Python.h>
|
||||
#include <pystate.h>
|
||||
#include <algorithm>
|
||||
#include <queue>
|
||||
#include <set>
|
||||
#include "common.h"
|
||||
#include "op.h"
|
||||
#include "acl_jittor.h"
|
||||
#include "ops/random_op.h"
|
||||
#include "ops/reduce_op.h"
|
||||
#include "ops/binary_op.h"
|
||||
#include "ops/broadcast_to_op.h"
|
||||
#include "ops/transpose_op.h"
|
||||
#include "ops/array_op.h"
|
||||
#include "ops/code_op.h"
|
||||
#include "fused_op.h"
|
||||
#include "ops/unary_op.h"
|
||||
#include "ops/ternary_op.h"
|
||||
#include "executor.h"
|
||||
#include "misc/cuda_flags.h"
|
||||
#include "mem/allocator.h"
|
||||
#include "op_compiler.h"
|
||||
#include "ops/op_register.h"
|
||||
#include "opt/tuner_manager.h"
|
||||
#include "utils/str_utils.h"
|
||||
#include "aclnn/aclnn.h"
|
||||
#include "transpose_op_acl.h"
|
||||
|
||||
namespace jittor
|
||||
{
|
||||
TransposeOpRunner::TransposeOpRunner() : BaseOpRunner("Transpose")
|
||||
{
|
||||
}
|
||||
|
||||
void TransposeOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
|
||||
{
|
||||
auto attr = dynamic_cast<ReduceAttr *>(op_attr.get());
|
||||
|
||||
aclIntArray *dim = nullptr;
|
||||
|
||||
dim = aclCreateIntArray(attr->axes.data(), attr->axes.size());
|
||||
bool keepdims = attr->keepdims;
|
||||
|
||||
ret = aclnnPermuteGetWorkspaceSize(inputTensors[0], dim, outputTensors[0], &workspaceSize, &executor);
|
||||
|
||||
checkRet(ret);
|
||||
|
||||
if (workspaceSize > 0)
|
||||
{
|
||||
mallocWorkSpace(workspaceSize);
|
||||
}
|
||||
|
||||
ret = aclnnPermute(workspaceAddr, workspaceSize, executor, aclstream);
|
||||
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnPermute failed. ERROR: %d\n", name.c_str(), ret); return);
|
||||
|
||||
syncRun();
|
||||
|
||||
aclDestroyIntArray(dim);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,17 @@
|
|||
#pragma once
|
||||
#include "utils.h"
|
||||
#include "base_op.h"
|
||||
|
||||
namespace jittor
|
||||
{
|
||||
class TransposeOpRunner : public BaseOpRunner
|
||||
{
|
||||
|
||||
protected:
|
||||
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
|
||||
|
||||
public:
|
||||
TransposeOpRunner();
|
||||
};
|
||||
|
||||
}
|
|
@ -0,0 +1,74 @@
|
|||
import os
|
||||
from jittor_utils import env_or_try_find
|
||||
import jittor_utils
|
||||
import ctypes
|
||||
import glob
|
||||
import jittor.compiler as compiler
|
||||
import jittor as jt
|
||||
import math
|
||||
import numpy as np
|
||||
|
||||
from typing import Union
|
||||
from collections.abc import Sequence, Iterable
|
||||
|
||||
def triu_cmd(name: str,
|
||||
inputs: list,
|
||||
output_dtypes: list = None,
|
||||
output_shapes: list = None,
|
||||
attr_code: str = "",
|
||||
attr_header: str = "",
|
||||
outputs: list = None):
|
||||
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
|
||||
|
||||
cuda_header = '''
|
||||
#include "acl/aclops/aclops.h"
|
||||
'''
|
||||
outputs_ = []
|
||||
if outputs is not None:
|
||||
outputs_ = outputs
|
||||
else:
|
||||
assert output_dtypes is not None
|
||||
assert output_shapes is not None
|
||||
assert len(output_dtypes) == len(output_shapes)
|
||||
for i in range(len(output_shapes)):
|
||||
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
|
||||
input_code = ''
|
||||
for i in range(len(inputs)):
|
||||
input_code += f"op.add(in{i}, true);\n"
|
||||
|
||||
output_code = ''
|
||||
for i in range(len(outputs_)):
|
||||
output_code += f"op.add(out{i}, false);\n"
|
||||
return jt.code(outputs=outputs_,
|
||||
inputs=inputs,
|
||||
cuda_header=attr_header + cuda_header,
|
||||
cuda_src=f"""
|
||||
|
||||
// aclop
|
||||
{name}OpRunner op;
|
||||
{input_code}
|
||||
{output_code}
|
||||
{attr_code}
|
||||
op.run();""")
|
||||
|
||||
class TriuACL(jt.Function):
|
||||
|
||||
def __init__(self):
|
||||
super(TriuACL, self).__init__()
|
||||
|
||||
def execute(self, input, diagonal):
|
||||
attr_code = f"""
|
||||
op.jt_name = "triu";
|
||||
TriuAttr *attr = new TriuAttr();
|
||||
attr->diagonal = {diagonal};
|
||||
op.op_attr.reset(attr);
|
||||
"""
|
||||
|
||||
result = triu_cmd("Triu", [input],
|
||||
output_dtypes=[input.dtype],
|
||||
output_shapes=[input.shape],
|
||||
attr_code=attr_code)[0]
|
||||
return result
|
||||
|
||||
def grad(self, grad_output):
|
||||
return grad_output
|
|
@ -0,0 +1,58 @@
|
|||
#pragma once
|
||||
#include <acl/acl.h>
|
||||
#include <acl/acl_op_compiler.h>
|
||||
#include <Python.h>
|
||||
#include <pystate.h>
|
||||
#include <algorithm>
|
||||
#include <queue>
|
||||
#include <set>
|
||||
#include "common.h"
|
||||
#include "op.h"
|
||||
#include "acl_jittor.h"
|
||||
#include "ops/random_op.h"
|
||||
#include "ops/reduce_op.h"
|
||||
#include "ops/binary_op.h"
|
||||
#include "ops/broadcast_to_op.h"
|
||||
#include "ops/transpose_op.h"
|
||||
#include "ops/array_op.h"
|
||||
#include "ops/code_op.h"
|
||||
#include "fused_op.h"
|
||||
#include "ops/unary_op.h"
|
||||
#include "ops/ternary_op.h"
|
||||
#include "executor.h"
|
||||
#include "misc/cuda_flags.h"
|
||||
#include "mem/allocator.h"
|
||||
#include "op_compiler.h"
|
||||
#include "ops/op_register.h"
|
||||
#include "opt/tuner_manager.h"
|
||||
#include "utils/str_utils.h"
|
||||
#include "aclnn/aclnn.h"
|
||||
#include "triu_op_acl.h"
|
||||
|
||||
namespace jittor
|
||||
{
|
||||
TriuOpRunner::TriuOpRunner() : BaseOpRunner("Triu")
|
||||
{
|
||||
}
|
||||
|
||||
void TriuOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
|
||||
{
|
||||
auto attr = dynamic_cast<TriuAttr *>(op_attr.get());
|
||||
ret = aclnnTriuGetWorkspaceSize(inputTensors[0], aclDataType(attr->diagonal), outputTensors[0], &workspaceSize, &executor);
|
||||
|
||||
checkRet(ret);
|
||||
|
||||
if (workspaceSize > 0)
|
||||
{
|
||||
mallocWorkSpace(workspaceSize);
|
||||
}
|
||||
|
||||
ret = aclnnTriu(workspaceAddr, workspaceSize, executor, aclstream);
|
||||
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnTriu failed. ERROR: %d\n", name.c_str(), ret); return);
|
||||
|
||||
syncRun();
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,16 @@
|
|||
#pragma once
|
||||
#include "utils.h"
|
||||
#include "base_op.h"
|
||||
|
||||
namespace jittor
|
||||
{
|
||||
class TriuOpRunner : public BaseOpRunner
|
||||
{
|
||||
|
||||
protected:
|
||||
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
|
||||
public:
|
||||
TriuOpRunner();
|
||||
};
|
||||
|
||||
}
|
|
@ -0,0 +1,59 @@
|
|||
#include <acl/acl.h>
|
||||
#include <acl/acl_op_compiler.h>
|
||||
#include <Python.h>
|
||||
#include <pystate.h>
|
||||
#include <algorithm>
|
||||
#include <queue>
|
||||
#include <set>
|
||||
#include "common.h"
|
||||
#include "op.h"
|
||||
#include "acl_jittor.h"
|
||||
#include "ops/random_op.h"
|
||||
#include "ops/reduce_op.h"
|
||||
#include "ops/binary_op.h"
|
||||
#include "ops/broadcast_to_op.h"
|
||||
#include "ops/transpose_op.h"
|
||||
#include "ops/array_op.h"
|
||||
#include "ops/code_op.h"
|
||||
#include "fused_op.h"
|
||||
#include "ops/unary_op.h"
|
||||
#include "ops/ternary_op.h"
|
||||
#include "executor.h"
|
||||
#include "misc/cuda_flags.h"
|
||||
#include "mem/allocator.h"
|
||||
#include "op_compiler.h"
|
||||
#include "ops/op_register.h"
|
||||
#include "opt/tuner_manager.h"
|
||||
#include "utils/str_utils.h"
|
||||
#include "aclnn/aclnn.h"
|
||||
|
||||
#include "unary_op_acl.h"
|
||||
|
||||
namespace jittor
|
||||
{
|
||||
UnaryOpRunner::UnaryOpRunner() : BaseOpRunner("unary")
|
||||
{
|
||||
use_nchw = false;
|
||||
is_group_op = true;
|
||||
}
|
||||
|
||||
void UnaryOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
|
||||
{
|
||||
if (name == "Cast")
|
||||
ret = it->second.getWorkspaceSizeFuncCast(inputTensors[0], get_dtype(out_[0]->dtype()), outputTensors[0], &workspaceSize, &executor);
|
||||
else
|
||||
ret = it->second.getWorkspaceSizeFuncUnaryNonzero(inputTensors[0], outputTensors[0], &workspaceSize, &executor);
|
||||
|
||||
checkRet(ret);
|
||||
|
||||
if (workspaceSize > 0)
|
||||
{
|
||||
mallocWorkSpace(workspaceSize);
|
||||
}
|
||||
|
||||
ret = it->second.executeFunc(workspaceAddr, workspaceSize, executor, aclstream);
|
||||
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnxxx failed. ERROR: %d\n", name.c_str(), ret); return);
|
||||
// syncRun();
|
||||
return;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,14 @@
|
|||
#pragma once
|
||||
#include "utils.h"
|
||||
#include "base_op.h"
|
||||
|
||||
namespace jittor
|
||||
{
|
||||
struct UnaryOpRunner : public BaseOpRunner
|
||||
{
|
||||
UnaryOpRunner();
|
||||
|
||||
protected:
|
||||
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
|
||||
};
|
||||
}
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue