Compare commits

..

5 Commits

Author SHA1 Message Date
lidongyang 845c24c9f8 update to 1.3.10.0 2025-07-28 18:50:28 +08:00
DongYang Li d892a83d1c
Merge pull request #654 from Jittor/hyx
merge hw backend
2025-07-28 18:36:24 +08:00
lidongyang 4017b161d2 fix master 2025-07-28 18:33:35 +08:00
uyzhang c78db2a794 enable cuda and acl 2025-07-19 11:05:30 +08:00
uyzhang f8e44de79d merge by JittorHW 2025-07-19 08:59:51 +08:00
128 changed files with 11161 additions and 2449 deletions

View File

@ -9,7 +9,7 @@
# file 'LICENSE.txt', which is part of this source code package.
# ***************************************************************
__version__ = '1.3.9.14'
__version__ = '1.3.10.0'
from jittor_utils import lock
with lock.lock_scope():
ori_int = int

View File

@ -611,6 +611,26 @@ def setup_nccl():
nccl_ops = nccl.ops
LOG.vv("Get nccl_ops: "+str(dir(nccl_ops)))
def setup_hccl():
global hccl_ops
hccl_src_dir = os.path.join(jittor_path, "extern", "acl", "hccl")
hccl_src_files = []
for r, _, f in os.walk(hccl_src_dir):
for fname in f:
hccl_src_files.append(os.path.join(r, fname))
hccl_include_path = os.path.join(os.environ.get("ASCEND_TOOLKIT_HOME"), "aarch64-linux/include/hccl")
hccl_lib_name = os.path.join(os.environ.get("ASCEND_TOOLKIT_HOME"), "aarch64-linux/lib64/libhccl.so")
ctypes.CDLL(hccl_lib_name, dlopen_flags)
hccl = compile_custom_ops(hccl_src_files,
extra_flags=f" -I\"{hccl_include_path}\" {mpi_compile_flags} ",
return_module=True, dlopen_flags=os.RTLD_GLOBAL | os.RTLD_NOW,
gen_name_="jittor_hccl_core")
hccl_ops = hccl.ops
LOG.vv("Get hccl_ops: "+str(dir(hccl_ops)))
def manual_link(flags):
lib_dirs = []
libs = []
@ -708,8 +728,14 @@ cudnn = cublas = curand = cufft = cusparse = None
setup_mpi()
rank = mpi.world_rank() if in_mpi else 0
world_size = mpi.world_size() if in_mpi else 1
# if has_acl:
# setup_hccl()
# elif has_cuda:
# setup_nccl()
# setup_cutt()
# setup_cutlass()
setup_nccl()
setup_cutt()
setup_cutlass()

View File

@ -1188,7 +1188,22 @@ make_cache_dir(ck_path)
# build cache_compile
cc_flags += f" -I\"{os.path.join(jittor_path, 'src')}\" "
cc_flags += f" -I\"{os.path.join(jittor_path, 'extern')}\" "
ascend_toolkit_home = os.getenv('ASCEND_TOOLKIT_HOME')
if ascend_toolkit_home:
cc_flags += f" -I\"{os.path.join(ascend_toolkit_home, 'include')}\" "
cc_flags += f" -I\"{os.path.join(ascend_toolkit_home, 'include/acl')}\" "
cc_flags += f" -I\"{os.path.join(ascend_toolkit_home, 'include/aclnn')}\" "
cc_flags += f" -I\"{os.path.join(ascend_toolkit_home, 'include/aclnnop')}\" "
cc_flags += f" -L\"{os.path.join(ascend_toolkit_home, 'lib64')}\" "
cc_flags += " -llibascendcl "
cc_flags += " -llibnnopbase "
cc_flags += " -llibopapi "
cc_flags += py_include
check_cache_compile()
LOG.v(f"Get cache_compile: {jit_utils.cc}")

File diff suppressed because it is too large Load Diff

View File

@ -1,6 +1,6 @@
// ***************************************************************
// Copyright (c) 2023 Jittor. All Rights Reserved.
// Maintainers: Dun Liang <randonlang@gmail.com>.
// Copyright (c) 2023 Jittor. All Rights Reserved.
// Maintainers: Dun Liang <randonlang@gmail.com>.
// This file is subject to the terms and conditions defined in
// file 'LICENSE.txt', which is part of this source code package.
// ***************************************************************
@ -11,23 +11,27 @@ using std::unordered_map;
typedef int aclError;
static inline unordered_map<aclError,string> gen_map(string s) {
unordered_map<aclError,string> smap;
for (int i=0; i<s.size(); i++) {
if (s[i] == ';') {
int j=s.rfind(" ", i);
int code = std::stoi(s.substr(j+1, i-j-1));
int k = s.rfind(" ", j-1);
int l = s.rfind(" ACL_", k-1);
smap[code] = s.substr(l+1, k-l-1);
static inline unordered_map<aclError, string> gen_map(string s)
{
unordered_map<aclError, string> smap;
for (int i = 0; i < s.size(); i++)
{
if (s[i] == ';')
{
int j = s.rfind(" ", i);
int code = std::stoi(s.substr(j + 1, i - j - 1));
int k = s.rfind(" ", j - 1);
int l = s.rfind(" ACL_", k - 1);
smap[code] = s.substr(l + 1, k - l - 1);
}
}
return smap;
}
string acl_error_to_string(aclError error) {
string acl_error_to_string(aclError error)
{
static unordered_map<aclError,string> acl_error_map = gen_map(R"(
static unordered_map<aclError, string> acl_error_map = gen_map(R"(
// from acl_base.h
static const int ACL_ERROR_INVALID_PARAM = 100000;
static const int ACL_ERROR_UNINITIALIZE = 100001;

View File

@ -1,6 +1,6 @@
// ***************************************************************
// Copyright (c) 2023 Jittor. All Rights Reserved.
// Maintainers: Dun Liang <randonlang@gmail.com>.
// Copyright (c) 2023 Jittor. All Rights Reserved.
// Maintainers: Dun Liang <randonlang@gmail.com>.
// This file is subject to the terms and conditions defined in
// file 'LICENSE.txt', which is part of this source code package.
// ***************************************************************
@ -10,267 +10,311 @@
#include "utils/str_utils.h"
#include <chrono>
#include <thread>
#include "aclnn/aclnn.h"
namespace jittor {
namespace jittor
{
uint64_t acl_jittor_tid;
int acl_jittor_thread_running=0;
aclrtContext acl_jittor_context;
aclrtStream aclstream;
uint64_t acl_jittor_tid;
int acl_jittor_thread_running = 0;
aclrtStream aclstream;
void *workspaceAddr = nullptr;
uint64_t nowWorkSpaceSize = 0;
#define CHECK_ACL(x) ASSERTop(x,==,0)
#define CHECK_ACL(x) ASSERTop(x, ==, 0)
static void* acl_jittor_process_callback(void*) {
acl_jittor_thread_running = 1;
int deviceId = 0;
CHECK_ACL(aclrtSetCurrentContext(acl_jittor_context));
while (acl_jittor_thread_running) {
// LOGir << "acl_jittor_process_callback";
auto ret = aclrtProcessReport(1000);
if (ret) {
if (acl_jittor_thread_running && ret != ACL_ERROR_RT_REPORT_TIMEOUT && ret != ACL_ERROR_RT_THREAD_SUBSCRIBE)
LOGir << "aclrtProcessReport:" << ret << acl_error_to_string(ret);
break;
void mallocWorkSpace(uint64_t size)
{
uint64_t alloc_size = size + 32;
alloc_size = ((alloc_size - 1) / 32 + 1) * 32;
if (alloc_size > nowWorkSpaceSize)
{
aclrtFree(workspaceAddr);
nowWorkSpaceSize = alloc_size;
auto ret = aclrtMalloc(&workspaceAddr, nowWorkSpaceSize, ACL_MEM_MALLOC_HUGE_FIRST);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("allocate workspace failed. ERROR: %d\n", ret); return);
}
}
acl_jittor_thread_running = 0;
return (void*)0;
}
static void *acl_jittor_process_callback(void *)
{
acl_jittor_thread_running = 1;
// void aaa(void*) {
// LOGir << "haha";
// }
struct acl_jittor_initer {
acl_jittor_initer() {
CHECK_ACL(aclInit(nullptr));
uint device_count = 0;
// 获取可用的Device数量
CHECK_ACL(aclrtGetDeviceCount(&device_count));
LOGi << "Found ACL device number:" << device_count;
CHECK_ACL(aclrtSetDevice(0));
CHECK_ACL(aclrtCreateContext(&acl_jittor_context, 0));
CHECK_ACL(aclrtSetCurrentContext(acl_jittor_context));
pthread_create(&acl_jittor_tid, nullptr, acl_jittor_process_callback, 0);
// subscribe for default stream
CHECK_ACL(aclrtSubscribeReport(acl_jittor_tid,0));
// simple callback test
CHECK_ACL(aclrtCreateStream(&aclstream));
// CHECK_ACL(aclrtSubscribeReport(acl_jittor_tid,aclstream));
// CHECK_ACL(aclrtLaunchCallback((aclrtCallback)&aaa, 0, ACL_CALLBACK_NO_BLOCK, aclstream));
// CHECK_ACL(aclrtLaunchCallback((aclrtCallback)&aaa, 0, ACL_CALLBACK_NO_BLOCK, 0));
}
~acl_jittor_initer() {
acl_jittor_thread_running = 0;
CHECK_ACL(aclrtUnSubscribeReport(acl_jittor_tid,0));
CHECK_ACL(aclrtDestroyContext(acl_jittor_context));
CHECK_ACL(aclFinalize());
}
} _acl_jittor_initer;
string process_acl(const string& src, const string& name, const map<string,string>& kargs) {
if (endswith(name, "_jittor.cc"))
return src;
// static vector<string> dont_compile = {"fp16_emu.cc"};
// for (auto& s : dont_compile)
// if (endswith(name, s))
// return " ";
static unordered_set<string> cuda_headers = {
"cuda_runtime", "cudnn", "driver_types",
"cuda_fp16", "cuda_runtime_api", "fp16_emu",
"cudnn_rnn_descriptor", "cublas_v2", "cublas_wrapper",
"curand", "curand_wrapper", "cufft", "cufftXt",
"CudaUtils", "cutt", "cudnn_wrapper", "cuda_bf16"
};
static unordered_set<string> fake_class = {
"cudnnHandle_t", "cudnnConvolutionBwdFilterAlgo_t",
"cudnnConvolutionBwdDataAlgo_t", "cudnnConvolutionFwdAlgo_t",
"cufftHandle"
};
try {
auto tokens = token_split(src);
int edit = 0;
for (int i=0; i<tokens.size(); i++) {
auto& token = tokens[i];
if (cuda_headers.count(token)) token = "acl_jittor", edit ++; else
if (fake_class.count(token)) token = "int", edit ++; else
if (token == "CUDA") token = "ACL", edit ++; else
if (startswith(token, "cuda")) {
if (token.size()>=5 && token[4] >= 'A' && token[4] <= 'Z') {
if (token == "cudaGetDeviceCount") {
token_replace(tokens, i, "($1);", "((uint*)$1);");
} else if (token == "cudaLaunchHostFunc") {
// ACL_CALLBACK_BLOCK for 310
token_replace(tokens, i, "LaunchHostFunc($1,$2,$3)",
"LaunchCallback($2,$3,ACL_CALLBACK_NO_BLOCK,$1)");
} else if (token == "cudaMemcpy")
token_replace(tokens, i, "cudaMemcpy($1,$2,$3,",
"aclrtMemcpy($1,$3,$2,$3,");
else if (token == "cudaMemcpyAsync")
token_replace(tokens, i, "cudaMemcpyAsync($1,$2,$3,",
"aclrtMemcpyAsync($1,$3,$2,$3,");
else if (token == "cudaMemcpyDeviceToHost") token = "ACL_MEMCPY_DEVICE_TO_HOST";
else if (token == "cudaMemcpyDefault") token = "ACL_MEMCPY_HOST_TO_DEVICE";
else if (token == "cudaMemcpyHostToDevice") token = "ACL_MEMCPY_HOST_TO_DEVICE";
else if (token == "cudaMemcpyDeviceToDevice") token = "ACL_MEMCPY_DEVICE_TO_DEVICE";
else if (token == "cudaMallocManaged" || token == "cudaMalloc") {
// unified address not supported
token = "aclrtMalloc";
token_replace(tokens, i, "($1,$2)",
"($1,$2,ACL_MEM_MALLOC_HUGE_FIRST)");
} else if (token == "cudaMemGetInfo")
token_replace(tokens, i, "cudaMemGetInfo($1,$2)",
"aclrtGetMemInfo(ACL_DDR_MEM,$1,$2)");
else if (token == "cudaGetLastError")
token_replace(tokens, i, "cudaGetLastError()", "0");
else if (token == "cudaStreamCreateWithFlags")
token_replace(tokens, i-1,
"(cudaStreamCreateWithFlags($1,$2));",
"(aclrtCreateStream($1)); checkAclErrors(aclrtSubscribeReport(acl_jittor_tid,*$1));");
else if (token == "cudaEventCreate")
token_replace(tokens, i,
"cudaEventCreate($1,$2)",
"aclrtCreateEvent($1)");
else if (token == "cudaDeviceSynchronize")
token = "aclrtSynchronizeDevice";
else if (token == "cudaStreamDestroy")
token_replace(tokens, i, "cudaStreamDestroy($1)",
"(aclrtUnSubscribeReport(acl_jittor_tid,$1), aclrtDestroyStream($1))");
else if (token == "cudaEventDestroy")
token = "aclrtDestroyEvent";
else if (token == "cudaEventRecord")
token = "aclrtRecordEvent";
else if (token == "cudaStreamWaitEvent")
token_replace(tokens, i,
"cudaStreamWaitEvent($1,$2,$3)",
"aclrtStreamWaitEvent($1,$2)");
if (token.size() && token[0] == 'c')
token = "aclrt" + token.substr(4);
if (endswith(token, "_t"))
token = token.substr(0, token.size()-2);
edit ++;
}
} else
if (token == "_cudaGetErrorEnum") {
token_replace(tokens, i, "_cudaGetErrorEnum($1)", "(acl_error_to_string($1))");
edit ++;
} else
if (token == "checkCudaErrors")
token = "checkAclErrors";
else if (token == "JPU") {
edit ++;
string new_code;
if (tokens[i+2] == "op_compiler")
token_replace(tokens, i,
"JPU(op_compiler($1,$2,$3))",
"acl_jittor_op_compiler($1,$2,$3)");
else if (tokens[i+2] == "header")
new_code = "#include \"acl_jittor.h\"";
if (new_code.size())
token_replace(tokens, i, "JPU($1)", new_code);
} else if (token == "use_cuda_managed_allocator" && tokens[i+1][0]==',') {
tokens[i+2] = "0"; // disable unified address
}
}
if (!edit) return src;
string new_src = join(tokens, "");
// if (name == "executor.cc") {
// new_src = string("#include <Python.h>\n#include <pystate.h>\n#include <common.h>\n")+
// "namespace jittor { void acl_op_exec(Op*); }\n" +
// replace(new_src, "op->do_run_after_prepare(jkl);",
// R"({
// acl_op_exec(op);
// })");
// }
if (name == "profiler.cc") {
new_src = token_replace_all(new_src, ".cc", ".tikcc");
}
// LOGir << name << (name == "pass_manager.cc");
if (name == "pass_manager.cc") {
LOGir << "replace" << name;
new_src = token_replace_all(new_src, "run_pass<FloatAtomicFixPass>();", "WTF");
}
// ????????
return new_src;
} catch (const std::exception& e) {
LOGe << "process acl error:" << e.what();
LOGe << "name:" << name;
throw;
}
}
void acl_jittor_op_compiler(string& filename, string& src, bool is_acl, string& extra_flags) {
if (!is_acl) return;
// extra_flags += " --tik-soc-version=Ascend910 ";
// filename = replace(filename, ".cc", ".tikcc");
// LOGir << filename;
string new_src = process_acl(src, "", {});
new_src = replace(new_src, R"(#include "misc/cuda_atomic.h")", "");
new_src = replace(new_src, R"(#include "misc/cuda_limits.h")", "");
new_src = replace(new_src, "__global__", "__ai_device_entry__");
new_src = token_replace_all(new_src, "__launch_bounds__($1)", "");
new_src = token_replace_all(new_src, "int thread_num = $1;", "int thread_num = 1;");
new_src = token_replace_all(new_src, "tn0=std::max(tn0, $1);", "");
new_src = token_replace_all(new_src, "<<<$1>>>", "<<<1,0>>>");
new_src = token_replace_all(new_src, "int thread_id = $1;", "int thread_id = 1;");
// for inc error
new_src = token_replace_all(new_src, "for ($1+=$2)", "for ($1++)");
// bit op error
new_src = token_replace_all(new_src, "int tnum$1;", "");
new_src = token_replace_all(new_src, "int p1$1;", "");
new_src = token_replace_all(new_src, "int p2$1;", "");
new_src = token_replace_all(new_src, "int tn$1=$2;", "int tn$1=0;");
new_src = token_replace_all(new_src, "int tid$1=$2;", "int tid$1=0;");
src = new_src;
new_src = token_replace_all(new_src, "atomicAdd(&$1,$2);", "$1=$1+$2;");
// new_src = token_replace_all(new_src, "bool", "int8");
new_src = token_replace_all(new_src, "::numeric_min<float32>()", "-1e30");
new_src = token_replace_all(new_src, "::numeric_max<float32>()", "1e30");
// TODO: support max
unordered_map<string,string> opmap = {
// {"::max","tikcc::scalar_max"},
{"::sqrtf", "tikcc::scalar_sqrt"}
};
auto ss = split(new_src, ";");
for (auto &s : ss) {
if (s.find("?") != string::npos) {
s = token_replace_all(s+";", "auto $1=$2?$3:$4;", "auto $1=$3;if (!($2)) $1=$4;");
}
if (s.find("::max") != string::npos) {
if (s.find("auto") == string::npos) {
s = token_replace_all(s+";", " $1=$4::max($2,$3);", " $1=$2;if ($2 < $3) $1=$3;");
} else {
s = token_replace_all(s+";", "auto $1=$4::max($2,$3);", "auto $1=$2;if ($2 < $3) $1=$3;");
while (acl_jittor_thread_running)
{
// LOGir << "acl_jittor_process_callback";
auto ret = aclrtProcessReport(1000);
if (ret)
{
if (acl_jittor_thread_running && ret != ACL_ERROR_RT_REPORT_TIMEOUT && ret != ACL_ERROR_RT_THREAD_SUBSCRIBE)
LOGir << "aclrtProcessReport:" << ret << acl_error_to_string(ret);
break;
}
}
for (auto& kv : opmap) {
if (s.find(kv.first) != string::npos) {
if (s.find("auto") == string::npos) {
// $1 = op($2) --> op($1, $2)
s = token_replace_all(s+";", " $1= "+kv.first+"($2);", kv.second+"($1, $2);");
} else {
// auto $1 = op($2) --> float32 $1; op($1, $2);
s = token_replace_all(s+";", "auto $1= "+kv.first+"($2);", "float32 $1; " + kv.second+"($1, $2);");
acl_jittor_thread_running = 0;
return (void *)0;
}
struct acl_jittor_initer
{
int32_t deviceId;
acl_jittor_initer()
{
CHECK_ACL(aclInit(nullptr));
uint device_count = 0;
deviceId = 0;
// 获取可用的Device数量
CHECK_ACL(aclrtGetDeviceCount(&device_count));
LOGi << "Found ACL device number:" << device_count;
CHECK_ACL(aclrtSetDevice(deviceId));
CHECK_ACL(aclrtCreateStream(&aclstream));
// pthread_create(&acl_jittor_tid, nullptr, acl_jittor_process_callback, 0);
}
~acl_jittor_initer()
{
acl_jittor_thread_running = 0;
// CHECK_ACL(aclrtUnSubscribeReport(acl_jittor_tid, 0));
aclrtDestroyStream(aclstream);
aclrtResetDevice(deviceId);
CHECK_ACL(aclFinalize());
if (nowWorkSpaceSize > 0)
{
aclrtFree(workspaceAddr);
}
}
} _acl_jittor_initer;
string process_acl(const string &src, const string &name, const map<string, string> &kargs)
{
if (endswith(name, "_jittor.cc"))
return src;
// static vector<string> dont_compile = {"fp16_emu.cc"};
// for (auto& s : dont_compile)
// if (endswith(name, s))
// return " ";
static unordered_set<string> cuda_headers = {
"cuda_runtime", "cudnn", "driver_types",
"cuda_fp16", "cuda_runtime_api", "fp16_emu",
"cudnn_rnn_descriptor", "cublas_v2", "cublas_wrapper",
"curand", "curand_wrapper", "cufft", "cufftXt",
"CudaUtils", "cutt", "cudnn_wrapper", "cuda_bf16"};
static unordered_set<string> fake_class = {
"cudnnHandle_t", "cudnnConvolutionBwdFilterAlgo_t",
"cudnnConvolutionBwdDataAlgo_t", "cudnnConvolutionFwdAlgo_t",
"cufftHandle"};
try
{
auto tokens = token_split(src);
int edit = 0;
for (int i = 0; i < tokens.size(); i++)
{
auto &token = tokens[i];
if (cuda_headers.count(token))
token = "acl_jittor", edit++;
else if (fake_class.count(token))
token = "int", edit++;
else if (token == "CUDA")
token = "ACL", edit++;
else if (startswith(token, "cuda"))
{
if (token.size() >= 5 && token[4] >= 'A' && token[4] <= 'Z')
{
if (token == "cudaGetDeviceCount")
{
token_replace(tokens, i, "($1);", "((uint*)$1);");
}
else if (token == "cudaLaunchHostFunc")
{
// ACL_CALLBACK_BLOCK for 310
token_replace(tokens, i, "LaunchHostFunc($1,$2,$3)",
"LaunchCallback($2,$3,ACL_CALLBACK_NO_BLOCK,$1)");
}
else if (token == "cudaMemcpy")
token_replace(tokens, i, "cudaMemcpy($1,$2,$3,",
"aclrtMemcpy($1,$3,$2,$3,");
else if (token == "cudaMemcpyAsync")
token_replace(tokens, i, "cudaMemcpyAsync($1,$2,$3,",
"aclrtMemcpyAsync($1,$3,$2,$3,");
else if (token == "cudaMemcpyDeviceToHost")
token = "ACL_MEMCPY_DEVICE_TO_HOST";
else if (token == "cudaMemcpyDefault")
token = "ACL_MEMCPY_HOST_TO_DEVICE";
else if (token == "cudaMemcpyHostToDevice")
token = "ACL_MEMCPY_HOST_TO_DEVICE";
else if (token == "cudaMemcpyDeviceToDevice")
token = "ACL_MEMCPY_DEVICE_TO_DEVICE";
else if (token == "cudaMallocManaged" || token == "cudaMalloc")
{
// unified address not supported
token = "aclrtMalloc";
token_replace(tokens, i, "($1,$2)",
"($1,$2,ACL_MEM_MALLOC_HUGE_FIRST)");
}
else if (token == "cudaMemGetInfo")
token_replace(tokens, i, "cudaMemGetInfo($1,$2)",
"aclrtGetMemInfo(ACL_DDR_MEM,$1,$2)");
else if (token == "cudaGetLastError")
token_replace(tokens, i, "cudaGetLastError()", "0");
else if (token == "cudaStreamCreateWithFlags")
token_replace(tokens, i - 1,
"(cudaStreamCreateWithFlags($1,$2));",
"(aclrtCreateStream($1)); checkAclErrors(aclrtSubscribeReport(acl_jittor_tid,*$1));");
else if (token == "cudaEventCreate")
token_replace(tokens, i,
"cudaEventCreate($1,$2)",
"aclrtCreateEvent($1)");
else if (token == "cudaDeviceSynchronize")
token = "aclrtSynchronizeDevice";
else if (token == "cudaStreamDestroy")
token_replace(tokens, i, "cudaStreamDestroy($1)",
"(aclrtUnSubscribeReport(acl_jittor_tid,$1), aclrtDestroyStream($1))");
else if (token == "cudaEventDestroy")
token = "aclrtDestroyEvent";
else if (token == "cudaEventRecord")
token = "aclrtRecordEvent";
else if (token == "cudaStreamWaitEvent")
token_replace(tokens, i,
"cudaStreamWaitEvent($1,$2,$3)",
"aclrtStreamWaitEvent($1,$2)");
if (token.size() && token[0] == 'c')
token = "aclrt" + token.substr(4);
if (endswith(token, "_t"))
token = token.substr(0, token.size() - 2);
edit++;
}
}
else if (token == "_cudaGetErrorEnum")
{
token_replace(tokens, i, "_cudaGetErrorEnum($1)", "(acl_error_to_string($1))");
edit++;
}
else if (token == "checkCudaErrors")
token = "checkAclErrors";
else if (token == "JPU")
{
edit++;
string new_code;
if (tokens[i + 2] == "op_compiler")
token_replace(tokens, i,
"JPU(op_compiler($1,$2,$3))",
"acl_jittor_op_compiler($1,$2,$3)");
else if (tokens[i + 2] == "header")
new_code = "#include \"acl_jittor.h\"";
if (new_code.size())
token_replace(tokens, i, "JPU($1)", new_code);
}
else if (token == "use_cuda_managed_allocator" && tokens[i + 1][0] == ',')
{
tokens[i + 2] = "0"; // disable unified address
}
}
if (!edit)
return src;
string new_src = join(tokens, "");
// if (name == "executor.cc") {
// new_src = string("#include <Python.h>\n#include <pystate.h>\n#include <common.h>\n")+
// "namespace jittor { void acl_op_exec(Op*); }\n" +
// replace(new_src, "op->do_run_after_prepare(jkl);",
// R"({
// acl_op_exec(op);
// })");
// }
if (name == "profiler.cc")
{
new_src = token_replace_all(new_src, ".cc", ".tikcc");
}
// LOGir << name << (name == "pass_manager.cc");
if (name == "pass_manager.cc")
{
LOGir << "replace" << name;
new_src = token_replace_all(new_src, "run_pass<FloatAtomicFixPass>();", "WTF");
}
// ????????
return new_src;
}
catch (const std::exception &e)
{
LOGe << "process acl error:" << e.what();
LOGe << "name:" << name;
throw;
}
// s = token_replace_all(s+";", "auto $1=$2?$3:$4;", "auto $1=$3;if (!($2)) $1=$4;");
// s = token_replace_all(s+";", "auto $1=$2?$3:$4;", "auto $1=$3;if (!($2)) $1=$4;");
// if (s.find("::max") != string::npos) {
// s = token_replace_all(s+";", " $1= ::max($2);", "tikcc::scalar_max($1, $2);");
// }
}
new_src = join(ss, ";");
src = new_src;
}
void acl_jittor_op_compiler(string &filename, string &src, bool is_acl, string &extra_flags)
{
if (!is_acl)
return;
string new_src = process_acl(src, "", {});
new_src = replace(new_src, R"(#include "misc/cuda_atomic.h")", "");
new_src = replace(new_src, R"(#include "misc/cuda_limits.h")", "");
new_src = replace(new_src, "__global__", "__ai_device_entry__");
new_src = token_replace_all(new_src, "__launch_bounds__($1)", "");
new_src = token_replace_all(new_src, "int thread_num = $1;", "int thread_num = 1;");
new_src = token_replace_all(new_src, "tn0=std::max(tn0, $1);", "");
new_src = token_replace_all(new_src, "<<<$1>>>", "<<<1,0>>>");
new_src = token_replace_all(new_src, "int thread_id = $1;", "int thread_id = 1;");
// for inc error
new_src = token_replace_all(new_src, "for ($1+=$2)", "for ($1++)");
// bit op error
new_src = token_replace_all(new_src, "int tnum$1;", "");
new_src = token_replace_all(new_src, "int p1$1;", "");
new_src = token_replace_all(new_src, "int p2$1;", "");
new_src = token_replace_all(new_src, "int tn$1=$2;", "int tn$1=0;");
new_src = token_replace_all(new_src, "int tid$1=$2;", "int tid$1=0;");
src = new_src;
new_src = token_replace_all(new_src, "atomicAdd(&$1,$2);", "$1=$1+$2;");
// new_src = token_replace_all(new_src, "bool", "int8");
new_src = token_replace_all(new_src, "::numeric_min<float32>()", "-1e30");
new_src = token_replace_all(new_src, "::numeric_max<float32>()", "1e30");
// TODO: support max
unordered_map<string, string> opmap = {
// {"::max","tikcc::scalar_max"},
{"::sqrtf", "tikcc::scalar_sqrt"}};
auto ss = split(new_src, ";");
for (auto &s : ss)
{
if (s.find("?") != string::npos)
{
s = token_replace_all(s + ";", "auto $1=$2?$3:$4;", "auto $1=$3;if (!($2)) $1=$4;");
}
if (s.find("::max") != string::npos)
{
if (s.find("auto") == string::npos)
{
s = token_replace_all(s + ";", " $1=$4::max($2,$3);", " $1=$2;if ($2 < $3) $1=$3;");
}
else
{
s = token_replace_all(s + ";", "auto $1=$4::max($2,$3);", "auto $1=$2;if ($2 < $3) $1=$3;");
}
}
for (auto &kv : opmap)
{
if (s.find(kv.first) != string::npos)
{
if (s.find("auto") == string::npos)
{
// $1 = op($2) --> op($1, $2)
s = token_replace_all(s + ";", " $1= " + kv.first + "($2);", kv.second + "($1, $2);");
}
else
{
// auto $1 = op($2) --> float32 $1; op($1, $2);
s = token_replace_all(s + ";", "auto $1= " + kv.first + "($2);", "float32 $1; " + kv.second + "($1, $2);");
}
}
}
// s = token_replace_all(s+";", "auto $1=$2?$3:$4;", "auto $1=$3;if (!($2)) $1=$4;");
// s = token_replace_all(s+";", "auto $1=$2?$3:$4;", "auto $1=$3;if (!($2)) $1=$4;");
// if (s.find("::max") != string::npos) {
// s = token_replace_all(s+";", " $1= ::max($2);", "tikcc::scalar_max($1, $2);");
// }
}
new_src = join(ss, ";");
src = new_src;
}
}

View File

@ -1,20 +1,700 @@
// ***************************************************************
// Copyright (c) 2023 Jittor. All Rights Reserved.
// Maintainers: Dun Liang <randonlang@gmail.com>.
// Copyright (c) 2023 Jittor. All Rights Reserved.
// Maintainers: Dun Liang <randonlang@gmail.com>.
// This file is subject to the terms and conditions defined in
// file 'LICENSE.txt', which is part of this source code package.
// ***************************************************************
#pragma once
#include "common.h"
#include "aclnn/aclnn.h"
#include <acl/acl.h>
std::string acl_error_to_string(aclError error);
namespace jittor {
namespace jittor
{
EXTERN_LIB uint64_t acl_jittor_tid;
EXTERN_LIB aclrtStream aclstream;
EXTERN_LIB uint64_t acl_jittor_tid;
EXTERN_LIB aclrtStream aclstream;
EXTERN_LIB void *workspaceAddr;
void acl_jittor_op_compiler(string& filename, string& src, bool is_acl, string& extra_flags);
void mallocWorkSpace(uint64_t size);
}
void acl_jittor_op_compiler(string &filename, string &src, bool is_acl, string &extra_flags);
struct AclOpFunctions
{
// for Unary and Nonzero
std::function<aclnnStatus(aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncUnaryNonzero;
// for Cast
std::function<aclnnStatus(aclTensor *, aclDataType, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncCast;
// for Bianry
std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncBinary;
// for Add and Sub
std::function<aclnnStatus(aclTensor *, aclTensor *, aclScalar *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncAdd;
// for Expand, permute, flip
std::function<aclnnStatus(aclTensor *, aclIntArray *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncExpand;
// for bmm and matmul
std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, int8_t, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncMatmul;
// for conv
std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, aclIntArray *, aclIntArray *, aclIntArray *, bool, aclIntArray *, int64_t, aclTensor *, int8_t, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncConv;
// for reducesum, mean
std::function<aclnnStatus(aclTensor *, aclIntArray *, bool, aclDataType, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncReduceSum;
// for amax and amin
std::function<aclnnStatus(aclTensor *, aclIntArray *, bool, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncAmax;
// for conv backward
std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, aclIntArray *, aclIntArray *, aclIntArray *, aclIntArray *, bool, aclIntArray *, int, aclBoolArray *, int8_t, aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncConvBackward;
// for proddim
std::function<aclnnStatus(aclTensor *, float, float, float, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncProdDim;
// for select, where
std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncSelect;
// for random_uniform and random_normal
std::function<aclnnStatus(aclTensor *, int64_t, int64_t, int64_t, int64_t, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncRandom;
// for maxpool
std::function<aclnnStatus(aclTensor *, aclIntArray *, aclIntArray *, aclIntArray *, aclIntArray *, bool, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncMaxPool;
// for maxpool backward
std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, aclIntArray *, aclIntArray *, aclIntArray *, aclIntArray *, bool, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncMaxPoolBackward;
// for avgpool
std::function<aclnnStatus(aclTensor *, aclIntArray *, aclIntArray *, aclIntArray *, bool, bool, int64_t, int8_t, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncAvgPool;
// for avgpool backward
std::function<aclnnStatus(aclTensor *, aclTensor *, aclIntArray *, aclIntArray *, aclIntArray *, bool, bool, int64_t, int8_t, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncAvgPoolBackward;
// for concat
std::function<aclnnStatus(aclTensorList *, uint64_t, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncConcat;
// for gather
std::function<aclnnStatus(aclTensor *, uint64_t, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncGather;
// for cumsum
std::function<aclnnStatus(aclTensor *, uint64_t, aclDataType, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncCumsum;
// for scatter
std::function<aclnnStatus(aclTensor *, uint64_t, aclTensor *, aclTensor *, uint64_t, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncScatter;
// for index
std::function<aclnnStatus(aclTensor *, aclTensorList *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncIndex;
// for stridesliceassignv2
std::function<aclnnStatus(aclTensor *, aclTensor *, aclIntArray *, aclIntArray *, aclIntArray *, aclIntArray *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncStridedSliceAssignV2;
// for slicev2
std::function<aclnnStatus(aclTensor *, aclIntArray *, aclIntArray *, aclIntArray *, aclIntArray *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncSliceV2;
// for indexputimpl
std::function<aclnnStatus(aclTensor *, aclTensorList *, aclTensor *, bool, bool, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncIndexPutImpl;
// for range
std::function<aclnnStatus(aclScalar *, aclScalar *, aclScalar *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncRange;
// for leaky_relu
std::function<aclnnStatus(aclTensor *, aclScalar *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncLeakyRelu;
// for leaky_relu backward
std::function<aclnnStatus(aclTensor *, aclTensor *, aclScalar *, bool, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncLeakyReluBackward;
// for dropout
std::function<aclnnStatus(aclTensor *, double, bool, int64_t, int64_t, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncDropout;
// for dropout backward
std::function<aclnnStatus(aclTensor *, aclTensor *, double, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncDropoutBackward;
// for split with size
std::function<aclnnStatus(aclTensor *, aclIntArray *, int64_t, aclTensorList *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncSplitWithSize;
// for silu
// std::function<aclnnStatus(aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncSilu;
// for silu backward
// std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncSiluBackward;
// for sigmoid
// std::function<aclnnStatus(aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncSigmoid;
// for sigmoid backward
// std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncSigmoidBackward;
// for embedding
// std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncEmbedding;
// for embedding backward
std::function<aclnnStatus(aclTensor *, aclTensor *, uint64_t, uint64_t, bool, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncEmbeddingBackward;
// for InplaceMaskedScatter MaskedSelect
// std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncInplaceMaskedScatter;
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, aclrtStream)> executeFunc;
// for flashattention
std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *,
aclIntArray *, aclIntArray *, aclIntArray *, double, double, int64_t, int64_t, int64_t, char *, int64_t, int64_t, int64_t,
aclTensor *, aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)>
getWorkspaceSizeFuncFalshAttention;
// for flashattention backward
std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *,
aclIntArray *, aclIntArray *, aclIntArray *, double, double, int64_t, int64_t, int64_t, char *, int64_t, int64_t, int64_t,
aclTensor *, aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)>
getWorkspaceSizeFuncFalshAttentionBackward;
// for batchnorm
std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, bool, double, double, aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncBatchNorm;
// for batchnorm backward
std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, bool, double, aclBoolArray *, aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncBatchNormBackward;
// for layernorm
std::function<aclnnStatus(aclTensor *, aclIntArray *, aclTensor *, aclTensor *, double, aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncLayerNorm;
// for ROPE
std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, aclTensor *, int64_t, uint64_t *, aclOpExecutor **)>
getWorkspaceSizeFuncRotaryPosEmb;
// 添加一个默认构造函数
AclOpFunctions() = default;
// for Unary and Nonzero
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, aclrtStream)> execf)
: getWorkspaceSizeFuncUnaryNonzero(gwsf), executeFunc(execf) {}
// for Cast
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclDataType, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, aclrtStream)> execf)
: getWorkspaceSizeFuncCast(gwsf), executeFunc(execf) {}
// for Binary
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncBinary(gwsf), executeFunc(execf) {}
// for Add and Sub
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, aclScalar *, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncAdd(gwsf), executeFunc(execf) {}
// for Expand, flip
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclIntArray *, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncExpand(gwsf), executeFunc(execf) {}
// for Matmul
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, int8_t, uint64_t *, aclOpExecutor **)> gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncMatmul(gwsf), executeFunc(execf) {}
// for conv
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, aclIntArray *, aclIntArray *, aclIntArray *, bool, aclIntArray *, int64_t, aclTensor *, int8_t, uint64_t *, aclOpExecutor **)> gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncConv(gwsf), executeFunc(execf) {}
// for reducesum, mean
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclIntArray *, bool, aclDataType, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncReduceSum(gwsf), executeFunc(execf) {}
// for amax amin
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclIntArray *, bool, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncAmax(gwsf), executeFunc(execf) {}
// for conv backward
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, aclIntArray *, aclIntArray *, aclIntArray *, aclIntArray *, bool, aclIntArray *, int, aclBoolArray *, int8_t, aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncConvBackward(gwsf), executeFunc(execf) {}
// for proddim
AclOpFunctions(std::function<aclnnStatus(const aclTensor *, float, float, float, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncProdDim(gwsf), executeFunc(execf) {}
// for select, where
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncSelect(gwsf), executeFunc(execf) {}
// for random_normal
AclOpFunctions(std::function<aclnnStatus(aclTensor *, int64_t, int64_t, int64_t, int64_t, uint64_t *, aclOpExecutor **)> gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncRandom(gwsf), executeFunc(execf) {}
// for maxpool
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclIntArray *, aclIntArray *, aclIntArray *, aclIntArray *, bool, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncMaxPool(gwsf), executeFunc(execf) {}
// for maxpool backward
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, aclIntArray *, aclIntArray *, aclIntArray *, aclIntArray *, bool, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncMaxPoolBackward(gwsf), executeFunc(execf) {}
// for avgpool
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclIntArray *, aclIntArray *, aclIntArray *, bool, bool, int64_t, int8_t, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncAvgPool(gwsf), executeFunc(execf) {}
// for avgpool backward
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, aclIntArray *, aclIntArray *, aclIntArray *, bool, bool, int64_t, int8_t, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncAvgPoolBackward(gwsf), executeFunc(execf) {}
// for concat
AclOpFunctions(std::function<aclnnStatus(aclTensorList *, int64_t, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncConcat(gwsf), executeFunc(execf) {}
// for gather
AclOpFunctions(std::function<aclnnStatus(aclTensor *, int64_t, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncGather(gwsf), executeFunc(execf) {}
// for cumsum
AclOpFunctions(std::function<aclnnStatus(aclTensor *, int64_t, aclDataType, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncCumsum(gwsf), executeFunc(execf) {}
// for scatter
AclOpFunctions(std::function<aclnnStatus(aclTensor *, uint64_t, aclTensor *, aclTensor *, uint64_t, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncScatter(gwsf), executeFunc(execf) {}
// for index
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensorList *, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncIndex(gwsf), executeFunc(execf) {}
// for stridesliceassignv2
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, aclIntArray *, aclIntArray *, aclIntArray *, aclIntArray *, uint64_t *, aclOpExecutor **)> gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncStridedSliceAssignV2(gwsf), executeFunc(execf) {}
// for slicev2
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclIntArray *, aclIntArray *, aclIntArray *, aclIntArray *, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncSliceV2(gwsf), executeFunc(execf) {}
// for indexputimpl
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensorList *, aclTensor *, bool, bool, uint64_t *, aclOpExecutor **)> gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncIndexPutImpl(gwsf), executeFunc(execf) {}
// for range
AclOpFunctions(std::function<aclnnStatus(aclScalar *, aclScalar *, aclScalar *, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncRange(gwsf), executeFunc(execf) {}
// for leaky_relu
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclScalar *, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncLeakyRelu(gwsf), executeFunc(execf) {}
// for leaky_relu backward
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, aclScalar *, bool, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncLeakyReluBackward(gwsf), executeFunc(execf) {}
// for dropout
AclOpFunctions(std::function<aclnnStatus(aclTensor *, double, bool, int64_t, int64_t, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncDropout(gwsf), executeFunc(execf) {}
// for dropout backward
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, double, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncDropoutBackward(gwsf), executeFunc(execf) {}
// for embedding backward
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, uint64_t, uint64_t, bool, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncEmbeddingBackward(gwsf), executeFunc(execf) {}
// for split with size
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclIntArray *, int64_t, aclTensorList *, uint64_t *, aclOpExecutor **)> gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncSplitWithSize(gwsf), executeFunc(execf) {}
// for flash attention
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *,
aclIntArray *, aclIntArray *, aclIntArray *, double, double, int64_t, int64_t, int64_t, char *, int64_t, int64_t, int64_t,
aclTensor *, aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)>
gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncFalshAttention(gwsf), executeFunc(execf) {}
// for flash attention backward
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *,
aclIntArray *, aclIntArray *, aclIntArray *, double, double, int64_t, int64_t, int64_t, char *, int64_t, int64_t, int64_t,
aclTensor *, aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)>
gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncFalshAttentionBackward(gwsf), executeFunc(execf) {}
// for batchnorm
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, bool, double, double, aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)>
gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncBatchNorm(gwsf), executeFunc(execf) {}
// for batchnorm backward
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, bool, double, aclBoolArray *, aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)>
gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncBatchNormBackward(gwsf), executeFunc(execf) {}
// for layernorm
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclIntArray *, aclTensor *, aclTensor *, double, aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)>
gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncLayerNorm(gwsf), executeFunc(execf) {}
// for ROPE
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, const aclTensor *, const aclTensor *, int64_t, uint64_t *, aclOpExecutor **)> gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncRotaryPosEmb(gwsf), executeFunc(execf) {}
};
static std::unordered_map<std::string, AclOpFunctions> aclOpFuncMap = {
{"Abs", AclOpFunctions(aclnnAbsGetWorkspaceSize, aclnnAbs)},
{"Exp", AclOpFunctions(aclnnExpGetWorkspaceSize, aclnnExp)},
{"Log", AclOpFunctions(aclnnLogGetWorkspaceSize, aclnnLog)},
{"Sqrt", AclOpFunctions(aclnnSqrtGetWorkspaceSize, aclnnSqrt)},
{"Ceil", AclOpFunctions(aclnnCeilGetWorkspaceSize, aclnnCeil)},
{"Floor", AclOpFunctions(aclnnFloorGetWorkspaceSize, aclnnFloor)},
{"Round", AclOpFunctions(aclnnRoundGetWorkspaceSize, aclnnRound)},
{"Sin", AclOpFunctions(aclnnSinGetWorkspaceSize, aclnnSin)},
{"Cos", AclOpFunctions(aclnnCosGetWorkspaceSize, aclnnCos)},
{"Tan", AclOpFunctions(aclnnTanGetWorkspaceSize, aclnnTan)},
{"Asin", AclOpFunctions(aclnnAsinGetWorkspaceSize, aclnnAsin)},
{"Acos", AclOpFunctions(aclnnAcosGetWorkspaceSize, aclnnAcos)},
{"Atan", AclOpFunctions(aclnnAtanGetWorkspaceSize, aclnnAtan)},
{"Sinh", AclOpFunctions(aclnnSinhGetWorkspaceSize, aclnnSinh)},
{"Cosh", AclOpFunctions(aclnnCoshGetWorkspaceSize, aclnnCosh)},
{"Tanh", AclOpFunctions(aclnnTanhGetWorkspaceSize, aclnnTanh)},
{"Asinh", AclOpFunctions(aclnnAsinhGetWorkspaceSize, aclnnAsinh)},
{"Acosh", AclOpFunctions(aclnnAcoshGetWorkspaceSize, aclnnAcosh)},
{"Atanh", AclOpFunctions(aclnnAtanhGetWorkspaceSize, aclnnAtanh)},
{"Sigmoid", AclOpFunctions(aclnnSigmoidGetWorkspaceSize, aclnnSigmoid)},
{"Erf", AclOpFunctions(aclnnErfGetWorkspaceSize, aclnnErf)},
{"Erfinv", AclOpFunctions(aclnnErfinvGetWorkspaceSize, aclnnErfinv)},
{"LogicalNot", AclOpFunctions(aclnnLogicalNotGetWorkspaceSize, aclnnLogicalNot)},
{"BitwiseNot", AclOpFunctions(aclnnBitwiseNotGetWorkspaceSize, aclnnBitwiseNot)},
{"Neg", AclOpFunctions(aclnnNegGetWorkspaceSize, aclnnNeg)},
{"Cast", AclOpFunctions(aclnnCastGetWorkspaceSize, aclnnCast)},
{"Maximum", AclOpFunctions(aclnnMaximumGetWorkspaceSize, aclnnMaximum)},
{"Minimum", AclOpFunctions(aclnnMinimumGetWorkspaceSize, aclnnMinimum)},
{"Add", AclOpFunctions(aclnnAddGetWorkspaceSize, aclnnAdd)},
{"Sub", AclOpFunctions(aclnnSubGetWorkspaceSize, aclnnSub)},
{"Mul", AclOpFunctions(aclnnMulGetWorkspaceSize, aclnnMul)},
{"RealDiv", AclOpFunctions(aclnnDivGetWorkspaceSize, aclnnDiv)},
{"FloorDiv", AclOpFunctions(aclnnFloorDivideGetWorkspaceSize, aclnnFloorDivide)},
{"LessEqual", AclOpFunctions(aclnnLeTensorGetWorkspaceSize, aclnnLeTensor)},
{"Less", AclOpFunctions(aclnnLtTensorGetWorkspaceSize, aclnnLtTensor)},
{"GreaterEqual", AclOpFunctions(aclnnGeTensorGetWorkspaceSize, aclnnGeTensor)},
{"Greater", AclOpFunctions(aclnnGtTensorGetWorkspaceSize, aclnnGtTensor)},
{"Equal", AclOpFunctions(aclnnEqTensorGetWorkspaceSize, aclnnEqTensor)},
{"NotEqual", AclOpFunctions(aclnnNeTensorGetWorkspaceSize, aclnnNeTensor)},
{"LogicalAnd", AclOpFunctions(aclnnLogicalAndGetWorkspaceSize, aclnnLogicalAnd)},
{"LogicalOr", AclOpFunctions(aclnnLogicalOrGetWorkspaceSize, aclnnLogicalOr)},
{"LogicalXor", AclOpFunctions(aclnnLogicalXorGetWorkspaceSize, aclnnLogicalXor)},
{"BitwiseAnd", AclOpFunctions(aclnnBitwiseAndTensorGetWorkspaceSize, aclnnBitwiseAndTensor)},
{"BitwiseOr", AclOpFunctions(aclnnBitwiseOrTensorGetWorkspaceSize, aclnnBitwiseOrTensor)},
{"BitwiseXor", AclOpFunctions(aclnnBitwiseXorTensorGetWorkspaceSize, aclnnBitwiseXorTensor)},
{"Pow", AclOpFunctions(aclnnPowTensorTensorGetWorkspaceSize, aclnnPowTensorTensor)},
{"Expand", AclOpFunctions(aclnnExpandGetWorkspaceSize, aclnnExpand)},
{"MatMul", AclOpFunctions(aclnnMatmulGetWorkspaceSize, aclnnMatmul)},
{"BatchMatMul", AclOpFunctions(aclnnBatchMatMulGetWorkspaceSize, aclnnBatchMatMul)},
{"ReduceMax", AclOpFunctions(aclnnAmaxGetWorkspaceSize, aclnnAmax)},
{"ReduceMin", AclOpFunctions(aclnnAminGetWorkspaceSize, aclnnAmin)},
{"ReduceSum", AclOpFunctions(aclnnReduceSumGetWorkspaceSize, aclnnReduceSum)},
{"Triu", AclOpFunctions(aclnnTriuGetWorkspaceSize, aclnnTriu)},
{"Conv2d", AclOpFunctions(aclnnConvolutionGetWorkspaceSize, aclnnConvolution)},
{"Conv2dBackward", AclOpFunctions(aclnnConvolutionBackwardGetWorkspaceSize, aclnnConvolutionBackward)},
{"ReduceMean", AclOpFunctions(aclnnMeanGetWorkspaceSize, aclnnMean)},
// {"ReduceProd", AclOpFunctions(aclnnProdDimGetWorkspaceSize, aclnnProdDim)},
{"Select", AclOpFunctions(aclnnSWhereGetWorkspaceSize, aclnnSWhere)},
{"RandomUniform", AclOpFunctions(aclnnInplaceUniformGetWorkspaceSize, aclnnInplaceUniform)},
{"RandomNormal", AclOpFunctions(aclnnInplaceNormalGetWorkspaceSize, aclnnInplaceNormal)},
{"Transpose", AclOpFunctions(aclnnPermuteGetWorkspaceSize, aclnnPermute)},
{"Maxpool", AclOpFunctions(aclnnMaxPool2dWithIndicesGetWorkspaceSize, aclnnMaxPool2dWithIndices)},
{"MaxpoolBackward", AclOpFunctions(aclnnMaxPool2dWithIndicesBackwardGetWorkspaceSize, aclnnMaxPool2dWithIndicesBackward)},
{"Avgpool", AclOpFunctions(aclnnAvgPool2dGetWorkspaceSize, aclnnAvgPool2d)},
{"AvgpoolBackward", AclOpFunctions(aclnnAvgPool2dBackwardGetWorkspaceSize, aclnnAvgPool2dBackward)},
{"Flip", AclOpFunctions(aclnnFlipGetWorkspaceSize, aclnnFlip)},
{"Concat", AclOpFunctions(aclnnCatGetWorkspaceSize, aclnnCat)},
{"Gather", AclOpFunctions(aclnnGatherGetWorkspaceSize, aclnnGather)},
{"Cumsum", AclOpFunctions(aclnnCumsumGetWorkspaceSize, aclnnCumsum)},
{"Index", AclOpFunctions(aclnnIndexGetWorkspaceSize, aclnnIndex)},
{"Scatter", AclOpFunctions(aclnnScatterGetWorkspaceSize, aclnnScatter)},
{"Nonzero", AclOpFunctions(aclnnNonzeroGetWorkspaceSize, aclnnNonzero)},
{"Where", AclOpFunctions(aclnnSWhereGetWorkspaceSize, aclnnSWhere)},
{"Floor", AclOpFunctions(aclnnFloorGetWorkspaceSize, aclnnFloor)},
{"StridedSliceAssignV2", AclOpFunctions(aclnnStridedSliceAssignV2GetWorkspaceSize, aclnnStridedSliceAssignV2)},
{"SliceV2", AclOpFunctions(aclnnSliceV2GetWorkspaceSize, aclnnSliceV2)},
{"IndexPutImpl", AclOpFunctions(aclnnIndexPutImplGetWorkspaceSize, aclnnIndexPutImpl)},
{"IndexPutImplAccumulate", AclOpFunctions(aclnnIndexPutImplGetWorkspaceSize, aclnnIndexPutImpl)},
{"Range", AclOpFunctions(aclnnRangeGetWorkspaceSize, aclnnRange)},
{"ReLU", AclOpFunctions(aclnnReluGetWorkspaceSize, aclnnRelu)},
{"LeakyReLU", AclOpFunctions(aclnnLeakyReluGetWorkspaceSize, aclnnLeakyRelu)},
{"LeakyReLUBackward", AclOpFunctions(aclnnLeakyReluBackwardGetWorkspaceSize, aclnnLeakyReluBackward)},
{"Dropout", AclOpFunctions(aclnnDropoutGetWorkspaceSize, aclnnDropout)},
{"DropoutBackward", AclOpFunctions(aclnnDropoutBackwardGetWorkspaceSize, aclnnDropoutBackward)},
{"SiLU", AclOpFunctions(aclnnSiluGetWorkspaceSize, aclnnSilu)},
{"SiLUBackward", AclOpFunctions(aclnnSiluBackwardGetWorkspaceSize, aclnnSiluBackward)},
{"Sigmoid", AclOpFunctions(aclnnSigmoidGetWorkspaceSize, aclnnSigmoid)},
{"SigmoidBackward", AclOpFunctions(aclnnSigmoidBackwardGetWorkspaceSize, aclnnSigmoidBackward)},
{"Embedding", AclOpFunctions(aclnnEmbeddingGetWorkspaceSize, aclnnEmbedding)},
{"EmbeddingBackward", AclOpFunctions(aclnnEmbeddingDenseBackwardGetWorkspaceSize, aclnnEmbeddingDenseBackward)},
{"InplaceMaskedScatter", AclOpFunctions(aclnnInplaceMaskedScatterGetWorkspaceSize, aclnnInplaceMaskedScatter)},
{"MaskedSelect", AclOpFunctions(aclnnMaskedSelectGetWorkspaceSize, aclnnMaskedSelect)},
{"SplitWithSize", AclOpFunctions(aclnnSplitWithSizeGetWorkspaceSize, aclnnSplitWithSize)},
{"Softmax", AclOpFunctions(aclnnSoftmaxGetWorkspaceSize, aclnnSoftmax)},
{"SoftmaxBackward", AclOpFunctions(aclnnSoftmaxBackwardGetWorkspaceSize, aclnnSoftmaxBackward)},
{"FlashAttention", AclOpFunctions(aclnnFlashAttentionScoreV2GetWorkspaceSize, aclnnFlashAttentionScoreV2)},
{"FlashAttentionBackward", AclOpFunctions(aclnnFlashAttentionScoreGradV2GetWorkspaceSize, aclnnFlashAttentionScoreGradV2)},
{"BatchNorm", AclOpFunctions(aclnnBatchNormGetWorkspaceSize, aclnnBatchNorm)},
{"BatchNormBackward", AclOpFunctions(aclnnBatchNormBackwardGetWorkspaceSize, aclnnBatchNormBackward)},
{"LayerNorm", AclOpFunctions(aclnnLayerNormGetWorkspaceSize, aclnnLayerNorm)},
{"RotaryPosEmb", AclOpFunctions(aclnnApplyRotaryPosEmbGetWorkspaceSize, aclnnApplyRotaryPosEmb)},
{"Stack", AclOpFunctions(aclnnStackGetWorkspaceSize, aclnnStack)},
{"NanToNum", AclOpFunctions(aclnnNanToNumGetWorkspaceSize, aclnnNanToNum)},
};
struct AclOpAttr
{
virtual ~AclOpAttr() {}
};
struct ConvAttr : AclOpAttr
{
vector<int64_t> convStrides;
vector<int64_t> convPads;
vector<int64_t> convOutPads;
vector<int64_t> convDilations;
bool convWithBias;
bool is_transposed;
int64_t group;
// 析构函数
~ConvAttr()
{
convStrides.clear();
convPads.clear();
convOutPads.clear();
convDilations.clear();
}
};
struct ReduceAttr : AclOpAttr
{
vector<int64_t> axes;
// for proddim
int64_t prod_dim;
bool keepdims;
~ReduceAttr()
{
axes.clear();
}
};
struct RandomAttr : AclOpAttr
{
int64_t seed, offset;
~RandomAttr()
{
}
};
struct TriuAttr : AclOpAttr
{
int64_t diagonal;
~TriuAttr()
{
}
};
struct PoolAttr : AclOpAttr
{
vector<int64_t> kernel_size;
vector<int64_t> poolStrides;
vector<int64_t> poolPads;
vector<int64_t> poolDilations;
bool poolCeil;
bool countIncludePad;
// divisorOverride(const int64_t计算输入): 表示取平均的除数。数据类型支持INT64。divisorOverride配置为默认值0时表示功能不使能。
// https://www.hiascend.com/document/detail/zh/canncommercial/80RC2/apiref/appdevgapi/context/aclnnAvgPool2d.md
int64_t divisorOverride = 0;
// cubeMathType(int8_t计算输入): host侧的整型判断Cube单元应该使用哪种计算逻辑进行运算数据类型支持INT8。对于无特殊说明的数据类型均保持原始输入数据类型计算。支持的枚举值如下
// 0:KEEP_DTYPE保持输入的数据类型进行计算。当输入是FLOATAtlas 训练系列产品和Atlas 推理系列产品Ascend 310P处理器暂不支持取0时会报错。
// 1:ALLOW_FP32_DOWN_PRECISION允许将输入数据降精度计算。当输入是FLOATAtlas 训练系列产品和Atlas 推理系列产品Ascend 310P处理器允许转换为FLOAT16计算。
// 2:USE_FP16允许转换为数据类型FLOAT16进行计算。当输入数据类型是FLOAT转换为FLOAT16计算。
// 3:USE_HF32允许转换为数据类型HFLOAT32计算。当输入是FLOATAtlas 训练系列产品、Atlas 推理系列产品Ascend 310P处理器和Atlas A2训练系列产品/Atlas 800I A2推理产品暂不支持取3时会报错。
// https://www.hiascend.com/document/detail/zh/canncommercial/80RC2/apiref/appdevgapi/context/aclnnAvgPool2d.md
int8_t cubeMathType = 0;
// 析构函数
~PoolAttr()
{
kernel_size.clear();
poolStrides.clear();
poolPads.clear();
poolDilations.clear();
}
};
struct ConcatAttr : AclOpAttr
{
int64_t tensorNum;
int64_t dim;
~ConcatAttr()
{
}
};
struct GatherAttr : AclOpAttr
{
int64_t dim;
~GatherAttr()
{
}
};
struct ScatterAttr : AclOpAttr
{
int64_t axis;
int64_t reduction;
~ScatterAttr()
{
}
};
struct StrideAttr : AclOpAttr
{
vector<int64_t> begins;
vector<int64_t> ends;
vector<int64_t> steps;
vector<int64_t> axes;
~StrideAttr()
{
begins.clear();
ends.clear();
steps.clear();
axes.clear();
}
};
struct RangeAttr : AclOpAttr
{
int64_t start;
int64_t end;
int64_t step;
~RangeAttr()
{
}
};
struct LeakyReluAttr : AclOpAttr
{
float negativeSlope;
bool selfIsResult;
~LeakyReluAttr()
{
}
};
struct DropoutAttr : AclOpAttr
{
float p;
bool train;
int64_t seed;
int64_t offset;
float scale;
~DropoutAttr()
{
}
};
struct EmbeddingAttr : AclOpAttr
{
int64_t numEmbeddings;
// int64_t embeddingDim;
int64_t paddingIdx;
bool scaleGradByFreq;
// bool sparse;
// bool isSparse;
// bool isDense;
~EmbeddingAttr()
{
}
};
struct SplitWithSizeAttr : AclOpAttr
{
vector<int64_t> splitSize;
int64_t dim;
~SplitWithSizeAttr()
{
splitSize.clear();
}
};
struct SoftmaxAttr : AclOpAttr
{
int64_t dim;
~SoftmaxAttr()
{
}
};
struct BatchNormAttr : AclOpAttr
{
bool is_train;
float momentum;
float eps;
~BatchNormAttr()
{
}
};
struct LayerNormAttr : AclOpAttr
{
float eps;
vector<int64_t> normalizedShape;
int64_t size;
~LayerNormAttr()
{
normalizedShape.clear();
}
};
struct FlashAttentionAttr : AclOpAttr
{
vector<int64_t> prefix;
vector<int64_t> qStartIdx;
vector<int64_t> kvStartIdx;
float scale;
float keepProb;
int64_t preToken;
int64_t nextToken;
int64_t headNum;
string inputLayout;
int64_t innerPrecise;
int64_t sparseMode;
int64_t psetype;
bool hasRealshift;
bool hasDropmask;
bool hasPaddingmask;
bool hasAttentmask;
~FlashAttentionAttr()
{
prefix.clear();
qStartIdx.clear();
kvStartIdx.clear();
}
};
struct NanToNumAttr : AclOpAttr
{
float nan;
float posinf;
float neginf;
~NanToNumAttr()
{
}
};
}

File diff suppressed because it is too large Load Diff

58
python/jittor/extern/acl/aclnn/aclnn.cc vendored Normal file
View File

@ -0,0 +1,58 @@
#include <iostream>
#include <vector>
#include "aclnn.h"
int64_t GetShapeSize(const std::vector<int64_t>& shape) {
int64_t shapeSize = 1;
for (auto i : shape) {
shapeSize *= i;
}
return shapeSize;
}
void PrintOutResult(std::vector<int64_t> &shape, void** deviceAddr) {
auto size = GetShapeSize(shape);
std::vector<int> resultData(size, 0);
auto ret = aclrtMemcpy(resultData.data(), resultData.size() * sizeof(resultData[0]),
*deviceAddr, size * sizeof(resultData[0]), ACL_MEMCPY_DEVICE_TO_HOST);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("copy result from device to host failed. ERROR: %d\n", ret); return);
for (int64_t i = 0; i < size; i++) {
LOG_PRINT("mean result[%ld] is: %d\n", i, resultData[i]);
}
}
/*int Init(int32_t deviceId) {
// 固定写法AscendCL初始化
auto ret = aclInit(nullptr);
CHECK_RET(ret == ACL_SUCCESS or ret == 100002, LOG_PRINT("aclInit failed. ERROR: %d\n", ret); return ret);
ret = aclrtSetDevice(deviceId);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclrtSetDevice failed. ERROR: %d\n", ret); return ret);
//ret = aclrtCreateStream(stream);
//CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclrtCreateStream failed. ERROR: %d\n", ret); return ret);
return 0;
}*/
/*
template <typename T>
int CreateAclTensor(const std::vector<T>& hostData, const std::vector<int64_t>& shape, void** deviceAddr,
aclDataType dataType, aclTensor** tensor) {
auto size = GetShapeSize(shape) * sizeof(T);
// 调用aclrtMalloc申请device侧内存
auto ret = aclrtMalloc(deviceAddr, size, ACL_MEM_MALLOC_HUGE_FIRST);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclrtMalloc failed. ERROR: %d\n", ret); return ret);
// 调用aclrtMemcpy将host侧数据拷贝到device侧内存上
ret = aclrtMemcpy(*deviceAddr, size, hostData.data(), size, ACL_MEMCPY_HOST_TO_DEVICE);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclrtMemcpy failed. ERROR: %d\n", ret); return ret);
// 计算连续tensor的strides
std::vector<int64_t> strides(shape.size(), 1);
for (int64_t i = shape.size() - 2; i >= 0; i--) {
strides[i] = shape[i + 1] * strides[i + 1];
}
// 调用aclCreateTensor接口创建aclTensor
*tensor = aclCreateTensor(shape.data(), shape.size(), dataType, strides.data(), 0, aclFormat::ACL_FORMAT_ND,
shape.data(), shape.size(), *deviceAddr);
return 0;
}*/

134
python/jittor/extern/acl/aclnn/aclnn.h vendored Normal file
View File

@ -0,0 +1,134 @@
#include <iostream>
#include <vector>
#include "acl.h"
// unary
#include "aclnnop/aclnn_abs.h"
#include "aclnnop/aclnn_neg.h"
#include "aclnnop/aclnn_exp.h"
#include "aclnnop/aclnn_log.h"
#include "aclnnop/aclnn_sqrt.h"
#include "aclnnop/aclnn_ceil.h"
#include "aclnnop/aclnn_floor.h"
#include "aclnnop/aclnn_round.h"
#include "aclnnop/aclnn_sin.h"
#include "aclnnop/aclnn_cos.h"
#include "aclnnop/aclnn_tan.h"
#include "aclnnop/aclnn_asin.h"
#include "aclnnop/aclnn_acos.h"
#include "aclnnop/aclnn_atan.h"
#include "aclnnop/aclnn_sinh.h"
#include "aclnnop/aclnn_cosh.h"
#include "aclnnop/aclnn_tanh.h"
#include "aclnnop/aclnn_asinh.h"
#include "aclnnop/aclnn_acosh.h"
#include "aclnnop/aclnn_atanh.h"
#include "aclnnop/aclnn_sigmoid.h"
#include "aclnnop/aclnn_erf.h"
#include "aclnnop/aclnn_erfinv.h"
#include "aclnnop/aclnn_logical_not.h"
#include "aclnnop/aclnn_bitwise_not.h"
#include "aclnnop/aclnn_cast.h"
#include "aclnnop/aclnn_nonzero.h"
// binary
#include "aclnnop/aclnn_maximum.h"
#include "aclnnop/aclnn_minimum.h"
#include "aclnnop/aclnn_add.h"
#include "aclnnop/aclnn_sub.h"
#include "aclnnop/aclnn_mul.h"
#include "aclnnop/aclnn_div.h"
#include "aclnnop/aclnn_floor_divide.h"
#include "aclnnop/aclnn_le_tensor.h"
#include "aclnnop/aclnn_lt_tensor.h"
#include "aclnnop/aclnn_ge_tensor.h"
#include "aclnnop/aclnn_gt_tensor.h"
#include "aclnnop/aclnn_eq_tensor.h"
#include "aclnnop/aclnn_ne_tensor.h"
#include "aclnnop/aclnn_logical_and.h"
#include "aclnnop/aclnn_logical_or.h"
#include "aclnnop/aclnn_logical_xor.h"
#include "aclnnop/aclnn_bitwise_and_tensor.h"
#include "aclnnop/aclnn_bitwise_or_tensor.h"
#include "aclnnop/aclnn_bitwise_xor_tensor.h"
#include "aclnnop/aclnn_pow_tensor_tensor.h"
#include "aclnnop/aclnn_expand.h"
#include "aclnnop/aclnn_matmul.h"
#include "aclnnop/aclnn_batch_matmul.h"
#include "aclnnop/aclnn_convolution.h"
#include "aclnnop/aclnn_convolution_backward.h"
#include "aclnnop/aclnn_reduce_sum.h"
#include "aclnnop/aclnn_amax.h"
#include "aclnnop/aclnn_amin.h"
#include "aclnnop/aclnn_mean.h"
#include "aclnnop/aclnn_prod.h"
#include "aclnnop/aclnn_triu.h"
#include "aclnnop/aclnn_s_where.h"
#include "aclnnop/aclnn_random.h"
#include "aclnnop/aclnn_normal.h"
#include "aclnnop/aclnn_permute.h"
#include "aclnnop/aclnn_max_pool2d_with_indices.h"
#include "aclnnop/aclnn_max_pool2d_with_indices_backward.h"
#include "aclnnop/aclnn_avgpool2d.h"
#include "aclnnop/aclnn_avgpool2d_backward.h"
#include "aclnnop/aclnn_flip.h"
#include "aclnnop/aclnn_cat.h"
#include "aclnnop/aclnn_gather.h"
#include "aclnnop/aclnn_cumsum.h"
#include "aclnnop/aclnn_index.h"
#include "aclnnop/aclnn_scatter.h"
#include "aclnnop/aclnn_index.h"
#include "aclnnop/aclnn_strided_slice_assign_v2.h"
#include "aclnnop/aclnn_slice_v2.h"
#include "aclnnop/aclnn_index_put_impl.h"
#include "aclnnop/aclnn_range.h"
#include "aclnnop/aclnn_relu.h"
#include "aclnnop/aclnn_dropout.h"
#include "aclnnop/aclnn_dropout_backward.h"
#include "aclnnop/aclnn_leaky_relu.h"
#include "aclnnop/aclnn_leaky_relu_backward.h"
#include "aclnnop/aclnn_uniform.h"
#include "aclnnop/aclnn_silu.h"
#include "aclnnop/aclnn_silu_backward.h"
#include "aclnnop/aclnn_sigmoid.h"
#include "aclnnop/aclnn_sigmoid_backward.h"
#include "aclnnop/aclnn_embedding.h"
#include "aclnnop/aclnn_embedding_dense_backward.h"
#include "aclnnop/aclnn_masked_scatter.h"
#include "aclnnop/aclnn_masked_select.h"
#include "aclnnop/aclnn_split_with_size.h"
#include "aclnnop/aclnn_flash_attention_score.h"
#include "aclnnop/aclnn_flash_attention_score_grad.h"
#include "aclnnop/aclnn_softmax.h"
#include "aclnnop/aclnn_softmax_backward.h"
#include "aclnnop/aclnn_batch_norm.h"
#include "aclnnop/aclnn_batch_norm_backward.h"
#include "aclnnop/aclnn_layer_norm.h"
#include "aclnnop/aclnn_apply_rotary_pos_emb.h"
#include "aclnnop/aclnn_stack.h"
#include "aclnnop/aclnn_nan_to_num.h"
#define CHECK_RET(cond, return_expr) \
do \
{ \
if (!(cond)) \
{ \
return_expr; \
} \
} while (0)
#define LOG_PRINT(message, ...) \
do \
{ \
printf(message, ##__VA_ARGS__); \
} while (0)
int64_t GetShapeSize(const std::vector<int64_t> &shape);
void PrintOutResult(std::vector<int64_t> &shape, void **deviceAddr);
//int Init(int32_t deviceId);
/*
template <typename T>
int CreateAclTensor(const std::vector<T>& hostData, const std::vector<int64_t>& shape, void** deviceAddr,
aclDataType dataType, aclTensor** tensor);
*/

View File

View File

@ -0,0 +1,33 @@
#pragma once
#include <acl/aclops/binary_op_acl.h>
#include <acl/aclops/unary_op_acl.h>
#include <acl/aclops/conv_op_acl.h>
#include <acl/aclops/ternary_op_acl.h>
#include <acl/aclops/reduce_op_acl.h>
#include <acl/aclops/expand_op_acl.h>
#include <acl/aclops/getitem_op_acl.h>
#include <acl/aclops/setitem_op_acl.h>
#include <acl/aclops/matmul_op_acl.h>
#include <acl/aclops/random_op_acl.h>
#include <acl/aclops/bmm_op_acl.h>
#include <acl/aclops/pool_op_acl.h>
#include <acl/aclops/flip_op_acl.h>
#include <acl/aclops/concat_op_acl.h>
#include <acl/aclops/gather_scatter_op_acl.h>
#include <acl/aclops/cumsum_op_acl.h>
#include <acl/aclops/index_op_acl.h>
#include <acl/aclops/where_op_acl.h>
#include <acl/aclops/floor_op_acl.h>
#include <acl/aclops/transpose_op_acl.h>
#include <acl/aclops/flashattention_op_acl.h>
#include <acl/aclops/relu_op_acl.h>
#include <acl/aclops/dropout_op_acl.h>
#include <acl/aclops/silu_op_acl.h>
#include <acl/aclops/sigmoid_op_acl.h>
#include <acl/aclops/softmax_op_acl.h>
#include <acl/aclops/stack_op_acl.h>
#include <acl/aclops/nantonum_op_acl.h>
#include <acl/aclops/rope_op_acl.h>
#include <acl/aclops/triu_op_acl.h>
#include <acl/aclops/embedding_op_acl.h>
#include <acl/aclops/norms_op_acl.h>

View File

@ -0,0 +1,56 @@
#pragma once
#include "utils.h"
#include "acl_jittor.h"
namespace jittor
{
extern int sync_run;
class BaseOpRunner
{
protected:
vector<Var *> in_;
vector<Var *> out_;
int ret = -1;
uint64_t workspaceSize = 0;
aclOpExecutor *executor;
bool is_group_op = false;
std::vector<std::vector<int64_t>> inputShapes;
std::vector<std::vector<int64_t>> outputShapes;
std::vector<aclTensor *> inputTensors;
std::vector<aclTensor *> outputTensors;
public:
string name;
string jt_name;
std::unique_ptr<AclOpAttr> op_attr;
bool use_nchw = false;
BaseOpRunner(const string &name = "") : name(name) {}
virtual ~BaseOpRunner() = default;
// Common functionality for adding input/output variables
void add(Var *v, bool is_input);
virtual void setupInputDesc();
void cleanupDesc();
virtual void setupOutputDesc();
virtual void syncRun();
void checkRet(aclnnStatus ret);
// Base run method with common operator lookup logic
void run();
protected:
// Virtual method for specific operator execution
virtual void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) = 0;
void cleanupAttr();
};
}

View File

@ -0,0 +1,152 @@
#include <acl/acl.h>
#include <acl/acl_op_compiler.h>
#include <Python.h>
#include <pystate.h>
#include <algorithm>
#include <queue>
#include <set>
#include "common.h"
#include "op.h"
#include "acl_jittor.h"
#include "ops/random_op.h"
#include "ops/reduce_op.h"
#include "ops/binary_op.h"
#include "ops/broadcast_to_op.h"
#include "ops/transpose_op.h"
#include "ops/array_op.h"
#include "ops/code_op.h"
#include "fused_op.h"
#include "ops/unary_op.h"
#include "ops/ternary_op.h"
#include "executor.h"
#include "misc/cuda_flags.h"
#include "mem/allocator.h"
#include "op_compiler.h"
#include "ops/op_register.h"
#include "opt/tuner_manager.h"
#include "utils/str_utils.h"
#include "aclnn/aclnn.h"
#include "binary_op_acl.h"
#include "base_op.h"
namespace jittor
{
extern int sync_run;
// Common functionality for adding input/output variables
void BaseOpRunner::add(Var *v, bool is_input)
{
if (is_input)
{
in_.push_back(v);
}
else
{
out_.push_back(v);
}
return;
}
void BaseOpRunner::setupInputDesc()
{
auto input_num = in_.size();
for (int input_idx = 0; input_idx < input_num; input_idx++)
{
std::vector<int64_t> shape;
for (int j = 0; j < in_[input_idx]->shape.size(); j++)
{
shape.push_back(in_[input_idx]->shape[j]);
}
inputShapes.push_back(shape);
}
for (int idx = 0; idx < input_num; idx++)
{
inputTensors.push_back(nullptr);
auto ret = CreateAclTensor(inputShapes[idx], in_[idx]->mem_ptr, in_[idx]->size, get_dtype(in_[idx]->dtype()), &inputTensors[idx], use_nchw);
CHECK_RET(ret == ACL_SUCCESS, return);
}
}
void BaseOpRunner::cleanupDesc()
{
auto input_num = in_.size();
auto output_num = out_.size();
for (int idx = 0; idx < input_num; idx++)
{
aclDestroyTensor(inputTensors[idx]);
}
for (int idx = 0; idx < output_num; idx++)
{
aclDestroyTensor(outputTensors[idx]);
}
}
void BaseOpRunner::setupOutputDesc()
{
auto output_num = out_.size();
for (int output_idx = 0; output_idx < output_num; output_idx++)
{
std::vector<int64_t> shape;
for (int j = 0; j < out_[output_idx]->shape.size(); j++)
{
shape.push_back(out_[output_idx]->shape[j]);
}
outputShapes.push_back(shape);
}
for (int idx = 0; idx < output_num; idx++)
{
outputTensors.push_back(nullptr);
auto ret = CreateAclTensor(outputShapes[idx], out_[idx]->mem_ptr, out_[idx]->size, get_dtype(out_[idx]->dtype()), &outputTensors[idx], use_nchw);
CHECK_RET(ret == ACL_SUCCESS, return);
}
}
void BaseOpRunner::syncRun()
{
if (sync_run)
{
// ret = aclrtSynchronizeStream(aclstream);
// CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclrtSynchronizeStream failed. ERROR: %d\n", name.c_str(), ret); return);
}
}
void BaseOpRunner::checkRet(aclnnStatus ret)
{
if (ret != ACL_SUCCESS)
{
auto tmp_err_msg = aclGetRecentErrMsg();
LOGir << name << ", " << tmp_err_msg;
}
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnxxxGetWorkspaceSize failed. ERROR: %d\n", name.c_str(), ret); return);
}
// Base run method with common operator lookup logic
void BaseOpRunner::run()
{
if (is_group_op)
{
auto it = aclOpFuncMap.find(name);
if (it == aclOpFuncMap.end())
{
LOGir << "aclOpFuncMap Not supported op: " << name;
throw std::runtime_error("Unsupported operation type.");
}
setupInputDesc();
setupOutputDesc();
executeOp(it);
cleanupDesc();
}
else
{
auto it = aclOpFuncMap.find(name);
setupInputDesc();
setupOutputDesc();
executeOp(it);
cleanupDesc();
}
}
}

View File

@ -0,0 +1,124 @@
#include <acl/acl.h>
#include <acl/acl_op_compiler.h>
#include <Python.h>
#include <pystate.h>
#include <algorithm>
#include <queue>
#include <set>
#include "common.h"
#include "op.h"
#include "acl_jittor.h"
#include "ops/random_op.h"
#include "ops/reduce_op.h"
#include "ops/binary_op.h"
#include "ops/broadcast_to_op.h"
#include "ops/transpose_op.h"
#include "ops/array_op.h"
#include "ops/code_op.h"
#include "fused_op.h"
#include "ops/unary_op.h"
#include "ops/ternary_op.h"
#include "executor.h"
#include "misc/cuda_flags.h"
#include "mem/allocator.h"
#include "op_compiler.h"
#include "ops/op_register.h"
#include "opt/tuner_manager.h"
#include "utils/str_utils.h"
#include "aclnn/aclnn.h"
#include "binary_op_acl.h"
namespace jittor
{
BinaryOpRunner::BinaryOpRunner() : BaseOpRunner("binary")
{
use_nchw = false;
is_group_op = true;
}
void BinaryOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
aclScalar *alpha = nullptr;
if (name == string("Add") || name == string("Sub"))
{
if (get_dtype(in_[0]->dtype()) == ACL_FLOAT)
{
float alphaValue = 1.0;
alpha = aclCreateScalar(&alphaValue, get_dtype(in_[0]->dtype()));
}
else if (get_dtype(in_[0]->dtype()) == ACL_FLOAT16)
{
__fp16 alphaValue = 1.0;
alpha = aclCreateScalar(&alphaValue, get_dtype(in_[0]->dtype()));
}
else if (get_dtype(in_[0]->dtype()) == ACL_INT64)
{
int64_t alphaValue = 1;
alpha = aclCreateScalar(&alphaValue, get_dtype(in_[0]->dtype()));
}
else if (get_dtype(in_[0]->dtype()) == ACL_INT32)
{
int alphaValue = 1;
alpha = aclCreateScalar(&alphaValue, get_dtype(in_[0]->dtype()));
}
else if (get_dtype(in_[0]->dtype()) == ACL_INT8)
{
int8_t alphaValue = 1;
alpha = aclCreateScalar(&alphaValue, get_dtype(in_[0]->dtype()));
}
else if (get_dtype(in_[0]->dtype()) == ACL_INT16)
{
int16_t alphaValue = 1;
alpha = aclCreateScalar(&alphaValue, get_dtype(in_[0]->dtype()));
}
else if (get_dtype(in_[0]->dtype()) == ACL_UINT8)
{
uint8_t alphaValue = 1;
alpha = aclCreateScalar(&alphaValue, get_dtype(in_[0]->dtype()));
}
else if (get_dtype(in_[0]->dtype()) == ACL_UINT16)
{
uint16_t alphaValue = 1;
alpha = aclCreateScalar(&alphaValue, get_dtype(in_[0]->dtype()));
}
else if (get_dtype(in_[0]->dtype()) == ACL_UINT32)
{
uint32_t alphaValue = 1;
alpha = aclCreateScalar(&alphaValue, get_dtype(in_[0]->dtype()));
}
else if (get_dtype(in_[0]->dtype()) == ACL_BOOL)
{
bool alphaValue = true;
alpha = aclCreateScalar(&alphaValue, get_dtype(in_[0]->dtype()));
}
else
{
LOGf << "Not supported dtype: " << in_[0]->dtype();
}
CHECK_RET(alpha != nullptr, return);
ret = it->second.getWorkspaceSizeFuncAdd(inputTensors[0], inputTensors[1], alpha, outputTensors[0], &workspaceSize, &executor);
}
else
{
ret = it->second.getWorkspaceSizeFuncBinary(inputTensors[0], inputTensors[1], outputTensors[0], &workspaceSize, &executor);
}
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = it->second.executeFunc(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnxxx failed. ERROR: %d\n", name.c_str(), ret); return);
syncRun();
aclDestroyScalar(alpha);
return;
}
}

View File

@ -0,0 +1,14 @@
#pragma once
#include "utils.h"
#include "base_op.h"
namespace jittor
{
struct BinaryOpRunner : public BaseOpRunner
{
BinaryOpRunner();
protected:
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
};
}

View File

@ -0,0 +1,128 @@
import os
from jittor_utils import env_or_try_find
import jittor_utils
import ctypes
import glob
import jittor.compiler as compiler
import jittor as jt
import math
import numpy as np
from typing import Union
from collections.abc import Sequence, Iterable
def acl_cmd(name: str,
inputs: list,
output_dtypes: list = None,
output_shapes: list = None,
attr_code: str = "",
attr_header: str = "",
outputs: list = None,
extra_data: dict = {}):
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
cuda_header = '''
#include "acl/aclops/aclops.h"
'''
outputs_ = []
if outputs is not None:
outputs_ = outputs
else:
assert output_dtypes is not None
assert output_shapes is not None
assert len(output_dtypes) == len(output_shapes)
for i in range(len(output_shapes)):
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
input_code = ''
for i in range(len(inputs)):
input_code += f"op.add(in{i}, true);\n"
return jt.code(outputs=outputs_,
inputs=inputs,
cuda_header=attr_header + cuda_header,
cuda_src=f"""
// aclop
BatchMatMulOpRunner op;
{input_code}
op.add(out0, false);
{attr_code}
op.run();""",
data=extra_data)
class BmmACL(jt.Function):
def __init__(self, trans_x2=False):
super(BmmACL, self).__init__()
self.trans_x2 = trans_x2
def execute(self, x1, x2):
self.input = [x1, x2]
result = acl_cmd("BatchMatMul", [x1, x2],
output_dtypes=[x1.dtype],
output_shapes=[
x1.shape[:-1] + x2.shape[-2:-1] if self.trans_x2
else x1.shape[:-1] + x2.shape[-1:]
],
attr_code="op.jt_name=\"bmm_trans_1\";"
if self.trans_x2 else "op.jt_name=\"bmm\";")[0]
return result
def grad(self, grad_output):
x1, x2 = self.input
if len(x1) != len(x2):
reshape_grad_x2 = True
else:
reshape_grad_x2 = False
grad_x1 = acl_cmd(
"BatchMatMul", [grad_output, x2],
output_dtypes=[x1.dtype],
output_shapes=[
grad_output.shape[:-1] + x2.shape[-2:-1] if not self.trans_x2
else grad_output.shape[:-1] + x1.shape[-1:]
],
attr_code="op.jt_name=\"bmm_trans_1\";"
if not self.trans_x2 else "op.jt_name=\"bmm\";")[0]
if self.trans_x2:
if reshape_grad_x2:
output_shape = grad_output.shape[1:-2] + grad_output.shape[
-1:] + x1.shape[-1:]
grad_x2 = acl_cmd("BatchMatMul", [
grad_output.reshape(-1, grad_output.shape[-1]),
x1.reshape(-1, x1.shape[-1])
],
output_dtypes=[x2.dtype],
output_shapes=[output_shape],
attr_code="op.jt_name=\"bmm_trans_0\";")[0]
else:
output_shape = grad_output.shape[:-2] + grad_output.shape[
-1:] + x1.shape[-1:]
grad_x2 = acl_cmd("BatchMatMul", [grad_output, x1],
output_dtypes=[x2.dtype],
output_shapes=[output_shape],
attr_code="op.jt_name=\"bmm_trans_0\";")[0]
else:
if reshape_grad_x2:
output_shape = x1.shape[1:-2] + x1.shape[
-1:] + grad_output.shape[-1:]
grad_x2 = acl_cmd("BatchMatMul", [
x1.reshape(-1, x1.shape[-1]),
grad_output.reshape(-1, grad_output.shape[-1])
],
output_dtypes=[x2.dtype],
output_shapes=[output_shape],
attr_code="op.jt_name=\"bmm_trans_0\";")[0]
else:
output_shape = x1.shape[:-2] + x1.shape[
-1:] + grad_output.shape[-1:]
grad_x2 = acl_cmd("BatchMatMul", [x1, grad_output],
output_dtypes=[x2.dtype],
output_shapes=[output_shape],
attr_code="op.jt_name=\"bmm_trans_0\";")[0]
if len(grad_x1.shape) > len(x1.shape):
grad_x1 = grad_x1.sum(0)
if len(grad_x2.shape) > len(x2.shape):
grad_x2 = grad_x2.sum(0)
return grad_x1, grad_x2

View File

@ -0,0 +1,77 @@
#pragma once
#include <acl/acl.h>
#include <acl/acl_op_compiler.h>
#include <Python.h>
#include <pystate.h>
#include <algorithm>
#include <queue>
#include <set>
#include "common.h"
#include "op.h"
#include "acl_jittor.h"
#include "ops/random_op.h"
#include "ops/reduce_op.h"
#include "ops/binary_op.h"
#include "ops/broadcast_to_op.h"
#include "ops/transpose_op.h"
#include "ops/array_op.h"
#include "ops/code_op.h"
#include "fused_op.h"
#include "ops/unary_op.h"
#include "ops/ternary_op.h"
#include "executor.h"
#include "misc/cuda_flags.h"
#include "mem/allocator.h"
#include "op_compiler.h"
#include "ops/op_register.h"
#include "opt/tuner_manager.h"
#include "utils/str_utils.h"
#include "aclnn/aclnn.h"
#include "bmm_op_acl.h"
namespace jittor
{
BatchMatMulOpRunner::BatchMatMulOpRunner() : BaseOpRunner("BatchMatMulMatMul")
{
}
void BatchMatMulOpRunner::setupInputDesc()
{
auto input_num = in_.size();
for (int input_idx = 0; input_idx < input_num; input_idx++)
{
std::vector<int64_t> shape;
for (int j = 0; j < in_[input_idx]->shape.size(); j++)
{
shape.push_back(in_[input_idx]->shape[j]);
}
inputShapes.push_back(shape);
}
for (int idx = 0; idx < input_num; idx++)
{
inputTensors.push_back(nullptr);
if ((jt_name == "bmm_trans_1" && idx == 1) || (jt_name == "bmm_trans_0" && idx == 0))
{
auto ret = CreateFakeTransAclTensor(inputShapes[idx], in_[idx]->mem_ptr, in_[idx]->size, get_dtype(in_[idx]->dtype()), &inputTensors[idx], use_nchw);
CHECK_RET(ret == ACL_SUCCESS, return);
}
else
{
auto ret = CreateAclTensor(inputShapes[idx], in_[idx]->mem_ptr, in_[idx]->size, get_dtype(in_[idx]->dtype()), &inputTensors[idx], use_nchw);
CHECK_RET(ret == ACL_SUCCESS, return);
}
}
}
void BatchMatMulOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
ret = aclnnBatchMatMulGetWorkspaceSize(inputTensors[0], inputTensors[1], outputTensors[0], 1, &workspaceSize, &executor);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnBatchMatmulGetWorkspaceSize failed. ERROR: %d\n", name.c_str(), ret); return);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnBatchMatMul(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnbatchMatmul failed. ERROR: %d\n", name.c_str(), ret); return);
syncRun();
}
}

View File

@ -0,0 +1,17 @@
#pragma once
#include "utils.h"
#include "base_op.h"
namespace jittor
{
class BatchMatMulOpRunner : public BaseOpRunner
{
protected:
void setupInputDesc() override;
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
public:
BatchMatMulOpRunner();
};
}

View File

@ -0,0 +1,186 @@
import os
from jittor_utils import env_or_try_find
import jittor_utils
import ctypes
import glob
import jittor.compiler as compiler
import jittor as jt
import math
import numpy as np
from typing import Union
from collections.abc import Sequence, Iterable
def concat_cmd(name: str,
inputs: list,
output_dtypes: list = None,
output_shapes: list = None,
attr_code: str = "",
attr_header: str = "",
outputs: list = None):
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
cuda_header = '''
#include "acl/aclops/aclops.h"
'''
outputs_ = []
if outputs is not None:
outputs_ = outputs
else:
assert output_dtypes is not None
assert output_shapes is not None
assert len(output_dtypes) == len(output_shapes)
for i in range(len(output_shapes)):
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
input_code = ''
for i in range(len(inputs)):
input_code += f"op.add(in{i}, true);\n"
output_code = ''
for i in range(len(outputs_)):
output_code += f"op.add(out{i}, false);\n"
return jt.code(outputs=outputs_,
inputs=inputs,
cuda_header=attr_header + cuda_header,
cuda_src=f"""
// aclop
{name}OpRunner op;
{input_code}
{output_code}
{attr_code}
op.run();""")
class ConcatACL(jt.Function):
def __init__(self):
super(ConcatACL, self).__init__()
def __call__(self, *args):
assert isinstance(args[0], (list, tuple))
assert isinstance(args[1], int)
if jt.flags.no_grad:
return self.execute(*args)
backup = args
args = list(args)
taped_inputs = []
taped_outputs = []
input_mask = [-1] * (len(args[0]) + 1)
newargs = [list(), args[1]]
for i, v in enumerate(args[0]):
if isinstance(v, jt.Var):
if v.is_stop_grad():
# -2 in input_mask represents it is stop_grad
input_mask[i] = -2
newargs[0].append(v)
continue
v = v.tape()
newargs[0].append(v)
input_mask[i] = len(taped_inputs)
taped_inputs.append(v)
ori_res = self.execute(*newargs)
if not isinstance(ori_res, Sequence):
res = [ori_res]
else:
res = list(ori_res)
output_mask = [-1] * len(res)
for i, v in enumerate(res):
if isinstance(v, jt.Var):
v = v.tape()
output_mask[i] = len(taped_outputs)
res[i] = v
taped_outputs.append(v)
self.input_mask = input_mask
self.output_mask = output_mask
# tape output and input together so
# backward treat them as one operator
jt.tape_together(taped_inputs, taped_outputs, self._grad)
if isinstance(ori_res, Sequence):
return res
else:
return res[0]
def execute(self, input_tensors, dim=0):
for _ in input_tensors:
if not (-_.ndim <= dim < _.ndim):
print(_.shape, dim)
raise ValueError("dim out of range")
if dim < 0:
dim += input_tensors[0].ndim
self.input = input_tensors
self.dim = dim
for i in range(len(input_tensors)):
if input_tensors[i].dtype != input_tensors[0].dtype:
raise ValueError("All input tensors must have the same dtype")
if input_tensors[i].shape[:dim] != input_tensors[
0].shape[:dim] or input_tensors[i].shape[
dim + 1:] != input_tensors[0].shape[dim + 1:]:
raise ValueError("All input tensors must have the same shape")
attr_code = f"""
op.jt_name = "concat";
ConcatAttr *attr = new ConcatAttr();
attr->tensorNum = {len(input_tensors)};
attr->dim = {dim};
op.op_attr.reset(attr);
"""
result = concat_cmd(
"Concat",
input_tensors,
output_dtypes=[input_tensors[0].dtype],
output_shapes=[
jt.empty(self.calculate_output_shape(input_tensors, dim)).shape
],
attr_code=attr_code)[0]
return result
def _grad(self, *args):
new_args = ((args[i] if i >= 0 else None) for i in self.output_mask)
ret = self.grad(*new_args)
new_ret = []
for i, r in enumerate(ret):
j = self.input_mask[i]
if j < 0:
# -2 in input_mask represents it is stop_grad
assert r is None or j==-2, f"{type(self)}'s {i}-th returned grad should be None, "\
"because the input value is not jittor variable."
else:
new_ret.append(r)
return new_ret
def grad(self, grad_output):
grad_inputs = self.split_grad(grad_output, self.input, self.dim)
return grad_inputs
def calculate_output_shape(self, input_tensors, axis):
shape = list(input_tensors[0].shape)
for tensor in input_tensors[1:]:
shape[axis] += tensor.shape[axis]
return tuple(shape)
def split_grad(self, grad_output, input_tensors, axis):
offset = []
shapeVec = []
dtypeVec = []
for tensor in input_tensors:
offset.append(tensor.shape[axis])
dtypeVec.append(tensor.dtype)
shapeVec.append(tensor.shape)
attr_code = f"""
op.jt_name = "splitwithsize";
auto *attr = new SplitWithSizeAttr();
attr->splitSize = {{ {", ".join(map(str, offset))} }};
attr->dim = {axis};
op.op_attr.reset(attr);
"""
result = concat_cmd("SplitWithSize", [grad_output],
output_dtypes=dtypeVec,
output_shapes=shapeVec,
attr_code=attr_code)
return result

View File

@ -0,0 +1,89 @@
#pragma once
#include <acl/acl.h>
#include <acl/acl_op_compiler.h>
#include <Python.h>
#include <pystate.h>
#include <algorithm>
#include <queue>
#include <set>
#include "common.h"
#include "op.h"
#include "acl_jittor.h"
#include "ops/random_op.h"
#include "ops/reduce_op.h"
#include "ops/binary_op.h"
#include "ops/broadcast_to_op.h"
#include "ops/transpose_op.h"
#include "ops/array_op.h"
#include "ops/code_op.h"
#include "fused_op.h"
#include "ops/unary_op.h"
#include "ops/ternary_op.h"
#include "executor.h"
#include "misc/cuda_flags.h"
#include "mem/allocator.h"
#include "op_compiler.h"
#include "ops/op_register.h"
#include "opt/tuner_manager.h"
#include "utils/str_utils.h"
#include "aclnn/aclnn.h"
#include "concat_op_acl.h"
namespace jittor
{
ConcatOpRunner::ConcatOpRunner() : BaseOpRunner("Concat")
{
}
void ConcatOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
auto input_num = in_.size();
std::vector<aclTensor *> concatTensorList = {};
for (int i = 0; i < input_num; i++)
{
concatTensorList.push_back(inputTensors[i]);
}
auto concatTensorListInput = aclCreateTensorList(&concatTensorList[0], input_num);
auto attr = dynamic_cast<ConcatAttr *>(op_attr.get());
ret = aclnnCatGetWorkspaceSize(concatTensorListInput, attr->dim, outputTensors[0], &workspaceSize, &executor);
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnCat(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnCat failed. ERROR: %d\n", name.c_str(), ret); return);
syncRun();
return;
}
SplitWithSizeOpRunner::SplitWithSizeOpRunner() : BaseOpRunner("SplitWithSize")
{
}
void SplitWithSizeOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
auto output_num = out_.size();
auto attr = dynamic_cast<SplitWithSizeAttr *>(op_attr.get());
auto splitSize = aclCreateIntArray(attr->splitSize.data(), attr->splitSize.size());
auto tensorList = aclCreateTensorList(&outputTensors[0], output_num);
ret = aclnnSplitWithSizeGetWorkspaceSize(inputTensors[0], splitSize, attr->dim, tensorList, &workspaceSize, &executor);
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnSplitWithSize(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnSplitWithSize failed. ERROR: %d\n", name.c_str(), ret); return);
syncRun();
return;
}
}

View File

@ -0,0 +1,26 @@
#pragma once
#include "utils.h"
#include "base_op.h"
namespace jittor
{
class ConcatOpRunner : public BaseOpRunner
{
protected:
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
public:
ConcatOpRunner();
};
class SplitWithSizeOpRunner : public BaseOpRunner
{
protected:
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
public:
SplitWithSizeOpRunner();
};
}

View File

@ -0,0 +1,160 @@
import os
import jittor_utils
from jittor_utils import env_or_try_find
import ctypes
import glob
import jittor as jt
import jittor.compiler as compiler
import math
import numpy as np
from typing import Union
from collections.abc import Sequence, Iterable
def _ntuple(n):
def parse(x):
if isinstance(x, Iterable):
return x
return tuple([x] * n)
return parse
_pair = _ntuple(2)
def conv_cmd(name: str,
inputs: list,
output_dtypes: list = None,
output_shapes: list = None,
attr_code: str = "",
attr_header: str = "",
outputs: list = None):
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
cuda_header = '''
#include "acl/aclops/aclops.h"
'''
outputs_ = []
if outputs is not None:
outputs_ = outputs
else:
assert output_dtypes is not None
assert output_shapes is not None
assert len(output_dtypes) == len(output_shapes)
for i in range(len(output_shapes)):
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
input_code = ''
for i in range(len(inputs)):
input_code += f"op.add(in{i}, true);\n"
output_code = ''
for i in range(len(outputs_)):
output_code += f"op.add(out{i}, false);\n"
return jt.code(outputs=outputs_,
inputs=inputs,
cuda_header=attr_header + cuda_header,
cuda_src=f"""
// aclop
{name}OpRunner op;
{input_code}
{output_code}
{attr_code}
op.run();""")
class ConvACL(jt.Function):
def execute(self,
x,
weight,
bias=None,
stride=1,
padding=0,
dilation=1,
groups=1):
self.input = x
self.weight = weight
self.bias = bias
padding = _pair(padding)
stride = _pair(stride)
dilation = _pair(dilation)
out_channels = weight.shape[0]
if groups <= 0:
raise ValueError("groups must be a positive integer")
self.padding = padding
self.stride = stride
self.dilation = dilation
self.groups = groups
attr_code = f"""
op.jt_name = "conv2d";
ConvAttr *attr = new ConvAttr();
attr->convStrides = {{ {stride[0]}, {stride[1]} }};
attr->convPads = {{ {padding[0]}, {padding[1]} }};
attr->convDilations = {{ {dilation[0]}, {dilation[1]} }};
attr->group = {groups};
attr->convOutPads = {{1,1}};
op.op_attr.reset(attr);
"""
input_height, input_width = x.shape[-2:]
kernel_height, kernel_width = weight.shape[-2:]
output_height = (input_height + 2 * padding[0] - dilation[0] *
(kernel_height - 1) - 1) // stride[0] + 1
output_width = (input_width + 2 * padding[1] - dilation[1] *
(kernel_width - 1) - 1) // stride[1] + 1
output_shape = (x.shape[0], out_channels, output_height, output_width)
inputs = [x, weight]
if bias is not None:
inputs.append(bias)
result = conv_cmd(
"Conv2d",
inputs,
output_dtypes=[x.dtype],
output_shapes=[output_shape],
attr_code=attr_code,
)[0]
return result
def grad(self, grad_output):
x = self.input
weight = self.weight
bias = self.bias
inputs = [grad_output, x, weight]
if bias is not None:
inputs.append(bias)
output_shapes = [x.shape, weight.shape]
output_dtypes = [x.dtype, weight.dtype]
if bias is not None:
output_shapes.append(bias.shape)
output_dtypes.append(bias.dtype)
else:
output_shapes.append([weight.shape[0]])
output_dtypes.append(x.dtype)
padding = self.padding
stride = self.stride
dilation = self.dilation
groups = self.groups
attr_code = f"""
op.jt_name = "conv2dbackward";
ConvAttr *attr = new ConvAttr();
attr->convStrides = {{ {stride[0]}, {stride[1]} }};
attr->convPads = {{ {padding[0]}, {padding[1]} }};
attr->convDilations = {{ {dilation[0]}, {dilation[1]} }};
attr->group = {groups};
attr->convOutPads = {{ 1,1}};
op.op_attr.reset(attr);
"""
results = conv_cmd("Conv2dBackward",
inputs,
output_dtypes=output_dtypes,
output_shapes=output_shapes,
attr_code=attr_code)
if self.bias is None:
return results[0], results[1]
return results

View File

@ -0,0 +1,152 @@
#include <acl/acl.h>
#include <acl/acl_op_compiler.h>
#include <Python.h>
#include <pystate.h>
#include <algorithm>
#include <queue>
#include <set>
#include "common.h"
#include "op.h"
#include "acl_jittor.h"
#include "ops/random_op.h"
#include "ops/reduce_op.h"
#include "ops/binary_op.h"
#include "ops/broadcast_to_op.h"
#include "ops/transpose_op.h"
#include "ops/array_op.h"
#include "ops/code_op.h"
#include "fused_op.h"
#include "ops/unary_op.h"
#include "ops/ternary_op.h"
#include "executor.h"
#include "misc/cuda_flags.h"
#include "mem/allocator.h"
#include "op_compiler.h"
#include "ops/op_register.h"
#include "opt/tuner_manager.h"
#include "utils/str_utils.h"
#include "aclnn/aclnn.h"
#include "conv_op_acl.h"
namespace jittor
{
Conv2dOpRunner::Conv2dOpRunner() : BaseOpRunner("Conv2d")
{
use_nchw = true;
}
void Conv2dOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
// for conv
aclIntArray *strides = nullptr;
aclIntArray *pads = nullptr;
aclIntArray *outPads = nullptr;
aclIntArray *dilations = nullptr;
auto attr = dynamic_cast<ConvAttr *>(op_attr.get());
strides = aclCreateIntArray(attr->convStrides.data(), 2);
pads = aclCreateIntArray(attr->convPads.data(), 2);
outPads = aclCreateIntArray(attr->convOutPads.data(), 2);
dilations = aclCreateIntArray(attr->convDilations.data(), 2);
aclTensor *bias = nullptr;
auto input_num = in_.size();
if (input_num == 3)
bias = inputTensors[2];
ret = aclnnConvolutionGetWorkspaceSize(inputTensors[0], inputTensors[1], bias, strides, pads, dilations, false, outPads, attr->group, outputTensors[0], 0, &workspaceSize, &executor);
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnConvolution(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnConvolution failed. ERROR: %d\n", name.c_str(), ret); return);
syncRun();
aclDestroyIntArray(strides);
aclDestroyIntArray(pads);
aclDestroyIntArray(outPads);
aclDestroyIntArray(dilations);
return;
}
Conv2dBackwardOpRunner::Conv2dBackwardOpRunner() : BaseOpRunner("Conv2dBackward")
{
use_nchw = true;
}
void Conv2dBackwardOpRunner::setupOutputDesc()
{
auto output_num = out_.size();
for (int output_idx = 0; output_idx < output_num; output_idx++)
{
std::vector<int64_t> shape;
for (int j = 0; j < out_[output_idx]->shape.size(); j++)
{
shape.push_back(out_[output_idx]->shape[j]);
}
outputShapes.push_back(shape);
}
for (int idx = 0; idx < 2; idx++)
{
outputTensors.push_back(nullptr);
auto ret = CreateAclTensor(outputShapes[idx], out_[idx]->mem_ptr, out_[idx]->size, get_dtype(out_[idx]->dtype()), &outputTensors[idx], use_nchw);
CHECK_RET(ret == ACL_SUCCESS, return);
}
// biasgrad nd format
{
outputTensors.push_back(nullptr);
auto ret = CreateAclTensor(outputShapes[2], out_[2]->mem_ptr, out_[2]->size, get_dtype(out_[2]->dtype()), &outputTensors[2], false);
CHECK_RET(ret == ACL_SUCCESS, return);
}
}
void Conv2dBackwardOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
// for conv
aclIntArray *strides = nullptr;
aclIntArray *pads = nullptr;
aclIntArray *outPads = nullptr;
aclIntArray *dilations = nullptr;
auto attr = dynamic_cast<ConvAttr *>(op_attr.get());
strides = aclCreateIntArray(attr->convStrides.data(), 2);
pads = aclCreateIntArray(attr->convPads.data(), 2);
outPads = aclCreateIntArray(attr->convOutPads.data(), 2);
dilations = aclCreateIntArray(attr->convDilations.data(), 2);
bool outputMask[3] = {true, true, true};
auto input_num = in_.size();
if (input_num == 3)
{
outputMask[2] = false;
}
aclBoolArray *outMask = aclCreateBoolArray(outputMask, 3);
auto biasSizes = aclCreateIntArray(&outputShapes[2][0], outputShapes[2].size());
ret = aclnnConvolutionBackwardGetWorkspaceSize(inputTensors[0], inputTensors[1], inputTensors[2], biasSizes, strides, pads, dilations, false, outPads, attr->group, outMask, 0, outputTensors[0], outputTensors[1], outputTensors[2], &workspaceSize, &executor);
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnConvolutionBackward(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnConvolutionBackward failed. ERROR: %d\n", name.c_str(), ret); return);
syncRun();
aclDestroyIntArray(strides);
aclDestroyIntArray(pads);
aclDestroyIntArray(outPads);
aclDestroyIntArray(dilations);
return;
}
}

View File

@ -0,0 +1,27 @@
#pragma once
#include "utils.h"
#include "base_op.h"
namespace jittor
{
class Conv2dOpRunner : public BaseOpRunner
{
protected:
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
public:
Conv2dOpRunner();
};
class Conv2dBackwardOpRunner : public BaseOpRunner
{
protected:
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
void setupOutputDesc() override;
public:
Conv2dBackwardOpRunner();
};
}

View File

@ -0,0 +1,101 @@
import os
from jittor_utils import env_or_try_find
import jittor_utils
import ctypes
import glob
import jittor.compiler as compiler
import jittor as jt
import math
import numpy as np
from typing import Union
from collections.abc import Sequence, Iterable
def cumsum_cmd(name: str,
inputs: list,
output_dtypes: list = None,
output_shapes: list = None,
attr_code: str = "",
attr_header: str = "",
outputs: list = None):
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
cuda_header = '''
#include "acl/aclops/aclops.h"
'''
outputs_ = []
if outputs is not None:
outputs_ = outputs
else:
assert output_dtypes is not None
assert output_shapes is not None
assert len(output_dtypes) == len(output_shapes)
for i in range(len(output_shapes)):
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
input_code = ''
for i in range(len(inputs)):
input_code += f"op.add(in{i}, true);\n"
output_code = ''
for i in range(len(outputs_)):
output_code += f"op.add(out{i}, false);\n"
return jt.code(outputs=outputs_,
inputs=inputs,
cuda_header=attr_header + cuda_header,
cuda_src=f"""
// aclop
{name}OpRunner op;
{input_code}
{output_code}
{attr_code}
op.run();""")
class CumsumACL(jt.Function):
def __init__(self):
super(CumsumACL, self).__init__()
def execute(self, input, dim=-1):
self.dim = dim
attr_code = f"""
op.jt_name = "cumsum";
GatherAttr *attr = new GatherAttr();
attr->dim = {dim};
op.op_attr.reset(attr);
"""
result = cumsum_cmd("Cumsum", [input],
output_dtypes=[input.dtype],
output_shapes=[input.shape],
attr_code=attr_code)[0]
return result
def grad(self, grad_output):
cumsum_attr_code = f"""
op.jt_name = "cumsum";
GatherAttr *attr = new GatherAttr();
attr->dim = {self.dim};
op.op_attr.reset(attr);
"""
flip_attr_code = f"""
op.jt_name = "flip";
ReduceAttr *attr = new ReduceAttr();
attr->axes = {{{self.dim}}};
attr->prod_dim = {{{1}}};
op.op_attr.reset(attr);
"""
flipped_grad_output = cumsum_cmd("Flip", [grad_output],
output_dtypes=[grad_output.dtype],
output_shapes=[grad_output.shape],
attr_code=flip_attr_code)[0]
cumulative_grad = cumsum_cmd("Cumsum", [flipped_grad_output],
output_dtypes=[grad_output.dtype],
output_shapes=[grad_output.shape],
attr_code=cumsum_attr_code)[0]
grad_input = cumsum_cmd("Flip", [cumulative_grad],
output_dtypes=[grad_output.dtype],
output_shapes=[grad_output.shape],
attr_code=flip_attr_code)[0]
return grad_input

View File

@ -0,0 +1,57 @@
#pragma once
#include <acl/acl.h>
#include <acl/acl_op_compiler.h>
#include <Python.h>
#include <pystate.h>
#include <algorithm>
#include <queue>
#include <set>
#include "common.h"
#include "op.h"
#include "acl_jittor.h"
#include "ops/random_op.h"
#include "ops/reduce_op.h"
#include "ops/binary_op.h"
#include "ops/broadcast_to_op.h"
#include "ops/transpose_op.h"
#include "ops/array_op.h"
#include "ops/code_op.h"
#include "fused_op.h"
#include "ops/unary_op.h"
#include "ops/ternary_op.h"
#include "executor.h"
#include "misc/cuda_flags.h"
#include "mem/allocator.h"
#include "op_compiler.h"
#include "ops/op_register.h"
#include "opt/tuner_manager.h"
#include "utils/str_utils.h"
#include "aclnn/aclnn.h"
#include "cumsum_op_acl.h"
namespace jittor
{
CumsumOpRunner::CumsumOpRunner() : BaseOpRunner("Cumsum")
{
}
void CumsumOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
auto attr = dynamic_cast<GatherAttr *>(op_attr.get());
ret = aclnnCumsumGetWorkspaceSize(inputTensors[0], attr->dim, get_dtype(out_[0]->dtype()), outputTensors[0], &workspaceSize, &executor);
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnCumsum(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnCumsum failed. ERROR: %d\n", name.c_str(), ret); return);
syncRun();
return;
}
}

View File

@ -0,0 +1,17 @@
#pragma once
#include "utils.h"
#include "base_op.h"
namespace jittor
{
class CumsumOpRunner : public BaseOpRunner
{
protected:
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
public:
CumsumOpRunner();
};
}

View File

@ -0,0 +1,94 @@
import os
from jittor_utils import env_or_try_find
import jittor_utils
import ctypes
import glob
import jittor.compiler as compiler
import jittor as jt
import math
import numpy as np
from typing import Union
from collections.abc import Sequence, Iterable
def dropout_cmd(name: str,
inputs: list,
output_dtypes: list = None,
output_shapes: list = None,
attr_code: str = "",
attr_header: str = "",
outputs: list = None):
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
cuda_header = '''
#include "acl/aclops/aclops.h"
'''
outputs_ = []
if outputs is not None:
outputs_ = outputs
else:
assert output_dtypes is not None
assert output_shapes is not None
assert len(output_dtypes) == len(output_shapes)
for i in range(len(output_shapes)):
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
input_code = ''
for i in range(len(inputs)):
input_code += f"op.add(in{i}, true);\n"
output_code = ''
for i in range(len(outputs_)):
output_code += f"op.add(out{i}, false);\n"
return jt.code(outputs=outputs_,
inputs=inputs,
cuda_header=attr_header + cuda_header,
cuda_src=f"""
// aclop
{name}OpRunner op;
{input_code}
{output_code}
{attr_code}
op.run();""")
class DropoutACL(jt.Function):
def __init__(self):
super(DropoutACL, self).__init__()
def execute(self, x, p=0.5, is_train=False):
self.input = x
num_elements = x.numel()
aligned_elements = (num_elements + 127) // 128 * 128
mask_shape = (aligned_elements // 8, )
attr_code = f"""
op.jt_name = "dropout";
DropoutAttr *attr = new DropoutAttr();
attr->p = {p};
attr->train = {"true" if is_train else "false"};
attr->seed = 0;
attr->offset = 0;
op.op_attr.reset(attr);
"""
result = dropout_cmd("Dropout", [x],
output_dtypes=[x.dtype, "uint8"],
output_shapes=[x.shape, mask_shape],
attr_code=attr_code)
self.maskout = result[1]
return result[0]
def grad(self, grad_output):
attr_code = f"""
op.jt_name = "dropoutbackward";
DropoutAttr *attr = new DropoutAttr();
attr->scale = 1.0;
op.op_attr.reset(attr);
"""
grad_input = dropout_cmd("DropoutBackward",
[grad_output, self.maskout],
output_dtypes=[grad_output.dtype],
output_shapes=[grad_output.shape],
attr_code=attr_code)[0]
return grad_input

View File

@ -0,0 +1,82 @@
#pragma once
#include <acl/acl.h>
#include <acl/acl_op_compiler.h>
#include <Python.h>
#include <pystate.h>
#include <algorithm>
#include <queue>
#include <set>
#include "common.h"
#include "op.h"
#include "acl_jittor.h"
#include "ops/random_op.h"
#include "ops/reduce_op.h"
#include "ops/binary_op.h"
#include "ops/broadcast_to_op.h"
#include "ops/transpose_op.h"
#include "ops/array_op.h"
#include "ops/code_op.h"
#include "fused_op.h"
#include "ops/unary_op.h"
#include "ops/ternary_op.h"
#include "executor.h"
#include "misc/cuda_flags.h"
#include "mem/allocator.h"
#include "op_compiler.h"
#include "ops/op_register.h"
#include "opt/tuner_manager.h"
#include "utils/str_utils.h"
#include "aclnn/aclnn.h"
#include "dropout_op_acl.h"
namespace jittor
{
DropoutOpRunner::DropoutOpRunner() : BaseOpRunner("Dropout")
{
}
void DropoutOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
auto attr = dynamic_cast<DropoutAttr *>(op_attr.get());
ret = aclnnDropoutGetWorkspaceSize(inputTensors[0], attr->p, attr->train, attr->seed, attr->offset, outputTensors[0], outputTensors[1], &workspaceSize, &executor);
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnDropout(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnDropout failed. ERROR: %d\n", name.c_str(), ret); return);
syncRun();
return;
}
DropoutBackwardOpRunner::DropoutBackwardOpRunner() : BaseOpRunner("DropoutBackward")
{
}
void DropoutBackwardOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
auto attr = dynamic_cast<DropoutAttr *>(op_attr.get());
ret = aclnnDropoutBackwardGetWorkspaceSize(inputTensors[0], inputTensors[1], attr->scale, outputTensors[0], &workspaceSize, &executor);
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnDropoutBackward(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnDropoutBackward failed. ERROR: %d\n", name.c_str(), ret); return);
syncRun();
return;
}
}

View File

@ -0,0 +1,27 @@
#pragma once
#include "utils.h"
#include "base_op.h"
namespace jittor
{
class DropoutOpRunner : public BaseOpRunner
{
protected:
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
public:
DropoutOpRunner();
};
class DropoutBackwardOpRunner : public BaseOpRunner
{
protected:
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
public:
DropoutBackwardOpRunner();
};
}

View File

@ -0,0 +1,91 @@
import os
from jittor_utils import env_or_try_find
import jittor_utils
import ctypes
import glob
import jittor.compiler as compiler
import jittor as jt
import math
import numpy as np
from typing import Union
from collections.abc import Sequence, Iterable
def embedding_cmd(name: str,
inputs: list,
output_dtypes: list = None,
output_shapes: list = None,
attr_code: str = "",
attr_header: str = "",
outputs: list = None):
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
cuda_header = '''
#include "acl/aclops/aclops.h"
'''
outputs_ = []
if outputs is not None:
outputs_ = outputs
else:
assert output_dtypes is not None
assert output_shapes is not None
assert len(output_dtypes) == len(output_shapes)
for i in range(len(output_shapes)):
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
input_code = ''
for i in range(len(inputs)):
input_code += f"op.add(in{i}, true);\n"
output_code = ''
for i in range(len(outputs_)):
output_code += f"op.add(out{i}, false);\n"
return jt.code(outputs=outputs_,
inputs=inputs,
cuda_header=attr_header + cuda_header,
cuda_src=f"""
// aclop
{name}OpRunner op;
{input_code}
{output_code}
{attr_code}
op.run();""")
class EmbeddingACL(jt.Function):
def __init__(self):
super(EmbeddingACL, self).__init__()
def execute(
self,
indices,
weight,
):
inputs = [weight, indices]
self.indices = indices
self.weight_shape = weight.shape
output_shape = list(indices.shape) + list(weight.shape[1:])
outputs = [jt.empty(output_shape, weight.dtype)]
attr_code = f"""
op.jt_name = "embedding";
"""
result = embedding_cmd("Embedding",
inputs=inputs,
outputs=outputs,
attr_code=attr_code)[0]
return result
def grad(self, grad_output):
inputs = [grad_output, self.indices]
outputs = [jt.empty(self.weight_shape, grad_output.dtype)]
attr_code = f"""
op.jt_name = "embeddingbackward";
EmbeddingAttr *attr = new EmbeddingAttr();
attr->numEmbeddings = {self.weight_shape[0]};
op.op_attr.reset(attr);
"""
grad_weight = embedding_cmd("EmbeddingBackward",
inputs=inputs,
outputs=outputs,
attr_code=attr_code)[0]
return None, grad_weight

View File

@ -0,0 +1,82 @@
#pragma once
#include <acl/acl.h>
#include <acl/acl_op_compiler.h>
#include <Python.h>
#include <pystate.h>
#include <algorithm>
#include <queue>
#include <set>
#include "common.h"
#include "op.h"
#include "acl_jittor.h"
#include "ops/random_op.h"
#include "ops/reduce_op.h"
#include "ops/binary_op.h"
#include "ops/broadcast_to_op.h"
#include "ops/transpose_op.h"
#include "ops/array_op.h"
#include "ops/code_op.h"
#include "fused_op.h"
#include "ops/unary_op.h"
#include "ops/ternary_op.h"
#include "executor.h"
#include "misc/cuda_flags.h"
#include "mem/allocator.h"
#include "op_compiler.h"
#include "ops/op_register.h"
#include "opt/tuner_manager.h"
#include "utils/str_utils.h"
#include "aclnn/aclnn.h"
#include "embedding_op_acl.h"
namespace jittor
{
EmbeddingOpRunner::EmbeddingOpRunner() : BaseOpRunner("Embedding")
{
}
void EmbeddingOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
ret = aclnnEmbeddingGetWorkspaceSize(inputTensors[0], inputTensors[1], outputTensors[0], &workspaceSize, &executor);
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnEmbedding(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnEmbedding failed. ERROR: %d\n", name.c_str(), ret); return);
syncRun();
return;
}
EmbeddingBackwardOpRunner::EmbeddingBackwardOpRunner() : BaseOpRunner("EmbeddingBackward")
{
}
void EmbeddingBackwardOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
auto attr = dynamic_cast<EmbeddingAttr *>(op_attr.get());
auto numEmbeddings = attr->numEmbeddings;
ret = aclnnEmbeddingDenseBackwardGetWorkspaceSize(inputTensors[0], inputTensors[1], numEmbeddings, 0, false, outputTensors[0], &workspaceSize, &executor);
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnEmbeddingDenseBackward(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnEmbeddingDenseBackward failed. ERROR: %d\n", name.c_str(), ret); return);
syncRun();
return;
}
}

View File

@ -0,0 +1,25 @@
#pragma once
#include "utils.h"
#include "base_op.h"
namespace jittor
{
class EmbeddingOpRunner : public BaseOpRunner
{
protected:
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
public:
EmbeddingOpRunner();
};
class EmbeddingBackwardOpRunner : public BaseOpRunner
{
protected:
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
public:
EmbeddingBackwardOpRunner();
};
}

View File

@ -0,0 +1,58 @@
#include <acl/acl.h>
#include <acl/acl_op_compiler.h>
#include <Python.h>
#include <pystate.h>
#include <algorithm>
#include <queue>
#include <set>
#include "common.h"
#include "op.h"
#include "acl_jittor.h"
#include "ops/random_op.h"
#include "ops/reduce_op.h"
#include "ops/binary_op.h"
#include "ops/broadcast_to_op.h"
#include "ops/transpose_op.h"
#include "ops/array_op.h"
#include "ops/code_op.h"
#include "fused_op.h"
#include "ops/unary_op.h"
#include "ops/ternary_op.h"
#include "executor.h"
#include "misc/cuda_flags.h"
#include "mem/allocator.h"
#include "op_compiler.h"
#include "ops/op_register.h"
#include "opt/tuner_manager.h"
#include "utils/str_utils.h"
#include "aclnn/aclnn.h"
#include "expand_op_acl.h"
namespace jittor
{
ExpandOpRunner::ExpandOpRunner() : BaseOpRunner("ternary")
{
use_nchw = false;
}
void ExpandOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
aclIntArray *size = nullptr;
size = aclCreateIntArray(&outputShapes[0][0], outputShapes[0].size());
ret = aclnnExpandGetWorkspaceSize(inputTensors[0], size, outputTensors[0], &workspaceSize, &executor);
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnExpand(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnExpand failed. ERROR: %d\n", name.c_str(), ret); return);
aclDestroyIntArray(size);
return;
}
}

View File

@ -0,0 +1,14 @@
#pragma once
#include "utils.h"
#include "base_op.h"
namespace jittor
{
struct ExpandOpRunner : public BaseOpRunner
{
ExpandOpRunner();
protected:
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
};
}

View File

@ -0,0 +1,209 @@
import os
from jittor_utils import env_or_try_find
import jittor_utils
import ctypes
import glob
import jittor.compiler as compiler
import jittor as jt
import math
import numpy as np
from typing import Union
from collections.abc import Sequence, Iterable
def flashattention_cmd(name: str,
inputs: list,
output_dtypes: list = None,
output_shapes: list = None,
attr_code: str = "",
attr_header: str = "",
outputs: list = None):
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
cuda_header = '''
#include "acl/aclops/aclops.h"
'''
outputs_ = []
if outputs is not None:
outputs_ = outputs
else:
assert output_dtypes is not None
assert output_shapes is not None
assert len(output_dtypes) == len(output_shapes)
for i in range(len(output_shapes)):
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
input_code = ''
for i in range(len(inputs)):
input_code += f"op.add(in{i}, true);\n"
output_code = ''
for i in range(len(outputs_)):
output_code += f"op.add(out{i}, false);\n"
return jt.code(outputs=outputs_,
inputs=inputs,
cuda_header=attr_header + cuda_header,
cuda_src=f"""
// aclop
{name}OpRunner op;
{input_code}
{output_code}
{attr_code}
op.run();""")
class FlashAttentionACL(jt.Function):
def __init__(self,
headnum,
layout="BNSD",
prefix=None,
qstart=None,
kvstart=None,
scale=1.0,
prob=1.0,
pretokens=2147483647,
nexttokens=2147483647,
innerprecise=0,
sparsemode=0,
psetype=1):
self.headnum = headnum
self.layout = layout
self.scale = scale
self.prob = prob
self.pretokens = pretokens
self.nexttokens = nexttokens
self.innerprecise = innerprecise
self.sparsemode = sparsemode
self.psetype = psetype
self.prefix = prefix
self.qstart = qstart
self.kvstart = kvstart
def execute(
self,
q,
k,
v,
realshift=None,
dropMask=None,
paddingMask=None,
attenMask=None,
):
if self.layout == 'BSH':
B, SQ, H = q.shape
SKV = k.shape[1]
N = self.headnum
D = H / N
elif self.layout == 'SBH':
SQ, B, H = q.shape
SKV = k.shape[0]
N = self.headnum
D = H / N
elif self.layout == 'BSND':
B, SQ, N, D = q.shape
SKV = k.shape[1]
elif self.layout == 'BNSD':
B, N, SQ, D = q.shape
SKV = k.shape[2]
else:
raise ValueError(f"got invalid input layout {self.layout}")
output_shape = (B, N, SQ, 8)
self.q = q
self.k = k
self.v = v
self.prefix = self.prefix if self.prefix else [0 for _ in range(B)]
self.qstart = self.qstart if self.qstart else [0 for _ in range(B)]
self.kvstart = self.kvstart if self.kvstart else [0 for _ in range(B)]
self.hasRealshift = (not realshift == None)
self.hasDropmask = (not dropMask == None)
self.hasPaddingmask = (not paddingMask == None)
self.hasAttenmask = (not attenMask == None)
# 待定目前设为nullptr
self.realshift = realshift if realshift else jt.zeros(B, N, SQ, SKV)
self.dropMask = dropMask if dropMask else jt.ones(B, N, SQ, SKV)
self.paddingMask = paddingMask if paddingMask else jt.zeros(
B, N, SQ, SKV)
self.attenMask = attenMask if attenMask else jt.zeros(SQ, SKV)
attr_code = f"""
op.jt_name = "flashattention";
FlashAttentionAttr *attr = new FlashAttentionAttr();
attr->scale = {self.scale};
attr->keepProb = {self.prob};
attr->preToken = {self.pretokens};
attr->nextToken = {self.nexttokens};
attr->headNum = {self.headnum};
attr->inputLayout = "{self.layout}";
attr->innerPrecise = {self.innerprecise};
attr->sparseMode = {self.sparsemode};
attr->psetype = {self.psetype};
attr->prefix = {{ {", ".join(map(str, self.prefix))} }};
attr->qStartIdx = {{ {", ".join(map(str, self.qstart))} }};
attr->kvStartIdx = {{ {", ".join(map(str, self.kvstart))} }};
attr->hasRealshift = {"true" if self.hasRealshift else "false"};
attr->hasDropmask = {"true" if self.hasDropmask else "false"};
attr->hasPaddingmask = {"true" if self.hasPaddingmask else "false"};
attr->hasAttentmask = {"true" if self.hasAttenmask else "false"};
op.op_attr.reset(attr);
"""
inputs = [
q, k, v, self.realshift, self.dropMask, self.paddingMask,
self.attenMask
]
result = flashattention_cmd(
"FlashAttention",
inputs,
output_dtypes=["float", "float", q.dtype],
output_shapes=[output_shape, output_shape, q.shape],
attr_code=attr_code)
self.maxout = result[0]
self.sumout = result[1]
self.attenout = result[2]
return self.attenout
def grad(self, dy):
attr_code = f"""
op.jt_name = "flashattentionbackward";
FlashAttentionAttr *attr = new FlashAttentionAttr();
attr->scale = {self.scale};
attr->keepProb = {self.prob};
attr->preToken = {self.pretokens};
attr->nextToken = {self.nexttokens};
attr->headNum = {self.headnum};
attr->inputLayout = "{self.layout}";
attr->innerPrecise = {self.innerprecise};
attr->sparseMode = {self.sparsemode};
attr->psetype = {self.psetype};
attr->prefix = {{ {", ".join(map(str, self.prefix))} }};
attr->qStartIdx = {{ {", ".join(map(str, self.qstart))} }};
attr->kvStartIdx = {{ {", ".join(map(str, self.kvstart))} }};
attr->hasRealshift = {"true" if self.hasRealshift else "false"};
attr->hasDropmask = {"true" if self.hasDropmask else "false"};
attr->hasPaddingmask = {"true" if self.hasPaddingmask else "false"};
attr->hasAttentmask = {"true" if self.hasAttenmask else "false"};
op.op_attr.reset(attr);
"""
inputs = [
self.q, self.k, self.v, dy, self.realshift, self.dropMask,
self.paddingMask, self.attenMask, self.maxout, self.sumout,
self.attenout
]
result = flashattention_cmd(
"FlashAttentionBackward",
inputs,
output_dtypes=[self.q.dtype, self.k.dtype, self.v.dtype],
output_shapes=[self.q.shape, self.k.shape, self.v.shape],
attr_code=attr_code)
return result

View File

@ -0,0 +1,88 @@
#pragma once
#include <acl/acl.h>
#include <acl/acl_op_compiler.h>
#include <Python.h>
#include <pystate.h>
#include <algorithm>
#include <queue>
#include <set>
#include "common.h"
#include "op.h"
#include "acl_jittor.h"
#include "ops/random_op.h"
#include "ops/reduce_op.h"
#include "ops/binary_op.h"
#include "ops/broadcast_to_op.h"
#include "ops/transpose_op.h"
#include "ops/array_op.h"
#include "ops/code_op.h"
#include "fused_op.h"
#include "ops/unary_op.h"
#include "ops/ternary_op.h"
#include "executor.h"
#include "misc/cuda_flags.h"
#include "mem/allocator.h"
#include "op_compiler.h"
#include "ops/op_register.h"
#include "opt/tuner_manager.h"
#include "utils/str_utils.h"
#include "aclnn/aclnn.h"
#include "flashattention_op_acl.h"
namespace jittor
{
FlashAttentionOpRunner::FlashAttentionOpRunner() : BaseOpRunner("FlashAttention")
{
}
void FlashAttentionOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
auto attr = dynamic_cast<FlashAttentionAttr *>(op_attr.get());
auto prefix = aclCreateIntArray(attr->prefix.data(), attr->prefix.size());
auto qstart = aclCreateIntArray(attr->qStartIdx.data(), attr->qStartIdx.size());
auto kvstart = aclCreateIntArray(attr->kvStartIdx.data(), attr->kvStartIdx.size());
char *layout = const_cast<char *>(attr->inputLayout.data());
ret = aclnnFlashAttentionScoreV2GetWorkspaceSize(inputTensors[0], inputTensors[1], inputTensors[2], attr->hasRealshift ? inputTensors[3] : nullptr, attr->hasDropmask ? inputTensors[4] : nullptr, nullptr, attr->hasAttentmask ? inputTensors[6] : nullptr, prefix, qstart, kvstart, attr->scale, attr->keepProb, attr->preToken, attr->nextToken, attr->headNum, layout, attr->innerPrecise, attr->sparseMode, attr->psetype, outputTensors[0], outputTensors[1], nullptr, outputTensors[2], &workspaceSize, &executor);
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnFlashAttentionScoreV2(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnFlashAttentionScoreV2 failed. ERROR: %d\n", name.c_str(), ret); return);
syncRun();
return;
}
FlashAttentionBackwardOpRunner::FlashAttentionBackwardOpRunner() : BaseOpRunner("FlashAttentionBackward")
{
}
void FlashAttentionBackwardOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
auto attr = dynamic_cast<FlashAttentionAttr *>(op_attr.get());
auto prefix = aclCreateIntArray(attr->prefix.data(), attr->prefix.size());
auto qstart = aclCreateIntArray(attr->qStartIdx.data(), attr->qStartIdx.size());
auto kvstart = aclCreateIntArray(attr->kvStartIdx.data(), attr->kvStartIdx.size());
char *layout = const_cast<char *>(attr->inputLayout.data());
ret = aclnnFlashAttentionScoreGradV2GetWorkspaceSize(inputTensors[0], inputTensors[1], inputTensors[2], inputTensors[3], attr->hasRealshift ? inputTensors[4] : nullptr, attr->hasDropmask ? inputTensors[5] : nullptr, nullptr, attr->hasAttentmask ? inputTensors[7] : nullptr, inputTensors[8], inputTensors[9], nullptr, inputTensors[10], prefix, qstart, kvstart, attr->scale, attr->keepProb, attr->preToken, attr->nextToken, attr->headNum, layout, attr->innerPrecise, attr->sparseMode, attr->psetype, outputTensors[0], outputTensors[1], outputTensors[2], nullptr, &workspaceSize, &executor);
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnFlashAttentionScoreGradV2(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnFlashAttentionScoreGradV2 failed. ERROR: %d\n", name.c_str(), ret); return);
syncRun();
return;
}
}

View File

@ -0,0 +1,27 @@
#pragma once
#include "utils.h"
#include "base_op.h"
namespace jittor
{
class FlashAttentionOpRunner : public BaseOpRunner
{
protected:
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
public:
FlashAttentionOpRunner();
};
class FlashAttentionBackwardOpRunner : public BaseOpRunner
{
protected:
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
public:
FlashAttentionBackwardOpRunner();
};
}

View File

@ -0,0 +1,85 @@
import os
from jittor_utils import env_or_try_find
import jittor_utils
import ctypes
import glob
import jittor.compiler as compiler
import jittor as jt
import math
import numpy as np
from typing import Union
from collections.abc import Sequence, Iterable
def flip_cmd(name: str,
inputs: list,
output_dtypes: list = None,
output_shapes: list = None,
attr_code: str = "",
attr_header: str = "",
outputs: list = None):
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
cuda_header = '''
#include "acl/aclops/aclops.h"
'''
outputs_ = []
if outputs is not None:
outputs_ = outputs
else:
assert output_dtypes is not None
assert output_shapes is not None
assert len(output_dtypes) == len(output_shapes)
for i in range(len(output_shapes)):
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
input_code = ''
for i in range(len(inputs)):
input_code += f"op.add(in{i}, true);\n"
output_code = ''
for i in range(len(outputs_)):
output_code += f"op.add(out{i}, false);\n"
return jt.code(outputs=outputs_,
inputs=inputs,
cuda_header=attr_header + cuda_header,
cuda_src=f"""
// aclop
{name}OpRunner op;
{input_code}
{output_code}
{attr_code}
op.run();""")
class FlipACL(jt.Function):
def __init__(self):
super(FlipACL, self).__init__()
def execute(self, input, dim):
if type(dim) is tuple:
dim = list(dim)
if type(dim) is not list:
dim = [dim]
attr_code = f"""
op.jt_name = "flip";
ReduceAttr *attr = new ReduceAttr();
attr->axes = {{{', '.join(map(str, (list(dim))))}}};
attr->prod_dim = {len(dim)};
op.op_attr.reset(attr);
"""
self.attr_code = attr_code
result = flip_cmd("Flip", [input],
output_dtypes=[input.dtype],
output_shapes=[input.shape],
attr_code=self.attr_code)[0]
return result
def grad(self, grad_output):
grad_input = flip_cmd("Flip", [grad_output],
output_dtypes=[grad_output.dtype],
output_shapes=[grad_output.shape],
attr_code=self.attr_code)[0]
return grad_input

View File

@ -0,0 +1,58 @@
#pragma once
#include <acl/acl.h>
#include <acl/acl_op_compiler.h>
#include <Python.h>
#include <pystate.h>
#include <algorithm>
#include <queue>
#include <set>
#include "common.h"
#include "op.h"
#include "acl_jittor.h"
#include "ops/random_op.h"
#include "ops/reduce_op.h"
#include "ops/binary_op.h"
#include "ops/broadcast_to_op.h"
#include "ops/transpose_op.h"
#include "ops/array_op.h"
#include "ops/code_op.h"
#include "fused_op.h"
#include "ops/unary_op.h"
#include "ops/ternary_op.h"
#include "executor.h"
#include "misc/cuda_flags.h"
#include "mem/allocator.h"
#include "op_compiler.h"
#include "ops/op_register.h"
#include "opt/tuner_manager.h"
#include "utils/str_utils.h"
#include "aclnn/aclnn.h"
#include "flip_op_acl.h"
namespace jittor
{
FlipOpRunner::FlipOpRunner() : BaseOpRunner("Flip")
{
}
void FlipOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
auto attr = dynamic_cast<ReduceAttr *>(op_attr.get());
auto dim = aclCreateIntArray(attr->axes.data(), attr->axes.size());
ret = aclnnFlipGetWorkspaceSize(inputTensors[0], dim, outputTensors[0], &workspaceSize, &executor);
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnFlip(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnFlip failed. ERROR: %d\n", name.c_str(), ret); return);
syncRun();
return;
}
}

View File

@ -0,0 +1,16 @@
#pragma once
#include "utils.h"
#include "base_op.h"
namespace jittor
{
class FlipOpRunner : public BaseOpRunner
{
protected:
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
public:
FlipOpRunner();
};
}

View File

@ -0,0 +1,70 @@
import os
from jittor_utils import env_or_try_find
import jittor_utils
import ctypes
import glob
import jittor.compiler as compiler
import jittor as jt
import math
import numpy as np
from typing import Union
from collections.abc import Sequence, Iterable
def floor_cmd(name: str,
inputs: list,
output_dtypes: list = None,
output_shapes: list = None,
attr_code: str = "",
attr_header: str = "",
outputs: list = None):
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
cuda_header = '''
#include "acl/aclops/aclops.h"
'''
outputs_ = []
if outputs is not None:
outputs_ = outputs
else:
assert output_dtypes is not None
assert output_shapes is not None
assert len(output_dtypes) == len(output_shapes)
for i in range(len(output_shapes)):
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
input_code = ''
for i in range(len(inputs)):
input_code += f"op.add(in{i}, true);\n"
output_code = ''
for i in range(len(outputs_)):
output_code += f"op.add(out{i}, false);\n"
return jt.code(outputs=outputs_,
inputs=inputs,
cuda_header=attr_header + cuda_header,
cuda_src=f"""
// aclop
{name}OpRunner op;
{input_code}
{output_code}
{attr_code}
op.run();""")
class FloorIntACL(jt.Function):
def __init__(self):
super(FloorIntACL, self).__init__()
def execute(self, input):
self.shape = input.shape
result = floor_cmd("Floor", [input],
output_dtypes=[input.dtype],
output_shapes=[input.shape],
attr_code="op.jt_name=\"floor\";")[0]
return result
def grad(self, grad_output):
return jt.zeros(self.shape, dtype=grad_output.dtype)

View File

@ -0,0 +1,56 @@
#pragma once
#include <acl/acl.h>
#include <acl/acl_op_compiler.h>
#include <Python.h>
#include <pystate.h>
#include <algorithm>
#include <queue>
#include <set>
#include "common.h"
#include "op.h"
#include "acl_jittor.h"
#include "ops/random_op.h"
#include "ops/reduce_op.h"
#include "ops/binary_op.h"
#include "ops/broadcast_to_op.h"
#include "ops/transpose_op.h"
#include "ops/array_op.h"
#include "ops/code_op.h"
#include "fused_op.h"
#include "ops/unary_op.h"
#include "ops/ternary_op.h"
#include "executor.h"
#include "misc/cuda_flags.h"
#include "mem/allocator.h"
#include "op_compiler.h"
#include "ops/op_register.h"
#include "opt/tuner_manager.h"
#include "utils/str_utils.h"
#include "aclnn/aclnn.h"
#include "floor_op_acl.h"
namespace jittor
{
FloorOpRunner::FloorOpRunner() : BaseOpRunner("Floor")
{
}
void FloorOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
ret = aclnnFloorGetWorkspaceSize(inputTensors[0], outputTensors[0], &workspaceSize, &executor);
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnFloor(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnFloor failed. ERROR: %d\n", name.c_str(), ret); return);
syncRun();
return;
}
}

View File

@ -0,0 +1,16 @@
#pragma once
#include "utils.h"
#include "base_op.h"
namespace jittor
{
class FloorOpRunner : public BaseOpRunner
{
protected:
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
public:
FloorOpRunner();
};
}

View File

@ -0,0 +1,126 @@
import os
from jittor_utils import env_or_try_find
import jittor_utils
import ctypes
import glob
import jittor.compiler as compiler
import jittor as jt
import math
import numpy as np
from typing import Union
from collections.abc import Sequence, Iterable
def gather_scatter_cmd(name: str,
inputs: list,
output_dtypes: list = None,
output_shapes: list = None,
attr_code: str = "",
attr_header: str = "",
outputs: list = None):
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
cuda_header = '''
#include "acl/aclops/aclops.h"
'''
outputs_ = []
if outputs is not None:
outputs_ = outputs
else:
assert output_dtypes is not None
assert output_shapes is not None
assert len(output_dtypes) == len(output_shapes)
for i in range(len(output_shapes)):
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
input_code = ''
for i in range(len(inputs)):
input_code += f"op.add(in{i}, true);\n"
output_code = ''
for i in range(len(outputs_)):
output_code += f"op.add(out{i}, false);\n"
return jt.code(outputs=outputs_,
inputs=inputs,
cuda_header=attr_header + cuda_header,
cuda_src=f"""
// aclop
{name}OpRunner op;
{input_code}
{output_code}
{attr_code}
op.run();""")
class GatherACL(jt.Function):
def __init__(self):
super(GatherACL, self).__init__()
def execute(self, input, dim, index):
self.dim = dim
self.index = index
attr_code = f"""
op.jt_name = "gather";
GatherAttr *attr = new GatherAttr();
attr->dim = {dim};
op.op_attr.reset(attr);
"""
result = gather_scatter_cmd("Gather", [input, index],
output_dtypes=[input.dtype],
output_shapes=[index.shape],
attr_code=attr_code)[0]
return result
def grad(self, grad_output):
tmp = jt.zeros(self.index.shape, dtype=grad_output.dtype)
attr_code = f"""
op.jt_name = "scatter";
ScatterAttr *attr = new ScatterAttr();
attr->axis = {self.dim};
attr->reduction = {1};
op.op_attr.reset(attr);
"""
grad_input = gather_scatter_cmd("Scatter",
[tmp, self.index, grad_output],
output_dtypes=[grad_output.dtype],
output_shapes=[tmp.shape],
attr_code=attr_code)[0]
return grad_input
class ScatterACL(jt.Function):
def __init__(self):
super(ScatterACL, self).__init__()
def execute(self, input, dim, index, src, reduce='void'):
self.dim = dim
self.index = index
self.reduce = reduce
attr_code = f"""
op.jt_name = "scatter";
ScatterAttr *attr = new ScatterAttr();
attr->axis = {dim};
attr->reduction = {1 if reduce == 'add' else 2 if reduce == 'mul' else 0};
op.op_attr.reset(attr);
"""
result = gather_scatter_cmd("Scatter", [input, self.index, src],
output_dtypes=[input.dtype],
output_shapes=[input.shape],
attr_code=attr_code)[0]
return result
def grad(self, grad_output):
attr_code = f"""
op.jt_name = "gather";
GatherAttr *attr = new GatherAttr();
attr->dim = {self.dim};
op.op_attr.reset(attr);
"""
grad_input = gather_scatter_cmd("Gather", [grad_output, self.index],
output_dtypes=[grad_output.dtype],
output_shapes=[self.index.shape],
attr_code=attr_code)[0]
return grad_output, None, None, grad_input

View File

@ -0,0 +1,80 @@
#pragma once
#include <acl/acl.h>
#include <acl/acl_op_compiler.h>
#include <Python.h>
#include <pystate.h>
#include <algorithm>
#include <queue>
#include <set>
#include "common.h"
#include "op.h"
#include "acl_jittor.h"
#include "ops/random_op.h"
#include "ops/reduce_op.h"
#include "ops/binary_op.h"
#include "ops/broadcast_to_op.h"
#include "ops/transpose_op.h"
#include "ops/array_op.h"
#include "ops/code_op.h"
#include "fused_op.h"
#include "ops/unary_op.h"
#include "ops/ternary_op.h"
#include "executor.h"
#include "misc/cuda_flags.h"
#include "mem/allocator.h"
#include "op_compiler.h"
#include "ops/op_register.h"
#include "opt/tuner_manager.h"
#include "utils/str_utils.h"
#include "aclnn/aclnn.h"
#include "gather_scatter_op_acl.h"
namespace jittor
{
GatherOpRunner::GatherOpRunner() : BaseOpRunner("Gather")
{
}
void GatherOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
auto attr = dynamic_cast<GatherAttr *>(op_attr.get());
ret = aclnnGatherGetWorkspaceSize(inputTensors[0], attr->dim, inputTensors[1], outputTensors[0], &workspaceSize, &executor);
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnGather(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnGather failed. ERROR: %d\n", name.c_str(), ret); return);
syncRun();
return;
}
ScatterOpRunner::ScatterOpRunner() : BaseOpRunner("Scatter")
{
}
void ScatterOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
auto attr = dynamic_cast<ScatterAttr *>(op_attr.get());
ret = aclnnScatterGetWorkspaceSize(inputTensors[0], attr->axis, inputTensors[1], inputTensors[2], attr->reduction, outputTensors[0], &workspaceSize, &executor);
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnScatter(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnScatter failed. ERROR: %d\n", name.c_str(), ret); return);
syncRun();
return;
}
}

View File

@ -0,0 +1,26 @@
#pragma once
#include "utils.h"
#include "base_op.h"
namespace jittor
{
class GatherOpRunner : public BaseOpRunner
{
protected:
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
public:
GatherOpRunner();
};
class ScatterOpRunner : public BaseOpRunner
{
protected:
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
public:
ScatterOpRunner();
};
}

View File

@ -0,0 +1,419 @@
import os
from jittor_utils import env_or_try_find
import jittor_utils
import ctypes
import glob
import jittor.compiler as compiler
import jittor as jt
import math
import numpy as np
from typing import Union
from collections.abc import Sequence, Iterable
def getitem_cmd(name: str,
inputs: list,
output_dtypes: list = None,
output_shapes: list = None,
attr_code: str = "",
attr_header: str = "",
outputs: list = None):
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
cuda_header = '''
#include "acl/aclops/aclops.h"
'''
outputs_ = []
if outputs is not None:
outputs_ = outputs
else:
assert output_dtypes is not None
assert output_shapes is not None
assert len(output_dtypes) == len(output_shapes)
for i in range(len(output_shapes)):
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
input_code = ''
for i in range(len(inputs)):
input_code += f"op.add(in{i}, true);\n"
output_code = ''
for i in range(len(outputs_)):
output_code += f"op.add(out{i}, false);\n"
return jt.code(outputs=outputs_,
inputs=inputs,
cuda_header=attr_header + cuda_header,
cuda_src=f"""
// aclop
{name}OpRunner op;
{input_code}
{output_code}
{attr_code}
op.run();""")
def getitem_forward(name: str,
inputs: list,
output_dtypes: list = None,
output_shapes: list = None,
attr_code: str = "",
attr_header: str = "",
outputs: list = None,
extra_data: dict = {}):
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
cuda_header = '''
#include "acl/aclops/aclops.h"
'''
outputs_ = []
if outputs is not None:
outputs_ = outputs
else:
assert output_dtypes is not None
assert output_shapes is not None
assert len(output_dtypes) == len(output_shapes)
for i in range(len(output_shapes)):
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
input_code = ''
for i in range(len(inputs)):
input_code += f"op.add(in{i}, true);\n"
output_code = ''
for i in range(len(outputs_)):
output_code += f"op.add(out{i}, false);\n"
return jt.code(outputs=outputs_,
inputs=inputs,
cuda_header=attr_header + cuda_header,
cuda_src=f"""
// aclop
{name}OpRunner op;
{input_code}
op.add(out0, false);
{attr_code}
op.run();""",
data=extra_data)
def caculate_shape(tensors):
if isinstance(tensors, jt.Var):
# tensors = tensors[0]
return tensors.shape
elif isinstance(tensors, (int, float)):
return []
elif isinstance(tensors, (list, tuple)):
# return [caculate_shape(tensor) for tensor in tensors]
sub_shape = caculate_shape(tensors[0])
return [len(tensors)] + sub_shape
else:
assert False, f"not implemented for {type(tensors)}"
def can_broadcast_and_shape(shape1, shape2):
"""
检查两个张量是否可以广播并返回广播后的形状
参数:
- shape1: 第一个张量的形状tuple list
- shape2: 第二个张量的形状tuple list
返回:
- can_broadcast: 布尔值表示是否可以广播
- broadcast_shape: 如果可以广播返回广播后的形状否则返回 None
"""
# 将形状转换为元组,以防输入是列表
shape1 = tuple(shape1)
shape2 = tuple(shape2)
# 使两个形状的长度一致通过在前面补1
len1, len2 = len(shape1), len(shape2)
if len1 < len2:
shape1 = (1, ) * (len2 - len1) + shape1
elif len2 < len1:
shape2 = (1, ) * (len1 - len2) + shape2
broadcast_shape = []
# 从最后一维开始检查每一维度
for dim1, dim2 in zip(shape1, shape2):
if dim1 == dim2:
broadcast_shape.append(dim1)
elif dim1 == 1:
broadcast_shape.append(dim2)
elif dim2 == 1:
broadcast_shape.append(dim1)
else:
# 如果在某一维度上不兼容,则不能广播
return False, None
return True, tuple(broadcast_shape)
class GetItemACL(jt.Function):
def __init__(self):
self.type_ = 'notype'
def stride(self, x, dim):
stride = 1
for i in range(dim + 1, len(x.shape)):
stride *= x.shape[i]
return stride
def execute(self, x, slices, return_x=None):
if isinstance(slices, jt.Var) and slices.dtype == 'bool':
# assert False, "not support bool type now"
#TODO:优化
assert x.shape == slices.shape, "shape not match"
output_len = slices.sum().item()
# output = jt.empty((output_len,),dtype=x.dtype)
x_len = x.numel()
output = jt.empty((x_len), dtype=x.dtype)
outputs = [output]
inputs = [x, slices]
# print(inputs,outputs)
# print(output.shape)
self.mask = slices
self.type_ = 'mask'
attr_code = f"""
op.jt_name = "maskedselect";
"""
result = getitem_cmd("MaskedSelect",
inputs=inputs,
outputs=outputs,
attr_code=attr_code)[0]
result = result[:output_len]
result.sync()
return result
self.x_shape = x.shape
if not isinstance(slices, tuple):
slices = (slices, )
slices = list(slices)
for i, s in enumerate(slices):
if isinstance(s, int) and s < 0:
slices[i] = s + x.shape[i]
slices = tuple(slices)
slices_list = list(slices)
# if not isinstance(slices[0], slice):
#check slices contains slice type
contains_slice = False
for s in slices:
if not isinstance(s, jt.Var) and (isinstance(s, slice)
or s == Ellipsis):
contains_slice = True
break
if not contains_slice:
indices = []
output_shape = []
slices_len = len(slices)
boardcast_shape = caculate_shape(slices_list[0])
for ii in range(1, len(slices)):
dd, boardcast_shape = can_broadcast_and_shape(
boardcast_shape, caculate_shape(slices_list[ii]))
assert dd is True, "can not broadcast"
output_shape = boardcast_shape
output_shape += x.shape[slices_len:]
if output_shape == []:
output_shape = [1]
for ii in slices:
indices.append(jt.Var(ii).int32())
if isinstance(slices[0],
jt.Var) or isinstance(slices[0], int) or isinstance(
slices[0], list) or isinstance(slices[0], tuple):
self.indices = indices
inputs = [x] + indices
attr_code = f"""
op.jt_name = "index";
"""
self.type_ = 'index'
result = getitem_cmd("Index",
inputs=inputs,
output_dtypes=[x.dtype],
output_shapes=[output_shape],
attr_code=attr_code)[0]
result.sync()
return result
assert contains_slice, "slice type error"
x_dim = len(x.shape)
slices = list(slices)
for s in slices:
if not isinstance(s, jt.Var) and s == Ellipsis:
slices = slices[:slices.index(s)] + [
slice(None, None, None)
] * (x_dim - len(slices) + 1) + slices[slices.index(s) + 1:]
break
slices = tuple(slices)
if len(slices) < x_dim:
slices += (slice(None, None, None), ) * (x_dim - len(slices))
inputs = [x]
sizes = []
begins = []
ends = []
steps = []
dims = []
squeeze_dims = []
extra_data = {}
if len(slices):
extra_data["a"] = len(slices)
for dim, s in enumerate(slices):
if isinstance(s, int):
s = slice(s, s + 1, 1)
squeeze_dims.append(dim)
if isinstance(s, jt.Var):
assert False, "jt.Var not supported"
start, stop, step = s.indices(x.size(dim))
size = (stop - start - 1) // step + 1
# stride = self.stride(x, dim) * step
sizes.append(size)
extra_data[str(dim * 3)] = start
extra_data[str(dim * 3 + 1)] = stop
extra_data[str(dim * 3 + 2)] = step
steps.append(step)
begins.append(start)
ends.append(stop)
dims.append(dim)
else:
extra_data["a"] = -1
sizes = [1]
steps = [1]
self.type_ = 'slicev2'
# for backward
self.begins = begins
self.ends = ends
self.steps = steps
self.dims = dims
self.slices = slices
attr_code = """
op.jt_name = "slicev2";
StrideAttr *attr = new StrideAttr();
int slice_dim = data["a"];
if(slice_dim == -1) {
attr->begins = {};
attr->ends = {};
attr->steps = {1};
attr->axes = {};
} else {
vector<long int> begins;
vector<long int> ends;
vector<long int> steps;
vector<long int> dims;
for(int dim = 0; dim < slice_dim; dim++) {
dims.push_back(dim);
begins.push_back(data[std::to_string(dim*3)]);
ends.push_back(data[std::to_string(dim*3+1)]);
steps.push_back(data[std::to_string(dim*3+2)]);
}
attr->begins = begins;
attr->ends = ends;
attr->steps = steps;
attr->axes = dims;
}
op.op_attr.reset(attr);
"""
result = getitem_forward("SliceV2",
inputs,
output_dtypes=[x.dtype],
output_shapes=[jt.empty(sizes).shape],
attr_code=attr_code,
extra_data=extra_data)[0]
self.squeeze_dims = squeeze_dims
for dim in squeeze_dims[::-1]:
result = jt.squeeze(result, dim)
result.sync()
return result
def grad(self, grad_output):
if self.type_ == 'index':
indices = self.indices
inputs = [grad_output] + indices
attr_code = f"""
op.jt_name = "indexputimplaccumulate";
"""
outputs = [jt.zeros(self.x_shape, dtype=grad_output.dtype)]
# breakpoint()
result = getitem_cmd("IndexPutImplAccumulate",
inputs=inputs,
outputs=outputs,
attr_code=attr_code)[0]
result.sync()
return result, None
elif self.type_ == 'slicev2':
begins = self.begins
ends = self.ends
steps = self.steps
dims = self.dims
slices = self.slices
#注意前向的维数可能会被压缩,所以这里要还原
for dim in self.squeeze_dims:
grad_output = jt.unsqueeze(grad_output, dim)
#适配华为奇怪的要求最后一个维度的step必须是1
expand_dim = False
if isinstance(slices[-1], slice):
if slices[-1].step is not None and slices[-1].step != 1:
slices = slices + (slice(None, None, None), )
expand_dim = True
elif isinstance(slices[-1], int):
#注意最后一个维度是数字
slices = list(slices)
slices[-1] = slice(slices[-1], slices[-1] + 1, 1)
slices = tuple(slices)
slices = slices + (slice(None, None, None), )
expand_dim = True
else:
assert False, "not supported"
# x = x.unsqueeze(-1)
if expand_dim:
grad_output = grad_output.unsqueeze(-1)
self.x_shape = self.x_shape + (1, )
sizes = []
begins = []
ends = []
steps = []
dims = []
for dim, s in enumerate(slices):
if isinstance(s, int):
s = slice(s, s + 1, 1)
# squeeze_dims.append(dim)
if isinstance(s, jt.Var):
assert False, "jt.Var not supported"
start, stop, step = s.indices(self.x_shape[dim])
size = (stop - start - 1) // step + 1
# stride = self.stride(x, dim) * step
sizes.append(size)
steps.append(step)
begins.append(start)
ends.append(stop)
dims.append(dim)
if not sizes:
sizes = [1]
steps = [1]
attr_code = f"""
op.jt_name = "stridedsliceassignv2";
StrideAttr *attr = new StrideAttr();
attr->begins = {{ {", ".join(map(str, begins))} }};
attr->ends = {{ {", ".join(map(str, ends))} }};
attr->steps = {{ {", ".join(map(str, steps))} }};
attr->axes = {{ {", ".join(map(str, dims))} }};
op.op_attr.reset(attr);
"""
inputs = [grad_output]
outputs = [jt.zeros(self.x_shape, dtype=grad_output.dtype)]
result = getitem_cmd("StridedSliceAssignV2",
inputs=inputs,
outputs=outputs,
attr_code=attr_code)[0]
result.sync()
if expand_dim:
result = result.squeeze(-1)
return result, None
elif self.type_ == 'mask':
return self.mask.float()
pass
else:
assert False, f"grad not implemented for {self.type_}"

View File

@ -0,0 +1,165 @@
#pragma once
#include <acl/acl.h>
#include <acl/acl_op_compiler.h>
#include <Python.h>
#include <pystate.h>
#include <algorithm>
#include <queue>
#include <set>
#include "common.h"
#include "op.h"
#include "acl_jittor.h"
#include "ops/random_op.h"
#include "ops/reduce_op.h"
#include "ops/binary_op.h"
#include "ops/broadcast_to_op.h"
#include "ops/transpose_op.h"
#include "ops/array_op.h"
#include "ops/code_op.h"
#include "fused_op.h"
#include "ops/unary_op.h"
#include "ops/ternary_op.h"
#include "executor.h"
#include "misc/cuda_flags.h"
#include "mem/allocator.h"
#include "op_compiler.h"
#include "ops/op_register.h"
#include "opt/tuner_manager.h"
#include "utils/str_utils.h"
#include "aclnn/aclnn.h"
#include "getitem_op_acl.h"
namespace jittor
{
MaskedSelectOpRunner::MaskedSelectOpRunner() : BaseOpRunner("MaskedSelect")
{
}
void MaskedSelectOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
ret = aclnnMaskedSelectGetWorkspaceSize(inputTensors[0], inputTensors[1], outputTensors[0], &workspaceSize, &executor);
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnMaskedSelect(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnMaskedSelect failed. ERROR: %d\n", name.c_str(), ret); return);
syncRun();
return;
}
IndexOpRunner::IndexOpRunner() : BaseOpRunner("Index")
{
}
void IndexOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
auto input_num = in_.size();
auto indexTensorList = aclCreateTensorList(&inputTensors[1], input_num - 1);
ret = aclnnIndexGetWorkspaceSize(inputTensors[0], indexTensorList, outputTensors[0], &workspaceSize, &executor);
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnIndex(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnIndex failed. ERROR: %d\n", name.c_str(), ret); return);
syncRun();
return;
}
SliceV2OpRunner::SliceV2OpRunner() : BaseOpRunner("SliceV2")
{
}
void SliceV2OpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
auto attr = dynamic_cast<StrideAttr *>(op_attr.get());
auto begins = aclCreateIntArray(attr->begins.data(), attr->begins.size());
auto ends = aclCreateIntArray(attr->ends.data(), attr->ends.size());
auto steps = aclCreateIntArray(attr->steps.data(), attr->steps.size());
auto axes = aclCreateIntArray(attr->axes.data(), attr->axes.size());
ret = aclnnSliceV2GetWorkspaceSize(inputTensors[0], begins, ends, axes, steps, outputTensors[0], &workspaceSize, &executor);
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnSliceV2(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnSliceV2 failed. ERROR: %d\n", name.c_str(), ret); return);
syncRun();
return;
}
IndexPutImplAccumulateOpRunner::IndexPutImplAccumulateOpRunner() : BaseOpRunner("IndexPutImplAccumulate")
{
}
void IndexPutImplAccumulateOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
auto input_num = in_.size();
std::vector<aclTensor *> indexTensorList = {};
for (int i = 1; i < input_num; i++)
{
indexTensorList.push_back(inputTensors[i]);
}
auto indexTensorListInput = aclCreateTensorList(&indexTensorList[0], input_num - 1);
ret = aclnnIndexPutImplGetWorkspaceSize(outputTensors[0], indexTensorListInput, inputTensors[0], true, true, &workspaceSize, &executor);
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnIndexPutImpl(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnIndexPutImpl failed. ERROR: %d\n", name.c_str(), ret); return);
syncRun();
return;
}
StridedSliceAssignV2OpRunner::StridedSliceAssignV2OpRunner() : BaseOpRunner("StridedSliceAssignV2")
{
}
void StridedSliceAssignV2OpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
auto attr = dynamic_cast<StrideAttr *>(op_attr.get());
auto begins = aclCreateIntArray(attr->begins.data(), attr->begins.size());
auto ends = aclCreateIntArray(attr->ends.data(), attr->ends.size());
auto steps = aclCreateIntArray(attr->steps.data(), attr->steps.size());
auto axes = aclCreateIntArray(attr->axes.data(), attr->axes.size());
ret = aclnnStridedSliceAssignV2GetWorkspaceSize(outputTensors[0], inputTensors[0], begins, ends, steps, axes, &workspaceSize, &executor);
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnStridedSliceAssignV2(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnStridedSliceAssignV2 failed. ERROR: %d\n", name.c_str(), ret); return);
syncRun();
return;
}
}

View File

@ -0,0 +1,57 @@
#pragma once
#include "utils.h"
#include "base_op.h"
namespace jittor
{
class MaskedSelectOpRunner : public BaseOpRunner
{
protected:
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
public:
MaskedSelectOpRunner();
};
class IndexOpRunner : public BaseOpRunner
{
protected:
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
public:
IndexOpRunner();
};
class SliceV2OpRunner : public BaseOpRunner
{
protected:
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
public:
SliceV2OpRunner();
};
class IndexPutImplAccumulateOpRunner : public BaseOpRunner
{
protected:
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
public:
IndexPutImplAccumulateOpRunner();
};
class StridedSliceAssignV2OpRunner : public BaseOpRunner
{
protected:
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
public:
StridedSliceAssignV2OpRunner();
};
}

View File

@ -0,0 +1,107 @@
import os
from jittor_utils import env_or_try_find
import jittor_utils
import ctypes
import glob
import jittor.compiler as compiler
import jittor as jt
import math
import numpy as np
from typing import Union
from collections.abc import Sequence, Iterable
def range_forward(name: str,
inputs: list,
output_dtypes: list = None,
output_shapes: list = None,
attr_code: str = "",
attr_header: str = "",
outputs: list = None,
extra_data: dict = {}):
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
cuda_header = '''
#include "acl/aclops/aclops.h"
'''
outputs_ = []
if outputs is not None:
outputs_ = outputs
else:
assert output_dtypes is not None
assert output_shapes is not None
assert len(output_dtypes) == len(output_shapes)
for i in range(len(output_shapes)):
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
input_code = ''
for i in range(len(inputs)):
input_code += f"op.add(in{i}, true);\n"
output_code = ''
for i in range(len(outputs_)):
output_code += f"op.add(out{i}, false);\n"
return jt.code(outputs=outputs_,
inputs=inputs,
cuda_header=attr_header + cuda_header,
cuda_src=f"""
// aclop
{name}OpRunner op;
{input_code}
op.add(out0, false);
{attr_code}
op.run();""",
data=extra_data)
class IndexACL(jt.Function):
def __init__(self):
super(IndexACL, self).__init__()
def execute(self, inshape: list, dim=None, dtype="int32"):
# zeros a tensor, shape is inshape, dtype is dtype
dim_input = dim
if dim == None:
dim = [i for i in range(len(inshape))]
elif type(dim) == int:
dim = [dim]
results = []
extra_data = {}
extra_data["dim_count"] = len(dim)
for i, d in enumerate(dim):
max_len = inshape[d]
extra_data[f"dim_{i}_start"] = 0
extra_data[f"dim_{i}_end"] = max_len
extra_data[f"dim_{i}_step"] = 1
tmp = jt.zeros(max_len, dtype=dtype)
range_attr_code = f"""
op.jt_name = "range";
RangeAttr *attr = new RangeAttr();
attr->start = data["dim_{i}_start"];
attr->end = data["dim_{i}_end"];
attr->step = data["dim_{i}_step"];
op.op_attr.reset(attr);
"""
result = range_forward("Range", [],
output_dtypes=[tmp.dtype],
output_shapes=[tmp.shape],
attr_code=range_attr_code,
extra_data=extra_data)[0]
broadcast_dims = list(range(len(inshape)))
broadcast_dims.remove(d)
result = jt.broadcast(result, shape=inshape, dims=broadcast_dims)
results.append(result)
if len(results) != 1 or dim_input == None:
return tuple(results)
elif len(results) == 1 and dim_input != None:
return results[0]
else:
return results
def grad(self, grad_output):
return grad_output

View File

@ -0,0 +1,72 @@
#pragma once
#include <acl/acl.h>
#include <acl/acl_op_compiler.h>
#include <Python.h>
#include <pystate.h>
#include <algorithm>
#include <queue>
#include <set>
#include "common.h"
#include "op.h"
#include "acl_jittor.h"
#include "ops/random_op.h"
#include "ops/reduce_op.h"
#include "ops/binary_op.h"
#include "ops/broadcast_to_op.h"
#include "ops/transpose_op.h"
#include "ops/array_op.h"
#include "ops/code_op.h"
#include "fused_op.h"
#include "ops/unary_op.h"
#include "ops/ternary_op.h"
#include "executor.h"
#include "misc/cuda_flags.h"
#include "mem/allocator.h"
#include "op_compiler.h"
#include "ops/op_register.h"
#include "opt/tuner_manager.h"
#include "utils/str_utils.h"
#include "aclnn/aclnn.h"
#include "index_op_acl.h"
namespace jittor
{
RangeOpRunner::RangeOpRunner() : BaseOpRunner("Range")
{
}
void RangeOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
aclScalar *start = nullptr;
aclScalar *end = nullptr;
aclScalar *step = nullptr;
auto attr = dynamic_cast<RangeAttr *>(op_attr.get());
int64_t startValue = attr->start;
int64_t endValue = attr->end;
int64_t stepValue = attr->step;
start = aclCreateScalar(&startValue, aclDataType::ACL_INT64);
end = aclCreateScalar(&endValue, aclDataType::ACL_INT64);
step = aclCreateScalar(&stepValue, aclDataType::ACL_INT64);
ret = aclnnRangeGetWorkspaceSize(start, end, step, outputTensors[0], &workspaceSize, &executor);
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnRange(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnRange failed. ERROR: %d\n", name.c_str(), ret); return);
syncRun();
aclDestroyScalar(start);
aclDestroyScalar(end);
aclDestroyScalar(step);
return;
}
}

View File

@ -0,0 +1,17 @@
#pragma once
#include "utils.h"
#include "base_op.h"
namespace jittor
{
class RangeOpRunner : public BaseOpRunner
{
protected:
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
public:
RangeOpRunner();
};
}

View File

@ -0,0 +1,130 @@
import os
from jittor_utils import env_or_try_find
import jittor_utils
import ctypes
import glob
import jittor.compiler as compiler
import jittor as jt
import math
import numpy as np
from typing import Union
from collections.abc import Sequence, Iterable
def matmul_forward(name: str,
inputs: list,
output_dtypes: list = None,
output_shapes: list = None,
attr_code: str = "",
attr_header: str = "",
outputs: list = None,
extra_data: dict = {}):
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
cuda_header = '''
#include "acl/aclops/aclops.h"
'''
outputs_ = []
if outputs is not None:
outputs_ = outputs
else:
assert output_dtypes is not None
assert output_shapes is not None
assert len(output_dtypes) == len(output_shapes)
for i in range(len(output_shapes)):
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
input_code = ''
for i in range(len(inputs)):
input_code += f"op.add(in{i}, true);\n"
return jt.code(outputs=outputs_,
inputs=inputs,
cuda_header=attr_header + cuda_header,
cuda_src=f"""
// aclop
MatMulOpRunner op;
{input_code}
op.add(out0, false);
{attr_code}
op.run();""",
data=extra_data)
class MatmulACL(jt.Function):
def __init__(self, trans_x2=False):
super(MatmulACL, self).__init__()
self.trans_x2 = trans_x2
def execute(self, x1, x2):
self.input = [x1, x2]
result = matmul_forward(
"MatMul", [x1, x2],
output_dtypes=[x1.dtype],
output_shapes=[
x1.shape[:-1] +
x2.shape[-2:-1] if self.trans_x2 else x1.shape[:-1] +
x2.shape[-1:]
],
attr_code="op.jt_name=\"matmul_trans_1\";"
if self.trans_x2 else "op.jt_name=\"matmul\";")[0]
return result
def grad(self, grad_output):
x1, x2 = self.input
if len(x1) != len(x2):
reshape_grad_x2 = True
else:
reshape_grad_x2 = False
grad_x1 = matmul_forward(
"MatMul", [grad_output, x2],
output_dtypes=[x1.dtype],
output_shapes=[
grad_output.shape[:-1] + x2.shape[-2:-1] if not self.trans_x2
else grad_output.shape[:-1] + x2.shape[-1:]
],
attr_code="op.jt_name=\"matmul_trans_1\";"
if not self.trans_x2 else "op.jt_name=\"matmul\";")[0]
if self.trans_x2:
if reshape_grad_x2:
output_shape = grad_output.shape[1:-2] + grad_output.shape[
-1:] + x1.shape[-1:]
grad_x2 = matmul_forward(
"MatMul", [
grad_output.reshape(-1, grad_output.shape[-1]),
x1.reshape(-1, x1.shape[-1])
],
output_dtypes=[x2.dtype],
output_shapes=[output_shape],
attr_code="op.jt_name=\"matmul_trans_0\";")[0]
else:
output_shape = grad_output.shape[:-2] + grad_output.shape[
-1:] + x1.shape[-1:]
grad_x2 = matmul_forward(
"MatMul", [grad_output, x1],
output_dtypes=[x2.dtype],
output_shapes=[output_shape],
attr_code="op.jt_name=\"matmul_trans_0\";")[0]
else:
if reshape_grad_x2:
output_shape = x1.shape[1:-2] + x1.shape[
-1:] + grad_output.shape[-1:]
grad_x2 = matmul_forward(
"MatMul", [
x1.reshape(-1, x1.shape[-1]),
grad_output.reshape(-1, grad_output.shape[-1])
],
output_dtypes=[x2.dtype],
output_shapes=[output_shape],
attr_code="op.jt_name=\"matmul_trans_0\";")[0]
else:
output_shape = x1.shape[:-2] + x1.shape[
-1:] + grad_output.shape[-1:]
grad_x2 = matmul_forward(
"MatMul", [x1, grad_output],
output_dtypes=[x2.dtype],
output_shapes=[output_shape],
attr_code="op.jt_name=\"matmul_trans_0\";")[0]
return grad_x1, grad_x2

View File

@ -0,0 +1,77 @@
#pragma once
#include <acl/acl.h>
#include <acl/acl_op_compiler.h>
#include <Python.h>
#include <pystate.h>
#include <algorithm>
#include <queue>
#include <set>
#include "common.h"
#include "op.h"
#include "acl_jittor.h"
#include "ops/random_op.h"
#include "ops/reduce_op.h"
#include "ops/binary_op.h"
#include "ops/broadcast_to_op.h"
#include "ops/transpose_op.h"
#include "ops/array_op.h"
#include "ops/code_op.h"
#include "fused_op.h"
#include "ops/unary_op.h"
#include "ops/ternary_op.h"
#include "executor.h"
#include "misc/cuda_flags.h"
#include "mem/allocator.h"
#include "op_compiler.h"
#include "ops/op_register.h"
#include "opt/tuner_manager.h"
#include "utils/str_utils.h"
#include "aclnn/aclnn.h"
#include "matmul_op_acl.h"
namespace jittor
{
MatMulOpRunner::MatMulOpRunner() : BaseOpRunner("MatMul")
{
}
void MatMulOpRunner::setupInputDesc()
{
auto input_num = in_.size();
for (int input_idx = 0; input_idx < input_num; input_idx++)
{
std::vector<int64_t> shape;
for (int j = 0; j < in_[input_idx]->shape.size(); j++)
{
shape.push_back(in_[input_idx]->shape[j]);
}
inputShapes.push_back(shape);
}
for (int idx = 0; idx < input_num; idx++)
{
inputTensors.push_back(nullptr);
if ((jt_name == "matmul_trans_1" && idx == 1) || (jt_name == "matmul_trans_0" && idx == 0))
{
auto ret = CreateFakeTransAclTensor(inputShapes[idx], in_[idx]->mem_ptr, in_[idx]->size, get_dtype(in_[idx]->dtype()), &inputTensors[idx], use_nchw);
CHECK_RET(ret == ACL_SUCCESS, return);
}
else
{
auto ret = CreateAclTensor(inputShapes[idx], in_[idx]->mem_ptr, in_[idx]->size, get_dtype(in_[idx]->dtype()), &inputTensors[idx], use_nchw);
CHECK_RET(ret == ACL_SUCCESS, return);
}
}
}
void MatMulOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
ret = aclnnMatmulGetWorkspaceSize(inputTensors[0], inputTensors[1], outputTensors[0], 1, &workspaceSize, &executor);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnMatmulGetWorkspaceSize failed. ERROR: %d\n", name.c_str(), ret); return);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnMatmul(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnMatmul failed. ERROR: %d\n", name.c_str(), ret); return);
syncRun();
}
}

View File

@ -0,0 +1,17 @@
#pragma once
#include "utils.h"
#include "base_op.h"
namespace jittor
{
class MatMulOpRunner : public BaseOpRunner
{
protected:
void setupInputDesc() override;
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
public:
MatMulOpRunner();
};
}

View File

@ -0,0 +1,75 @@
import os
from jittor_utils import env_or_try_find
import jittor_utils
import ctypes
import glob
import jittor.compiler as compiler
import jittor as jt
import math
import numpy as np
from typing import Union
from collections.abc import Sequence, Iterable
def nantonum_cmd(name: str,
inputs: list,
output_dtypes: list = None,
output_shapes: list = None,
attr_code: str = "",
attr_header: str = "",
outputs: list = None):
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
cuda_header = '''
#include "acl/aclops/aclops.h"
'''
outputs_ = []
if outputs is not None:
outputs_ = outputs
else:
assert output_dtypes is not None
assert output_shapes is not None
assert len(output_dtypes) == len(output_shapes)
for i in range(len(output_shapes)):
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
input_code = ''
for i in range(len(inputs)):
input_code += f"op.add(in{i}, true);\n"
output_code = ''
for i in range(len(outputs_)):
output_code += f"op.add(out{i}, false);\n"
return jt.code(outputs=outputs_,
inputs=inputs,
cuda_header=attr_header + cuda_header,
cuda_src=f"""
// aclop
{name}OpRunner op;
{input_code}
{output_code}
{attr_code}
op.run();""")
class NanToNumACL(jt.Function):
def __init__(self):
super(NanToNumACL, self).__init__()
def execute(self, input, nan_or_inf):
attr_code = f"""
op.jt_name = "NanToNum";
NanToNumAttr *attr = new NanToNumAttr();
attr->nan = {nan_or_inf};
attr->posinf = {-nan_or_inf};
attr->neginf = {-nan_or_inf};
op.op_attr.reset(attr);
"""
self.attr_code = attr_code
result = nantonum_cmd("NanToNum", [input],
output_dtypes=[input[0].dtype],
output_shapes=[input.shape],
attr_code=self.attr_code)[0]
return result

View File

@ -0,0 +1,58 @@
#pragma once
#include <acl/acl.h>
#include <acl/acl_op_compiler.h>
#include <Python.h>
#include <pystate.h>
#include <algorithm>
#include <queue>
#include <set>
#include "common.h"
#include "op.h"
#include "acl_jittor.h"
#include "ops/random_op.h"
#include "ops/reduce_op.h"
#include "ops/binary_op.h"
#include "ops/broadcast_to_op.h"
#include "ops/transpose_op.h"
#include "ops/array_op.h"
#include "ops/code_op.h"
#include "fused_op.h"
#include "ops/unary_op.h"
#include "ops/ternary_op.h"
#include "executor.h"
#include "misc/cuda_flags.h"
#include "mem/allocator.h"
#include "op_compiler.h"
#include "ops/op_register.h"
#include "opt/tuner_manager.h"
#include "utils/str_utils.h"
#include "aclnn/aclnn.h"
#include "nantonum_op_acl.h"
namespace jittor
{
NanToNumOpRunner::NanToNumOpRunner() : BaseOpRunner("NanToNum")
{
}
void NanToNumOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
auto attr = dynamic_cast<NanToNumAttr *>(op_attr.get());
ret = aclnnNanToNumGetWorkspaceSize(inputTensors[0], attr->nan, attr->posinf, attr->neginf, outputTensors[0], &workspaceSize, &executor);
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnNanToNum(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnNanToNum failed. ERROR: %d\n", name.c_str(), ret); return);
syncRun();
return;
}
}

View File

@ -0,0 +1,17 @@
#pragma once
#include "utils.h"
#include "base_op.h"
namespace jittor
{
class NanToNumOpRunner : public BaseOpRunner
{
protected:
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
public:
NanToNumOpRunner();
};
}

View File

@ -0,0 +1,184 @@
import os
from jittor_utils import env_or_try_find
import jittor_utils
import ctypes
import glob
import jittor.compiler as compiler
import jittor as jt
import math
import numpy as np
from typing import Union
from collections.abc import Sequence, Iterable
def norms_cmd(name: str,
inputs: list,
output_dtypes: list = None,
output_shapes: list = None,
attr_code: str = "",
attr_header: str = "",
outputs: list = None):
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
cuda_header = '''
#include "acl/aclops/aclops.h"
'''
outputs_ = []
if outputs is not None:
outputs_ = outputs
else:
assert output_dtypes is not None
assert output_shapes is not None
assert len(output_dtypes) == len(output_shapes)
for i in range(len(output_shapes)):
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
input_code = ''
for i in range(len(inputs)):
input_code += f"op.add(in{i}, true);\n"
output_code = ''
for i in range(len(outputs_)):
output_code += f"op.add(out{i}, false);\n"
return jt.code(outputs=outputs_,
inputs=inputs,
cuda_header=attr_header + cuda_header,
cuda_src=f"""
// aclop
{name}OpRunner op;
{input_code}
{output_code}
{attr_code}
op.run();""")
class BatchNormACL(jt.Function):
def __init__(self,
num_features,
eps=1e-05,
momentum=0.1,
affine=True,
is_train=True,
sync=True):
self.num_features = num_features
self.eps = eps
self.momentum = momentum
self.affine = affine
self.is_train = is_train
self.sync = sync
self.weight = jt.init.constant(
(num_features, ), "float32", 1.0) if affine else 1.0
self.bias = jt.init.constant(
(num_features, ), "float32", 0.0) if affine else 0.0
self.running_mean = jt.init.constant((num_features, ), "float32",
0.0).stop_grad()
self.running_var = jt.init.constant((num_features, ), "float32",
1.0).stop_grad()
def execute(self, x):
# assert self.num_features == x.shape[-1]
self.input = x.float32()
inputs = [
self.input, self.weight, self.bias, self.running_mean,
self.running_var
]
outputs = [
jt.empty(x.shape),
jt.empty(self.num_features),
jt.empty(self.num_features)
]
attr_code = f"""
op.jt_name = "batchnorm";
BatchNormAttr *attr = new BatchNormAttr();
attr->is_train = {"true" if self.is_train else "false"};
attr->momentum = {self.momentum};
attr->eps = {self.eps};
op.op_attr.reset(attr);
"""
result = norms_cmd("BatchNorm",
inputs=inputs,
outputs=outputs,
attr_code=attr_code)
self.output = result[0]
self.saveMean = result[1]
self.saveInvstd = result[2]
return self.output
def grad(self, grad_output):
attr_code = f"""
op.jt_name = "batchnorm";
BatchNormAttr *attr = new BatchNormAttr();
attr->is_train = {"true" if self.is_train else "false"};
attr->momentum = {self.momentum};
attr->eps = {self.eps};
op.op_attr.reset(attr);
"""
inputs = [
grad_output, self.input, self.weight, self.running_mean,
self.running_var, self.saveMean, self.saveInvstd
]
outputs = [
jt.empty(self.input.shape),
jt.empty(self.num_features),
jt.empty(self.num_features)
]
grad_input = norms_cmd("BatchNormBackward",
inputs=inputs,
outputs=outputs,
attr_code=attr_code)[0]
return grad_input
class LayerNormACL(jt.Function):
def __init__(self,
normalized_shape,
eps: float = 1e-5,
elementwise_affine: bool = True):
if isinstance(normalized_shape, int):
normalized_shape = (normalized_shape, )
self.normalized_shape = tuple(normalized_shape)
self.eps = eps
self.elementwise_affine = elementwise_affine
self.weight = jt.init.constant(normalized_shape, "float32",
1.0) if elementwise_affine else 1.0
self.bias = jt.init.constant(normalized_shape, "float32",
0.0) if elementwise_affine else 0.0
def execute(self, x):
self.input = x.float32()
inputs = [self.input, self.weight, self.bias]
outputs = [jt.empty(x.shape), jt.empty(x.shape), jt.empty(x.shape)]
attr_code = f"""
op.jt_name = "layernorm";
LayerNormAttr *attr = new LayerNormAttr();
attr->eps = {self.eps};
attr->normalizedShape = {{{', '.join(map(str, (list(self.normalized_shape))))}}};
attr->size = {x.shape[-1]};
op.op_attr.reset(attr);
"""
result = norms_cmd("LayerNorm",
inputs=inputs,
outputs=outputs,
attr_code=attr_code)
self.output = result[0]
self.meanout = result[1]
self.rstdout = result[2]
return self.output
def grad(self, grad_output):
attr_code = f"""
op.jt_name = "batchnorm";
BatchNormAttr *attr = new BatchNormAttr();
attr->is_train = {"true" if self.is_train else "false"};
attr->momentum = {self.momentum};
attr->eps = {self.eps};
op.op_attr.reset(attr);
"""
inputs = [grad_output, self.input, self.weight, self.running_mean, self.running_var, self.saveMean, self.saveInvstd]
outputs = [jt.empty(self.input.shape), jt.empty(self.num_features), jt.empty(self.num_features)]
grad_input = norms_cmd("SoftmaxBackward",
inputs=inputs,
outputs=outputs,
attr_code=attr_code)[0]
return grad_input

View File

@ -0,0 +1,111 @@
#pragma once
#include <acl/acl.h>
#include <acl/acl_op_compiler.h>
#include <Python.h>
#include <pystate.h>
#include <algorithm>
#include <queue>
#include <set>
#include "common.h"
#include "op.h"
#include "acl_jittor.h"
#include "ops/random_op.h"
#include "ops/reduce_op.h"
#include "ops/binary_op.h"
#include "ops/broadcast_to_op.h"
#include "ops/transpose_op.h"
#include "ops/array_op.h"
#include "ops/code_op.h"
#include "fused_op.h"
#include "ops/unary_op.h"
#include "ops/ternary_op.h"
#include "executor.h"
#include "misc/cuda_flags.h"
#include "mem/allocator.h"
#include "op_compiler.h"
#include "ops/op_register.h"
#include "opt/tuner_manager.h"
#include "utils/str_utils.h"
#include "aclnn/aclnn.h"
#include "norms_op_acl.h"
namespace jittor
{
BatchNormOpRunner::BatchNormOpRunner() : BaseOpRunner("BatchNorm")
{
}
void BatchNormOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
auto attr = dynamic_cast<BatchNormAttr *>(op_attr.get());
ret = aclnnBatchNormGetWorkspaceSize(inputTensors[0], inputTensors[1], inputTensors[2], inputTensors[3], inputTensors[4], attr->is_train, attr->momentum, attr->eps, outputTensors[0], outputTensors[1], outputTensors[2], &workspaceSize, &executor);
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnBatchNorm(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnBatchNorm failed. ERROR: %d\n", name.c_str(), ret); return);
syncRun();
return;
}
BatchNormBackwardOpRunner::BatchNormBackwardOpRunner() : BaseOpRunner("BatchNormBackward")
{
}
void BatchNormBackwardOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
auto attr = dynamic_cast<BatchNormAttr *>(op_attr.get());
bool outputMask[3] = {true, true, true};
aclBoolArray *outMask = aclCreateBoolArray(outputMask, 3);
ret = aclnnBatchNormBackwardGetWorkspaceSize(inputTensors[0], inputTensors[1], inputTensors[2], inputTensors[3], inputTensors[4], inputTensors[5], inputTensors[6], attr->is_train, attr->eps, outMask, outputTensors[0], outputTensors[1], outputTensors[2], &workspaceSize, &executor);
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnBatchNormBackward(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnBatchNormBackward failed. ERROR: %d\n", name.c_str(), ret); return);
syncRun();
return;
}
LayerNormOpRunner::LayerNormOpRunner() : BaseOpRunner("LayerNorm")
{
}
void LayerNormOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
auto attr = dynamic_cast<LayerNormAttr *>(op_attr.get());
aclIntArray *normalizedShape = nullptr;
normalizedShape = aclCreateIntArray(attr->normalizedShape.data(), attr->size);
ret = aclnnLayerNormGetWorkspaceSize(inputTensors[0], normalizedShape, inputTensors[1], inputTensors[2], attr->eps, outputTensors[0], outputTensors[1], outputTensors[2], &workspaceSize, &executor);
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnLayerNorm(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnLayerNorm failed. ERROR: %d\n", name.c_str(), ret); return);
syncRun();
aclDestroyIntArray(normalizedShape);
return;
}
}

View File

@ -0,0 +1,34 @@
#pragma once
#include "utils.h"
#include "base_op.h"
namespace jittor
{
class BatchNormOpRunner : public BaseOpRunner
{
protected:
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
public:
BatchNormOpRunner();
};
class BatchNormBackwardOpRunner : public BaseOpRunner
{
protected:
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
public:
BatchNormBackwardOpRunner();
};
class LayerNormOpRunner : public BaseOpRunner
{
protected:
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
public:
LayerNormOpRunner();
};
}

View File

@ -0,0 +1,176 @@
import os
from jittor_utils import env_or_try_find
import jittor_utils
import ctypes
import glob
import jittor.compiler as compiler
import jittor as jt
import math
import numpy as np
from typing import Union
from collections.abc import Sequence, Iterable
def pool_cmd(name: str,
inputs: list,
output_dtypes: list = None,
output_shapes: list = None,
attr_code: str = "",
attr_header: str = "",
outputs: list = None):
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
cuda_header = '''
#include "acl/aclops/aclops.h"
'''
outputs_ = []
if outputs is not None:
outputs_ = outputs
else:
assert output_dtypes is not None
assert output_shapes is not None
assert len(output_dtypes) == len(output_shapes)
for i in range(len(output_shapes)):
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
input_code = ''
for i in range(len(inputs)):
input_code += f"op.add(in{i}, true);\n"
output_code = ''
for i in range(len(outputs_)):
output_code += f"op.add(out{i}, false);\n"
return jt.code(outputs=outputs_,
inputs=inputs,
cuda_header=attr_header + cuda_header,
cuda_src=f"""
// aclop
{name}OpRunner op;
{input_code}
{output_code}
{attr_code}
op.run();""")
class PoolACL(jt.Function):
def __init__(self,
kernel_size,
stride=None,
padding=0,
dilation=None,
return_indices=None,
ceil_mode=False,
count_include_pad=True,
op='maximum'):
self.kernel_size = kernel_size if isinstance(
kernel_size, tuple) else (kernel_size, kernel_size)
stride = stride if stride else kernel_size
self.stride = stride if isinstance(stride, tuple) else (stride, stride)
self.padding = padding if isinstance(padding, tuple) else (padding,
padding)
dilation = dilation if dilation else 1
assert dilation == 1
self.dilation = dilation if isinstance(dilation, tuple) else (dilation,
dilation)
for item in self.kernel_size:
if item <= 0:
raise RuntimeError(
f"kernel_size must be greater than zero, but got {item}")
for item in self.stride:
if item <= 0:
raise RuntimeError(
f"stride must be greater than zero, but got {item}")
for item in self.padding:
if item < 0:
raise RuntimeError(
f"padding must be non-negative, but got {item}")
self.op = op
self.return_indices = return_indices
self.ceil_mode = ceil_mode
self.count_include_pad = count_include_pad
def execute(self, input):
self.input = input
attr_code = f"""
op.jt_name = "{"avgpool" if self.op == 'mean' else "maxpool"}";
PoolAttr *attr = new PoolAttr();
attr->kernel_size = {{ {self.kernel_size[0]}, {self.kernel_size[1]} }};
attr->poolStrides = {{ {self.stride[0]}, {self.stride[1]} }};
attr->poolPads = {{ {self.padding[0]}, {self.padding[1]} }};
attr->poolDilations = {{ {self.dilation[0]}, {self.dilation[1]} }};
attr->poolCeil = {"true" if self.ceil_mode else "false"};
attr->countIncludePad = {"true" if self.count_include_pad else "false"};
op.op_attr.reset(attr);
"""
input_height, input_width = input.shape[-2:]
kernel_height, kernel_width = self.kernel_size[-2:]
output_height = (input_height + 2 * self.padding[0] -
(kernel_height - 1) - 1) // self.stride[0] + 1
output_width = (input_width + 2 * self.padding[1] -
(kernel_width - 1) - 1) // self.stride[1] + 1
output_shape = (input.shape[0], input.shape[1], output_height,
output_width)
inputs = [input]
if self.op == 'maximum':
result = pool_cmd(
"Maxpool",
inputs,
output_dtypes=[input.dtype, 'int32'],
output_shapes=[output_shape, output_shape],
attr_code=attr_code,
)
elif self.op == 'mean':
result = pool_cmd(
"Avgpool",
inputs,
output_dtypes=[input.dtype],
output_shapes=[output_shape],
attr_code=attr_code,
)
else:
raise ValueError('no this type pool')
if self.op == 'maximum':
self.index = result[1]
if self.return_indices:
return result[0], result[1]
else:
return result[0]
def grad(self, grad_output):
input = self.input
attr_code = f"""
op.jt_name = "{"avgpoolbackward" if self.op == 'mean' else "maxpoolbackward"}";
PoolAttr *attr = new PoolAttr();
attr->kernel_size = {{ {self.kernel_size[0]}, {self.kernel_size[1]} }};
attr->poolStrides = {{ {self.stride[0]}, {self.stride[1]} }};
attr->poolPads = {{ {self.padding[0]}, {self.padding[1]} }};
attr->poolDilations = {{ {self.dilation[0]}, {self.dilation[1]} }};
attr->poolCeil = {"true" if self.ceil_mode else "false"};
attr->countIncludePad = {"true" if self.count_include_pad else "false"};
op.op_attr.reset(attr);
"""
output_shapes = [input.shape]
output_dtypes = [input.dtype]
if self.op == 'maximum':
result = pool_cmd("MaxpoolBackward",
inputs=[grad_output, input, self.index],
output_dtypes=output_dtypes,
output_shapes=output_shapes,
attr_code=attr_code)[0]
elif self.op == 'mean':
result = pool_cmd("AvgpoolBackward",
inputs=[grad_output, input],
output_dtypes=output_dtypes,
output_shapes=output_shapes,
attr_code=attr_code)[0]
else:
raise ValueError('no this type pool')
return result

View File

@ -0,0 +1,187 @@
#pragma once
#include <acl/acl.h>
#include <acl/acl_op_compiler.h>
#include <Python.h>
#include <pystate.h>
#include <algorithm>
#include <queue>
#include <set>
#include "common.h"
#include "op.h"
#include "acl_jittor.h"
#include "ops/random_op.h"
#include "ops/reduce_op.h"
#include "ops/binary_op.h"
#include "ops/broadcast_to_op.h"
#include "ops/transpose_op.h"
#include "ops/array_op.h"
#include "ops/code_op.h"
#include "fused_op.h"
#include "ops/unary_op.h"
#include "ops/ternary_op.h"
#include "executor.h"
#include "misc/cuda_flags.h"
#include "mem/allocator.h"
#include "op_compiler.h"
#include "ops/op_register.h"
#include "opt/tuner_manager.h"
#include "utils/str_utils.h"
#include "aclnn/aclnn.h"
#include "pool_op_acl.h"
namespace jittor
{
MaxpoolOpRunner::MaxpoolOpRunner() : BaseOpRunner("Maxpool")
{
use_nchw = true;
}
void MaxpoolOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
aclIntArray *strides = nullptr;
aclIntArray *pads = nullptr;
aclIntArray *dilations = nullptr;
aclIntArray *kernel_size = nullptr;
auto attr = dynamic_cast<PoolAttr *>(op_attr.get());
kernel_size = aclCreateIntArray(attr->kernel_size.data(), 2);
strides = aclCreateIntArray(attr->poolStrides.data(), 2);
pads = aclCreateIntArray(attr->poolPads.data(), 2);
dilations = aclCreateIntArray(attr->poolDilations.data(), 2);
ret = aclnnMaxPool2dWithIndicesGetWorkspaceSize(inputTensors[0], kernel_size, strides, pads, dilations, attr->poolCeil, outputTensors[0], outputTensors[1], &workspaceSize, &executor);
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnMaxPool2dWithIndices(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnMaxPool2dWithIndices failed. ERROR: %d\n", name.c_str(), ret); return);
syncRun();
aclDestroyIntArray(strides);
aclDestroyIntArray(pads);
aclDestroyIntArray(dilations);
aclDestroyIntArray(kernel_size);
return;
}
AvgpoolOpRunner::AvgpoolOpRunner() : BaseOpRunner("Avgpool")
{
use_nchw = true;
}
void AvgpoolOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
aclIntArray *strides = nullptr;
aclIntArray *pads = nullptr;
aclIntArray *kernel_size = nullptr;
auto attr = dynamic_cast<PoolAttr *>(op_attr.get());
kernel_size = aclCreateIntArray(attr->kernel_size.data(), 2);
strides = aclCreateIntArray(attr->poolStrides.data(), 2);
pads = aclCreateIntArray(attr->poolPads.data(), 2);
ret = aclnnAvgPool2dGetWorkspaceSize(inputTensors[0], kernel_size, strides, pads, attr->poolCeil, attr->countIncludePad, attr->divisorOverride, attr->divisorOverride, outputTensors[0], &workspaceSize, &executor);
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnAvgPool2d(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnAvgPool2d failed. ERROR: %d\n", name.c_str(), ret); return);
syncRun();
aclDestroyIntArray(strides);
aclDestroyIntArray(pads);
aclDestroyIntArray(kernel_size);
return;
}
MaxpoolBackwardOpRunner::MaxpoolBackwardOpRunner() : BaseOpRunner("MaxpoolBackward")
{
use_nchw = true;
}
void MaxpoolBackwardOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
aclIntArray *strides = nullptr;
aclIntArray *pads = nullptr;
aclIntArray *dilations = nullptr;
aclIntArray *kernel_size = nullptr;
auto attr = dynamic_cast<PoolAttr *>(op_attr.get());
kernel_size = aclCreateIntArray(attr->kernel_size.data(), 2);
strides = aclCreateIntArray(attr->poolStrides.data(), 2);
pads = aclCreateIntArray(attr->poolPads.data(), 2);
dilations = aclCreateIntArray(attr->poolDilations.data(), 2);
ret = aclnnMaxPool2dWithIndicesBackwardGetWorkspaceSize(inputTensors[0], inputTensors[1], inputTensors[2], kernel_size, strides, pads, dilations, attr->poolCeil, outputTensors[0], &workspaceSize, &executor);
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnMaxPool2dWithIndicesBackward(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnMaxPool2dWithIndicesBackward failed. ERROR: %d\n", name.c_str(), ret); return);
syncRun();
aclDestroyIntArray(strides);
aclDestroyIntArray(pads);
aclDestroyIntArray(dilations);
aclDestroyIntArray(kernel_size);
return;
}
AvgpoolBackwardOpRunner::AvgpoolBackwardOpRunner() : BaseOpRunner("AvgpoolBackward")
{
use_nchw = true;
}
void AvgpoolBackwardOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
aclIntArray *strides = nullptr;
aclIntArray *pads = nullptr;
aclIntArray *kernel_size = nullptr;
auto attr = dynamic_cast<PoolAttr *>(op_attr.get());
kernel_size = aclCreateIntArray(attr->kernel_size.data(), 2);
strides = aclCreateIntArray(attr->poolStrides.data(), 2);
pads = aclCreateIntArray(attr->poolPads.data(), 2);
ret = aclnnAvgPool2dBackwardGetWorkspaceSize(inputTensors[0], inputTensors[1], kernel_size, strides, pads, attr->countIncludePad, attr->divisorOverride, attr->divisorOverride, attr->poolCeil, outputTensors[0], &workspaceSize, &executor);
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnAvgPool2dBackward(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnAvgPool2dBackward failed. ERROR: %d\n", name.c_str(), ret); return);
syncRun();
aclDestroyIntArray(strides);
aclDestroyIntArray(pads);
aclDestroyIntArray(kernel_size);
return;
}
}

View File

@ -0,0 +1,46 @@
#pragma once
#include "utils.h"
#include "base_op.h"
namespace jittor
{
class MaxpoolOpRunner : public BaseOpRunner
{
protected:
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
public:
MaxpoolOpRunner();
};
class AvgpoolOpRunner : public BaseOpRunner
{
protected:
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
public:
AvgpoolOpRunner();
};
class MaxpoolBackwardOpRunner : public BaseOpRunner
{
protected:
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
public:
MaxpoolBackwardOpRunner();
};
class AvgpoolBackwardOpRunner : public BaseOpRunner
{
protected:
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
public:
AvgpoolBackwardOpRunner();
};
}

View File

@ -0,0 +1,82 @@
#pragma once
#include <acl/acl.h>
#include <acl/acl_op_compiler.h>
#include <Python.h>
#include <pystate.h>
#include <algorithm>
#include <queue>
#include <set>
#include "common.h"
#include "op.h"
#include "acl_jittor.h"
#include "ops/random_op.h"
#include "ops/reduce_op.h"
#include "ops/binary_op.h"
#include "ops/broadcast_to_op.h"
#include "ops/transpose_op.h"
#include "ops/array_op.h"
#include "ops/code_op.h"
#include "fused_op.h"
#include "ops/unary_op.h"
#include "ops/ternary_op.h"
#include "executor.h"
#include "misc/cuda_flags.h"
#include "mem/allocator.h"
#include "op_compiler.h"
#include "ops/op_register.h"
#include "opt/tuner_manager.h"
#include "utils/str_utils.h"
#include "aclnn/aclnn.h"
#include "random_op_acl.h"
namespace jittor
{
RandomOpRunner::RandomOpRunner() : BaseOpRunner("RandomUniform")
{
name = "RandomUniform";
}
RandomOpRunner::RandomOpRunner(const string &_name) : BaseOpRunner(_name)
{
name = _name;
}
void RandomOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
auto attr = dynamic_cast<RandomAttr *>(op_attr.get());
if (name == "RandomUniform")
{
ret = aclnnInplaceUniformGetWorkspaceSize(outputTensors[0], 0.0, 1.0, attr->seed, attr->offset, &workspaceSize, &executor);
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnInplaceUniform(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnInplaceUniform failed. ERROR: %d\n", name.c_str(), ret); return);
}
else if (name == "RandomNormal")
{
ret = aclnnInplaceNormalGetWorkspaceSize(outputTensors[0], 0.0, 1.0, attr->seed, attr->offset, &workspaceSize, &executor);
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnInplaceNormal(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnInplaceNormal failed. ERROR: %d\n", name.c_str(), ret); return);
}
else
{
LOGf << "Not supported random type : " << name;
}
syncRun();
return;
}
}

View File

@ -0,0 +1,18 @@
#pragma once
#include "utils.h"
#include "base_op.h"
namespace jittor
{
class RandomOpRunner : public BaseOpRunner
{
protected:
string name; // special to random op
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
public:
RandomOpRunner();
RandomOpRunner(const string &name);
};
}

View File

@ -0,0 +1,127 @@
#include <acl/acl.h>
#include <acl/acl_op_compiler.h>
#include <Python.h>
#include <pystate.h>
#include <algorithm>
#include <queue>
#include <set>
#include "common.h"
#include "op.h"
#include "acl_jittor.h"
#include "ops/random_op.h"
#include "ops/reduce_op.h"
#include "ops/binary_op.h"
#include "ops/broadcast_to_op.h"
#include "ops/transpose_op.h"
#include "ops/array_op.h"
#include "ops/code_op.h"
#include "fused_op.h"
#include "ops/unary_op.h"
#include "ops/ternary_op.h"
#include "executor.h"
#include "misc/cuda_flags.h"
#include "mem/allocator.h"
#include "op_compiler.h"
#include "ops/op_register.h"
#include "opt/tuner_manager.h"
#include "utils/str_utils.h"
#include "aclnn/aclnn.h"
#include "reduce_op_acl.h"
namespace jittor
{
ReduceOpRunner::ReduceOpRunner() : BaseOpRunner("reduce")
{
use_nchw = false;
}
void ReduceOpRunner::setupOutputDesc()
{
auto output_num = out_.size();
for (int output_idx = 0; output_idx < output_num; output_idx++)
{
std::vector<int64_t> shape;
for (int j = 0; j < out_[output_idx]->shape.size(); j++)
{
shape.push_back(out_[output_idx]->shape[j]);
}
outputShapes.push_back(shape);
}
attr = dynamic_cast<ReduceAttr *>(op_attr.get());
dim = aclCreateIntArray(attr->axes.data(), attr->axes.size());
keepdims = attr->keepdims;
if (op_idx < 13)
{
if (attr->axes.size() == in_[0]->shape.size())
outputShapes[0] = {};
}
for (int idx = 0; idx < output_num; idx++)
{
outputTensors.push_back(nullptr);
auto ret = CreateAclTensor(outputShapes[idx], out_[idx]->mem_ptr, out_[idx]->size, get_dtype(out_[idx]->dtype()), &outputTensors[idx], use_nchw);
CHECK_RET(ret == ACL_SUCCESS, return);
}
}
void ReduceOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
switch (op_idx)
{
case 9:
{
ret = aclnnReduceSumGetWorkspaceSize(inputTensors[0], dim, keepdims, get_dtype(out_[0]->dtype()), outputTensors[0], &workspaceSize, &executor);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnReduceSumGetWorkspaceSize failed. ERROR: %d\n", name.c_str(), ret); return);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnReduceSum(workspaceAddr, workspaceSize, executor, aclstream);
break;
}
case 10:
{
ret = aclnnMeanGetWorkspaceSize(inputTensors[0], dim, keepdims, get_dtype(out_[0]->dtype()), outputTensors[0], &workspaceSize, &executor);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnMeanGetWorkspaceSize failed. ERROR: %d\n", name.c_str(), ret); return);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnMean(workspaceAddr, workspaceSize, executor, aclstream);
break;
}
case 11:
{
ret = aclnnAmaxGetWorkspaceSize(inputTensors[0], dim, keepdims, outputTensors[0], &workspaceSize, &executor);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnAmaxGetWorkspaceSize failed. ERROR: %d\n", name.c_str(), ret); return);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnAmax(workspaceAddr, workspaceSize, executor, aclstream);
break;
}
case 12:
{
ret = aclnnAminGetWorkspaceSize(inputTensors[0], dim, keepdims, outputTensors[0], &workspaceSize, &executor);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnAminGetWorkspaceSize failed. ERROR: %d\n", name.c_str(), ret); return);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnAmin(workspaceAddr, workspaceSize, executor, aclstream);
break;
}
default:
{
LOGir << "no such reduce!!";
exit(-1);
}
}
syncRun();
return;
}
}

View File

@ -0,0 +1,21 @@
#pragma once
#include "utils.h"
#include "base_op.h"
namespace jittor
{
struct ReduceOpRunner : public BaseOpRunner
{
int op_idx; // Specific to reduce operations
ReduceOpRunner();
protected:
ReduceAttr *attr;
aclIntArray *dim;
bool keepdims;
void setupOutputDesc() override;
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
};
}

View File

@ -0,0 +1,115 @@
import os
from jittor_utils import env_or_try_find
import jittor_utils
import ctypes
import glob
import jittor.compiler as compiler
import jittor as jt
import math
import numpy as np
from typing import Union
from collections.abc import Sequence, Iterable
def relu_cmd(name: str,
inputs: list,
output_dtypes: list = None,
output_shapes: list = None,
attr_code: str = "",
attr_header: str = "",
outputs: list = None):
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
cuda_header = '''
#include "acl/aclops/aclops.h"
'''
outputs_ = []
if outputs is not None:
outputs_ = outputs
else:
assert output_dtypes is not None
assert output_shapes is not None
assert len(output_dtypes) == len(output_shapes)
for i in range(len(output_shapes)):
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
input_code = ''
for i in range(len(inputs)):
input_code += f"op.add(in{i}, true);\n"
output_code = ''
for i in range(len(outputs_)):
output_code += f"op.add(out{i}, false);\n"
return jt.code(outputs=outputs_,
inputs=inputs,
cuda_header=attr_header + cuda_header,
cuda_src=f"""
// aclop
{name}OpRunner op;
{input_code}
{output_code}
{attr_code}
op.run();""")
class ReLUACL(jt.Function):
def __init__(self):
super(ReLUACL, self).__init__()
def execute(self, x):
x = x.float32()
self.input = x
result = relu_cmd("Unary", [x],
output_dtypes=[x.dtype],
output_shapes=[x.shape],
attr_code="op.name=\"ReLU\";")[0]
return result
def grad(self, grad_output):
mask = relu_cmd("Binary",
[self.input, jt.zeros(self.input.shape)],
output_dtypes=[self.input.dtype],
output_shapes=[self.input.shape],
attr_code="op.name=\"Greater\";")[0]
grad_input = relu_cmd("Binary", [grad_output, mask],
output_dtypes=[grad_output.dtype],
output_shapes=[grad_output.shape],
attr_code="op.name=\"Mul\";")[0]
return grad_input
class LeakyReLUACL(jt.Function):
def __init__(self):
super(LeakyReLUACL, self).__init__()
def execute(self, x, negative_slope=0.01):
x = x.float32()
self.input = x
attr_code = f"""
op.jt_name = "leakyrelu";
LeakyReluAttr *attr = new LeakyReluAttr();
attr->negativeSlope = {negative_slope};
op.op_attr.reset(attr);
"""
result = relu_cmd("LeakyReLU", [x],
output_dtypes=[x.dtype],
output_shapes=[x.shape],
attr_code=attr_code)[0]
self.negative_slope = negative_slope
return result
def grad(self, grad_output):
attr_code = f"""
op.jt_name = "leakyrelubackward";
LeakyReluAttr *attr = new LeakyReluAttr();
attr->negativeSlope = {self.negative_slope};
attr->selfIsResult = false;
op.op_attr.reset(attr);
"""
grad_input = relu_cmd("LeakyReLUBackward", [grad_output, self.input],
output_dtypes=[grad_output.dtype],
output_shapes=[grad_output.shape],
attr_code=attr_code)[0]
return grad_input

View File

@ -0,0 +1,90 @@
#pragma once
#include <acl/acl.h>
#include <acl/acl_op_compiler.h>
#include <Python.h>
#include <pystate.h>
#include <algorithm>
#include <queue>
#include <set>
#include "common.h"
#include "op.h"
#include "acl_jittor.h"
#include "ops/random_op.h"
#include "ops/reduce_op.h"
#include "ops/binary_op.h"
#include "ops/broadcast_to_op.h"
#include "ops/transpose_op.h"
#include "ops/array_op.h"
#include "ops/code_op.h"
#include "fused_op.h"
#include "ops/unary_op.h"
#include "ops/ternary_op.h"
#include "executor.h"
#include "misc/cuda_flags.h"
#include "mem/allocator.h"
#include "op_compiler.h"
#include "ops/op_register.h"
#include "opt/tuner_manager.h"
#include "utils/str_utils.h"
#include "aclnn/aclnn.h"
#include "relu_op_acl.h"
namespace jittor
{
LeakyReLUOpRunner::LeakyReLUOpRunner() : BaseOpRunner("LeakyReLU")
{
}
void LeakyReLUOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
aclScalar *negativeSlope = nullptr;
auto attr = dynamic_cast<LeakyReluAttr *>(op_attr.get());
negativeSlope = aclCreateScalar(&attr->negativeSlope, aclDataType::ACL_FLOAT);
ret = aclnnLeakyReluGetWorkspaceSize(inputTensors[0], negativeSlope, outputTensors[0], &workspaceSize, &executor);
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnLeakyRelu(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnLeakyRelu failed. ERROR: %d\n", name.c_str(), ret); return);
syncRun();
aclDestroyScalar(negativeSlope);
return;
}
LeakyReLUBackwardOpRunner::LeakyReLUBackwardOpRunner() : BaseOpRunner("LeakyReLUBackward")
{
}
void LeakyReLUBackwardOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
aclScalar *negativeSlope = nullptr;
auto attr = dynamic_cast<LeakyReluAttr *>(op_attr.get());
negativeSlope = aclCreateScalar(&attr->negativeSlope, aclDataType::ACL_FLOAT);
ret = aclnnLeakyReluBackwardGetWorkspaceSize(inputTensors[0], inputTensors[1], negativeSlope, attr->selfIsResult, outputTensors[0], &workspaceSize, &executor);
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnLeakyReluBackward(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnLeakyReluBackward failed. ERROR: %d\n", name.c_str(), ret); return);
syncRun();
aclDestroyScalar(negativeSlope);
return;
}
}

View File

@ -0,0 +1,27 @@
#pragma once
#include "utils.h"
#include "base_op.h"
namespace jittor
{
class LeakyReLUOpRunner : public BaseOpRunner
{
protected:
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
public:
LeakyReLUOpRunner();
};
class LeakyReLUBackwardOpRunner : public BaseOpRunner
{
protected:
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
public:
LeakyReLUBackwardOpRunner();
};
}

View File

@ -0,0 +1,84 @@
import os
from jittor_utils import env_or_try_find
import jittor_utils
import ctypes
import glob
import jittor.compiler as compiler
import jittor as jt
import math
import numpy as np
from typing import Union
from collections.abc import Sequence, Iterable
def rope_cmd(name: str,
inputs: list,
output_dtypes: list = None,
output_shapes: list = None,
attr_code: str = "",
attr_header: str = "",
outputs: list = None):
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
cuda_header = '''
#include "acl/aclops/aclops.h"
'''
outputs_ = []
if outputs is not None:
outputs_ = outputs
else:
assert output_dtypes is not None
assert output_shapes is not None
assert len(output_dtypes) == len(output_shapes)
for i in range(len(output_shapes)):
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
input_code = ''
for i in range(len(inputs)):
input_code += f"op.add(in{i}, true);\n"
output_code = ''
for i in range(len(outputs_)):
output_code += f"op.add(out{i}, false);\n"
return jt.code(outputs=outputs_,
inputs=inputs,
cuda_header=attr_header + cuda_header,
cuda_src=f"""
// aclop
{name}OpRunner op;
{input_code}
{output_code}
{attr_code}
op.run();""")
class RopeACL(jt.Function):
def __init__(self):
super(RopeACL, self).__init__()
def execute(self, xq, xk, freqs_cis, freq_cos, freq_sin):
attr_code = f"""
op.jt_name = "RotaryPosEmb";
"""
if freqs_cis is not None:
freq_cos = freqs_cis[..., 0]
freq_sin = freqs_cis[..., 1]
else:
assert freq_cos is not None and freq_sin is not None
inputs = [xq, xk, freq_cos, freq_sin]
results = rope_cmd("RotaryPosEmb",
inputs,
output_dtypes=[
xq.dtype,
],
output_shapes=[
xq.shape,
],
attr_code=attr_code)
results[0].sync()
return inputs[0], inputs[1]
def grad(self, grad_output):
return grad_output

View File

@ -0,0 +1,57 @@
#pragma once
#include <acl/acl.h>
#include <acl/acl_op_compiler.h>
#include <Python.h>
#include <pystate.h>
#include <algorithm>
#include <queue>
#include <set>
#include "common.h"
#include "op.h"
#include "acl_jittor.h"
#include "ops/random_op.h"
#include "ops/reduce_op.h"
#include "ops/binary_op.h"
#include "ops/broadcast_to_op.h"
#include "ops/transpose_op.h"
#include "ops/array_op.h"
#include "ops/code_op.h"
#include "fused_op.h"
#include "ops/unary_op.h"
#include "ops/ternary_op.h"
#include "executor.h"
#include "misc/cuda_flags.h"
#include "mem/allocator.h"
#include "op_compiler.h"
#include "ops/op_register.h"
#include "opt/tuner_manager.h"
#include "utils/str_utils.h"
#include "aclnn/aclnn.h"
#include "rope_op_acl.h"
namespace jittor
{
RotaryPosEmbOpRunner::RotaryPosEmbOpRunner() : BaseOpRunner("RotaryPosEmb")
{
}
void RotaryPosEmbOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
ret = aclnnApplyRotaryPosEmbGetWorkspaceSize(inputTensors[0], inputTensors[1], inputTensors[2], inputTensors[3], (int64_t)1, &workspaceSize, &executor);
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnApplyRotaryPosEmb(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnApplyRotaryPosEmb failed. ERROR: %d\n", name.c_str(), ret); return);
syncRun();
return;
}
}

View File

@ -0,0 +1,17 @@
#pragma once
#include "utils.h"
#include "base_op.h"
namespace jittor
{
class RotaryPosEmbOpRunner : public BaseOpRunner
{
protected:
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
public:
RotaryPosEmbOpRunner();
};
}

View File

@ -0,0 +1,356 @@
import os
from jittor_utils import env_or_try_find
import jittor_utils
import ctypes
import glob
import jittor.compiler as compiler
import jittor as jt
import math
import numpy as np
from typing import Union
from collections.abc import Sequence, Iterable
def setitem_cmd(name: str,
inputs: list,
output_dtypes: list = None,
output_shapes: list = None,
attr_code: str = "",
attr_header: str = "",
outputs: list = None):
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
cuda_header = '''
#include "acl/aclops/aclops.h"
'''
outputs_ = []
if outputs is not None:
outputs_ = outputs
else:
assert output_dtypes is not None
assert output_shapes is not None
assert len(output_dtypes) == len(output_shapes)
for i in range(len(output_shapes)):
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
input_code = ''
for i in range(len(inputs)):
input_code += f"op.add(in{i}, true);\n"
output_code = ''
for i in range(len(outputs_)):
output_code += f"op.add(out{i}, false);\n"
return jt.code(outputs=outputs_,
inputs=inputs,
cuda_header=attr_header + cuda_header,
cuda_src=f"""
// aclop
{name}OpRunner op;
{input_code}
{output_code}
{attr_code}
op.run();""")
def setitem_forward(name: str,
inputs: list,
output_dtypes: list = None,
output_shapes: list = None,
attr_code: str = "",
attr_header: str = "",
outputs: list = None,
extra_data: dict = {}):
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
cuda_header = '''
#include "acl/aclops/aclops.h"
'''
outputs_ = []
if outputs is not None:
outputs_ = outputs
else:
assert output_dtypes is not None
assert output_shapes is not None
assert len(output_dtypes) == len(output_shapes)
for i in range(len(output_shapes)):
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
input_code = ''
for i in range(len(inputs)):
input_code += f"op.add(in{i}, true);\n"
output_code = ''
for i in range(len(outputs_)):
output_code += f"op.add(out{i}, false);\n"
return jt.code(outputs=outputs_,
inputs=inputs,
cuda_header=attr_header + cuda_header,
cuda_src=f"""
// aclop
{name}OpRunner op;
{input_code}
op.add(out0, false);
{attr_code}
op.run();""",
data=extra_data)
def caculate_shape(tensors):
if isinstance(tensors, jt.Var):
# tensors = tensors[0]
return tensors.shape
elif isinstance(tensors, (int, float)):
return []
elif isinstance(tensors, (list, tuple)):
# return [caculate_shape(tensor) for tensor in tensors]
sub_shape = caculate_shape(tensors[0])
return [len(tensors)] + sub_shape
else:
assert False, f"not implemented for {type(tensors)}"
def can_broadcast_and_shape(shape1, shape2):
"""
检查两个张量是否可以广播并返回广播后的形状
参数:
- shape1: 第一个张量的形状tuple list
- shape2: 第二个张量的形状tuple list
返回:
- can_broadcast: 布尔值表示是否可以广播
- broadcast_shape: 如果可以广播返回广播后的形状否则返回 None
"""
# 将形状转换为元组,以防输入是列表
shape1 = tuple(shape1)
shape2 = tuple(shape2)
# 使两个形状的长度一致通过在前面补1
len1, len2 = len(shape1), len(shape2)
if len1 < len2:
shape1 = (1, ) * (len2 - len1) + shape1
elif len2 < len1:
shape2 = (1, ) * (len1 - len2) + shape2
broadcast_shape = []
# 从最后一维开始检查每一维度
for dim1, dim2 in zip(shape1, shape2):
if dim1 == dim2:
broadcast_shape.append(dim1)
elif dim1 == 1:
broadcast_shape.append(dim2)
elif dim2 == 1:
broadcast_shape.append(dim1)
else:
# 如果在某一维度上不兼容,则不能广播
return False, None
return True, tuple(broadcast_shape)
class SetItemACL(jt.Function):
def __init__(self):
self.type_ = 'notype'
self.value_var = True
def stride(self, x, dim):
stride = 1
for i in range(dim + 1, len(x.shape)):
stride *= x.shape[i]
return stride
def execute(self, x, slices, value):
self.x_shape = x.shape
self.input_slice = slices
if not isinstance(value, jt.Var):
self.value_var = False
if isinstance(slices, jt.Var):
if slices.dtype == "bool":
slices_len = slices.sum().item()
if slices_len == 0:
return x
if isinstance(value, int) or isinstance(value, float):
value = jt.full((slices_len, ), value, dtype=x.dtype)
assert slices.shape == x.shape, "setitem shape not match"
assert len(value.shape) == 1, "value shape must be 1D"
assert value.shape[
0] == slices_len, "value shape length must be equal to slices sum"
self.type_ = 'mask'
self.value_shape = value.shape
inputs = [slices, value]
outputs = [x.clone()]
attr_code = f"""
op.jt_name = "inplacemaskedscatter";
"""
result = setitem_cmd("InplaceMaskedScatter",
inputs=inputs,
outputs=outputs,
attr_code=attr_code)[0]
return result
# assert isinstance(value,jt.Var), "value must be jt.Var"
# self.value_shape = value.shape
if not isinstance(slices, tuple):
slices = (slices, )
slices = list(slices)
for i, s in enumerate(slices):
if isinstance(s, int) and s < 0:
slices[i] = x.shape[i] + s
slices = tuple(slices)
slices_list = list(slices)
#check slices contains slice type
contains_slice = False
for s in slices:
if not isinstance(s, jt.Var) and (isinstance(s, slice)
or s == Ellipsis):
contains_slice = True
break
if not contains_slice:
indices = []
value_shape = []
slices_len = len(slices)
boardcast_shape = caculate_shape(slices_list[0])
for ii in range(1, len(slices)):
dd, boardcast_shape = can_broadcast_and_shape(
boardcast_shape, caculate_shape(slices_list[ii]))
assert dd is True, "can not broadcast"
value_shape = boardcast_shape
value_shape += x.shape[slices_len:]
if value_shape == []:
value_shape = [1]
if isinstance(value, int) or isinstance(value, float):
value = jt.full(value_shape, value)
self.value_shape = value_shape
for ii in slices:
indices.append(jt.Var(ii).int32())
if isinstance(slices[0],
jt.Var) or isinstance(slices[0], int) or isinstance(
slices[0], list) or isinstance(slices[0], tuple):
self.indices = indices
self.type_ = 'index'
attr_code = f"""
op.jt_name = "indexputimpl";
"""
inputs = [value] + indices
outputs = [x.clone()]
result = setitem_cmd("IndexPutImpl",
inputs=inputs,
outputs=outputs,
attr_code=attr_code)[0]
# result.sync()
return result
assert "not support"
assert contains_slice, "slice type error"
x_dim = len(x.shape)
slices = list(slices)
for s in slices:
if not isinstance(s, jt.Var) and s == Ellipsis:
slices = slices[:slices.index(s)] + [
slice(None, None, None)
] * (x_dim - len(slices) + 1) + slices[slices.index(s) + 1:]
break
slices = tuple(slices)
self.input_slice = slices
if len(slices) < x_dim:
slices += (slice(None, None, None), ) * (x_dim - len(slices))
sizes = []
#适配华为奇怪的要求最后一个维度的step必须是1
expand_dim = False
if isinstance(slices[-1], slice):
if slices[-1].step is not None and slices[-1].step != 1:
slices = slices + (slice(None, None, None), )
expand_dim = True
elif isinstance(slices[-1], int):
#注意最后一个维度是数字
slices = slices + (slice(None, None, None), )
expand_dim = True
# value = value.unsqueeze(-1)
else:
assert False, "not supported"
x_shape = list(x.shape)
if expand_dim:
x_shape.append(1)
x = x.unsqueeze(-1)
value = value.unsqueeze(-1)
squeeze_dims = []
if isinstance(value, jt.Var):
for dim, s in enumerate(slices):
if isinstance(s, int):
s = slice(s, s + 1, 1)
squeeze_dims.append(dim)
for dim in squeeze_dims:
value = value.unsqueeze(dim)
extra_data = {}
if len(slices):
extra_data["a"] = len(slices)
for dim, s in enumerate(slices):
if isinstance(s, int):
s = slice(s, s + 1, 1)
if isinstance(s, jt.Var):
assert False, "jt.Var not supported"
start, stop, step = s.indices(x_shape[dim])
size = (stop - start - 1) // step + 1
sizes.append(size)
extra_data[str(dim * 3)] = start
extra_data[str(dim * 3 + 1)] = stop
extra_data[str(dim * 3 + 2)] = step
else:
extra_data["a"] = -1
sizes = [1]
steps = [1]
if isinstance(value, int) or isinstance(value, float):
value = jt.full(sizes, value)
self.type_ = 'slicev2'
attr_code = """
op.jt_name = "stridedsliceassignv2";
StrideAttr *attr = new StrideAttr();
int slice_dim = data["a"];
if(slice_dim == -1) {
attr->begins = {};
attr->ends = {};
attr->steps = {1};
attr->axes = {};
} else {
vector<long int> begins;
vector<long int> ends;
vector<long int> steps;
vector<long int> dims;
for(int dim = 0; dim < slice_dim; dim++) {
dims.push_back(dim);
begins.push_back(data[std::to_string(dim*3)]);
ends.push_back(data[std::to_string(dim*3+1)]);
steps.push_back(data[std::to_string(dim*3+2)]);
}
attr->begins = begins;
attr->ends = ends;
attr->steps = steps;
attr->axes = dims;
}
op.op_attr.reset(attr);
"""
self.value_shape = value.shape
inputs = [value]
outputs = [x.clone()]
result = setitem_forward("StridedSliceAssignV2",
inputs=inputs,
outputs=outputs,
attr_code=attr_code,
extra_data=extra_data)[0]
if expand_dim:
result = result.squeeze(-1)
# result.sync()
return result
def grad(self, grad_output):
value_grad = None
if self.value_var:
value_grad = grad_output[self.input_slice]
grad_output[self.input_slice] = jt.zeros(self.value_shape)
return grad_output, None, value_grad

View File

@ -0,0 +1,84 @@
#include <acl/acl.h>
#include <acl/acl_op_compiler.h>
#include <Python.h>
#include <pystate.h>
#include <algorithm>
#include <queue>
#include <set>
#include "common.h"
#include "op.h"
#include "acl_jittor.h"
#include "ops/random_op.h"
#include "ops/reduce_op.h"
#include "ops/binary_op.h"
#include "ops/broadcast_to_op.h"
#include "ops/transpose_op.h"
#include "ops/array_op.h"
#include "ops/code_op.h"
#include "fused_op.h"
#include "ops/unary_op.h"
#include "ops/ternary_op.h"
#include "executor.h"
#include "misc/cuda_flags.h"
#include "mem/allocator.h"
#include "op_compiler.h"
#include "ops/op_register.h"
#include "opt/tuner_manager.h"
#include "utils/str_utils.h"
#include "aclnn/aclnn.h"
#include "setitem_op_acl.h"
namespace jittor
{
InplaceMaskedScatterOpRunner::InplaceMaskedScatterOpRunner() : BaseOpRunner("InplaceMaskedScatter")
{
}
void InplaceMaskedScatterOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
ret = aclnnInplaceMaskedScatterGetWorkspaceSize(outputTensors[0], inputTensors[0], inputTensors[1], &workspaceSize, &executor);
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnInplaceMaskedScatter(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnInplaceMaskedScatter failed. ERROR: %d\n", name.c_str(), ret); return);
syncRun();
return;
}
IndexPutImplOpRunner::IndexPutImplOpRunner() : BaseOpRunner("IndexPutImpl")
{
}
void IndexPutImplOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
auto input_num = in_.size();
std::vector<aclTensor *> indexTensorList = {};
for (int i = 1; i < input_num; i++)
{
indexTensorList.push_back(inputTensors[i]);
}
auto indexTensorListInput = aclCreateTensorList(&indexTensorList[0], input_num - 1);
ret = aclnnIndexPutImplGetWorkspaceSize(outputTensors[0], indexTensorListInput, inputTensors[0], false, true, &workspaceSize, &executor);
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnIndexPutImpl(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnIndexPutImpl failed. ERROR: %d\n", name.c_str(), ret); return);
syncRun();
return;
}
}

View File

@ -0,0 +1,26 @@
#pragma once
#include "utils.h"
#include "base_op.h"
namespace jittor
{
class InplaceMaskedScatterOpRunner : public BaseOpRunner
{
protected:
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
public:
InplaceMaskedScatterOpRunner();
};
class IndexPutImplOpRunner : public BaseOpRunner
{
protected:
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
public:
IndexPutImplOpRunner();
};
}

View File

@ -0,0 +1,85 @@
import os
from jittor_utils import env_or_try_find
import jittor_utils
import ctypes
import glob
import jittor.compiler as compiler
import jittor as jt
import math
import numpy as np
from typing import Union
from collections.abc import Sequence, Iterable
def sigmoid_cmd(name: str,
inputs: list,
output_dtypes: list = None,
output_shapes: list = None,
attr_code: str = "",
attr_header: str = "",
outputs: list = None):
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
cuda_header = '''
#include "acl/aclops/aclops.h"
'''
outputs_ = []
if outputs is not None:
outputs_ = outputs
else:
assert output_dtypes is not None
assert output_shapes is not None
assert len(output_dtypes) == len(output_shapes)
for i in range(len(output_shapes)):
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
input_code = ''
for i in range(len(inputs)):
input_code += f"op.add(in{i}, true);\n"
output_code = ''
for i in range(len(outputs_)):
output_code += f"op.add(out{i}, false);\n"
return jt.code(outputs=outputs_,
inputs=inputs,
cuda_header=attr_header + cuda_header,
cuda_src=f"""
// aclop
{name}OpRunner op;
{input_code}
{output_code}
{attr_code}
op.run();""")
class SigmoidACL(jt.Function):
def __init__(self):
super(SigmoidACL, self).__init__()
def execute(self, x):
x = x.float32()
inputs = [x]
outputs = [jt.empty(x.shape, x.dtype)]
attr_code = f"""
op.jt_name = "sigmoid";
"""
result = sigmoid_cmd("Sigmoid",
inputs=inputs,
outputs=outputs,
attr_code=attr_code)[0]
self.output = result
return result
def grad(self, grad_output):
attr_code = f"""
op.jt_name = "sigmoidbackward";
"""
inputs = [grad_output, self.output]
outputs = [jt.empty(grad_output.shape, grad_output.dtype)]
grad_input = sigmoid_cmd("SigmoidBackward",
inputs=inputs,
outputs=outputs,
attr_code=attr_code)[0]
return grad_input

View File

@ -0,0 +1,80 @@
#pragma once
#include <acl/acl.h>
#include <acl/acl_op_compiler.h>
#include <Python.h>
#include <pystate.h>
#include <algorithm>
#include <queue>
#include <set>
#include "common.h"
#include "op.h"
#include "acl_jittor.h"
#include "ops/random_op.h"
#include "ops/reduce_op.h"
#include "ops/binary_op.h"
#include "ops/broadcast_to_op.h"
#include "ops/transpose_op.h"
#include "ops/array_op.h"
#include "ops/code_op.h"
#include "fused_op.h"
#include "ops/unary_op.h"
#include "ops/ternary_op.h"
#include "executor.h"
#include "misc/cuda_flags.h"
#include "mem/allocator.h"
#include "op_compiler.h"
#include "ops/op_register.h"
#include "opt/tuner_manager.h"
#include "utils/str_utils.h"
#include "aclnn/aclnn.h"
#include "sigmoid_op_acl.h"
namespace jittor
{
SigmoidOpRunner::SigmoidOpRunner() : BaseOpRunner("Sigmoid")
{
}
void SigmoidOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
ret = aclnnSigmoidGetWorkspaceSize(inputTensors[0], outputTensors[0], &workspaceSize, &executor);
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnSigmoid(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnSigmoid failed. ERROR: %d\n", name.c_str(), ret); return);
syncRun();
return;
}
SigmoidBackwardOpRunner::SigmoidBackwardOpRunner() : BaseOpRunner("SigmoidBackward")
{
}
void SigmoidBackwardOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
ret = aclnnSigmoidBackwardGetWorkspaceSize(inputTensors[0], inputTensors[1], outputTensors[0], &workspaceSize, &executor);
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnSigmoidBackward(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnSigmoidBackward failed. ERROR: %d\n", name.c_str(), ret); return);
syncRun();
return;
}
}

View File

@ -0,0 +1,27 @@
#pragma once
#include "utils.h"
#include "base_op.h"
namespace jittor
{
class SigmoidOpRunner : public BaseOpRunner
{
protected:
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
public:
SigmoidOpRunner();
};
class SigmoidBackwardOpRunner : public BaseOpRunner
{
protected:
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
public:
SigmoidBackwardOpRunner();
};
}

View File

@ -0,0 +1,85 @@
import os
from jittor_utils import env_or_try_find
import jittor_utils
import ctypes
import glob
import jittor.compiler as compiler
import jittor as jt
import math
import numpy as np
from typing import Union
from collections.abc import Sequence, Iterable
def silu_cmd(name: str,
inputs: list,
output_dtypes: list = None,
output_shapes: list = None,
attr_code: str = "",
attr_header: str = "",
outputs: list = None):
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
cuda_header = '''
#include "acl/aclops/aclops.h"
'''
outputs_ = []
if outputs is not None:
outputs_ = outputs
else:
assert output_dtypes is not None
assert output_shapes is not None
assert len(output_dtypes) == len(output_shapes)
for i in range(len(output_shapes)):
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
input_code = ''
for i in range(len(inputs)):
input_code += f"op.add(in{i}, true);\n"
output_code = ''
for i in range(len(outputs_)):
output_code += f"op.add(out{i}, false);\n"
return jt.code(outputs=outputs_,
inputs=inputs,
cuda_header=attr_header + cuda_header,
cuda_src=f"""
// aclop
{name}OpRunner op;
{input_code}
{output_code}
{attr_code}
op.run();""")
class SiLUACL(jt.Function):
def __init__(self):
super(SiLUACL, self).__init__()
def execute(self, x):
x = x.float32()
inputs = [x]
self.input = x
outputs = [jt.empty(x.shape, x.dtype)]
attr_code = f"""
op.jt_name = "silu";
"""
result = silu_cmd("SiLU",
inputs=inputs,
outputs=outputs,
attr_code=attr_code)[0]
return result
def grad(self, grad_output):
attr_code = f"""
op.jt_name = "silubackward";
"""
inputs = [grad_output, self.input]
outputs = [jt.empty(grad_output.shape, grad_output.dtype)]
grad_input = silu_cmd("SiLUBackward",
inputs=inputs,
outputs=outputs,
attr_code=attr_code)[0]
return grad_input

View File

@ -0,0 +1,80 @@
#pragma once
#include <acl/acl.h>
#include <acl/acl_op_compiler.h>
#include <Python.h>
#include <pystate.h>
#include <algorithm>
#include <queue>
#include <set>
#include "common.h"
#include "op.h"
#include "acl_jittor.h"
#include "ops/random_op.h"
#include "ops/reduce_op.h"
#include "ops/binary_op.h"
#include "ops/broadcast_to_op.h"
#include "ops/transpose_op.h"
#include "ops/array_op.h"
#include "ops/code_op.h"
#include "fused_op.h"
#include "ops/unary_op.h"
#include "ops/ternary_op.h"
#include "executor.h"
#include "misc/cuda_flags.h"
#include "mem/allocator.h"
#include "op_compiler.h"
#include "ops/op_register.h"
#include "opt/tuner_manager.h"
#include "utils/str_utils.h"
#include "aclnn/aclnn.h"
#include "silu_op_acl.h"
namespace jittor
{
SiLUOpRunner::SiLUOpRunner() : BaseOpRunner("SiLU")
{
}
void SiLUOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
ret = aclnnSiluGetWorkspaceSize(inputTensors[0], outputTensors[0], &workspaceSize, &executor);
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnSilu(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnSilu failed. ERROR: %d\n", name.c_str(), ret); return);
syncRun();
return;
}
SiLUBackwardOpRunner::SiLUBackwardOpRunner() : BaseOpRunner("SiLUBackward")
{
}
void SiLUBackwardOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
ret = aclnnSiluBackwardGetWorkspaceSize(inputTensors[0], inputTensors[1], outputTensors[0], &workspaceSize, &executor);
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnSiluBackward(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnSiluBackward failed. ERROR: %d\n", name.c_str(), ret); return);
syncRun();
return;
}
}

View File

@ -0,0 +1,27 @@
#pragma once
#include "utils.h"
#include "base_op.h"
namespace jittor
{
class SiLUOpRunner : public BaseOpRunner
{
protected:
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
public:
SiLUOpRunner();
};
class SiLUBackwardOpRunner : public BaseOpRunner
{
protected:
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
public:
SiLUBackwardOpRunner();
};
}

View File

@ -0,0 +1,92 @@
import os
from jittor_utils import env_or_try_find
import jittor_utils
import ctypes
import glob
import jittor.compiler as compiler
import jittor as jt
import math
import numpy as np
from typing import Union
from collections.abc import Sequence, Iterable
def softmax_cmd(name: str,
inputs: list,
output_dtypes: list = None,
output_shapes: list = None,
attr_code: str = "",
attr_header: str = "",
outputs: list = None):
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
cuda_header = '''
#include "acl/aclops/aclops.h"
'''
outputs_ = []
if outputs is not None:
outputs_ = outputs
else:
assert output_dtypes is not None
assert output_shapes is not None
assert len(output_dtypes) == len(output_shapes)
for i in range(len(output_shapes)):
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
input_code = ''
for i in range(len(inputs)):
input_code += f"op.add(in{i}, true);\n"
output_code = ''
for i in range(len(outputs_)):
output_code += f"op.add(out{i}, false);\n"
return jt.code(outputs=outputs_,
inputs=inputs,
cuda_header=attr_header + cuda_header,
cuda_src=f"""
// aclop
{name}OpRunner op;
{input_code}
{output_code}
{attr_code}
op.run();""")
class SoftmaxACL(jt.Function):
def __init__(self):
super(SoftmaxACL, self).__init__()
def execute(self, x, dim):
x = x.float32()
inputs = [x]
outputs = [jt.empty(x.shape)]
self.dim = dim
attr_code = f"""
op.jt_name = "softmax";
SoftmaxAttr *attr = new SoftmaxAttr();
attr->dim = {dim};
op.op_attr.reset(attr);
"""
result = softmax_cmd("Softmax",
inputs=inputs,
outputs=outputs,
attr_code=attr_code)[0]
self.output = result
return result
def grad(self, grad_output):
attr_code = f"""
op.jt_name = "softmax";
SoftmaxAttr *attr = new SoftmaxAttr();
attr->dim = {self.dim};
op.op_attr.reset(attr);
"""
inputs = [grad_output, self.output]
outputs = [jt.empty(grad_output.shape)]
grad_input = softmax_cmd("SoftmaxBackward",
inputs=inputs,
outputs=outputs,
attr_code=attr_code)[0]
return grad_input

View File

@ -0,0 +1,82 @@
#pragma once
#include <acl/acl.h>
#include <acl/acl_op_compiler.h>
#include <Python.h>
#include <pystate.h>
#include <algorithm>
#include <queue>
#include <set>
#include "common.h"
#include "op.h"
#include "acl_jittor.h"
#include "ops/random_op.h"
#include "ops/reduce_op.h"
#include "ops/binary_op.h"
#include "ops/broadcast_to_op.h"
#include "ops/transpose_op.h"
#include "ops/array_op.h"
#include "ops/code_op.h"
#include "fused_op.h"
#include "ops/unary_op.h"
#include "ops/ternary_op.h"
#include "executor.h"
#include "misc/cuda_flags.h"
#include "mem/allocator.h"
#include "op_compiler.h"
#include "ops/op_register.h"
#include "opt/tuner_manager.h"
#include "utils/str_utils.h"
#include "aclnn/aclnn.h"
#include "softmax_op_acl.h"
namespace jittor
{
SoftmaxOpRunner::SoftmaxOpRunner() : BaseOpRunner("Softmax")
{
}
void SoftmaxOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
auto attr = dynamic_cast<SoftmaxAttr *>(op_attr.get());
ret = aclnnSoftmaxGetWorkspaceSize(inputTensors[0], aclDataType(attr->dim), outputTensors[0], &workspaceSize, &executor);
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnSoftmax(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnSoftmax failed. ERROR: %d\n", name.c_str(), ret); return);
syncRun();
return;
}
SoftmaxBackwardOpRunner::SoftmaxBackwardOpRunner() : BaseOpRunner("SoftmaxBackward")
{
}
void SoftmaxBackwardOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
auto attr = dynamic_cast<SoftmaxAttr *>(op_attr.get());
ret = aclnnSoftmaxBackwardGetWorkspaceSize(inputTensors[0], inputTensors[1], attr->dim, outputTensors[0], &workspaceSize, &executor);
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnSoftmaxBackward(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnSoftmaxBackward failed. ERROR: %d\n", name.c_str(), ret); return);
syncRun();
return;
}
}

View File

@ -0,0 +1,27 @@
#pragma once
#include "utils.h"
#include "base_op.h"
namespace jittor
{
class SoftmaxOpRunner : public BaseOpRunner
{
protected:
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
public:
SoftmaxOpRunner();
};
class SoftmaxBackwardOpRunner : public BaseOpRunner
{
protected:
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
public:
SoftmaxBackwardOpRunner();
};
}

View File

@ -0,0 +1,115 @@
import os
from jittor_utils import env_or_try_find
import jittor_utils
import ctypes
import glob
import jittor.compiler as compiler
import jittor as jt
import math
import numpy as np
from typing import Union
from collections.abc import Sequence, Iterable
def stack_cmd(name: str,
inputs: list,
output_dtypes: list = None,
output_shapes: list = None,
attr_code: str = "",
attr_header: str = "",
outputs: list = None):
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
cuda_header = '''
#include "acl/aclops/aclops.h"
'''
outputs_ = []
if outputs is not None:
outputs_ = outputs
else:
assert output_dtypes is not None
assert output_shapes is not None
assert len(output_dtypes) == len(output_shapes)
for i in range(len(output_shapes)):
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
input_code = ''
for i in range(len(inputs)):
input_code += f"op.add(in{i}, true);\n"
output_code = ''
for i in range(len(outputs_)):
output_code += f"op.add(out{i}, false);\n"
return jt.code(outputs=outputs_,
inputs=inputs,
cuda_header=attr_header + cuda_header,
cuda_src=f"""
// aclop
{name}OpRunner op;
{input_code}
{output_code}
{attr_code}
op.run();""")
class StackACL(jt.Function):
def __init__(self):
super(StackACL, self).__init__()
def execute(self, input_tensors, dim):
if type(input_tensors) is tuple:
input_tensors = list(input_tensors)
assert type(input_tensors) is list
assert -1 * len(input_tensors) - 1 <= dim and dim <= len(input_tensors)
for i in range(len(input_tensors)):
if input_tensors[i].dtype != input_tensors[0].dtype:
raise ValueError("All input tensors must have the same dtype")
if input_tensors[i].shape != input_tensors[0].shape:
raise ValueError("All input tensors must have the same shape")
self.input = input_tensors
input_shape = list(input_tensors[0].shape)
output_shape = input_shape[:dim] + [len(input_tensors)
] + input_shape[dim:]
attr_code = f"""
op.jt_name = "stack";
ConcatAttr *attr = new ConcatAttr();
attr->tensorNum = {len(input_tensors)};
attr->dim = {dim};
op.op_attr.reset(attr);
"""
self.attr_code = attr_code
result = stack_cmd("Stack",
input_tensors,
output_dtypes=[input_tensors[0].dtype],
output_shapes=[output_shape],
attr_code=self.attr_code)[0]
return result
def grad(self, grad_output):
grad_inputs = self.split_grad(grad_output, self.input, self.dim)
return grad_inputs
def split_grad(self, grad_output, input_tensors, axis):
offset = []
shapeVec = []
dtypeVec = []
for tensor in input_tensors:
offset.append(tensor.shape[axis])
dtypeVec.append(tensor.dtype)
shapeVec.append(tensor.shape)
attr_code = f"""
op.jt_name = "splitwithsize";
auto *attr = new SplitWithSizeAttr();
attr->splitSize = {{ {", ".join(map(str, offset))} }};
attr->dim = {axis};
op.op_attr.reset(attr);
"""
result = stack_cmd("SplitWithSize", [grad_output],
output_dtypes=dtypeVec,
output_shapes=shapeVec,
attr_code=attr_code)
return result

View File

@ -0,0 +1,65 @@
#pragma once
#include <acl/acl.h>
#include <acl/acl_op_compiler.h>
#include <Python.h>
#include <pystate.h>
#include <algorithm>
#include <queue>
#include <set>
#include "common.h"
#include "op.h"
#include "acl_jittor.h"
#include "ops/random_op.h"
#include "ops/reduce_op.h"
#include "ops/binary_op.h"
#include "ops/broadcast_to_op.h"
#include "ops/transpose_op.h"
#include "ops/array_op.h"
#include "ops/code_op.h"
#include "fused_op.h"
#include "ops/unary_op.h"
#include "ops/ternary_op.h"
#include "executor.h"
#include "misc/cuda_flags.h"
#include "mem/allocator.h"
#include "op_compiler.h"
#include "ops/op_register.h"
#include "opt/tuner_manager.h"
#include "utils/str_utils.h"
#include "aclnn/aclnn.h"
#include "stack_op_acl.h"
namespace jittor
{
StackOpRunner::StackOpRunner() : BaseOpRunner("Stack")
{
}
void StackOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
auto input_num = in_.size();
std::vector<aclTensor *> stackTensorList = {};
for (int i = 0; i < input_num; i++)
{
stackTensorList.push_back(inputTensors[i]);
}
auto stackTensorListInput = aclCreateTensorList(&stackTensorList[0], input_num);
auto attr = dynamic_cast<ConcatAttr *>(op_attr.get());
ret = aclnnStackGetWorkspaceSize(stackTensorListInput, attr->dim, outputTensors[0], &workspaceSize, &executor);
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnStack(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnStack failed. ERROR: %d\n", name.c_str(), ret); return);
syncRun();
return;
}
}

View File

@ -0,0 +1,17 @@
#pragma once
#include "utils.h"
#include "base_op.h"
namespace jittor
{
class StackOpRunner : public BaseOpRunner
{
protected:
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
public:
StackOpRunner();
};
}

View File

@ -0,0 +1,54 @@
#include <acl/acl.h>
#include <acl/acl_op_compiler.h>
#include <Python.h>
#include <pystate.h>
#include <algorithm>
#include <queue>
#include <set>
#include "common.h"
#include "op.h"
#include "acl_jittor.h"
#include "ops/random_op.h"
#include "ops/reduce_op.h"
#include "ops/binary_op.h"
#include "ops/broadcast_to_op.h"
#include "ops/transpose_op.h"
#include "ops/array_op.h"
#include "ops/code_op.h"
#include "fused_op.h"
#include "ops/unary_op.h"
#include "ops/ternary_op.h"
#include "executor.h"
#include "misc/cuda_flags.h"
#include "mem/allocator.h"
#include "op_compiler.h"
#include "ops/op_register.h"
#include "opt/tuner_manager.h"
#include "utils/str_utils.h"
#include "aclnn/aclnn.h"
#include "ternary_op_acl.h"
namespace jittor
{
TernaryOpRunner::TernaryOpRunner() : BaseOpRunner("ternary")
{
use_nchw = false;
}
void TernaryOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
ret = aclnnSWhereGetWorkspaceSize(inputTensors[0], inputTensors[1], inputTensors[2], outputTensors[0], &workspaceSize, &executor);
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnSWhere(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnxxx failed. ERROR: %d\n", name.c_str(), ret); return);
// syncRun();
return;
}
}

View File

@ -0,0 +1,14 @@
#pragma once
#include "utils.h"
#include "base_op.h"
namespace jittor
{
struct TernaryOpRunner : public BaseOpRunner
{
TernaryOpRunner();
protected:
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
};
}

View File

@ -0,0 +1,101 @@
import os
from jittor_utils import env_or_try_find
import jittor_utils
import ctypes
import glob
import jittor.compiler as compiler
import jittor as jt
import math
import numpy as np
from typing import Union
from collections.abc import Sequence, Iterable
def transpose_cmd(name: str,
inputs: list,
output_dtypes: list = None,
output_shapes: list = None,
attr_code: str = "",
attr_header: str = "",
outputs: list = None):
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
cuda_header = '''
#include "acl/aclops/aclops.h"
'''
outputs_ = []
if outputs is not None:
outputs_ = outputs
else:
assert output_dtypes is not None
assert output_shapes is not None
assert len(output_dtypes) == len(output_shapes)
for i in range(len(output_shapes)):
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
input_code = ''
for i in range(len(inputs)):
input_code += f"op.add(in{i}, true);\n"
output_code = ''
for i in range(len(outputs_)):
output_code += f"op.add(out{i}, false);\n"
return jt.code(outputs=outputs_,
inputs=inputs,
cuda_header=attr_header + cuda_header,
cuda_src=f"""
// aclop
{name}OpRunner op;
{input_code}
{output_code}
{attr_code}
op.run();""")
class TransPoseACL(jt.Function):
def __init__(self):
super(TransPoseACL, self).__init__()
def execute(self, x, *dim):
self.input = x
if len(dim) == 1 and isinstance(dim[0], Sequence):
dim = dim[0]
elif len(dim) == 2:
axes = list(range(x.ndim))
a, b = dim
axes[a], axes[b] = axes[b], axes[a]
dim = axes
attr_code = f"""
op.jt_name = "transpose";
ReduceAttr *attr = new ReduceAttr();
attr->axes = {{ {", ".join(map(str, dim))} }};
op.op_attr.reset(attr);
"""
# calculate output shape
output_shape = [x.shape[i] for i in dim]
output = transpose_cmd("Transpose", [x],
output_dtypes=[x.dtype],
output_shapes=[jt.empty(output_shape).shape],
attr_code=attr_code)[0]
self.dim = dim
return output
def grad(self, grad_output):
dim = list(range(grad_output.ndim))
for i, p in enumerate(self.dim):
dim[p] = i
output_shape = [grad_output.shape[i] for i in dim]
attr_code = f"""
op.jt_name = "transpose";
ReduceAttr *attr = new ReduceAttr();
attr->axes = {{ {", ".join(map(str, dim))} }};
op.op_attr.reset(attr);
"""
output = transpose_cmd("Transpose", [grad_output],
output_dtypes=[grad_output.dtype],
output_shapes=[jt.empty(output_shape).shape],
attr_code=attr_code)[0]
return output

View File

@ -0,0 +1,66 @@
#pragma once
#include <acl/acl.h>
#include <acl/acl_op_compiler.h>
#include <Python.h>
#include <pystate.h>
#include <algorithm>
#include <queue>
#include <set>
#include "common.h"
#include "op.h"
#include "acl_jittor.h"
#include "ops/random_op.h"
#include "ops/reduce_op.h"
#include "ops/binary_op.h"
#include "ops/broadcast_to_op.h"
#include "ops/transpose_op.h"
#include "ops/array_op.h"
#include "ops/code_op.h"
#include "fused_op.h"
#include "ops/unary_op.h"
#include "ops/ternary_op.h"
#include "executor.h"
#include "misc/cuda_flags.h"
#include "mem/allocator.h"
#include "op_compiler.h"
#include "ops/op_register.h"
#include "opt/tuner_manager.h"
#include "utils/str_utils.h"
#include "aclnn/aclnn.h"
#include "transpose_op_acl.h"
namespace jittor
{
TransposeOpRunner::TransposeOpRunner() : BaseOpRunner("Transpose")
{
}
void TransposeOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
auto attr = dynamic_cast<ReduceAttr *>(op_attr.get());
aclIntArray *dim = nullptr;
dim = aclCreateIntArray(attr->axes.data(), attr->axes.size());
bool keepdims = attr->keepdims;
ret = aclnnPermuteGetWorkspaceSize(inputTensors[0], dim, outputTensors[0], &workspaceSize, &executor);
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnPermute(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnPermute failed. ERROR: %d\n", name.c_str(), ret); return);
syncRun();
aclDestroyIntArray(dim);
return;
}
}

View File

@ -0,0 +1,17 @@
#pragma once
#include "utils.h"
#include "base_op.h"
namespace jittor
{
class TransposeOpRunner : public BaseOpRunner
{
protected:
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
public:
TransposeOpRunner();
};
}

View File

@ -0,0 +1,74 @@
import os
from jittor_utils import env_or_try_find
import jittor_utils
import ctypes
import glob
import jittor.compiler as compiler
import jittor as jt
import math
import numpy as np
from typing import Union
from collections.abc import Sequence, Iterable
def triu_cmd(name: str,
inputs: list,
output_dtypes: list = None,
output_shapes: list = None,
attr_code: str = "",
attr_header: str = "",
outputs: list = None):
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
cuda_header = '''
#include "acl/aclops/aclops.h"
'''
outputs_ = []
if outputs is not None:
outputs_ = outputs
else:
assert output_dtypes is not None
assert output_shapes is not None
assert len(output_dtypes) == len(output_shapes)
for i in range(len(output_shapes)):
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
input_code = ''
for i in range(len(inputs)):
input_code += f"op.add(in{i}, true);\n"
output_code = ''
for i in range(len(outputs_)):
output_code += f"op.add(out{i}, false);\n"
return jt.code(outputs=outputs_,
inputs=inputs,
cuda_header=attr_header + cuda_header,
cuda_src=f"""
// aclop
{name}OpRunner op;
{input_code}
{output_code}
{attr_code}
op.run();""")
class TriuACL(jt.Function):
def __init__(self):
super(TriuACL, self).__init__()
def execute(self, input, diagonal):
attr_code = f"""
op.jt_name = "triu";
TriuAttr *attr = new TriuAttr();
attr->diagonal = {diagonal};
op.op_attr.reset(attr);
"""
result = triu_cmd("Triu", [input],
output_dtypes=[input.dtype],
output_shapes=[input.shape],
attr_code=attr_code)[0]
return result
def grad(self, grad_output):
return grad_output

View File

@ -0,0 +1,58 @@
#pragma once
#include <acl/acl.h>
#include <acl/acl_op_compiler.h>
#include <Python.h>
#include <pystate.h>
#include <algorithm>
#include <queue>
#include <set>
#include "common.h"
#include "op.h"
#include "acl_jittor.h"
#include "ops/random_op.h"
#include "ops/reduce_op.h"
#include "ops/binary_op.h"
#include "ops/broadcast_to_op.h"
#include "ops/transpose_op.h"
#include "ops/array_op.h"
#include "ops/code_op.h"
#include "fused_op.h"
#include "ops/unary_op.h"
#include "ops/ternary_op.h"
#include "executor.h"
#include "misc/cuda_flags.h"
#include "mem/allocator.h"
#include "op_compiler.h"
#include "ops/op_register.h"
#include "opt/tuner_manager.h"
#include "utils/str_utils.h"
#include "aclnn/aclnn.h"
#include "triu_op_acl.h"
namespace jittor
{
TriuOpRunner::TriuOpRunner() : BaseOpRunner("Triu")
{
}
void TriuOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
auto attr = dynamic_cast<TriuAttr *>(op_attr.get());
ret = aclnnTriuGetWorkspaceSize(inputTensors[0], aclDataType(attr->diagonal), outputTensors[0], &workspaceSize, &executor);
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnTriu(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnTriu failed. ERROR: %d\n", name.c_str(), ret); return);
syncRun();
return;
}
}

View File

@ -0,0 +1,16 @@
#pragma once
#include "utils.h"
#include "base_op.h"
namespace jittor
{
class TriuOpRunner : public BaseOpRunner
{
protected:
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
public:
TriuOpRunner();
};
}

View File

@ -0,0 +1,59 @@
#include <acl/acl.h>
#include <acl/acl_op_compiler.h>
#include <Python.h>
#include <pystate.h>
#include <algorithm>
#include <queue>
#include <set>
#include "common.h"
#include "op.h"
#include "acl_jittor.h"
#include "ops/random_op.h"
#include "ops/reduce_op.h"
#include "ops/binary_op.h"
#include "ops/broadcast_to_op.h"
#include "ops/transpose_op.h"
#include "ops/array_op.h"
#include "ops/code_op.h"
#include "fused_op.h"
#include "ops/unary_op.h"
#include "ops/ternary_op.h"
#include "executor.h"
#include "misc/cuda_flags.h"
#include "mem/allocator.h"
#include "op_compiler.h"
#include "ops/op_register.h"
#include "opt/tuner_manager.h"
#include "utils/str_utils.h"
#include "aclnn/aclnn.h"
#include "unary_op_acl.h"
namespace jittor
{
UnaryOpRunner::UnaryOpRunner() : BaseOpRunner("unary")
{
use_nchw = false;
is_group_op = true;
}
void UnaryOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
if (name == "Cast")
ret = it->second.getWorkspaceSizeFuncCast(inputTensors[0], get_dtype(out_[0]->dtype()), outputTensors[0], &workspaceSize, &executor);
else
ret = it->second.getWorkspaceSizeFuncUnaryNonzero(inputTensors[0], outputTensors[0], &workspaceSize, &executor);
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = it->second.executeFunc(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnxxx failed. ERROR: %d\n", name.c_str(), ret); return);
// syncRun();
return;
}
}

Some files were not shown because too many files have changed in this diff Show More