Merge pull request #654 from Jittor/hyx

merge hw backend
2025-07-28 18:36:24 +08:00 · 2025-07-28 18:36:24 +08:00 · d892a83d1c
parent b79ac22b05 4017b161d2
commit d892a83d1c
127 changed files with 11160 additions and 2448 deletions
--- a/python/jittor/compile_extern.py
+++ b/python/jittor/compile_extern.py
@ -611,6 +611,26 @@ def setup_nccl():
    nccl_ops = nccl.ops
    LOG.vv("Get nccl_ops: "+str(dir(nccl_ops)))

+def setup_hccl():
+    global hccl_ops
+
+    hccl_src_dir = os.path.join(jittor_path, "extern", "acl", "hccl")
+    hccl_src_files = []
+    for r, _, f in os.walk(hccl_src_dir):
+        for fname in f:
+            hccl_src_files.append(os.path.join(r, fname))
+
+    hccl_include_path = os.path.join(os.environ.get("ASCEND_TOOLKIT_HOME"), "aarch64-linux/include/hccl")
+    hccl_lib_name = os.path.join(os.environ.get("ASCEND_TOOLKIT_HOME"), "aarch64-linux/lib64/libhccl.so")
+    ctypes.CDLL(hccl_lib_name, dlopen_flags)
+
+    hccl = compile_custom_ops(hccl_src_files, 
+        extra_flags=f" -I\"{hccl_include_path}\" {mpi_compile_flags} ",
+        return_module=True, dlopen_flags=os.RTLD_GLOBAL | os.RTLD_NOW,
+        gen_name_="jittor_hccl_core")
+    hccl_ops = hccl.ops
+    LOG.vv("Get hccl_ops: "+str(dir(hccl_ops)))
+
 def manual_link(flags):
    lib_dirs = []
    libs = []
@ -708,8 +728,14 @@ cudnn = cublas = curand = cufft = cusparse = None
 setup_mpi()
 rank = mpi.world_rank() if in_mpi else 0
 world_size = mpi.world_size() if in_mpi else 1
+# if has_acl:
+#     setup_hccl()
+# elif has_cuda:
+#     setup_nccl()
+#     setup_cutt()
+#     setup_cutlass()
+    
 setup_nccl()
-
 setup_cutt()
 setup_cutlass()

--- a/python/jittor/compiler.py
+++ b/python/jittor/compiler.py
@ -1188,7 +1188,22 @@ make_cache_dir(ck_path)

 # build cache_compile
 cc_flags += f" -I\"{os.path.join(jittor_path, 'src')}\" "
+cc_flags += f" -I\"{os.path.join(jittor_path, 'extern')}\" "
+
+ascend_toolkit_home = os.getenv('ASCEND_TOOLKIT_HOME')
+
+if ascend_toolkit_home:
+    cc_flags += f" -I\"{os.path.join(ascend_toolkit_home, 'include')}\" "
+    cc_flags += f" -I\"{os.path.join(ascend_toolkit_home, 'include/acl')}\" "
+    cc_flags += f" -I\"{os.path.join(ascend_toolkit_home, 'include/aclnn')}\" "
+    cc_flags += f" -I\"{os.path.join(ascend_toolkit_home, 'include/aclnnop')}\" "
+    cc_flags += f" -L\"{os.path.join(ascend_toolkit_home, 'lib64')}\" "
+    cc_flags += " -llibascendcl "
+    cc_flags += " -llibnnopbase "
+    cc_flags += " -llibopapi "
+
 cc_flags += py_include
+
 check_cache_compile()
 LOG.v(f"Get cache_compile: {jit_utils.cc}")

--- a/python/jittor/extern/acl/acl_compiler.py
+++ b/python/jittor/extern/acl/acl_compiler.py
--- a/python/jittor/extern/acl/acl_error_code.cc
+++ b/python/jittor/extern/acl/acl_error_code.cc
@ -1,6 +1,6 @@
 // ***************************************************************
-// Copyright (c) 2023 Jittor. All Rights Reserved. 
-// Maintainers: Dun Liang <randonlang@gmail.com>. 
+// Copyright (c) 2023 Jittor. All Rights Reserved.
+// Maintainers: Dun Liang <randonlang@gmail.com>.
 // This file is subject to the terms and conditions defined in
 // file 'LICENSE.txt', which is part of this source code package.
 // ***************************************************************
@ -11,23 +11,27 @@ using std::unordered_map;

 typedef int aclError;

-static inline unordered_map<aclError,string> gen_map(string s) {
-    unordered_map<aclError,string> smap;
-    for (int i=0; i<s.size(); i++) {
-        if (s[i] == ';') {
-            int j=s.rfind(" ", i);
-            int code = std::stoi(s.substr(j+1, i-j-1));
-            int k = s.rfind(" ", j-1);
-            int l = s.rfind(" ACL_", k-1);
-            smap[code] = s.substr(l+1, k-l-1);
+static inline unordered_map<aclError, string> gen_map(string s)
+{
+    unordered_map<aclError, string> smap;
+    for (int i = 0; i < s.size(); i++)
+    {
+        if (s[i] == ';')
+        {
+            int j = s.rfind(" ", i);
+            int code = std::stoi(s.substr(j + 1, i - j - 1));
+            int k = s.rfind(" ", j - 1);
+            int l = s.rfind(" ACL_", k - 1);
+            smap[code] = s.substr(l + 1, k - l - 1);
        }
    }
    return smap;
 }

-string acl_error_to_string(aclError error) {
+string acl_error_to_string(aclError error)
+{

-static unordered_map<aclError,string> acl_error_map = gen_map(R"(
+    static unordered_map<aclError, string> acl_error_map = gen_map(R"(
 // from acl_base.h
 static const int ACL_ERROR_INVALID_PARAM = 100000;
 static const int ACL_ERROR_UNINITIALIZE = 100001;
--- a/python/jittor/extern/acl/acl_jittor.cc
+++ b/python/jittor/extern/acl/acl_jittor.cc
@ -1,6 +1,6 @@
 // ***************************************************************
-// Copyright (c) 2023 Jittor. All Rights Reserved. 
-// Maintainers: Dun Liang <randonlang@gmail.com>. 
+// Copyright (c) 2023 Jittor. All Rights Reserved.
+// Maintainers: Dun Liang <randonlang@gmail.com>.
 // This file is subject to the terms and conditions defined in
 // file 'LICENSE.txt', which is part of this source code package.
 // ***************************************************************
@ -10,267 +10,311 @@
 #include "utils/str_utils.h"
 #include <chrono>
 #include <thread>
+#include "aclnn/aclnn.h"

-namespace jittor {
+namespace jittor
+{

-uint64_t acl_jittor_tid;
-int acl_jittor_thread_running=0;
-aclrtContext acl_jittor_context;
-aclrtStream aclstream;
+    uint64_t acl_jittor_tid;
+    int acl_jittor_thread_running = 0;
+    aclrtStream aclstream;
+    void *workspaceAddr = nullptr;
+    uint64_t nowWorkSpaceSize = 0;

-#define CHECK_ACL(x) ASSERTop(x,==,0)
+#define CHECK_ACL(x) ASSERTop(x, ==, 0)

-static void* acl_jittor_process_callback(void*) {
-    acl_jittor_thread_running = 1;
-    int deviceId = 0;
-    CHECK_ACL(aclrtSetCurrentContext(acl_jittor_context));
-    
-    while (acl_jittor_thread_running) {
-        // LOGir << "acl_jittor_process_callback";
-        auto ret = aclrtProcessReport(1000);
-        if (ret) {
-            if (acl_jittor_thread_running && ret != ACL_ERROR_RT_REPORT_TIMEOUT && ret != ACL_ERROR_RT_THREAD_SUBSCRIBE)
-                LOGir << "aclrtProcessReport:" << ret << acl_error_to_string(ret);
-            break;
+    void mallocWorkSpace(uint64_t size)
+    {
+        uint64_t alloc_size = size + 32;
+        alloc_size = ((alloc_size - 1) / 32 + 1) * 32;
+        if (alloc_size > nowWorkSpaceSize)
+        {
+            aclrtFree(workspaceAddr);
+            nowWorkSpaceSize = alloc_size;
+            auto ret = aclrtMalloc(&workspaceAddr, nowWorkSpaceSize, ACL_MEM_MALLOC_HUGE_FIRST);
+            CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("allocate workspace failed. ERROR: %d\n", ret); return);
        }
    }
-    acl_jittor_thread_running = 0;
-    return (void*)0;
-}
+    static void *acl_jittor_process_callback(void *)
+    {
+        acl_jittor_thread_running = 1;

-// void aaa(void*) {
-//     LOGir << "haha";
-// }
-
-struct acl_jittor_initer {
-
-acl_jittor_initer() {
-    CHECK_ACL(aclInit(nullptr));
-    uint device_count = 0;
-    // 获取可用的Device数量
-    CHECK_ACL(aclrtGetDeviceCount(&device_count));
-    LOGi << "Found ACL device number:" << device_count;
-    CHECK_ACL(aclrtSetDevice(0));
-    CHECK_ACL(aclrtCreateContext(&acl_jittor_context, 0));
-    CHECK_ACL(aclrtSetCurrentContext(acl_jittor_context));
-    
-    pthread_create(&acl_jittor_tid, nullptr, acl_jittor_process_callback, 0);
-
-    // subscribe for default stream
-    CHECK_ACL(aclrtSubscribeReport(acl_jittor_tid,0));
-
-    // simple callback test
-    CHECK_ACL(aclrtCreateStream(&aclstream));
-    // CHECK_ACL(aclrtSubscribeReport(acl_jittor_tid,aclstream));
-    // CHECK_ACL(aclrtLaunchCallback((aclrtCallback)&aaa, 0, ACL_CALLBACK_NO_BLOCK, aclstream));
-    // CHECK_ACL(aclrtLaunchCallback((aclrtCallback)&aaa, 0, ACL_CALLBACK_NO_BLOCK, 0));
-}
-
-~acl_jittor_initer() {
-    acl_jittor_thread_running = 0;
-    CHECK_ACL(aclrtUnSubscribeReport(acl_jittor_tid,0));
-    CHECK_ACL(aclrtDestroyContext(acl_jittor_context));
-    CHECK_ACL(aclFinalize());
-}
-
-} _acl_jittor_initer;
-
-string process_acl(const string& src, const string& name, const map<string,string>& kargs) {
-    if (endswith(name, "_jittor.cc"))
-        return src;
-    // static vector<string> dont_compile = {"fp16_emu.cc"};
-    // for (auto& s : dont_compile)
-    //     if (endswith(name, s))
-    //         return " ";
-    static unordered_set<string> cuda_headers = {
-        "cuda_runtime", "cudnn", "driver_types",
-        "cuda_fp16", "cuda_runtime_api", "fp16_emu",
-        "cudnn_rnn_descriptor", "cublas_v2", "cublas_wrapper",
-        "curand", "curand_wrapper", "cufft", "cufftXt",
-        "CudaUtils", "cutt", "cudnn_wrapper", "cuda_bf16"
-    };
-    static unordered_set<string> fake_class = {
-        "cudnnHandle_t", "cudnnConvolutionBwdFilterAlgo_t",
-        "cudnnConvolutionBwdDataAlgo_t", "cudnnConvolutionFwdAlgo_t",
-        "cufftHandle"
-    };
-    try {
-    auto tokens = token_split(src);
-    int edit = 0;
-    for (int i=0; i<tokens.size(); i++) {
-        auto& token = tokens[i];
-        if (cuda_headers.count(token)) token = "acl_jittor", edit ++; else
-        if (fake_class.count(token)) token = "int", edit ++; else
-        if (token == "CUDA") token = "ACL", edit ++; else
-        if (startswith(token, "cuda")) {
-            if (token.size()>=5 && token[4] >= 'A' && token[4] <= 'Z') {
-                if (token == "cudaGetDeviceCount") {
-                    token_replace(tokens, i, "($1);", "((uint*)$1);");
-                } else if (token == "cudaLaunchHostFunc") {
-                    // ACL_CALLBACK_BLOCK for 310
-                    token_replace(tokens, i, "LaunchHostFunc($1,$2,$3)",
-                        "LaunchCallback($2,$3,ACL_CALLBACK_NO_BLOCK,$1)");
-                } else if (token == "cudaMemcpy")
-                    token_replace(tokens, i, "cudaMemcpy($1,$2,$3,",
-                        "aclrtMemcpy($1,$3,$2,$3,");
-                else if (token == "cudaMemcpyAsync")
-                    token_replace(tokens, i, "cudaMemcpyAsync($1,$2,$3,",
-                        "aclrtMemcpyAsync($1,$3,$2,$3,");
-                else if (token == "cudaMemcpyDeviceToHost") token = "ACL_MEMCPY_DEVICE_TO_HOST";
-                else if (token == "cudaMemcpyDefault") token = "ACL_MEMCPY_HOST_TO_DEVICE";
-                else if (token == "cudaMemcpyHostToDevice") token = "ACL_MEMCPY_HOST_TO_DEVICE";
-                else if (token == "cudaMemcpyDeviceToDevice") token = "ACL_MEMCPY_DEVICE_TO_DEVICE";
-                else if (token == "cudaMallocManaged" || token == "cudaMalloc") {
-                    // unified address not supported
-                    token = "aclrtMalloc";
-                    token_replace(tokens, i, "($1,$2)",
-                        "($1,$2,ACL_MEM_MALLOC_HUGE_FIRST)");
-                } else if (token == "cudaMemGetInfo")
-                    token_replace(tokens, i, "cudaMemGetInfo($1,$2)",
-                        "aclrtGetMemInfo(ACL_DDR_MEM,$1,$2)");
-                else if (token == "cudaGetLastError")
-                    token_replace(tokens, i, "cudaGetLastError()", "0");
-                else if (token == "cudaStreamCreateWithFlags")
-                    token_replace(tokens, i-1, 
-                        "(cudaStreamCreateWithFlags($1,$2));",
-                        "(aclrtCreateStream($1)); checkAclErrors(aclrtSubscribeReport(acl_jittor_tid,*$1));");
-                else if (token == "cudaEventCreate")
-                    token_replace(tokens, i, 
-                        "cudaEventCreate($1,$2)",
-                        "aclrtCreateEvent($1)");
-                else if (token == "cudaDeviceSynchronize")
-                    token = "aclrtSynchronizeDevice";
-                else if (token == "cudaStreamDestroy")
-                    token_replace(tokens, i, "cudaStreamDestroy($1)",
-                        "(aclrtUnSubscribeReport(acl_jittor_tid,$1), aclrtDestroyStream($1))");
-                else if (token == "cudaEventDestroy")
-                    token = "aclrtDestroyEvent";
-                else if (token == "cudaEventRecord")
-                    token = "aclrtRecordEvent";
-                else if (token == "cudaStreamWaitEvent")
-                    token_replace(tokens, i, 
-                        "cudaStreamWaitEvent($1,$2,$3)",
-                        "aclrtStreamWaitEvent($1,$2)");
-                
-                if (token.size() && token[0] == 'c')
-                    token = "aclrt" + token.substr(4);
-                if (endswith(token, "_t"))
-                    token = token.substr(0, token.size()-2);
-                edit ++;
-            }
-        } else
-        if (token == "_cudaGetErrorEnum") {
-            token_replace(tokens, i, "_cudaGetErrorEnum($1)", "(acl_error_to_string($1))");
-            edit ++;
-        } else
-        if (token == "checkCudaErrors")
-            token = "checkAclErrors";
-        else if (token == "JPU") {
-            edit ++;
-            string new_code;
-            if (tokens[i+2] == "op_compiler")
-                token_replace(tokens, i, 
-                    "JPU(op_compiler($1,$2,$3))",
-                    "acl_jittor_op_compiler($1,$2,$3)");
-            else if (tokens[i+2] == "header")
-                new_code = "#include \"acl_jittor.h\"";
-            if (new_code.size())
-                token_replace(tokens, i,  "JPU($1)", new_code);
-        } else if (token == "use_cuda_managed_allocator" && tokens[i+1][0]==',') {
-            tokens[i+2] = "0"; // disable unified address
-        }
-    }
-    if (!edit) return src;
-    string new_src = join(tokens, "");
-    // if (name == "executor.cc") {
-    //     new_src = string("#include <Python.h>\n#include <pystate.h>\n#include <common.h>\n")+
-    //     "namespace jittor { void acl_op_exec(Op*); }\n" + 
-    //     replace(new_src, "op->do_run_after_prepare(jkl);", 
-    //     R"({
-    //         acl_op_exec(op);
-    //     })");
-    // }
-    if (name == "profiler.cc") {
-        new_src = token_replace_all(new_src, ".cc", ".tikcc");
-    }
-    // LOGir << name << (name == "pass_manager.cc");
-    if (name == "pass_manager.cc") {
-        LOGir << "replace" << name;
-        new_src = token_replace_all(new_src, "run_pass<FloatAtomicFixPass>();", "WTF");
-    }
-    // ????????
-    return new_src;
-    } catch (const std::exception& e) {
-        LOGe << "process acl error:" << e.what();
-        LOGe << "name:" << name;
-        throw;
-    }
-}
-
-void acl_jittor_op_compiler(string& filename, string& src, bool is_acl, string& extra_flags) {
-    if (!is_acl) return;
-    // extra_flags += " --tik-soc-version=Ascend910 ";
-    // filename = replace(filename, ".cc", ".tikcc");
-    // LOGir << filename;
-    string new_src = process_acl(src, "", {});
-    new_src = replace(new_src, R"(#include "misc/cuda_atomic.h")", "");
-    new_src = replace(new_src, R"(#include "misc/cuda_limits.h")", "");
-    new_src = replace(new_src, "__global__", "__ai_device_entry__");
-    new_src = token_replace_all(new_src, "__launch_bounds__($1)", "");
-    new_src = token_replace_all(new_src, "int thread_num = $1;", "int thread_num = 1;");
-    new_src = token_replace_all(new_src, "tn0=std::max(tn0, $1);", "");
-    new_src = token_replace_all(new_src, "<<<$1>>>", "<<<1,0>>>");
-    new_src = token_replace_all(new_src, "int thread_id = $1;", "int thread_id = 1;");
-    // for inc error
-    new_src = token_replace_all(new_src, "for ($1+=$2)", "for ($1++)");
-    // bit op error
-    new_src = token_replace_all(new_src, "int tnum$1;", "");
-    new_src = token_replace_all(new_src, "int p1$1;", "");
-    new_src = token_replace_all(new_src, "int p2$1;", "");
-    new_src = token_replace_all(new_src, "int tn$1=$2;", "int tn$1=0;");
-    new_src = token_replace_all(new_src, "int tid$1=$2;", "int tid$1=0;");
-    src = new_src;
-
-    new_src = token_replace_all(new_src, "atomicAdd(&$1,$2);", "$1=$1+$2;");
-    // new_src = token_replace_all(new_src, "bool", "int8");
-    new_src = token_replace_all(new_src, "::numeric_min<float32>()", "-1e30");
-    new_src = token_replace_all(new_src, "::numeric_max<float32>()", "1e30");
-    // TODO: support max
-    unordered_map<string,string> opmap = {
-        // {"::max","tikcc::scalar_max"}, 
-        {"::sqrtf", "tikcc::scalar_sqrt"}
-    };
-    auto ss = split(new_src, ";");
-    for (auto &s : ss) {
-        if (s.find("?") != string::npos) {
-            s = token_replace_all(s+";", "auto $1=$2?$3:$4;", "auto $1=$3;if (!($2)) $1=$4;");
-        }
-        if (s.find("::max") != string::npos) {
-            if (s.find("auto") == string::npos) {
-                s = token_replace_all(s+";", " $1=$4::max($2,$3);", " $1=$2;if ($2 < $3) $1=$3;");
-            } else {
-                s = token_replace_all(s+";", "auto $1=$4::max($2,$3);", "auto $1=$2;if ($2 < $3) $1=$3;");
+        while (acl_jittor_thread_running)
+        {
+            // LOGir << "acl_jittor_process_callback";
+            auto ret = aclrtProcessReport(1000);
+            if (ret)
+            {
+                if (acl_jittor_thread_running && ret != ACL_ERROR_RT_REPORT_TIMEOUT && ret != ACL_ERROR_RT_THREAD_SUBSCRIBE)
+                    LOGir << "aclrtProcessReport:" << ret << acl_error_to_string(ret);
+                break;
            }
        }
-        for (auto& kv : opmap) {
-            if (s.find(kv.first) != string::npos) {
-                if (s.find("auto") == string::npos) {
-                    // $1 = op($2) --> op($1, $2)
-                    s = token_replace_all(s+";", " $1= "+kv.first+"($2);", kv.second+"($1, $2);");
-                } else {
-                    // auto $1 = op($2) --> float32 $1; op($1, $2);
-                    s = token_replace_all(s+";", "auto $1= "+kv.first+"($2);", "float32 $1; " + kv.second+"($1, $2);");
+        acl_jittor_thread_running = 0;
+        return (void *)0;
+    }
+
+    struct acl_jittor_initer
+    {
+        int32_t deviceId;
+        acl_jittor_initer()
+        {
+            CHECK_ACL(aclInit(nullptr));
+            uint device_count = 0;
+            deviceId = 0;
+            // 获取可用的Device数量
+            CHECK_ACL(aclrtGetDeviceCount(&device_count));
+            LOGi << "Found ACL device number:" << device_count;
+            CHECK_ACL(aclrtSetDevice(deviceId));
+            CHECK_ACL(aclrtCreateStream(&aclstream));
+            // pthread_create(&acl_jittor_tid, nullptr, acl_jittor_process_callback, 0);
+        }
+
+        ~acl_jittor_initer()
+        {
+            acl_jittor_thread_running = 0;
+            // CHECK_ACL(aclrtUnSubscribeReport(acl_jittor_tid, 0));
+            aclrtDestroyStream(aclstream);
+            aclrtResetDevice(deviceId);
+            CHECK_ACL(aclFinalize());
+            if (nowWorkSpaceSize > 0)
+            {
+                aclrtFree(workspaceAddr);
+            }
+        }
+
+    } _acl_jittor_initer;
+
+    string process_acl(const string &src, const string &name, const map<string, string> &kargs)
+    {
+        if (endswith(name, "_jittor.cc"))
+            return src;
+        // static vector<string> dont_compile = {"fp16_emu.cc"};
+        // for (auto& s : dont_compile)
+        //     if (endswith(name, s))
+        //         return " ";
+        static unordered_set<string> cuda_headers = {
+            "cuda_runtime", "cudnn", "driver_types",
+            "cuda_fp16", "cuda_runtime_api", "fp16_emu",
+            "cudnn_rnn_descriptor", "cublas_v2", "cublas_wrapper",
+            "curand", "curand_wrapper", "cufft", "cufftXt",
+            "CudaUtils", "cutt", "cudnn_wrapper", "cuda_bf16"};
+        static unordered_set<string> fake_class = {
+            "cudnnHandle_t", "cudnnConvolutionBwdFilterAlgo_t",
+            "cudnnConvolutionBwdDataAlgo_t", "cudnnConvolutionFwdAlgo_t",
+            "cufftHandle"};
+        try
+        {
+            auto tokens = token_split(src);
+            int edit = 0;
+            for (int i = 0; i < tokens.size(); i++)
+            {
+                auto &token = tokens[i];
+                if (cuda_headers.count(token))
+                    token = "acl_jittor", edit++;
+                else if (fake_class.count(token))
+                    token = "int", edit++;
+                else if (token == "CUDA")
+                    token = "ACL", edit++;
+                else if (startswith(token, "cuda"))
+                {
+                    if (token.size() >= 5 && token[4] >= 'A' && token[4] <= 'Z')
+                    {
+                        if (token == "cudaGetDeviceCount")
+                        {
+                            token_replace(tokens, i, "($1);", "((uint*)$1);");
+                        }
+                        else if (token == "cudaLaunchHostFunc")
+                        {
+                            // ACL_CALLBACK_BLOCK for 310
+                            token_replace(tokens, i, "LaunchHostFunc($1,$2,$3)",
+                                          "LaunchCallback($2,$3,ACL_CALLBACK_NO_BLOCK,$1)");
+                        }
+                        else if (token == "cudaMemcpy")
+                            token_replace(tokens, i, "cudaMemcpy($1,$2,$3,",
+                                          "aclrtMemcpy($1,$3,$2,$3,");
+                        else if (token == "cudaMemcpyAsync")
+                            token_replace(tokens, i, "cudaMemcpyAsync($1,$2,$3,",
+                                          "aclrtMemcpyAsync($1,$3,$2,$3,");
+                        else if (token == "cudaMemcpyDeviceToHost")
+                            token = "ACL_MEMCPY_DEVICE_TO_HOST";
+                        else if (token == "cudaMemcpyDefault")
+                            token = "ACL_MEMCPY_HOST_TO_DEVICE";
+                        else if (token == "cudaMemcpyHostToDevice")
+                            token = "ACL_MEMCPY_HOST_TO_DEVICE";
+                        else if (token == "cudaMemcpyDeviceToDevice")
+                            token = "ACL_MEMCPY_DEVICE_TO_DEVICE";
+                        else if (token == "cudaMallocManaged" || token == "cudaMalloc")
+                        {
+                            // unified address not supported
+                            token = "aclrtMalloc";
+                            token_replace(tokens, i, "($1,$2)",
+                                          "($1,$2,ACL_MEM_MALLOC_HUGE_FIRST)");
+                        }
+                        else if (token == "cudaMemGetInfo")
+                            token_replace(tokens, i, "cudaMemGetInfo($1,$2)",
+                                          "aclrtGetMemInfo(ACL_DDR_MEM,$1,$2)");
+                        else if (token == "cudaGetLastError")
+                            token_replace(tokens, i, "cudaGetLastError()", "0");
+                        else if (token == "cudaStreamCreateWithFlags")
+                            token_replace(tokens, i - 1,
+                                          "(cudaStreamCreateWithFlags($1,$2));",
+                                          "(aclrtCreateStream($1)); checkAclErrors(aclrtSubscribeReport(acl_jittor_tid,*$1));");
+                        else if (token == "cudaEventCreate")
+                            token_replace(tokens, i,
+                                          "cudaEventCreate($1,$2)",
+                                          "aclrtCreateEvent($1)");
+                        else if (token == "cudaDeviceSynchronize")
+                            token = "aclrtSynchronizeDevice";
+                        else if (token == "cudaStreamDestroy")
+                            token_replace(tokens, i, "cudaStreamDestroy($1)",
+                                          "(aclrtUnSubscribeReport(acl_jittor_tid,$1), aclrtDestroyStream($1))");
+                        else if (token == "cudaEventDestroy")
+                            token = "aclrtDestroyEvent";
+                        else if (token == "cudaEventRecord")
+                            token = "aclrtRecordEvent";
+                        else if (token == "cudaStreamWaitEvent")
+                            token_replace(tokens, i,
+                                          "cudaStreamWaitEvent($1,$2,$3)",
+                                          "aclrtStreamWaitEvent($1,$2)");
+
+                        if (token.size() && token[0] == 'c')
+                            token = "aclrt" + token.substr(4);
+                        if (endswith(token, "_t"))
+                            token = token.substr(0, token.size() - 2);
+                        edit++;
+                    }
+                }
+                else if (token == "_cudaGetErrorEnum")
+                {
+                    token_replace(tokens, i, "_cudaGetErrorEnum($1)", "(acl_error_to_string($1))");
+                    edit++;
+                }
+                else if (token == "checkCudaErrors")
+                    token = "checkAclErrors";
+                else if (token == "JPU")
+                {
+                    edit++;
+                    string new_code;
+                    if (tokens[i + 2] == "op_compiler")
+                        token_replace(tokens, i,
+                                      "JPU(op_compiler($1,$2,$3))",
+                                      "acl_jittor_op_compiler($1,$2,$3)");
+                    else if (tokens[i + 2] == "header")
+                        new_code = "#include \"acl_jittor.h\"";
+                    if (new_code.size())
+                        token_replace(tokens, i, "JPU($1)", new_code);
+                }
+                else if (token == "use_cuda_managed_allocator" && tokens[i + 1][0] == ',')
+                {
+                    tokens[i + 2] = "0"; // disable unified address
                }
            }
+            if (!edit)
+                return src;
+            string new_src = join(tokens, "");
+            // if (name == "executor.cc") {
+            //     new_src = string("#include <Python.h>\n#include <pystate.h>\n#include <common.h>\n")+
+            //     "namespace jittor { void acl_op_exec(Op*); }\n" +
+            //     replace(new_src, "op->do_run_after_prepare(jkl);",
+            //     R"({
+            //         acl_op_exec(op);
+            //     })");
+            // }
+            if (name == "profiler.cc")
+            {
+                new_src = token_replace_all(new_src, ".cc", ".tikcc");
+            }
+            // LOGir << name << (name == "pass_manager.cc");
+            if (name == "pass_manager.cc")
+            {
+                LOGir << "replace" << name;
+                new_src = token_replace_all(new_src, "run_pass<FloatAtomicFixPass>();", "WTF");
+            }
+            // ????????
+            return new_src;
+        }
+        catch (const std::exception &e)
+        {
+            LOGe << "process acl error:" << e.what();
+            LOGe << "name:" << name;
+            throw;
        }
-        // s = token_replace_all(s+";", "auto $1=$2?$3:$4;", "auto $1=$3;if (!($2)) $1=$4;");
-        // s = token_replace_all(s+";", "auto $1=$2?$3:$4;", "auto $1=$3;if (!($2)) $1=$4;");
-        // if (s.find("::max") != string::npos) {
-        //     s = token_replace_all(s+";", " $1= ::max($2);", "tikcc::scalar_max($1, $2);");
-        // }
    }
-    new_src = join(ss, ";");
-    src = new_src;
-}
+
+    void acl_jittor_op_compiler(string &filename, string &src, bool is_acl, string &extra_flags)
+    {
+        if (!is_acl)
+            return;
+        string new_src = process_acl(src, "", {});
+        new_src = replace(new_src, R"(#include "misc/cuda_atomic.h")", "");
+        new_src = replace(new_src, R"(#include "misc/cuda_limits.h")", "");
+        new_src = replace(new_src, "__global__", "__ai_device_entry__");
+        new_src = token_replace_all(new_src, "__launch_bounds__($1)", "");
+        new_src = token_replace_all(new_src, "int thread_num = $1;", "int thread_num = 1;");
+        new_src = token_replace_all(new_src, "tn0=std::max(tn0, $1);", "");
+        new_src = token_replace_all(new_src, "<<<$1>>>", "<<<1,0>>>");
+        new_src = token_replace_all(new_src, "int thread_id = $1;", "int thread_id = 1;");
+        // for inc error
+        new_src = token_replace_all(new_src, "for ($1+=$2)", "for ($1++)");
+        // bit op error
+        new_src = token_replace_all(new_src, "int tnum$1;", "");
+        new_src = token_replace_all(new_src, "int p1$1;", "");
+        new_src = token_replace_all(new_src, "int p2$1;", "");
+        new_src = token_replace_all(new_src, "int tn$1=$2;", "int tn$1=0;");
+        new_src = token_replace_all(new_src, "int tid$1=$2;", "int tid$1=0;");
+        src = new_src;
+
+        new_src = token_replace_all(new_src, "atomicAdd(&$1,$2);", "$1=$1+$2;");
+        // new_src = token_replace_all(new_src, "bool", "int8");
+        new_src = token_replace_all(new_src, "::numeric_min<float32>()", "-1e30");
+        new_src = token_replace_all(new_src, "::numeric_max<float32>()", "1e30");
+        // TODO: support max
+        unordered_map<string, string> opmap = {
+            // {"::max","tikcc::scalar_max"},
+            {"::sqrtf", "tikcc::scalar_sqrt"}};
+        auto ss = split(new_src, ";");
+        for (auto &s : ss)
+        {
+            if (s.find("?") != string::npos)
+            {
+                s = token_replace_all(s + ";", "auto $1=$2?$3:$4;", "auto $1=$3;if (!($2)) $1=$4;");
+            }
+            if (s.find("::max") != string::npos)
+            {
+                if (s.find("auto") == string::npos)
+                {
+                    s = token_replace_all(s + ";", " $1=$4::max($2,$3);", " $1=$2;if ($2 < $3) $1=$3;");
+                }
+                else
+                {
+                    s = token_replace_all(s + ";", "auto $1=$4::max($2,$3);", "auto $1=$2;if ($2 < $3) $1=$3;");
+                }
+            }
+            for (auto &kv : opmap)
+            {
+                if (s.find(kv.first) != string::npos)
+                {
+                    if (s.find("auto") == string::npos)
+                    {
+                        // $1 = op($2) --> op($1, $2)
+                        s = token_replace_all(s + ";", " $1= " + kv.first + "($2);", kv.second + "($1, $2);");
+                    }
+                    else
+                    {
+                        // auto $1 = op($2) --> float32 $1; op($1, $2);
+                        s = token_replace_all(s + ";", "auto $1= " + kv.first + "($2);", "float32 $1; " + kv.second + "($1, $2);");
+                    }
+                }
+            }
+            // s = token_replace_all(s+";", "auto $1=$2?$3:$4;", "auto $1=$3;if (!($2)) $1=$4;");
+            // s = token_replace_all(s+";", "auto $1=$2?$3:$4;", "auto $1=$3;if (!($2)) $1=$4;");
+            // if (s.find("::max") != string::npos) {
+            //     s = token_replace_all(s+";", " $1= ::max($2);", "tikcc::scalar_max($1, $2);");
+            // }
+        }
+        new_src = join(ss, ";");
+        src = new_src;
+    }

 }
--- a/python/jittor/extern/acl/acl_jittor.h
+++ b/python/jittor/extern/acl/acl_jittor.h
@ -1,20 +1,700 @@
 // ***************************************************************
-// Copyright (c) 2023 Jittor. All Rights Reserved. 
-// Maintainers: Dun Liang <randonlang@gmail.com>. 
+// Copyright (c) 2023 Jittor. All Rights Reserved.
+// Maintainers: Dun Liang <randonlang@gmail.com>.
 // This file is subject to the terms and conditions defined in
 // file 'LICENSE.txt', which is part of this source code package.
 // ***************************************************************
 #pragma once
 #include "common.h"
+#include "aclnn/aclnn.h"
 #include <acl/acl.h>

 std::string acl_error_to_string(aclError error);

-namespace jittor {
+namespace jittor
+{

-EXTERN_LIB uint64_t acl_jittor_tid;
-EXTERN_LIB aclrtStream aclstream;
+    EXTERN_LIB uint64_t acl_jittor_tid;
+    EXTERN_LIB aclrtStream aclstream;
+    EXTERN_LIB void *workspaceAddr;

-void acl_jittor_op_compiler(string& filename, string& src, bool is_acl, string& extra_flags);
+    void mallocWorkSpace(uint64_t size);

-}
+    void acl_jittor_op_compiler(string &filename, string &src, bool is_acl, string &extra_flags);
+
+    struct AclOpFunctions
+    {
+        // for Unary and Nonzero
+        std::function<aclnnStatus(aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncUnaryNonzero;
+        // for Cast
+        std::function<aclnnStatus(aclTensor *, aclDataType, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncCast;
+        // for Bianry
+        std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncBinary;
+        // for Add and Sub
+        std::function<aclnnStatus(aclTensor *, aclTensor *, aclScalar *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncAdd;
+        // for Expand, permute, flip
+        std::function<aclnnStatus(aclTensor *, aclIntArray *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncExpand;
+        // for bmm and matmul
+        std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, int8_t, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncMatmul;
+        // for conv
+        std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, aclIntArray *, aclIntArray *, aclIntArray *, bool, aclIntArray *, int64_t, aclTensor *, int8_t, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncConv;
+        // for reducesum, mean
+        std::function<aclnnStatus(aclTensor *, aclIntArray *, bool, aclDataType, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncReduceSum;
+        // for amax and amin
+        std::function<aclnnStatus(aclTensor *, aclIntArray *, bool, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncAmax;
+        // for conv backward
+        std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, aclIntArray *, aclIntArray *, aclIntArray *, aclIntArray *, bool, aclIntArray *, int, aclBoolArray *, int8_t, aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncConvBackward;
+        // for proddim
+        std::function<aclnnStatus(aclTensor *, float, float, float, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncProdDim;
+        // for select, where
+        std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncSelect;
+        // for random_uniform and random_normal
+        std::function<aclnnStatus(aclTensor *, int64_t, int64_t, int64_t, int64_t, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncRandom;
+        // for maxpool
+        std::function<aclnnStatus(aclTensor *, aclIntArray *, aclIntArray *, aclIntArray *, aclIntArray *, bool, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncMaxPool;
+        // for maxpool backward
+        std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, aclIntArray *, aclIntArray *, aclIntArray *, aclIntArray *, bool, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncMaxPoolBackward;
+        // for avgpool
+        std::function<aclnnStatus(aclTensor *, aclIntArray *, aclIntArray *, aclIntArray *, bool, bool, int64_t, int8_t, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncAvgPool;
+        // for avgpool backward
+        std::function<aclnnStatus(aclTensor *, aclTensor *, aclIntArray *, aclIntArray *, aclIntArray *, bool, bool, int64_t, int8_t, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncAvgPoolBackward;
+        // for concat
+        std::function<aclnnStatus(aclTensorList *, uint64_t, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncConcat;
+        // for gather
+        std::function<aclnnStatus(aclTensor *, uint64_t, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncGather;
+        // for cumsum
+        std::function<aclnnStatus(aclTensor *, uint64_t, aclDataType, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncCumsum;
+        // for scatter
+        std::function<aclnnStatus(aclTensor *, uint64_t, aclTensor *, aclTensor *, uint64_t, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncScatter;
+        // for index
+        std::function<aclnnStatus(aclTensor *, aclTensorList *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncIndex;
+        // for stridesliceassignv2
+        std::function<aclnnStatus(aclTensor *, aclTensor *, aclIntArray *, aclIntArray *, aclIntArray *, aclIntArray *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncStridedSliceAssignV2;
+        // for slicev2
+        std::function<aclnnStatus(aclTensor *, aclIntArray *, aclIntArray *, aclIntArray *, aclIntArray *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncSliceV2;
+        // for indexputimpl
+        std::function<aclnnStatus(aclTensor *, aclTensorList *, aclTensor *, bool, bool, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncIndexPutImpl;
+        // for range
+        std::function<aclnnStatus(aclScalar *, aclScalar *, aclScalar *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncRange;
+        // for leaky_relu
+        std::function<aclnnStatus(aclTensor *, aclScalar *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncLeakyRelu;
+        // for leaky_relu backward
+        std::function<aclnnStatus(aclTensor *, aclTensor *, aclScalar *, bool, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncLeakyReluBackward;
+        // for dropout
+        std::function<aclnnStatus(aclTensor *, double, bool, int64_t, int64_t, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncDropout;
+        // for dropout backward
+        std::function<aclnnStatus(aclTensor *, aclTensor *, double, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncDropoutBackward;
+        // for split with size
+        std::function<aclnnStatus(aclTensor *, aclIntArray *, int64_t, aclTensorList *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncSplitWithSize;
+
+        // for silu
+        // std::function<aclnnStatus(aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncSilu;
+
+        // for silu backward
+        // std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncSiluBackward;
+
+        // for sigmoid
+        // std::function<aclnnStatus(aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncSigmoid;
+
+        // for sigmoid backward
+        // std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncSigmoidBackward;
+
+        // for embedding
+        // std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncEmbedding;
+
+        // for embedding backward
+        std::function<aclnnStatus(aclTensor *, aclTensor *, uint64_t, uint64_t, bool, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncEmbeddingBackward;
+
+        // for InplaceMaskedScatter MaskedSelect
+        // std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncInplaceMaskedScatter;
+        std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, aclrtStream)> executeFunc;
+
+        // for flashattention
+        std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *,
+                                  aclIntArray *, aclIntArray *, aclIntArray *, double, double, int64_t, int64_t, int64_t, char *, int64_t, int64_t, int64_t,
+                                  aclTensor *, aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)>
+            getWorkspaceSizeFuncFalshAttention;
+
+        // for flashattention backward
+        std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *,
+                                  aclIntArray *, aclIntArray *, aclIntArray *, double, double, int64_t, int64_t, int64_t, char *, int64_t, int64_t, int64_t,
+                                  aclTensor *, aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)>
+            getWorkspaceSizeFuncFalshAttentionBackward;
+
+        // for batchnorm
+        std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, bool, double, double, aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncBatchNorm;
+
+        // for batchnorm backward
+        std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, bool, double, aclBoolArray *, aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncBatchNormBackward;
+
+        // for layernorm
+        std::function<aclnnStatus(aclTensor *, aclIntArray *, aclTensor *, aclTensor *, double, aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncLayerNorm;
+
+        // for ROPE
+        std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, aclTensor *, int64_t, uint64_t *, aclOpExecutor **)>
+            getWorkspaceSizeFuncRotaryPosEmb;
+
+        // 添加一个默认构造函数
+        AclOpFunctions() = default;
+
+        // for Unary and Nonzero
+        AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
+                       std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, aclrtStream)> execf)
+            : getWorkspaceSizeFuncUnaryNonzero(gwsf), executeFunc(execf) {}
+
+        // for Cast
+        AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclDataType, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
+                       std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, aclrtStream)> execf)
+            : getWorkspaceSizeFuncCast(gwsf), executeFunc(execf) {}
+
+        // for Binary
+        AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
+                       std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
+            : getWorkspaceSizeFuncBinary(gwsf), executeFunc(execf) {}
+        // for Add and Sub
+        AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, aclScalar *, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
+                       std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
+            : getWorkspaceSizeFuncAdd(gwsf), executeFunc(execf) {}
+
+        // for Expand, flip
+        AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclIntArray *, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
+                       std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
+            : getWorkspaceSizeFuncExpand(gwsf), executeFunc(execf) {}
+
+        // for Matmul
+        AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, int8_t, uint64_t *, aclOpExecutor **)> gwsf,
+                       std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
+            : getWorkspaceSizeFuncMatmul(gwsf), executeFunc(execf) {}
+
+        // for conv
+        AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, aclIntArray *, aclIntArray *, aclIntArray *, bool, aclIntArray *, int64_t, aclTensor *, int8_t, uint64_t *, aclOpExecutor **)> gwsf,
+                       std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
+            : getWorkspaceSizeFuncConv(gwsf), executeFunc(execf) {}
+
+        // for reducesum, mean
+        AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclIntArray *, bool, aclDataType, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
+                       std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
+            : getWorkspaceSizeFuncReduceSum(gwsf), executeFunc(execf) {}
+
+        // for amax amin
+        AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclIntArray *, bool, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
+                       std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
+            : getWorkspaceSizeFuncAmax(gwsf), executeFunc(execf) {}
+
+        // for conv backward
+        AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, aclIntArray *, aclIntArray *, aclIntArray *, aclIntArray *, bool, aclIntArray *, int, aclBoolArray *, int8_t, aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
+                       std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
+            : getWorkspaceSizeFuncConvBackward(gwsf), executeFunc(execf) {}
+
+        // for proddim
+        AclOpFunctions(std::function<aclnnStatus(const aclTensor *, float, float, float, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
+                       std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
+            : getWorkspaceSizeFuncProdDim(gwsf), executeFunc(execf) {}
+
+        // for select, where
+        AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
+                       std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
+            : getWorkspaceSizeFuncSelect(gwsf), executeFunc(execf) {}
+
+        // for random_normal
+        AclOpFunctions(std::function<aclnnStatus(aclTensor *, int64_t, int64_t, int64_t, int64_t, uint64_t *, aclOpExecutor **)> gwsf,
+                       std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
+            : getWorkspaceSizeFuncRandom(gwsf), executeFunc(execf) {}
+
+        // for maxpool
+        AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclIntArray *, aclIntArray *, aclIntArray *, aclIntArray *, bool, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
+                       std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
+            : getWorkspaceSizeFuncMaxPool(gwsf), executeFunc(execf) {}
+
+        // for maxpool backward
+        AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, aclIntArray *, aclIntArray *, aclIntArray *, aclIntArray *, bool, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
+                       std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
+            : getWorkspaceSizeFuncMaxPoolBackward(gwsf), executeFunc(execf) {}
+
+        // for avgpool
+        AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclIntArray *, aclIntArray *, aclIntArray *, bool, bool, int64_t, int8_t, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
+                       std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
+            : getWorkspaceSizeFuncAvgPool(gwsf), executeFunc(execf) {}
+
+        // for avgpool backward
+        AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, aclIntArray *, aclIntArray *, aclIntArray *, bool, bool, int64_t, int8_t, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
+                       std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
+            : getWorkspaceSizeFuncAvgPoolBackward(gwsf), executeFunc(execf) {}
+
+        // for concat
+        AclOpFunctions(std::function<aclnnStatus(aclTensorList *, int64_t, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
+                       std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
+            : getWorkspaceSizeFuncConcat(gwsf), executeFunc(execf) {}
+
+        // for gather
+        AclOpFunctions(std::function<aclnnStatus(aclTensor *, int64_t, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
+                       std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
+            : getWorkspaceSizeFuncGather(gwsf), executeFunc(execf) {}
+
+        // for cumsum
+        AclOpFunctions(std::function<aclnnStatus(aclTensor *, int64_t, aclDataType, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
+                       std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
+            : getWorkspaceSizeFuncCumsum(gwsf), executeFunc(execf) {}
+
+        // for scatter
+        AclOpFunctions(std::function<aclnnStatus(aclTensor *, uint64_t, aclTensor *, aclTensor *, uint64_t, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
+                       std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
+            : getWorkspaceSizeFuncScatter(gwsf), executeFunc(execf) {}
+
+        // for index
+        AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensorList *, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
+                       std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
+            : getWorkspaceSizeFuncIndex(gwsf), executeFunc(execf) {}
+
+        // for stridesliceassignv2
+        AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, aclIntArray *, aclIntArray *, aclIntArray *, aclIntArray *, uint64_t *, aclOpExecutor **)> gwsf,
+                       std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
+            : getWorkspaceSizeFuncStridedSliceAssignV2(gwsf), executeFunc(execf) {}
+
+        // for slicev2
+        AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclIntArray *, aclIntArray *, aclIntArray *, aclIntArray *, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
+                       std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
+            : getWorkspaceSizeFuncSliceV2(gwsf), executeFunc(execf) {}
+
+        // for indexputimpl
+        AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensorList *, aclTensor *, bool, bool, uint64_t *, aclOpExecutor **)> gwsf,
+                       std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
+            : getWorkspaceSizeFuncIndexPutImpl(gwsf), executeFunc(execf) {}
+
+        // for range
+        AclOpFunctions(std::function<aclnnStatus(aclScalar *, aclScalar *, aclScalar *, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
+                       std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
+            : getWorkspaceSizeFuncRange(gwsf), executeFunc(execf) {}
+
+        // for leaky_relu
+        AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclScalar *, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
+                       std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
+            : getWorkspaceSizeFuncLeakyRelu(gwsf), executeFunc(execf) {}
+
+        // for leaky_relu backward
+        AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, aclScalar *, bool, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
+                       std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
+            : getWorkspaceSizeFuncLeakyReluBackward(gwsf), executeFunc(execf) {}
+
+        // for dropout
+        AclOpFunctions(std::function<aclnnStatus(aclTensor *, double, bool, int64_t, int64_t, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
+                       std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
+            : getWorkspaceSizeFuncDropout(gwsf), executeFunc(execf) {}
+
+        // for dropout backward
+        AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, double, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
+                       std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
+            : getWorkspaceSizeFuncDropoutBackward(gwsf), executeFunc(execf) {}
+
+        // for embedding backward
+        AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, uint64_t, uint64_t, bool, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
+                       std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
+            : getWorkspaceSizeFuncEmbeddingBackward(gwsf), executeFunc(execf) {}
+
+        // for split with size
+        AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclIntArray *, int64_t, aclTensorList *, uint64_t *, aclOpExecutor **)> gwsf,
+                       std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
+            : getWorkspaceSizeFuncSplitWithSize(gwsf), executeFunc(execf) {}
+
+        // for flash attention
+        AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *,
+                                                 aclIntArray *, aclIntArray *, aclIntArray *, double, double, int64_t, int64_t, int64_t, char *, int64_t, int64_t, int64_t,
+                                                 aclTensor *, aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)>
+                           gwsf,
+                       std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
+            : getWorkspaceSizeFuncFalshAttention(gwsf), executeFunc(execf) {}
+
+        // for flash attention backward
+        AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *,
+                                                 aclIntArray *, aclIntArray *, aclIntArray *, double, double, int64_t, int64_t, int64_t, char *, int64_t, int64_t, int64_t,
+                                                 aclTensor *, aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)>
+                           gwsf,
+                       std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
+            : getWorkspaceSizeFuncFalshAttentionBackward(gwsf), executeFunc(execf) {}
+
+        // for batchnorm
+        AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, bool, double, double, aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)>
+                           gwsf,
+                       std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
+            : getWorkspaceSizeFuncBatchNorm(gwsf), executeFunc(execf) {}
+
+        // for batchnorm backward
+        AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, bool, double, aclBoolArray *, aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)>
+                           gwsf,
+                       std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
+            : getWorkspaceSizeFuncBatchNormBackward(gwsf), executeFunc(execf) {}
+
+        // for layernorm
+        AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclIntArray *, aclTensor *, aclTensor *, double, aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)>
+                           gwsf,
+                       std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
+            : getWorkspaceSizeFuncLayerNorm(gwsf), executeFunc(execf) {}
+
+        // for ROPE
+        AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, const aclTensor *, const aclTensor *, int64_t, uint64_t *, aclOpExecutor **)> gwsf,
+                       std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
+            : getWorkspaceSizeFuncRotaryPosEmb(gwsf), executeFunc(execf) {}
+    };
+
+    static std::unordered_map<std::string, AclOpFunctions> aclOpFuncMap = {
+        {"Abs", AclOpFunctions(aclnnAbsGetWorkspaceSize, aclnnAbs)},
+        {"Exp", AclOpFunctions(aclnnExpGetWorkspaceSize, aclnnExp)},
+        {"Log", AclOpFunctions(aclnnLogGetWorkspaceSize, aclnnLog)},
+        {"Sqrt", AclOpFunctions(aclnnSqrtGetWorkspaceSize, aclnnSqrt)},
+        {"Ceil", AclOpFunctions(aclnnCeilGetWorkspaceSize, aclnnCeil)},
+        {"Floor", AclOpFunctions(aclnnFloorGetWorkspaceSize, aclnnFloor)},
+        {"Round", AclOpFunctions(aclnnRoundGetWorkspaceSize, aclnnRound)},
+        {"Sin", AclOpFunctions(aclnnSinGetWorkspaceSize, aclnnSin)},
+        {"Cos", AclOpFunctions(aclnnCosGetWorkspaceSize, aclnnCos)},
+        {"Tan", AclOpFunctions(aclnnTanGetWorkspaceSize, aclnnTan)},
+        {"Asin", AclOpFunctions(aclnnAsinGetWorkspaceSize, aclnnAsin)},
+        {"Acos", AclOpFunctions(aclnnAcosGetWorkspaceSize, aclnnAcos)},
+        {"Atan", AclOpFunctions(aclnnAtanGetWorkspaceSize, aclnnAtan)},
+        {"Sinh", AclOpFunctions(aclnnSinhGetWorkspaceSize, aclnnSinh)},
+        {"Cosh", AclOpFunctions(aclnnCoshGetWorkspaceSize, aclnnCosh)},
+        {"Tanh", AclOpFunctions(aclnnTanhGetWorkspaceSize, aclnnTanh)},
+        {"Asinh", AclOpFunctions(aclnnAsinhGetWorkspaceSize, aclnnAsinh)},
+        {"Acosh", AclOpFunctions(aclnnAcoshGetWorkspaceSize, aclnnAcosh)},
+        {"Atanh", AclOpFunctions(aclnnAtanhGetWorkspaceSize, aclnnAtanh)},
+        {"Sigmoid", AclOpFunctions(aclnnSigmoidGetWorkspaceSize, aclnnSigmoid)},
+        {"Erf", AclOpFunctions(aclnnErfGetWorkspaceSize, aclnnErf)},
+        {"Erfinv", AclOpFunctions(aclnnErfinvGetWorkspaceSize, aclnnErfinv)},
+        {"LogicalNot", AclOpFunctions(aclnnLogicalNotGetWorkspaceSize, aclnnLogicalNot)},
+        {"BitwiseNot", AclOpFunctions(aclnnBitwiseNotGetWorkspaceSize, aclnnBitwiseNot)},
+        {"Neg", AclOpFunctions(aclnnNegGetWorkspaceSize, aclnnNeg)},
+        {"Cast", AclOpFunctions(aclnnCastGetWorkspaceSize, aclnnCast)},
+        {"Maximum", AclOpFunctions(aclnnMaximumGetWorkspaceSize, aclnnMaximum)},
+        {"Minimum", AclOpFunctions(aclnnMinimumGetWorkspaceSize, aclnnMinimum)},
+        {"Add", AclOpFunctions(aclnnAddGetWorkspaceSize, aclnnAdd)},
+        {"Sub", AclOpFunctions(aclnnSubGetWorkspaceSize, aclnnSub)},
+        {"Mul", AclOpFunctions(aclnnMulGetWorkspaceSize, aclnnMul)},
+        {"RealDiv", AclOpFunctions(aclnnDivGetWorkspaceSize, aclnnDiv)},
+        {"FloorDiv", AclOpFunctions(aclnnFloorDivideGetWorkspaceSize, aclnnFloorDivide)},
+        {"LessEqual", AclOpFunctions(aclnnLeTensorGetWorkspaceSize, aclnnLeTensor)},
+        {"Less", AclOpFunctions(aclnnLtTensorGetWorkspaceSize, aclnnLtTensor)},
+        {"GreaterEqual", AclOpFunctions(aclnnGeTensorGetWorkspaceSize, aclnnGeTensor)},
+        {"Greater", AclOpFunctions(aclnnGtTensorGetWorkspaceSize, aclnnGtTensor)},
+        {"Equal", AclOpFunctions(aclnnEqTensorGetWorkspaceSize, aclnnEqTensor)},
+        {"NotEqual", AclOpFunctions(aclnnNeTensorGetWorkspaceSize, aclnnNeTensor)},
+        {"LogicalAnd", AclOpFunctions(aclnnLogicalAndGetWorkspaceSize, aclnnLogicalAnd)},
+        {"LogicalOr", AclOpFunctions(aclnnLogicalOrGetWorkspaceSize, aclnnLogicalOr)},
+        {"LogicalXor", AclOpFunctions(aclnnLogicalXorGetWorkspaceSize, aclnnLogicalXor)},
+        {"BitwiseAnd", AclOpFunctions(aclnnBitwiseAndTensorGetWorkspaceSize, aclnnBitwiseAndTensor)},
+        {"BitwiseOr", AclOpFunctions(aclnnBitwiseOrTensorGetWorkspaceSize, aclnnBitwiseOrTensor)},
+        {"BitwiseXor", AclOpFunctions(aclnnBitwiseXorTensorGetWorkspaceSize, aclnnBitwiseXorTensor)},
+        {"Pow", AclOpFunctions(aclnnPowTensorTensorGetWorkspaceSize, aclnnPowTensorTensor)},
+        {"Expand", AclOpFunctions(aclnnExpandGetWorkspaceSize, aclnnExpand)},
+        {"MatMul", AclOpFunctions(aclnnMatmulGetWorkspaceSize, aclnnMatmul)},
+        {"BatchMatMul", AclOpFunctions(aclnnBatchMatMulGetWorkspaceSize, aclnnBatchMatMul)},
+        {"ReduceMax", AclOpFunctions(aclnnAmaxGetWorkspaceSize, aclnnAmax)},
+        {"ReduceMin", AclOpFunctions(aclnnAminGetWorkspaceSize, aclnnAmin)},
+        {"ReduceSum", AclOpFunctions(aclnnReduceSumGetWorkspaceSize, aclnnReduceSum)},
+        {"Triu", AclOpFunctions(aclnnTriuGetWorkspaceSize, aclnnTriu)},
+        {"Conv2d", AclOpFunctions(aclnnConvolutionGetWorkspaceSize, aclnnConvolution)},
+        {"Conv2dBackward", AclOpFunctions(aclnnConvolutionBackwardGetWorkspaceSize, aclnnConvolutionBackward)},
+        {"ReduceMean", AclOpFunctions(aclnnMeanGetWorkspaceSize, aclnnMean)},
+        // {"ReduceProd", AclOpFunctions(aclnnProdDimGetWorkspaceSize, aclnnProdDim)},
+        {"Select", AclOpFunctions(aclnnSWhereGetWorkspaceSize, aclnnSWhere)},
+        {"RandomUniform", AclOpFunctions(aclnnInplaceUniformGetWorkspaceSize, aclnnInplaceUniform)},
+        {"RandomNormal", AclOpFunctions(aclnnInplaceNormalGetWorkspaceSize, aclnnInplaceNormal)},
+        {"Transpose", AclOpFunctions(aclnnPermuteGetWorkspaceSize, aclnnPermute)},
+        {"Maxpool", AclOpFunctions(aclnnMaxPool2dWithIndicesGetWorkspaceSize, aclnnMaxPool2dWithIndices)},
+        {"MaxpoolBackward", AclOpFunctions(aclnnMaxPool2dWithIndicesBackwardGetWorkspaceSize, aclnnMaxPool2dWithIndicesBackward)},
+        {"Avgpool", AclOpFunctions(aclnnAvgPool2dGetWorkspaceSize, aclnnAvgPool2d)},
+        {"AvgpoolBackward", AclOpFunctions(aclnnAvgPool2dBackwardGetWorkspaceSize, aclnnAvgPool2dBackward)},
+        {"Flip", AclOpFunctions(aclnnFlipGetWorkspaceSize, aclnnFlip)},
+        {"Concat", AclOpFunctions(aclnnCatGetWorkspaceSize, aclnnCat)},
+        {"Gather", AclOpFunctions(aclnnGatherGetWorkspaceSize, aclnnGather)},
+        {"Cumsum", AclOpFunctions(aclnnCumsumGetWorkspaceSize, aclnnCumsum)},
+        {"Index", AclOpFunctions(aclnnIndexGetWorkspaceSize, aclnnIndex)},
+        {"Scatter", AclOpFunctions(aclnnScatterGetWorkspaceSize, aclnnScatter)},
+        {"Nonzero", AclOpFunctions(aclnnNonzeroGetWorkspaceSize, aclnnNonzero)},
+        {"Where", AclOpFunctions(aclnnSWhereGetWorkspaceSize, aclnnSWhere)},
+        {"Floor", AclOpFunctions(aclnnFloorGetWorkspaceSize, aclnnFloor)},
+        {"StridedSliceAssignV2", AclOpFunctions(aclnnStridedSliceAssignV2GetWorkspaceSize, aclnnStridedSliceAssignV2)},
+        {"SliceV2", AclOpFunctions(aclnnSliceV2GetWorkspaceSize, aclnnSliceV2)},
+        {"IndexPutImpl", AclOpFunctions(aclnnIndexPutImplGetWorkspaceSize, aclnnIndexPutImpl)},
+        {"IndexPutImplAccumulate", AclOpFunctions(aclnnIndexPutImplGetWorkspaceSize, aclnnIndexPutImpl)},
+        {"Range", AclOpFunctions(aclnnRangeGetWorkspaceSize, aclnnRange)},
+        {"ReLU", AclOpFunctions(aclnnReluGetWorkspaceSize, aclnnRelu)},
+        {"LeakyReLU", AclOpFunctions(aclnnLeakyReluGetWorkspaceSize, aclnnLeakyRelu)},
+        {"LeakyReLUBackward", AclOpFunctions(aclnnLeakyReluBackwardGetWorkspaceSize, aclnnLeakyReluBackward)},
+        {"Dropout", AclOpFunctions(aclnnDropoutGetWorkspaceSize, aclnnDropout)},
+        {"DropoutBackward", AclOpFunctions(aclnnDropoutBackwardGetWorkspaceSize, aclnnDropoutBackward)},
+        {"SiLU", AclOpFunctions(aclnnSiluGetWorkspaceSize, aclnnSilu)},
+        {"SiLUBackward", AclOpFunctions(aclnnSiluBackwardGetWorkspaceSize, aclnnSiluBackward)},
+        {"Sigmoid", AclOpFunctions(aclnnSigmoidGetWorkspaceSize, aclnnSigmoid)},
+        {"SigmoidBackward", AclOpFunctions(aclnnSigmoidBackwardGetWorkspaceSize, aclnnSigmoidBackward)},
+        {"Embedding", AclOpFunctions(aclnnEmbeddingGetWorkspaceSize, aclnnEmbedding)},
+        {"EmbeddingBackward", AclOpFunctions(aclnnEmbeddingDenseBackwardGetWorkspaceSize, aclnnEmbeddingDenseBackward)},
+        {"InplaceMaskedScatter", AclOpFunctions(aclnnInplaceMaskedScatterGetWorkspaceSize, aclnnInplaceMaskedScatter)},
+        {"MaskedSelect", AclOpFunctions(aclnnMaskedSelectGetWorkspaceSize, aclnnMaskedSelect)},
+        {"SplitWithSize", AclOpFunctions(aclnnSplitWithSizeGetWorkspaceSize, aclnnSplitWithSize)},
+        {"Softmax", AclOpFunctions(aclnnSoftmaxGetWorkspaceSize, aclnnSoftmax)},
+        {"SoftmaxBackward", AclOpFunctions(aclnnSoftmaxBackwardGetWorkspaceSize, aclnnSoftmaxBackward)},
+        {"FlashAttention", AclOpFunctions(aclnnFlashAttentionScoreV2GetWorkspaceSize, aclnnFlashAttentionScoreV2)},
+        {"FlashAttentionBackward", AclOpFunctions(aclnnFlashAttentionScoreGradV2GetWorkspaceSize, aclnnFlashAttentionScoreGradV2)},
+        {"BatchNorm", AclOpFunctions(aclnnBatchNormGetWorkspaceSize, aclnnBatchNorm)},
+        {"BatchNormBackward", AclOpFunctions(aclnnBatchNormBackwardGetWorkspaceSize, aclnnBatchNormBackward)},
+        {"LayerNorm", AclOpFunctions(aclnnLayerNormGetWorkspaceSize, aclnnLayerNorm)},
+        {"RotaryPosEmb", AclOpFunctions(aclnnApplyRotaryPosEmbGetWorkspaceSize, aclnnApplyRotaryPosEmb)},
+        {"Stack", AclOpFunctions(aclnnStackGetWorkspaceSize, aclnnStack)},
+        {"NanToNum", AclOpFunctions(aclnnNanToNumGetWorkspaceSize, aclnnNanToNum)},
+    };
+
+    struct AclOpAttr
+    {
+        virtual ~AclOpAttr() {}
+    };
+
+    struct ConvAttr : AclOpAttr
+    {
+        vector<int64_t> convStrides;
+        vector<int64_t> convPads;
+        vector<int64_t> convOutPads;
+        vector<int64_t> convDilations;
+        bool convWithBias;
+        bool is_transposed;
+        int64_t group;
+
+        // 析构函数
+        ~ConvAttr()
+        {
+            convStrides.clear();
+            convPads.clear();
+            convOutPads.clear();
+            convDilations.clear();
+        }
+    };
+
+    struct ReduceAttr : AclOpAttr
+    {
+        vector<int64_t> axes;
+        // for proddim
+        int64_t prod_dim;
+        bool keepdims;
+
+        ~ReduceAttr()
+        {
+            axes.clear();
+        }
+    };
+
+    struct RandomAttr : AclOpAttr
+    {
+        int64_t seed, offset;
+
+        ~RandomAttr()
+        {
+        }
+    };
+
+    struct TriuAttr : AclOpAttr
+    {
+        int64_t diagonal;
+
+        ~TriuAttr()
+        {
+        }
+    };
+
+    struct PoolAttr : AclOpAttr
+    {
+        vector<int64_t> kernel_size;
+        vector<int64_t> poolStrides;
+        vector<int64_t> poolPads;
+        vector<int64_t> poolDilations;
+        bool poolCeil;
+        bool countIncludePad;
+
+        // divisorOverride(const int64_t，计算输入): 表示取平均的除数。数据类型支持INT64。divisorOverride配置为默认值0时表示功能不使能。
+        // https://www.hiascend.com/document/detail/zh/canncommercial/80RC2/apiref/appdevgapi/context/aclnnAvgPool2d.md
+        int64_t divisorOverride = 0;
+
+        // cubeMathType(int8_t，计算输入): host侧的整型，判断Cube单元应该使用哪种计算逻辑进行运算，数据类型支持INT8。对于无特殊说明的数据类型，均保持原始输入数据类型计算。支持的枚举值如下：
+        //    0:KEEP_DTYPE，保持输入的数据类型进行计算。当输入是FLOAT，Atlas 训练系列产品和Atlas 推理系列产品（Ascend 310P处理器）暂不支持，取0时会报错。
+        //    1:ALLOW_FP32_DOWN_PRECISION，允许将输入数据降精度计算。当输入是FLOAT，Atlas 训练系列产品和Atlas 推理系列产品（Ascend 310P处理器）允许转换为FLOAT16计算。
+        //    2:USE_FP16，允许转换为数据类型FLOAT16进行计算。当输入数据类型是FLOAT，转换为FLOAT16计算。
+        //    3:USE_HF32，允许转换为数据类型HFLOAT32计算。当输入是FLOAT，Atlas 训练系列产品、Atlas 推理系列产品（Ascend 310P处理器）和Atlas A2训练系列产品/Atlas 800I A2推理产品暂不支持，取3时会报错。
+        // https://www.hiascend.com/document/detail/zh/canncommercial/80RC2/apiref/appdevgapi/context/aclnnAvgPool2d.md
+        int8_t cubeMathType = 0;
+
+        // 析构函数
+        ~PoolAttr()
+        {
+            kernel_size.clear();
+            poolStrides.clear();
+            poolPads.clear();
+            poolDilations.clear();
+        }
+    };
+
+    struct ConcatAttr : AclOpAttr
+    {
+        int64_t tensorNum;
+        int64_t dim;
+
+        ~ConcatAttr()
+        {
+        }
+    };
+
+    struct GatherAttr : AclOpAttr
+    {
+        int64_t dim;
+
+        ~GatherAttr()
+        {
+        }
+    };
+
+    struct ScatterAttr : AclOpAttr
+    {
+        int64_t axis;
+        int64_t reduction;
+
+        ~ScatterAttr()
+        {
+        }
+    };
+
+    struct StrideAttr : AclOpAttr
+    {
+        vector<int64_t> begins;
+        vector<int64_t> ends;
+        vector<int64_t> steps;
+        vector<int64_t> axes;
+        ~StrideAttr()
+        {
+            begins.clear();
+            ends.clear();
+            steps.clear();
+            axes.clear();
+        }
+    };
+
+    struct RangeAttr : AclOpAttr
+    {
+        int64_t start;
+        int64_t end;
+        int64_t step;
+
+        ~RangeAttr()
+        {
+        }
+    };
+
+    struct LeakyReluAttr : AclOpAttr
+    {
+        float negativeSlope;
+        bool selfIsResult;
+
+        ~LeakyReluAttr()
+        {
+        }
+    };
+
+    struct DropoutAttr : AclOpAttr
+    {
+        float p;
+        bool train;
+        int64_t seed;
+        int64_t offset;
+        float scale;
+
+        ~DropoutAttr()
+        {
+        }
+    };
+
+    struct EmbeddingAttr : AclOpAttr
+    {
+        int64_t numEmbeddings;
+        // int64_t embeddingDim;
+        int64_t paddingIdx;
+        bool scaleGradByFreq;
+        // bool sparse;
+        // bool isSparse;
+        // bool isDense;
+
+        ~EmbeddingAttr()
+        {
+        }
+    };
+
+    struct SplitWithSizeAttr : AclOpAttr
+    {
+        vector<int64_t> splitSize;
+        int64_t dim;
+        ~SplitWithSizeAttr()
+        {
+            splitSize.clear();
+        }
+    };
+
+    struct SoftmaxAttr : AclOpAttr
+    {
+        int64_t dim;
+        ~SoftmaxAttr()
+        {
+        }
+    };
+
+    struct BatchNormAttr : AclOpAttr
+    {
+        bool is_train;
+        float momentum;
+        float eps;
+        ~BatchNormAttr()
+        {
+        }
+    };
+
+    struct LayerNormAttr : AclOpAttr
+    {
+        float eps;
+        vector<int64_t> normalizedShape;
+        int64_t size;
+        ~LayerNormAttr()
+        {
+            normalizedShape.clear();
+        }
+    };
+
+    struct FlashAttentionAttr : AclOpAttr
+    {
+        vector<int64_t> prefix;
+        vector<int64_t> qStartIdx;
+        vector<int64_t> kvStartIdx;
+        float scale;
+        float keepProb;
+        int64_t preToken;
+        int64_t nextToken;
+        int64_t headNum;
+        string inputLayout;
+        int64_t innerPrecise;
+        int64_t sparseMode;
+        int64_t psetype;
+        bool hasRealshift;
+        bool hasDropmask;
+        bool hasPaddingmask;
+        bool hasAttentmask;
+
+        ~FlashAttentionAttr()
+        {
+            prefix.clear();
+            qStartIdx.clear();
+            kvStartIdx.clear();
+        }
+    };
+
+    struct NanToNumAttr : AclOpAttr
+    {
+        float nan;
+        float posinf;
+        float neginf;
+        ~NanToNumAttr()
+        {
+        }
+    };
+}
--- a/python/jittor/extern/acl/acl_op_exec.cc
+++ b/python/jittor/extern/acl/acl_op_exec.cc
--- a/python/jittor/extern/acl/aclnn/aclnn.cc
+++ b/python/jittor/extern/acl/aclnn/aclnn.cc
@ -0,0 +1,58 @@
+#include <iostream>
+#include <vector>
+#include "aclnn.h"
+
+int64_t GetShapeSize(const std::vector<int64_t>& shape) {
+  int64_t shapeSize = 1;
+  for (auto i : shape) {
+    shapeSize *= i;
+  }
+  return shapeSize;
+}
+
+void PrintOutResult(std::vector<int64_t> &shape, void** deviceAddr) {
+  auto size = GetShapeSize(shape);
+  std::vector<int> resultData(size, 0);
+  auto ret = aclrtMemcpy(resultData.data(), resultData.size() * sizeof(resultData[0]),
+                         *deviceAddr, size * sizeof(resultData[0]), ACL_MEMCPY_DEVICE_TO_HOST);
+  CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("copy result from device to host failed. ERROR: %d\n", ret); return);
+  for (int64_t i = 0; i < size; i++) {
+    LOG_PRINT("mean result[%ld] is: %d\n", i, resultData[i]);
+  }
+}
+
+/*int Init(int32_t deviceId) {
+  // 固定写法，AscendCL初始化
+  auto ret = aclInit(nullptr);
+  CHECK_RET(ret == ACL_SUCCESS or ret == 100002, LOG_PRINT("aclInit failed. ERROR: %d\n", ret); return ret);
+  ret = aclrtSetDevice(deviceId);
+  CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclrtSetDevice failed. ERROR: %d\n", ret); return ret);
+  //ret = aclrtCreateStream(stream);
+  //CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclrtCreateStream failed. ERROR: %d\n", ret); return ret);
+  return 0;
+}*/
+
+/*
+template <typename T>
+int CreateAclTensor(const std::vector<T>& hostData, const std::vector<int64_t>& shape, void** deviceAddr,
+                    aclDataType dataType, aclTensor** tensor) {
+  auto size = GetShapeSize(shape) * sizeof(T);
+  // 调用aclrtMalloc申请device侧内存
+  auto ret = aclrtMalloc(deviceAddr, size, ACL_MEM_MALLOC_HUGE_FIRST);
+  CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclrtMalloc failed. ERROR: %d\n", ret); return ret);
+  // 调用aclrtMemcpy将host侧数据拷贝到device侧内存上
+  ret = aclrtMemcpy(*deviceAddr, size, hostData.data(), size, ACL_MEMCPY_HOST_TO_DEVICE);
+  CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclrtMemcpy failed. ERROR: %d\n", ret); return ret);
+
+  // 计算连续tensor的strides
+  std::vector<int64_t> strides(shape.size(), 1);
+  for (int64_t i = shape.size() - 2; i >= 0; i--) {
+    strides[i] = shape[i + 1] * strides[i + 1];
+  }
+
+  // 调用aclCreateTensor接口创建aclTensor
+  *tensor = aclCreateTensor(shape.data(), shape.size(), dataType, strides.data(), 0, aclFormat::ACL_FORMAT_ND,
+                            shape.data(), shape.size(), *deviceAddr);
+  return 0;
+}*/
+
--- a/python/jittor/extern/acl/aclnn/aclnn.h
+++ b/python/jittor/extern/acl/aclnn/aclnn.h
@ -0,0 +1,134 @@
+#include <iostream>
+#include <vector>
+#include "acl.h"
+// unary
+#include "aclnnop/aclnn_abs.h"
+#include "aclnnop/aclnn_neg.h"
+#include "aclnnop/aclnn_exp.h"
+#include "aclnnop/aclnn_log.h"
+#include "aclnnop/aclnn_sqrt.h"
+#include "aclnnop/aclnn_ceil.h"
+#include "aclnnop/aclnn_floor.h"
+#include "aclnnop/aclnn_round.h"
+#include "aclnnop/aclnn_sin.h"
+#include "aclnnop/aclnn_cos.h"
+#include "aclnnop/aclnn_tan.h"
+#include "aclnnop/aclnn_asin.h"
+#include "aclnnop/aclnn_acos.h"
+#include "aclnnop/aclnn_atan.h"
+#include "aclnnop/aclnn_sinh.h"
+#include "aclnnop/aclnn_cosh.h"
+#include "aclnnop/aclnn_tanh.h"
+#include "aclnnop/aclnn_asinh.h"
+#include "aclnnop/aclnn_acosh.h"
+#include "aclnnop/aclnn_atanh.h"
+#include "aclnnop/aclnn_sigmoid.h"
+#include "aclnnop/aclnn_erf.h"
+#include "aclnnop/aclnn_erfinv.h"
+#include "aclnnop/aclnn_logical_not.h"
+#include "aclnnop/aclnn_bitwise_not.h"
+#include "aclnnop/aclnn_cast.h"
+#include "aclnnop/aclnn_nonzero.h"
+// binary
+#include "aclnnop/aclnn_maximum.h"
+#include "aclnnop/aclnn_minimum.h"
+#include "aclnnop/aclnn_add.h"
+#include "aclnnop/aclnn_sub.h"
+#include "aclnnop/aclnn_mul.h"
+#include "aclnnop/aclnn_div.h"
+#include "aclnnop/aclnn_floor_divide.h"
+#include "aclnnop/aclnn_le_tensor.h"
+#include "aclnnop/aclnn_lt_tensor.h"
+#include "aclnnop/aclnn_ge_tensor.h"
+#include "aclnnop/aclnn_gt_tensor.h"
+#include "aclnnop/aclnn_eq_tensor.h"
+#include "aclnnop/aclnn_ne_tensor.h"
+#include "aclnnop/aclnn_logical_and.h"
+#include "aclnnop/aclnn_logical_or.h"
+#include "aclnnop/aclnn_logical_xor.h"
+#include "aclnnop/aclnn_bitwise_and_tensor.h"
+#include "aclnnop/aclnn_bitwise_or_tensor.h"
+#include "aclnnop/aclnn_bitwise_xor_tensor.h"
+#include "aclnnop/aclnn_pow_tensor_tensor.h"
+#include "aclnnop/aclnn_expand.h"
+#include "aclnnop/aclnn_matmul.h"
+#include "aclnnop/aclnn_batch_matmul.h"
+#include "aclnnop/aclnn_convolution.h"
+#include "aclnnop/aclnn_convolution_backward.h"
+#include "aclnnop/aclnn_reduce_sum.h"
+#include "aclnnop/aclnn_amax.h"
+#include "aclnnop/aclnn_amin.h"
+#include "aclnnop/aclnn_mean.h"
+#include "aclnnop/aclnn_prod.h"
+#include "aclnnop/aclnn_triu.h"
+#include "aclnnop/aclnn_s_where.h"
+#include "aclnnop/aclnn_random.h"
+#include "aclnnop/aclnn_normal.h"
+#include "aclnnop/aclnn_permute.h"
+#include "aclnnop/aclnn_max_pool2d_with_indices.h"
+#include "aclnnop/aclnn_max_pool2d_with_indices_backward.h"
+#include "aclnnop/aclnn_avgpool2d.h"
+#include "aclnnop/aclnn_avgpool2d_backward.h"
+#include "aclnnop/aclnn_flip.h"
+#include "aclnnop/aclnn_cat.h"
+#include "aclnnop/aclnn_gather.h"
+#include "aclnnop/aclnn_cumsum.h"
+#include "aclnnop/aclnn_index.h"
+#include "aclnnop/aclnn_scatter.h"
+#include "aclnnop/aclnn_index.h"
+#include "aclnnop/aclnn_strided_slice_assign_v2.h"
+#include "aclnnop/aclnn_slice_v2.h"
+#include "aclnnop/aclnn_index_put_impl.h"
+#include "aclnnop/aclnn_range.h"
+#include "aclnnop/aclnn_relu.h"
+#include "aclnnop/aclnn_dropout.h"
+#include "aclnnop/aclnn_dropout_backward.h"
+#include "aclnnop/aclnn_leaky_relu.h"
+#include "aclnnop/aclnn_leaky_relu_backward.h"
+#include "aclnnop/aclnn_uniform.h"
+#include "aclnnop/aclnn_silu.h"
+#include "aclnnop/aclnn_silu_backward.h"
+#include "aclnnop/aclnn_sigmoid.h"
+#include "aclnnop/aclnn_sigmoid_backward.h"
+#include "aclnnop/aclnn_embedding.h"
+#include "aclnnop/aclnn_embedding_dense_backward.h"
+#include "aclnnop/aclnn_masked_scatter.h"
+#include "aclnnop/aclnn_masked_select.h"
+#include "aclnnop/aclnn_split_with_size.h"
+#include "aclnnop/aclnn_flash_attention_score.h"
+#include "aclnnop/aclnn_flash_attention_score_grad.h"
+#include "aclnnop/aclnn_softmax.h"
+#include "aclnnop/aclnn_softmax_backward.h"
+#include "aclnnop/aclnn_batch_norm.h"
+#include "aclnnop/aclnn_batch_norm_backward.h"
+#include "aclnnop/aclnn_layer_norm.h"
+#include "aclnnop/aclnn_apply_rotary_pos_emb.h"
+#include "aclnnop/aclnn_stack.h"
+#include "aclnnop/aclnn_nan_to_num.h"
+
+#define CHECK_RET(cond, return_expr) \
+  do                                 \
+  {                                  \
+    if (!(cond))                     \
+    {                                \
+      return_expr;                   \
+    }                                \
+  } while (0)
+
+#define LOG_PRINT(message, ...)     \
+  do                                \
+  {                                 \
+    printf(message, ##__VA_ARGS__); \
+  } while (0)
+
+int64_t GetShapeSize(const std::vector<int64_t> &shape);
+
+void PrintOutResult(std::vector<int64_t> &shape, void **deviceAddr);
+
+//int Init(int32_t deviceId);
+
+/*
+template <typename T>
+int CreateAclTensor(const std::vector<T>& hostData, const std::vector<int64_t>& shape, void** deviceAddr,
+                    aclDataType dataType, aclTensor** tensor);
+*/
--- a/python/jittor/extern/acl/aclops/init.py
+++ b/python/jittor/extern/acl/aclops/init.py
--- a/python/jittor/extern/acl/aclops/aclops.h
+++ b/python/jittor/extern/acl/aclops/aclops.h
@ -0,0 +1,33 @@
+#pragma once
+#include <acl/aclops/binary_op_acl.h>
+#include <acl/aclops/unary_op_acl.h>
+#include <acl/aclops/conv_op_acl.h>
+#include <acl/aclops/ternary_op_acl.h>
+#include <acl/aclops/reduce_op_acl.h>
+#include <acl/aclops/expand_op_acl.h>
+#include <acl/aclops/getitem_op_acl.h>
+#include <acl/aclops/setitem_op_acl.h>
+#include <acl/aclops/matmul_op_acl.h>
+#include <acl/aclops/random_op_acl.h>
+#include <acl/aclops/bmm_op_acl.h>
+#include <acl/aclops/pool_op_acl.h>
+#include <acl/aclops/flip_op_acl.h>
+#include <acl/aclops/concat_op_acl.h>
+#include <acl/aclops/gather_scatter_op_acl.h>
+#include <acl/aclops/cumsum_op_acl.h>
+#include <acl/aclops/index_op_acl.h>
+#include <acl/aclops/where_op_acl.h>
+#include <acl/aclops/floor_op_acl.h>
+#include <acl/aclops/transpose_op_acl.h>
+#include <acl/aclops/flashattention_op_acl.h>
+#include <acl/aclops/relu_op_acl.h>
+#include <acl/aclops/dropout_op_acl.h>
+#include <acl/aclops/silu_op_acl.h>
+#include <acl/aclops/sigmoid_op_acl.h>
+#include <acl/aclops/softmax_op_acl.h>
+#include <acl/aclops/stack_op_acl.h>
+#include <acl/aclops/nantonum_op_acl.h>
+#include <acl/aclops/rope_op_acl.h>
+#include <acl/aclops/triu_op_acl.h>
+#include <acl/aclops/embedding_op_acl.h>
+#include <acl/aclops/norms_op_acl.h>
--- a/python/jittor/extern/acl/aclops/base_op.h
+++ b/python/jittor/extern/acl/aclops/base_op.h
@ -0,0 +1,56 @@
+#pragma once
+#include "utils.h"
+#include "acl_jittor.h"
+
+namespace jittor
+{
+    extern int sync_run;
+    class BaseOpRunner
+    {
+    protected:
+        vector<Var *> in_;
+        vector<Var *> out_;
+
+        int ret = -1;
+        uint64_t workspaceSize = 0;
+        aclOpExecutor *executor;
+        bool is_group_op = false;
+
+        std::vector<std::vector<int64_t>> inputShapes;
+        std::vector<std::vector<int64_t>> outputShapes;
+
+        std::vector<aclTensor *> inputTensors;
+        std::vector<aclTensor *> outputTensors;
+
+    public:
+        string name;
+        string jt_name;
+        std::unique_ptr<AclOpAttr> op_attr;
+        bool use_nchw = false;
+
+        BaseOpRunner(const string &name = "") : name(name) {}
+        virtual ~BaseOpRunner() = default;
+
+        // Common functionality for adding input/output variables
+        void add(Var *v, bool is_input);
+
+        virtual void setupInputDesc();
+
+        void cleanupDesc();
+
+        virtual void setupOutputDesc();
+
+        virtual void syncRun();
+
+        void checkRet(aclnnStatus ret);
+
+        // Base run method with common operator lookup logic
+        void run();
+
+    protected:
+        // Virtual method for specific operator execution
+        virtual void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) = 0;
+        void cleanupAttr();
+    };
+
+}
--- a/python/jittor/extern/acl/aclops/base_op_acl.cc
+++ b/python/jittor/extern/acl/aclops/base_op_acl.cc
@ -0,0 +1,152 @@
+#include <acl/acl.h>
+#include <acl/acl_op_compiler.h>
+#include <Python.h>
+#include <pystate.h>
+#include <algorithm>
+#include <queue>
+#include <set>
+#include "common.h"
+#include "op.h"
+#include "acl_jittor.h"
+#include "ops/random_op.h"
+#include "ops/reduce_op.h"
+#include "ops/binary_op.h"
+#include "ops/broadcast_to_op.h"
+#include "ops/transpose_op.h"
+#include "ops/array_op.h"
+#include "ops/code_op.h"
+#include "fused_op.h"
+#include "ops/unary_op.h"
+#include "ops/ternary_op.h"
+#include "executor.h"
+#include "misc/cuda_flags.h"
+#include "mem/allocator.h"
+#include "op_compiler.h"
+#include "ops/op_register.h"
+#include "opt/tuner_manager.h"
+#include "utils/str_utils.h"
+#include "aclnn/aclnn.h"
+#include "binary_op_acl.h"
+#include "base_op.h"
+
+namespace jittor
+{
+    extern int sync_run;
+    // Common functionality for adding input/output variables
+    void BaseOpRunner::add(Var *v, bool is_input)
+    {
+        if (is_input)
+        {
+            in_.push_back(v);
+        }
+        else
+        {
+            out_.push_back(v);
+        }
+        return;
+    }
+
+    void BaseOpRunner::setupInputDesc()
+    {
+        auto input_num = in_.size();
+        for (int input_idx = 0; input_idx < input_num; input_idx++)
+        {
+            std::vector<int64_t> shape;
+            for (int j = 0; j < in_[input_idx]->shape.size(); j++)
+            {
+                shape.push_back(in_[input_idx]->shape[j]);
+            }
+            inputShapes.push_back(shape);
+        }
+
+        for (int idx = 0; idx < input_num; idx++)
+        {
+            inputTensors.push_back(nullptr);
+            auto ret = CreateAclTensor(inputShapes[idx], in_[idx]->mem_ptr, in_[idx]->size, get_dtype(in_[idx]->dtype()), &inputTensors[idx], use_nchw);
+            CHECK_RET(ret == ACL_SUCCESS, return);
+        }
+    }
+
+    void BaseOpRunner::cleanupDesc()
+    {
+        auto input_num = in_.size();
+        auto output_num = out_.size();
+        for (int idx = 0; idx < input_num; idx++)
+        {
+            aclDestroyTensor(inputTensors[idx]);
+        }
+        for (int idx = 0; idx < output_num; idx++)
+        {
+            aclDestroyTensor(outputTensors[idx]);
+        }
+    }
+
+    void BaseOpRunner::setupOutputDesc()
+    {
+        auto output_num = out_.size();
+
+        for (int output_idx = 0; output_idx < output_num; output_idx++)
+        {
+            std::vector<int64_t> shape;
+            for (int j = 0; j < out_[output_idx]->shape.size(); j++)
+            {
+                shape.push_back(out_[output_idx]->shape[j]);
+            }
+            outputShapes.push_back(shape);
+        }
+
+        for (int idx = 0; idx < output_num; idx++)
+        {
+            outputTensors.push_back(nullptr);
+            auto ret = CreateAclTensor(outputShapes[idx], out_[idx]->mem_ptr, out_[idx]->size, get_dtype(out_[idx]->dtype()), &outputTensors[idx], use_nchw);
+            CHECK_RET(ret == ACL_SUCCESS, return);
+        }
+    }
+
+    void BaseOpRunner::syncRun()
+    {
+        if (sync_run)
+        {
+            // ret = aclrtSynchronizeStream(aclstream);
+            // CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclrtSynchronizeStream failed. ERROR: %d\n", name.c_str(), ret); return);
+        }
+    }
+
+    void BaseOpRunner::checkRet(aclnnStatus ret)
+    {
+        if (ret != ACL_SUCCESS)
+        {
+            auto tmp_err_msg = aclGetRecentErrMsg();
+            LOGir << name << ", " << tmp_err_msg;
+        }
+
+        CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnxxxGetWorkspaceSize failed. ERROR: %d\n", name.c_str(), ret); return);
+    }
+
+    // Base run method with common operator lookup logic
+    void BaseOpRunner::run()
+    {
+        if (is_group_op)
+        {
+            auto it = aclOpFuncMap.find(name);
+            if (it == aclOpFuncMap.end())
+            {
+                LOGir << "aclOpFuncMap Not supported op: " << name;
+                throw std::runtime_error("Unsupported operation type.");
+            }
+            setupInputDesc();
+            setupOutputDesc();
+            executeOp(it);
+            cleanupDesc();
+        }
+        else
+        {
+            auto it = aclOpFuncMap.find(name);
+            setupInputDesc();
+            setupOutputDesc();
+            executeOp(it);
+            cleanupDesc();
+        }
+    }
+
+}
--- a/python/jittor/extern/acl/aclops/binary_op_acl.cc
+++ b/python/jittor/extern/acl/aclops/binary_op_acl.cc
@ -0,0 +1,124 @@
+#include <acl/acl.h>
+#include <acl/acl_op_compiler.h>
+#include <Python.h>
+#include <pystate.h>
+#include <algorithm>
+#include <queue>
+#include <set>
+#include "common.h"
+#include "op.h"
+#include "acl_jittor.h"
+#include "ops/random_op.h"
+#include "ops/reduce_op.h"
+#include "ops/binary_op.h"
+#include "ops/broadcast_to_op.h"
+#include "ops/transpose_op.h"
+#include "ops/array_op.h"
+#include "ops/code_op.h"
+#include "fused_op.h"
+#include "ops/unary_op.h"
+#include "ops/ternary_op.h"
+#include "executor.h"
+#include "misc/cuda_flags.h"
+#include "mem/allocator.h"
+#include "op_compiler.h"
+#include "ops/op_register.h"
+#include "opt/tuner_manager.h"
+#include "utils/str_utils.h"
+#include "aclnn/aclnn.h"
+#include "binary_op_acl.h"
+
+namespace jittor
+{
+    BinaryOpRunner::BinaryOpRunner() : BaseOpRunner("binary")
+    {
+        use_nchw = false;
+        is_group_op = true;
+    }
+
+    void BinaryOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
+    {
+        aclScalar *alpha = nullptr;
+
+        if (name == string("Add") || name == string("Sub"))
+        {
+            if (get_dtype(in_[0]->dtype()) == ACL_FLOAT)
+            {
+                float alphaValue = 1.0;
+                alpha = aclCreateScalar(&alphaValue, get_dtype(in_[0]->dtype()));
+            }
+            else if (get_dtype(in_[0]->dtype()) == ACL_FLOAT16)
+            {
+                __fp16 alphaValue = 1.0;
+                alpha = aclCreateScalar(&alphaValue, get_dtype(in_[0]->dtype()));
+            }
+            else if (get_dtype(in_[0]->dtype()) == ACL_INT64)
+            {
+                int64_t alphaValue = 1;
+                alpha = aclCreateScalar(&alphaValue, get_dtype(in_[0]->dtype()));
+            }
+            else if (get_dtype(in_[0]->dtype()) == ACL_INT32)
+            {
+                int alphaValue = 1;
+                alpha = aclCreateScalar(&alphaValue, get_dtype(in_[0]->dtype()));
+            }
+            else if (get_dtype(in_[0]->dtype()) == ACL_INT8)
+            {
+                int8_t alphaValue = 1;
+                alpha = aclCreateScalar(&alphaValue, get_dtype(in_[0]->dtype()));
+            }
+            else if (get_dtype(in_[0]->dtype()) == ACL_INT16)
+            {
+                int16_t alphaValue = 1;
+                alpha = aclCreateScalar(&alphaValue, get_dtype(in_[0]->dtype()));
+            }
+            else if (get_dtype(in_[0]->dtype()) == ACL_UINT8)
+            {
+                uint8_t alphaValue = 1;
+                alpha = aclCreateScalar(&alphaValue, get_dtype(in_[0]->dtype()));
+            }
+            else if (get_dtype(in_[0]->dtype()) == ACL_UINT16)
+            {
+                uint16_t alphaValue = 1;
+                alpha = aclCreateScalar(&alphaValue, get_dtype(in_[0]->dtype()));
+            }
+            else if (get_dtype(in_[0]->dtype()) == ACL_UINT32)
+            {
+                uint32_t alphaValue = 1;
+                alpha = aclCreateScalar(&alphaValue, get_dtype(in_[0]->dtype()));
+            }
+            else if (get_dtype(in_[0]->dtype()) == ACL_BOOL)
+            {
+                bool alphaValue = true;
+                alpha = aclCreateScalar(&alphaValue, get_dtype(in_[0]->dtype()));
+            }
+            else
+            {
+                LOGf << "Not supported dtype: " << in_[0]->dtype();
+            }
+
+            CHECK_RET(alpha != nullptr, return);
+            ret = it->second.getWorkspaceSizeFuncAdd(inputTensors[0], inputTensors[1], alpha, outputTensors[0], &workspaceSize, &executor);
+        }
+        else
+
+        {
+            ret = it->second.getWorkspaceSizeFuncBinary(inputTensors[0], inputTensors[1], outputTensors[0], &workspaceSize, &executor);
+        }
+
+        checkRet(ret);
+
+        if (workspaceSize > 0)
+        {
+            mallocWorkSpace(workspaceSize);
+        }
+
+        ret = it->second.executeFunc(workspaceAddr, workspaceSize, executor, aclstream);
+        CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnxxx failed. ERROR: %d\n", name.c_str(), ret); return);
+
+        syncRun();
+
+        aclDestroyScalar(alpha);
+        return;
+    }
+}
--- a/python/jittor/extern/acl/aclops/binary_op_acl.h
+++ b/python/jittor/extern/acl/aclops/binary_op_acl.h
@ -0,0 +1,14 @@
+#pragma once
+#include "utils.h"
+#include "base_op.h"
+
+namespace jittor
+{
+    struct BinaryOpRunner : public BaseOpRunner
+    {
+        BinaryOpRunner();
+
+    protected:
+        void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
+    };
+}
--- a/python/jittor/extern/acl/aclops/bmm_op.py
+++ b/python/jittor/extern/acl/aclops/bmm_op.py
@ -0,0 +1,128 @@
+import os
+from jittor_utils import env_or_try_find
+import jittor_utils
+import ctypes
+import glob
+import jittor.compiler as compiler
+import jittor as jt
+import math
+import numpy as np
+
+from typing import Union
+from collections.abc import Sequence, Iterable
+
+
+def acl_cmd(name: str,
+            inputs: list,
+            output_dtypes: list = None,
+            output_shapes: list = None,
+            attr_code: str = "",
+            attr_header: str = "",
+            outputs: list = None,
+            extra_data: dict = {}):
+    attr_header = "\nnamespace jittor{" + attr_header + "}\n"
+
+    cuda_header = '''
+    #include "acl/aclops/aclops.h"
+    '''
+    outputs_ = []
+    if outputs is not None:
+        outputs_ = outputs
+    else:
+        assert output_dtypes is not None
+        assert output_shapes is not None
+        assert len(output_dtypes) == len(output_shapes)
+        for i in range(len(output_shapes)):
+            outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
+    input_code = ''
+    for i in range(len(inputs)):
+        input_code += f"op.add(in{i}, true);\n"
+
+    return jt.code(outputs=outputs_,
+                   inputs=inputs,
+                   cuda_header=attr_header + cuda_header,
+                   cuda_src=f"""
+    // aclop
+    BatchMatMulOpRunner op;
+    {input_code}
+    op.add(out0, false);
+    {attr_code}
+    op.run();""",
+                   data=extra_data)
+
+
+class BmmACL(jt.Function):
+
+    def __init__(self, trans_x2=False):
+        super(BmmACL, self).__init__()
+        self.trans_x2 = trans_x2
+
+    def execute(self, x1, x2):
+        self.input = [x1, x2]
+        result = acl_cmd("BatchMatMul", [x1, x2],
+                         output_dtypes=[x1.dtype],
+                         output_shapes=[
+                             x1.shape[:-1] + x2.shape[-2:-1] if self.trans_x2
+                             else x1.shape[:-1] + x2.shape[-1:]
+                         ],
+                         attr_code="op.jt_name=\"bmm_trans_1\";"
+                         if self.trans_x2 else "op.jt_name=\"bmm\";")[0]
+
+        return result
+
+    def grad(self, grad_output):
+        x1, x2 = self.input
+        if len(x1) != len(x2):
+            reshape_grad_x2 = True
+        else:
+            reshape_grad_x2 = False
+        grad_x1 = acl_cmd(
+            "BatchMatMul", [grad_output, x2],
+            output_dtypes=[x1.dtype],
+            output_shapes=[
+                grad_output.shape[:-1] + x2.shape[-2:-1] if not self.trans_x2
+                else grad_output.shape[:-1] + x1.shape[-1:]
+            ],
+            attr_code="op.jt_name=\"bmm_trans_1\";"
+            if not self.trans_x2 else "op.jt_name=\"bmm\";")[0]
+        if self.trans_x2:
+            if reshape_grad_x2:
+                output_shape = grad_output.shape[1:-2] + grad_output.shape[
+                    -1:] + x1.shape[-1:]
+                grad_x2 = acl_cmd("BatchMatMul", [
+                    grad_output.reshape(-1, grad_output.shape[-1]),
+                    x1.reshape(-1, x1.shape[-1])
+                ],
+                                  output_dtypes=[x2.dtype],
+                                  output_shapes=[output_shape],
+                                  attr_code="op.jt_name=\"bmm_trans_0\";")[0]
+            else:
+                output_shape = grad_output.shape[:-2] + grad_output.shape[
+                    -1:] + x1.shape[-1:]
+                grad_x2 = acl_cmd("BatchMatMul", [grad_output, x1],
+                                  output_dtypes=[x2.dtype],
+                                  output_shapes=[output_shape],
+                                  attr_code="op.jt_name=\"bmm_trans_0\";")[0]
+        else:
+            if reshape_grad_x2:
+                output_shape = x1.shape[1:-2] + x1.shape[
+                    -1:] + grad_output.shape[-1:]
+                grad_x2 = acl_cmd("BatchMatMul", [
+                    x1.reshape(-1, x1.shape[-1]),
+                    grad_output.reshape(-1, grad_output.shape[-1])
+                ],
+                                  output_dtypes=[x2.dtype],
+                                  output_shapes=[output_shape],
+                                  attr_code="op.jt_name=\"bmm_trans_0\";")[0]
+            else:
+                output_shape = x1.shape[:-2] + x1.shape[
+                    -1:] + grad_output.shape[-1:]
+                grad_x2 = acl_cmd("BatchMatMul", [x1, grad_output],
+                                  output_dtypes=[x2.dtype],
+                                  output_shapes=[output_shape],
+                                  attr_code="op.jt_name=\"bmm_trans_0\";")[0]
+        if len(grad_x1.shape) > len(x1.shape):
+            grad_x1 = grad_x1.sum(0)
+        if len(grad_x2.shape) > len(x2.shape):
+            grad_x2 = grad_x2.sum(0)
+        return grad_x1, grad_x2
--- a/python/jittor/extern/acl/aclops/bmm_op_acl.cc
+++ b/python/jittor/extern/acl/aclops/bmm_op_acl.cc
@ -0,0 +1,77 @@
+#pragma once
+#include <acl/acl.h>
+#include <acl/acl_op_compiler.h>
+#include <Python.h>
+#include <pystate.h>
+#include <algorithm>
+#include <queue>
+#include <set>
+#include "common.h"
+#include "op.h"
+#include "acl_jittor.h"
+#include "ops/random_op.h"
+#include "ops/reduce_op.h"
+#include "ops/binary_op.h"
+#include "ops/broadcast_to_op.h"
+#include "ops/transpose_op.h"
+#include "ops/array_op.h"
+#include "ops/code_op.h"
+#include "fused_op.h"
+#include "ops/unary_op.h"
+#include "ops/ternary_op.h"
+#include "executor.h"
+#include "misc/cuda_flags.h"
+#include "mem/allocator.h"
+#include "op_compiler.h"
+#include "ops/op_register.h"
+#include "opt/tuner_manager.h"
+#include "utils/str_utils.h"
+#include "aclnn/aclnn.h"
+#include "bmm_op_acl.h"
+
+namespace jittor
+{
+    BatchMatMulOpRunner::BatchMatMulOpRunner() : BaseOpRunner("BatchMatMulMatMul")
+    {
+    }
+    void BatchMatMulOpRunner::setupInputDesc()
+    {
+        auto input_num = in_.size();
+        for (int input_idx = 0; input_idx < input_num; input_idx++)
+        {
+            std::vector<int64_t> shape;
+            for (int j = 0; j < in_[input_idx]->shape.size(); j++)
+            {
+                shape.push_back(in_[input_idx]->shape[j]);
+            }
+            inputShapes.push_back(shape);
+        }
+        for (int idx = 0; idx < input_num; idx++)
+        {
+            inputTensors.push_back(nullptr);
+            if ((jt_name == "bmm_trans_1" && idx == 1) || (jt_name == "bmm_trans_0" && idx == 0))
+            {
+                auto ret = CreateFakeTransAclTensor(inputShapes[idx], in_[idx]->mem_ptr, in_[idx]->size, get_dtype(in_[idx]->dtype()), &inputTensors[idx], use_nchw);
+                CHECK_RET(ret == ACL_SUCCESS, return);
+            }
+            else
+            {
+                auto ret = CreateAclTensor(inputShapes[idx], in_[idx]->mem_ptr, in_[idx]->size, get_dtype(in_[idx]->dtype()), &inputTensors[idx], use_nchw);
+                CHECK_RET(ret == ACL_SUCCESS, return);
+            }
+        }
+    }
+    void BatchMatMulOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
+    {
+
+        ret = aclnnBatchMatMulGetWorkspaceSize(inputTensors[0], inputTensors[1], outputTensors[0], 1, &workspaceSize, &executor);
+        CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnBatchMatmulGetWorkspaceSize failed. ERROR: %d\n", name.c_str(), ret); return);
+        if (workspaceSize > 0)
+        {
+            mallocWorkSpace(workspaceSize);
+        }
+        ret = aclnnBatchMatMul(workspaceAddr, workspaceSize, executor, aclstream);
+        CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnbatchMatmul failed. ERROR: %d\n", name.c_str(), ret); return);
+        syncRun();
+    }
+}
--- a/python/jittor/extern/acl/aclops/bmm_op_acl.h
+++ b/python/jittor/extern/acl/aclops/bmm_op_acl.h
@ -0,0 +1,17 @@
+#pragma once
+#include "utils.h"
+#include "base_op.h"
+
+namespace jittor
+{
+    class BatchMatMulOpRunner : public BaseOpRunner
+    {
+
+    protected:
+        void setupInputDesc() override;
+        void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
+
+    public:
+        BatchMatMulOpRunner();
+    };
+}
--- a/python/jittor/extern/acl/aclops/concat_op.py
+++ b/python/jittor/extern/acl/aclops/concat_op.py
@ -0,0 +1,186 @@
+import os
+from jittor_utils import env_or_try_find
+import jittor_utils
+import ctypes
+import glob
+import jittor.compiler as compiler
+import jittor as jt
+import math
+import numpy as np
+
+from typing import Union
+from collections.abc import Sequence, Iterable
+
+
+def concat_cmd(name: str,
+               inputs: list,
+               output_dtypes: list = None,
+               output_shapes: list = None,
+               attr_code: str = "",
+               attr_header: str = "",
+               outputs: list = None):
+    attr_header = "\nnamespace jittor{" + attr_header + "}\n"
+
+    cuda_header = '''
+    #include "acl/aclops/aclops.h"
+    '''
+    outputs_ = []
+    if outputs is not None:
+        outputs_ = outputs
+    else:
+        assert output_dtypes is not None
+        assert output_shapes is not None
+        assert len(output_dtypes) == len(output_shapes)
+        for i in range(len(output_shapes)):
+            outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
+    input_code = ''
+    for i in range(len(inputs)):
+        input_code += f"op.add(in{i}, true);\n"
+
+    output_code = ''
+    for i in range(len(outputs_)):
+        output_code += f"op.add(out{i}, false);\n"
+    return jt.code(outputs=outputs_,
+                   inputs=inputs,
+                   cuda_header=attr_header + cuda_header,
+                   cuda_src=f"""
+   
+    // aclop
+    {name}OpRunner op;
+    {input_code}
+    {output_code}
+    {attr_code}
+    op.run();""")
+
+
+class ConcatACL(jt.Function):
+
+    def __init__(self):
+        super(ConcatACL, self).__init__()
+
+    def __call__(self, *args):
+        assert isinstance(args[0], (list, tuple))
+        assert isinstance(args[1], int)
+        if jt.flags.no_grad:
+            return self.execute(*args)
+        backup = args
+        args = list(args)
+        taped_inputs = []
+        taped_outputs = []
+        input_mask = [-1] * (len(args[0]) + 1)
+        newargs = [list(), args[1]]
+        for i, v in enumerate(args[0]):
+            if isinstance(v, jt.Var):
+                if v.is_stop_grad():
+                    # -2 in input_mask represents it is stop_grad
+                    input_mask[i] = -2
+                    newargs[0].append(v)
+                    continue
+                v = v.tape()
+                newargs[0].append(v)
+                input_mask[i] = len(taped_inputs)
+                taped_inputs.append(v)
+
+        ori_res = self.execute(*newargs)
+        if not isinstance(ori_res, Sequence):
+            res = [ori_res]
+        else:
+            res = list(ori_res)
+        output_mask = [-1] * len(res)
+        for i, v in enumerate(res):
+            if isinstance(v, jt.Var):
+                v = v.tape()
+                output_mask[i] = len(taped_outputs)
+                res[i] = v
+                taped_outputs.append(v)
+        self.input_mask = input_mask
+        self.output_mask = output_mask
+        # tape output and input together so
+        # backward treat them as one operator
+        jt.tape_together(taped_inputs, taped_outputs, self._grad)
+        if isinstance(ori_res, Sequence):
+            return res
+        else:
+            return res[0]
+
+    def execute(self, input_tensors, dim=0):
+        for _ in input_tensors:
+            if not (-_.ndim <= dim < _.ndim):
+                print(_.shape, dim)
+                raise ValueError("dim out of range")
+
+        if dim < 0:
+            dim += input_tensors[0].ndim
+
+        self.input = input_tensors
+        self.dim = dim
+        for i in range(len(input_tensors)):
+            if input_tensors[i].dtype != input_tensors[0].dtype:
+                raise ValueError("All input tensors must have the same dtype")
+            if input_tensors[i].shape[:dim] != input_tensors[
+                    0].shape[:dim] or input_tensors[i].shape[
+                        dim + 1:] != input_tensors[0].shape[dim + 1:]:
+                raise ValueError("All input tensors must have the same shape")
+        attr_code = f"""
+        op.jt_name = "concat";
+        ConcatAttr *attr = new ConcatAttr();
+        attr->tensorNum = {len(input_tensors)};
+        attr->dim = {dim};
+        op.op_attr.reset(attr);
+        """
+        result = concat_cmd(
+            "Concat",
+            input_tensors,
+            output_dtypes=[input_tensors[0].dtype],
+            output_shapes=[
+                jt.empty(self.calculate_output_shape(input_tensors, dim)).shape
+            ],
+            attr_code=attr_code)[0]
+        return result
+
+    def _grad(self, *args):
+        new_args = ((args[i] if i >= 0 else None) for i in self.output_mask)
+        ret = self.grad(*new_args)
+        new_ret = []
+        for i, r in enumerate(ret):
+            j = self.input_mask[i]
+            if j < 0:
+                # -2 in input_mask represents it is stop_grad
+                assert r is None or j==-2, f"{type(self)}'s {i}-th returned grad should be None, "\
+                    "because the input value is not jittor variable."
+            else:
+                new_ret.append(r)
+        return new_ret
+
+    def grad(self, grad_output):
+        grad_inputs = self.split_grad(grad_output, self.input, self.dim)
+        return grad_inputs
+
+    def calculate_output_shape(self, input_tensors, axis):
+        shape = list(input_tensors[0].shape)
+        for tensor in input_tensors[1:]:
+            shape[axis] += tensor.shape[axis]
+        return tuple(shape)
+
+    def split_grad(self, grad_output, input_tensors, axis):
+        offset = []
+        shapeVec = []
+        dtypeVec = []
+        for tensor in input_tensors:
+            offset.append(tensor.shape[axis])
+            dtypeVec.append(tensor.dtype)
+            shapeVec.append(tensor.shape)
+
+        attr_code = f"""
+        op.jt_name = "splitwithsize";
+        auto *attr = new SplitWithSizeAttr();
+        attr->splitSize = {{ {", ".join(map(str, offset))} }};
+        attr->dim = {axis};
+        op.op_attr.reset(attr);
+        """
+
+        result = concat_cmd("SplitWithSize", [grad_output],
+                            output_dtypes=dtypeVec,
+                            output_shapes=shapeVec,
+                            attr_code=attr_code)
+        return result
--- a/python/jittor/extern/acl/aclops/concat_op_acl.cc
+++ b/python/jittor/extern/acl/aclops/concat_op_acl.cc
@ -0,0 +1,89 @@
+#pragma once
+#include <acl/acl.h>
+#include <acl/acl_op_compiler.h>
+#include <Python.h>
+#include <pystate.h>
+#include <algorithm>
+#include <queue>
+#include <set>
+#include "common.h"
+#include "op.h"
+#include "acl_jittor.h"
+#include "ops/random_op.h"
+#include "ops/reduce_op.h"
+#include "ops/binary_op.h"
+#include "ops/broadcast_to_op.h"
+#include "ops/transpose_op.h"
+#include "ops/array_op.h"
+#include "ops/code_op.h"
+#include "fused_op.h"
+#include "ops/unary_op.h"
+#include "ops/ternary_op.h"
+#include "executor.h"
+#include "misc/cuda_flags.h"
+#include "mem/allocator.h"
+#include "op_compiler.h"
+#include "ops/op_register.h"
+#include "opt/tuner_manager.h"
+#include "utils/str_utils.h"
+#include "aclnn/aclnn.h"
+#include "concat_op_acl.h"
+
+namespace jittor
+{
+    ConcatOpRunner::ConcatOpRunner() : BaseOpRunner("Concat")
+    {
+    }
+
+    void ConcatOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
+    {
+        auto input_num = in_.size();
+        std::vector<aclTensor *> concatTensorList = {};
+        for (int i = 0; i < input_num; i++)
+        {
+            concatTensorList.push_back(inputTensors[i]);
+        }
+        auto concatTensorListInput = aclCreateTensorList(&concatTensorList[0], input_num);
+        auto attr = dynamic_cast<ConcatAttr *>(op_attr.get());
+        ret = aclnnCatGetWorkspaceSize(concatTensorListInput, attr->dim, outputTensors[0], &workspaceSize, &executor);
+        checkRet(ret);
+
+        if (workspaceSize > 0)
+        {
+            mallocWorkSpace(workspaceSize);
+        }
+
+        ret = aclnnCat(workspaceAddr, workspaceSize, executor, aclstream);
+        CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnCat failed. ERROR: %d\n", name.c_str(), ret); return);
+
+        syncRun();
+        return;
+    }
+
+    SplitWithSizeOpRunner::SplitWithSizeOpRunner() : BaseOpRunner("SplitWithSize")
+    {
+    }
+
+    void SplitWithSizeOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
+    {
+        auto output_num = out_.size();
+        auto attr = dynamic_cast<SplitWithSizeAttr *>(op_attr.get());
+        auto splitSize = aclCreateIntArray(attr->splitSize.data(), attr->splitSize.size());
+        auto tensorList = aclCreateTensorList(&outputTensors[0], output_num);
+        ret = aclnnSplitWithSizeGetWorkspaceSize(inputTensors[0], splitSize, attr->dim, tensorList, &workspaceSize, &executor);
+
+        checkRet(ret);
+
+        if (workspaceSize > 0)
+        {
+            mallocWorkSpace(workspaceSize);
+        }
+
+        ret = aclnnSplitWithSize(workspaceAddr, workspaceSize, executor, aclstream);
+        CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnSplitWithSize failed. ERROR: %d\n", name.c_str(), ret); return);
+
+        syncRun();
+        return;
+    }
+
+}
--- a/python/jittor/extern/acl/aclops/concat_op_acl.h
+++ b/python/jittor/extern/acl/aclops/concat_op_acl.h
@ -0,0 +1,26 @@
+#pragma once
+#include "utils.h"
+#include "base_op.h"
+
+namespace jittor
+{
+    class ConcatOpRunner : public BaseOpRunner
+    {
+
+    protected:
+        void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
+
+    public:
+        ConcatOpRunner();
+    };
+
+    class SplitWithSizeOpRunner : public BaseOpRunner
+    {
+
+    protected:
+        void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
+
+    public:
+        SplitWithSizeOpRunner();
+    };
+}
--- a/python/jittor/extern/acl/aclops/conv_op.py
+++ b/python/jittor/extern/acl/aclops/conv_op.py
@ -0,0 +1,160 @@
+import os
+import jittor_utils
+from jittor_utils import env_or_try_find
+import ctypes
+import glob
+import jittor as jt
+import jittor.compiler as compiler
+import math
+import numpy as np
+
+from typing import Union
+from collections.abc import Sequence, Iterable
+
+
+def _ntuple(n):
+
+    def parse(x):
+        if isinstance(x, Iterable):
+            return x
+        return tuple([x] * n)
+
+    return parse
+
+
+_pair = _ntuple(2)
+
+
+def conv_cmd(name: str,
+                inputs: list,
+                output_dtypes: list = None,
+                output_shapes: list = None,
+                attr_code: str = "",
+                attr_header: str = "",
+                outputs: list = None):
+    attr_header = "\nnamespace jittor{" + attr_header + "}\n"
+
+    cuda_header = '''
+    #include "acl/aclops/aclops.h"
+    '''
+    outputs_ = []
+    if outputs is not None:
+        outputs_ = outputs
+    else:
+        assert output_dtypes is not None
+        assert output_shapes is not None
+        assert len(output_dtypes) == len(output_shapes)
+        for i in range(len(output_shapes)):
+            outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
+    input_code = ''
+    for i in range(len(inputs)):
+        input_code += f"op.add(in{i}, true);\n"
+
+    output_code = ''
+    for i in range(len(outputs_)):
+        output_code += f"op.add(out{i}, false);\n"
+    return jt.code(outputs=outputs_,
+                   inputs=inputs,
+                   cuda_header=attr_header + cuda_header,
+                   cuda_src=f"""
+   
+    // aclop
+    {name}OpRunner op;
+    {input_code}
+    {output_code}
+    {attr_code}
+    op.run();""")
+
+class ConvACL(jt.Function):
+
+    def execute(self,
+                x,
+                weight,
+                bias=None,
+                stride=1,
+                padding=0,
+                dilation=1,
+                groups=1):
+        self.input = x
+        self.weight = weight
+        self.bias = bias
+        padding = _pair(padding)
+        stride = _pair(stride)
+        dilation = _pair(dilation)
+        out_channels = weight.shape[0]
+        if groups <= 0:
+            raise ValueError("groups must be a positive integer")
+        self.padding = padding
+        self.stride = stride
+        self.dilation = dilation
+        self.groups = groups
+        attr_code = f"""
+            op.jt_name = "conv2d";
+            ConvAttr *attr = new ConvAttr();
+            attr->convStrides = {{ {stride[0]}, {stride[1]} }};
+            attr->convPads = {{ {padding[0]}, {padding[1]} }};
+            attr->convDilations = {{ {dilation[0]}, {dilation[1]} }};
+            attr->group = {groups};
+            attr->convOutPads = {{1,1}};
+            op.op_attr.reset(attr);
+            """
+        input_height, input_width = x.shape[-2:]
+        kernel_height, kernel_width = weight.shape[-2:]
+
+        output_height = (input_height + 2 * padding[0] - dilation[0] *
+                         (kernel_height - 1) - 1) // stride[0] + 1
+        output_width = (input_width + 2 * padding[1] - dilation[1] *
+                        (kernel_width - 1) - 1) // stride[1] + 1
+
+        output_shape = (x.shape[0], out_channels, output_height, output_width)
+
+        inputs = [x, weight]
+        if bias is not None:
+            inputs.append(bias)
+        result = conv_cmd(
+            "Conv2d",
+            inputs,
+            output_dtypes=[x.dtype],
+            output_shapes=[output_shape],
+            attr_code=attr_code,
+        )[0]
+        return result
+
+    def grad(self, grad_output):
+        x = self.input
+        weight = self.weight
+        bias = self.bias
+        inputs = [grad_output, x, weight]
+        if bias is not None:
+            inputs.append(bias)
+        output_shapes = [x.shape, weight.shape]
+        output_dtypes = [x.dtype, weight.dtype]
+        if bias is not None:
+            output_shapes.append(bias.shape)
+            output_dtypes.append(bias.dtype)
+        else:
+            output_shapes.append([weight.shape[0]])
+            output_dtypes.append(x.dtype)
+        padding = self.padding
+        stride = self.stride
+        dilation = self.dilation
+        groups = self.groups
+        attr_code = f"""
+            op.jt_name = "conv2dbackward";
+            ConvAttr *attr = new ConvAttr();
+            attr->convStrides = {{ {stride[0]}, {stride[1]} }};
+            attr->convPads = {{ {padding[0]}, {padding[1]} }};
+            attr->convDilations = {{ {dilation[0]}, {dilation[1]} }};
+            attr->group = {groups};
+            attr->convOutPads = {{ 1,1}};
+            op.op_attr.reset(attr);
+            """
+        results = conv_cmd("Conv2dBackward",
+                                  inputs,
+                                  output_dtypes=output_dtypes,
+                                  output_shapes=output_shapes,
+                                  attr_code=attr_code)
+        if self.bias is None:
+            return results[0], results[1]
+
+        return results
--- a/python/jittor/extern/acl/aclops/conv_op_acl.cc
+++ b/python/jittor/extern/acl/aclops/conv_op_acl.cc
@ -0,0 +1,152 @@
+#include <acl/acl.h>
+#include <acl/acl_op_compiler.h>
+#include <Python.h>
+#include <pystate.h>
+#include <algorithm>
+#include <queue>
+#include <set>
+#include "common.h"
+#include "op.h"
+#include "acl_jittor.h"
+#include "ops/random_op.h"
+#include "ops/reduce_op.h"
+#include "ops/binary_op.h"
+#include "ops/broadcast_to_op.h"
+#include "ops/transpose_op.h"
+#include "ops/array_op.h"
+#include "ops/code_op.h"
+#include "fused_op.h"
+#include "ops/unary_op.h"
+#include "ops/ternary_op.h"
+#include "executor.h"
+#include "misc/cuda_flags.h"
+#include "mem/allocator.h"
+#include "op_compiler.h"
+#include "ops/op_register.h"
+#include "opt/tuner_manager.h"
+#include "utils/str_utils.h"
+#include "aclnn/aclnn.h"
+#include "conv_op_acl.h"
+
+namespace jittor
+{
+    Conv2dOpRunner::Conv2dOpRunner() : BaseOpRunner("Conv2d")
+    {
+        use_nchw = true;
+    }
+
+    void Conv2dOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
+    {
+        // for conv
+        aclIntArray *strides = nullptr;
+        aclIntArray *pads = nullptr;
+        aclIntArray *outPads = nullptr;
+        aclIntArray *dilations = nullptr;
+        auto attr = dynamic_cast<ConvAttr *>(op_attr.get());
+        strides = aclCreateIntArray(attr->convStrides.data(), 2);
+        pads = aclCreateIntArray(attr->convPads.data(), 2);
+        outPads = aclCreateIntArray(attr->convOutPads.data(), 2);
+        dilations = aclCreateIntArray(attr->convDilations.data(), 2);
+
+        aclTensor *bias = nullptr;
+
+        auto input_num = in_.size();
+        if (input_num == 3)
+            bias = inputTensors[2];
+
+        ret = aclnnConvolutionGetWorkspaceSize(inputTensors[0], inputTensors[1], bias, strides, pads, dilations, false, outPads, attr->group, outputTensors[0], 0, &workspaceSize, &executor);
+
+        checkRet(ret);
+
+        if (workspaceSize > 0)
+        {
+            mallocWorkSpace(workspaceSize);
+        }
+
+        ret = aclnnConvolution(workspaceAddr, workspaceSize, executor, aclstream);
+        CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnConvolution failed. ERROR: %d\n", name.c_str(), ret); return);
+
+        syncRun();
+
+        aclDestroyIntArray(strides);
+        aclDestroyIntArray(pads);
+        aclDestroyIntArray(outPads);
+        aclDestroyIntArray(dilations);
+        return;
+    }
+
+
+    Conv2dBackwardOpRunner::Conv2dBackwardOpRunner() : BaseOpRunner("Conv2dBackward")
+    {
+        use_nchw = true;
+    }
+
+    void Conv2dBackwardOpRunner::setupOutputDesc()
+    {
+        auto output_num = out_.size();
+
+        for (int output_idx = 0; output_idx < output_num; output_idx++)
+        {
+            std::vector<int64_t> shape;
+            for (int j = 0; j < out_[output_idx]->shape.size(); j++)
+            {
+                shape.push_back(out_[output_idx]->shape[j]);
+            }
+            outputShapes.push_back(shape);
+        }
+
+        for (int idx = 0; idx < 2; idx++)
+        {
+            outputTensors.push_back(nullptr);
+            auto ret = CreateAclTensor(outputShapes[idx], out_[idx]->mem_ptr, out_[idx]->size, get_dtype(out_[idx]->dtype()), &outputTensors[idx], use_nchw);
+            CHECK_RET(ret == ACL_SUCCESS, return);
+        }
+        // biasgrad nd format
+        {
+            outputTensors.push_back(nullptr);
+            auto ret = CreateAclTensor(outputShapes[2], out_[2]->mem_ptr, out_[2]->size, get_dtype(out_[2]->dtype()), &outputTensors[2], false);
+            CHECK_RET(ret == ACL_SUCCESS, return);
+        }
+    }
+
+    void Conv2dBackwardOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
+    {
+        // for conv
+        aclIntArray *strides = nullptr;
+        aclIntArray *pads = nullptr;
+        aclIntArray *outPads = nullptr;
+        aclIntArray *dilations = nullptr;
+        auto attr = dynamic_cast<ConvAttr *>(op_attr.get());
+        strides = aclCreateIntArray(attr->convStrides.data(), 2);
+        pads = aclCreateIntArray(attr->convPads.data(), 2);
+        outPads = aclCreateIntArray(attr->convOutPads.data(), 2);
+        dilations = aclCreateIntArray(attr->convDilations.data(), 2);
+        bool outputMask[3] = {true, true, true};
+        auto input_num = in_.size();
+        if (input_num == 3)
+        {
+            outputMask[2] = false;
+        }
+        aclBoolArray *outMask = aclCreateBoolArray(outputMask, 3);
+        auto biasSizes = aclCreateIntArray(&outputShapes[2][0], outputShapes[2].size());
+        ret = aclnnConvolutionBackwardGetWorkspaceSize(inputTensors[0], inputTensors[1], inputTensors[2], biasSizes, strides, pads, dilations, false, outPads, attr->group, outMask, 0, outputTensors[0], outputTensors[1], outputTensors[2], &workspaceSize, &executor);
+
+        checkRet(ret);
+
+        if (workspaceSize > 0)
+        {
+            mallocWorkSpace(workspaceSize);
+        }
+
+        ret = aclnnConvolutionBackward(workspaceAddr, workspaceSize, executor, aclstream);
+        CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnConvolutionBackward failed. ERROR: %d\n", name.c_str(), ret); return);
+
+        syncRun();
+
+        aclDestroyIntArray(strides);
+        aclDestroyIntArray(pads);
+        aclDestroyIntArray(outPads);
+        aclDestroyIntArray(dilations);
+        return;
+    }
+}
--- a/python/jittor/extern/acl/aclops/conv_op_acl.h
+++ b/python/jittor/extern/acl/aclops/conv_op_acl.h
@ -0,0 +1,27 @@
+#pragma once
+#include "utils.h"
+#include "base_op.h"
+
+namespace jittor
+{
+    class Conv2dOpRunner : public BaseOpRunner
+    {
+
+    protected:
+        void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
+
+    public:
+        Conv2dOpRunner();
+    };
+
+    class Conv2dBackwardOpRunner : public BaseOpRunner
+    {
+
+    protected:
+        void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
+        void setupOutputDesc() override;
+
+    public:
+        Conv2dBackwardOpRunner();
+    };
+}
--- a/python/jittor/extern/acl/aclops/cumsum_op.py
+++ b/python/jittor/extern/acl/aclops/cumsum_op.py
@ -0,0 +1,101 @@
+import os
+from jittor_utils import env_or_try_find
+import jittor_utils
+import ctypes
+import glob
+import jittor.compiler as compiler
+import jittor as jt
+import math
+import numpy as np
+
+from typing import Union
+from collections.abc import Sequence, Iterable
+
+
+def cumsum_cmd(name: str,
+               inputs: list,
+               output_dtypes: list = None,
+               output_shapes: list = None,
+               attr_code: str = "",
+               attr_header: str = "",
+               outputs: list = None):
+    attr_header = "\nnamespace jittor{" + attr_header + "}\n"
+
+    cuda_header = '''
+    #include "acl/aclops/aclops.h"
+    '''
+    outputs_ = []
+    if outputs is not None:
+        outputs_ = outputs
+    else:
+        assert output_dtypes is not None
+        assert output_shapes is not None
+        assert len(output_dtypes) == len(output_shapes)
+        for i in range(len(output_shapes)):
+            outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
+    input_code = ''
+    for i in range(len(inputs)):
+        input_code += f"op.add(in{i}, true);\n"
+
+    output_code = ''
+    for i in range(len(outputs_)):
+        output_code += f"op.add(out{i}, false);\n"
+    return jt.code(outputs=outputs_,
+                   inputs=inputs,
+                   cuda_header=attr_header + cuda_header,
+                   cuda_src=f"""
+   
+    // aclop
+    {name}OpRunner op;
+    {input_code}
+    {output_code}
+    {attr_code}
+    op.run();""")
+
+
+class CumsumACL(jt.Function):
+
+    def __init__(self):
+        super(CumsumACL, self).__init__()
+
+    def execute(self, input, dim=-1):
+        self.dim = dim
+        attr_code = f"""
+        op.jt_name = "cumsum";
+        GatherAttr *attr = new GatherAttr();
+        attr->dim = {dim};
+        op.op_attr.reset(attr);
+        """
+        result = cumsum_cmd("Cumsum", [input],
+                            output_dtypes=[input.dtype],
+                            output_shapes=[input.shape],
+                            attr_code=attr_code)[0]
+        return result
+
+    def grad(self, grad_output):
+        cumsum_attr_code = f"""
+        op.jt_name = "cumsum";
+        GatherAttr *attr = new GatherAttr();
+        attr->dim = {self.dim};
+        op.op_attr.reset(attr);
+        """
+        flip_attr_code = f"""
+        op.jt_name = "flip";
+        ReduceAttr *attr = new ReduceAttr();
+        attr->axes = {{{self.dim}}};
+        attr->prod_dim = {{{1}}};
+        op.op_attr.reset(attr);
+        """
+        flipped_grad_output = cumsum_cmd("Flip", [grad_output],
+                                         output_dtypes=[grad_output.dtype],
+                                         output_shapes=[grad_output.shape],
+                                         attr_code=flip_attr_code)[0]
+        cumulative_grad = cumsum_cmd("Cumsum", [flipped_grad_output],
+                                     output_dtypes=[grad_output.dtype],
+                                     output_shapes=[grad_output.shape],
+                                     attr_code=cumsum_attr_code)[0]
+        grad_input = cumsum_cmd("Flip", [cumulative_grad],
+                                output_dtypes=[grad_output.dtype],
+                                output_shapes=[grad_output.shape],
+                                attr_code=flip_attr_code)[0]
+        return grad_input
--- a/python/jittor/extern/acl/aclops/cumsum_op_acl.cc
+++ b/python/jittor/extern/acl/aclops/cumsum_op_acl.cc
@ -0,0 +1,57 @@
+#pragma once
+#include <acl/acl.h>
+#include <acl/acl_op_compiler.h>
+#include <Python.h>
+#include <pystate.h>
+#include <algorithm>
+#include <queue>
+#include <set>
+#include "common.h"
+#include "op.h"
+#include "acl_jittor.h"
+#include "ops/random_op.h"
+#include "ops/reduce_op.h"
+#include "ops/binary_op.h"
+#include "ops/broadcast_to_op.h"
+#include "ops/transpose_op.h"
+#include "ops/array_op.h"
+#include "ops/code_op.h"
+#include "fused_op.h"
+#include "ops/unary_op.h"
+#include "ops/ternary_op.h"
+#include "executor.h"
+#include "misc/cuda_flags.h"
+#include "mem/allocator.h"
+#include "op_compiler.h"
+#include "ops/op_register.h"
+#include "opt/tuner_manager.h"
+#include "utils/str_utils.h"
+#include "aclnn/aclnn.h"
+#include "cumsum_op_acl.h"
+
+namespace jittor
+{
+    CumsumOpRunner::CumsumOpRunner() : BaseOpRunner("Cumsum")
+    {
+    }
+
+    void CumsumOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
+    {
+        auto attr = dynamic_cast<GatherAttr *>(op_attr.get());
+        ret = aclnnCumsumGetWorkspaceSize(inputTensors[0], attr->dim, get_dtype(out_[0]->dtype()), outputTensors[0], &workspaceSize, &executor);
+
+        checkRet(ret);
+
+        if (workspaceSize > 0)
+        {
+            mallocWorkSpace(workspaceSize);
+        }
+
+        ret = aclnnCumsum(workspaceAddr, workspaceSize, executor, aclstream);
+        CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnCumsum failed. ERROR: %d\n", name.c_str(), ret); return);
+
+        syncRun();
+        return;
+    }
+
+}
--- a/python/jittor/extern/acl/aclops/cumsum_op_acl.h
+++ b/python/jittor/extern/acl/aclops/cumsum_op_acl.h
@ -0,0 +1,17 @@
+#pragma once
+#include "utils.h"
+#include "base_op.h"
+
+namespace jittor
+{
+    class CumsumOpRunner : public BaseOpRunner
+    {
+
+    protected:
+        void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
+
+    public:
+        CumsumOpRunner();
+    };
+
+}
--- a/python/jittor/extern/acl/aclops/dropout_op.py
+++ b/python/jittor/extern/acl/aclops/dropout_op.py
@ -0,0 +1,94 @@
+import os
+from jittor_utils import env_or_try_find
+import jittor_utils
+import ctypes
+import glob
+import jittor.compiler as compiler
+import jittor as jt
+import math
+import numpy as np
+
+from typing import Union
+from collections.abc import Sequence, Iterable
+
+
+def dropout_cmd(name: str,
+                inputs: list,
+                output_dtypes: list = None,
+                output_shapes: list = None,
+                attr_code: str = "",
+                attr_header: str = "",
+                outputs: list = None):
+    attr_header = "\nnamespace jittor{" + attr_header + "}\n"
+
+    cuda_header = '''
+    #include "acl/aclops/aclops.h"
+    '''
+    outputs_ = []
+    if outputs is not None:
+        outputs_ = outputs
+    else:
+        assert output_dtypes is not None
+        assert output_shapes is not None
+        assert len(output_dtypes) == len(output_shapes)
+        for i in range(len(output_shapes)):
+            outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
+    input_code = ''
+    for i in range(len(inputs)):
+        input_code += f"op.add(in{i}, true);\n"
+
+    output_code = ''
+    for i in range(len(outputs_)):
+        output_code += f"op.add(out{i}, false);\n"
+    return jt.code(outputs=outputs_,
+                   inputs=inputs,
+                   cuda_header=attr_header + cuda_header,
+                   cuda_src=f"""
+   
+    // aclop
+    {name}OpRunner op;
+    {input_code}
+    {output_code}
+    {attr_code}
+    op.run();""")
+
+
+class DropoutACL(jt.Function):
+
+    def __init__(self):
+        super(DropoutACL, self).__init__()
+
+    def execute(self, x, p=0.5, is_train=False):
+        self.input = x
+        num_elements = x.numel()
+        aligned_elements = (num_elements + 127) // 128 * 128
+        mask_shape = (aligned_elements // 8, )
+        attr_code = f"""
+        op.jt_name = "dropout";
+        DropoutAttr *attr = new DropoutAttr();
+        attr->p = {p};
+        attr->train = {"true" if is_train else "false"};
+        attr->seed = 0;
+        attr->offset = 0;
+        op.op_attr.reset(attr);
+        """
+        result = dropout_cmd("Dropout", [x],
+                             output_dtypes=[x.dtype, "uint8"],
+                             output_shapes=[x.shape, mask_shape],
+                             attr_code=attr_code)
+        self.maskout = result[1]
+        return result[0]
+
+    def grad(self, grad_output):
+        attr_code = f"""
+        op.jt_name = "dropoutbackward";
+        DropoutAttr *attr = new DropoutAttr();
+        attr->scale = 1.0;
+        op.op_attr.reset(attr);
+        """
+        grad_input = dropout_cmd("DropoutBackward",
+                                 [grad_output, self.maskout],
+                                 output_dtypes=[grad_output.dtype],
+                                 output_shapes=[grad_output.shape],
+                                 attr_code=attr_code)[0]
+        return grad_input
--- a/python/jittor/extern/acl/aclops/dropout_op_acl.cc
+++ b/python/jittor/extern/acl/aclops/dropout_op_acl.cc
@ -0,0 +1,82 @@
+#pragma once
+#include <acl/acl.h>
+#include <acl/acl_op_compiler.h>
+#include <Python.h>
+#include <pystate.h>
+#include <algorithm>
+#include <queue>
+#include <set>
+#include "common.h"
+#include "op.h"
+#include "acl_jittor.h"
+#include "ops/random_op.h"
+#include "ops/reduce_op.h"
+#include "ops/binary_op.h"
+#include "ops/broadcast_to_op.h"
+#include "ops/transpose_op.h"
+#include "ops/array_op.h"
+#include "ops/code_op.h"
+#include "fused_op.h"
+#include "ops/unary_op.h"
+#include "ops/ternary_op.h"
+#include "executor.h"
+#include "misc/cuda_flags.h"
+#include "mem/allocator.h"
+#include "op_compiler.h"
+#include "ops/op_register.h"
+#include "opt/tuner_manager.h"
+#include "utils/str_utils.h"
+#include "aclnn/aclnn.h"
+#include "dropout_op_acl.h"
+
+namespace jittor
+{
+    DropoutOpRunner::DropoutOpRunner() : BaseOpRunner("Dropout")
+    {
+    }
+
+    void DropoutOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
+    {
+        auto attr = dynamic_cast<DropoutAttr *>(op_attr.get());
+        ret = aclnnDropoutGetWorkspaceSize(inputTensors[0], attr->p, attr->train, attr->seed, attr->offset, outputTensors[0], outputTensors[1], &workspaceSize, &executor);
+
+        checkRet(ret);
+
+        if (workspaceSize > 0)
+        {
+            mallocWorkSpace(workspaceSize);
+        }
+
+        ret = aclnnDropout(workspaceAddr, workspaceSize, executor, aclstream);
+        CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnDropout failed. ERROR: %d\n", name.c_str(), ret); return);
+
+        syncRun();
+
+        return;
+    }
+
+    DropoutBackwardOpRunner::DropoutBackwardOpRunner() : BaseOpRunner("DropoutBackward")
+    {
+    }
+
+    void DropoutBackwardOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
+    {
+        auto attr = dynamic_cast<DropoutAttr *>(op_attr.get());
+        ret = aclnnDropoutBackwardGetWorkspaceSize(inputTensors[0], inputTensors[1], attr->scale, outputTensors[0], &workspaceSize, &executor);
+
+        checkRet(ret);
+
+        if (workspaceSize > 0)
+        {
+            mallocWorkSpace(workspaceSize);
+        }
+
+        ret = aclnnDropoutBackward(workspaceAddr, workspaceSize, executor, aclstream);
+        CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnDropoutBackward failed. ERROR: %d\n", name.c_str(), ret); return);
+
+        syncRun();
+
+        return;
+    }
+
+}
--- a/python/jittor/extern/acl/aclops/dropout_op_acl.h
+++ b/python/jittor/extern/acl/aclops/dropout_op_acl.h
@ -0,0 +1,27 @@
+#pragma once
+#include "utils.h"
+#include "base_op.h"
+
+namespace jittor
+{
+    class DropoutOpRunner : public BaseOpRunner
+    {
+
+    protected:
+        void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
+
+    public:
+        DropoutOpRunner();
+    };
+
+    class DropoutBackwardOpRunner : public BaseOpRunner
+    {
+
+    protected:
+        void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
+
+    public:
+        DropoutBackwardOpRunner();
+    };
+
+}
--- a/python/jittor/extern/acl/aclops/embedding_op.py
+++ b/python/jittor/extern/acl/aclops/embedding_op.py
@ -0,0 +1,91 @@
+import os
+from jittor_utils import env_or_try_find
+import jittor_utils
+import ctypes
+import glob
+import jittor.compiler as compiler
+import jittor as jt
+import math
+import numpy as np
+
+from typing import Union
+from collections.abc import Sequence, Iterable
+
+def embedding_cmd(name: str,
+            inputs: list,
+            output_dtypes: list = None,
+            output_shapes: list = None,
+            attr_code: str = "",
+            attr_header: str = "",
+            outputs: list = None):
+    attr_header = "\nnamespace jittor{" + attr_header + "}\n"
+
+    cuda_header = '''
+    #include "acl/aclops/aclops.h"
+    '''
+    outputs_ = []
+    if outputs is not None:
+        outputs_ = outputs
+    else:
+        assert output_dtypes is not None
+        assert output_shapes is not None
+        assert len(output_dtypes) == len(output_shapes)
+        for i in range(len(output_shapes)):
+            outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
+    input_code = ''
+    for i in range(len(inputs)):
+        input_code += f"op.add(in{i}, true);\n"
+
+    output_code = ''
+    for i in range(len(outputs_)):
+        output_code += f"op.add(out{i}, false);\n"
+    return jt.code(outputs=outputs_,
+                   inputs=inputs,
+                   cuda_header=attr_header + cuda_header,
+                   cuda_src=f"""
+   
+    // aclop
+    {name}OpRunner op;
+    {input_code}
+    {output_code}
+    {attr_code}
+    op.run();""")
+    
+class EmbeddingACL(jt.Function):
+
+    def __init__(self):
+        super(EmbeddingACL, self).__init__()
+
+    def execute(
+        self,
+        indices,
+        weight,
+    ):
+        inputs = [weight, indices]
+        self.indices = indices
+        self.weight_shape = weight.shape
+        output_shape = list(indices.shape) + list(weight.shape[1:])
+        outputs = [jt.empty(output_shape, weight.dtype)]
+        attr_code = f"""
+        op.jt_name = "embedding";
+        """
+        result = embedding_cmd("Embedding",
+                            inputs=inputs,
+                            outputs=outputs,
+                            attr_code=attr_code)[0]
+        return result
+
+    def grad(self, grad_output):
+        inputs = [grad_output, self.indices]
+        outputs = [jt.empty(self.weight_shape, grad_output.dtype)]
+        attr_code = f"""
+        op.jt_name = "embeddingbackward";
+        EmbeddingAttr *attr = new EmbeddingAttr();
+        attr->numEmbeddings = {self.weight_shape[0]};
+        op.op_attr.reset(attr);
+        """
+        grad_weight = embedding_cmd("EmbeddingBackward",
+                                inputs=inputs,
+                                outputs=outputs,
+                                attr_code=attr_code)[0]
+        return None, grad_weight
--- a/python/jittor/extern/acl/aclops/embedding_op_acl.cc
+++ b/python/jittor/extern/acl/aclops/embedding_op_acl.cc
@ -0,0 +1,82 @@
+#pragma once
+#include <acl/acl.h>
+#include <acl/acl_op_compiler.h>
+#include <Python.h>
+#include <pystate.h>
+#include <algorithm>
+#include <queue>
+#include <set>
+#include "common.h"
+#include "op.h"
+#include "acl_jittor.h"
+#include "ops/random_op.h"
+#include "ops/reduce_op.h"
+#include "ops/binary_op.h"
+#include "ops/broadcast_to_op.h"
+#include "ops/transpose_op.h"
+#include "ops/array_op.h"
+#include "ops/code_op.h"
+#include "fused_op.h"
+#include "ops/unary_op.h"
+#include "ops/ternary_op.h"
+#include "executor.h"
+#include "misc/cuda_flags.h"
+#include "mem/allocator.h"
+#include "op_compiler.h"
+#include "ops/op_register.h"
+#include "opt/tuner_manager.h"
+#include "utils/str_utils.h"
+#include "aclnn/aclnn.h"
+#include "embedding_op_acl.h"
+
+namespace jittor
+{
+    EmbeddingOpRunner::EmbeddingOpRunner() : BaseOpRunner("Embedding")
+    {
+    }
+
+    void EmbeddingOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
+    {
+        ret = aclnnEmbeddingGetWorkspaceSize(inputTensors[0], inputTensors[1], outputTensors[0], &workspaceSize, &executor);
+
+        checkRet(ret);
+
+        if (workspaceSize > 0)
+        {
+            mallocWorkSpace(workspaceSize);
+        }
+
+        ret = aclnnEmbedding(workspaceAddr, workspaceSize, executor, aclstream);
+        CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnEmbedding failed. ERROR: %d\n", name.c_str(), ret); return);
+
+        syncRun();
+
+        return;
+    }
+
+    EmbeddingBackwardOpRunner::EmbeddingBackwardOpRunner() : BaseOpRunner("EmbeddingBackward")
+    {
+    }
+
+    void EmbeddingBackwardOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
+    {
+        auto attr = dynamic_cast<EmbeddingAttr *>(op_attr.get());
+        auto numEmbeddings = attr->numEmbeddings;
+        ret = aclnnEmbeddingDenseBackwardGetWorkspaceSize(inputTensors[0], inputTensors[1], numEmbeddings, 0, false, outputTensors[0], &workspaceSize, &executor);
+
+        checkRet(ret);
+
+        if (workspaceSize > 0)
+        {
+            mallocWorkSpace(workspaceSize);
+        }
+
+        ret = aclnnEmbeddingDenseBackward(workspaceAddr, workspaceSize, executor, aclstream);
+        CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnEmbeddingDenseBackward failed. ERROR: %d\n", name.c_str(), ret); return);
+
+        syncRun();
+
+        return;
+    }
+
+}
--- a/python/jittor/extern/acl/aclops/embedding_op_acl.h
+++ b/python/jittor/extern/acl/aclops/embedding_op_acl.h
@ -0,0 +1,25 @@
+#pragma once
+#include "utils.h"
+#include "base_op.h"
+
+namespace jittor
+{
+    class EmbeddingOpRunner : public BaseOpRunner
+    {
+
+    protected:
+        void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
+    public:
+        EmbeddingOpRunner();
+    };
+
+    class EmbeddingBackwardOpRunner : public BaseOpRunner
+    {
+
+    protected:
+        void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
+    public:
+        EmbeddingBackwardOpRunner();
+    };
+
+}
--- a/python/jittor/extern/acl/aclops/expand_op_acl.cc
+++ b/python/jittor/extern/acl/aclops/expand_op_acl.cc
@ -0,0 +1,58 @@
+#include <acl/acl.h>
+#include <acl/acl_op_compiler.h>
+#include <Python.h>
+#include <pystate.h>
+#include <algorithm>
+#include <queue>
+#include <set>
+#include "common.h"
+#include "op.h"
+#include "acl_jittor.h"
+#include "ops/random_op.h"
+#include "ops/reduce_op.h"
+#include "ops/binary_op.h"
+#include "ops/broadcast_to_op.h"
+#include "ops/transpose_op.h"
+#include "ops/array_op.h"
+#include "ops/code_op.h"
+#include "fused_op.h"
+#include "ops/unary_op.h"
+#include "ops/ternary_op.h"
+#include "executor.h"
+#include "misc/cuda_flags.h"
+#include "mem/allocator.h"
+#include "op_compiler.h"
+#include "ops/op_register.h"
+#include "opt/tuner_manager.h"
+#include "utils/str_utils.h"
+#include "aclnn/aclnn.h"
+#include "expand_op_acl.h"
+
+namespace jittor
+{
+    ExpandOpRunner::ExpandOpRunner() : BaseOpRunner("ternary")
+    {
+        use_nchw = false;
+    }
+
+    void ExpandOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
+    {
+        aclIntArray *size = nullptr;
+        size = aclCreateIntArray(&outputShapes[0][0], outputShapes[0].size());
+        ret = aclnnExpandGetWorkspaceSize(inputTensors[0], size, outputTensors[0], &workspaceSize, &executor);
+
+        checkRet(ret);
+
+        if (workspaceSize > 0)
+        {
+            mallocWorkSpace(workspaceSize);
+        }
+
+        ret = aclnnExpand(workspaceAddr, workspaceSize, executor, aclstream);
+        CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnExpand failed. ERROR: %d\n", name.c_str(), ret); return);
+
+        aclDestroyIntArray(size);
+
+        return;
+    }
+}
--- a/python/jittor/extern/acl/aclops/expand_op_acl.h
+++ b/python/jittor/extern/acl/aclops/expand_op_acl.h
@ -0,0 +1,14 @@
+#pragma once
+#include "utils.h"
+#include "base_op.h"
+
+namespace jittor
+{
+    struct ExpandOpRunner : public BaseOpRunner
+    {
+        ExpandOpRunner();
+
+    protected:
+        void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
+    };
+}
--- a/python/jittor/extern/acl/aclops/flashattention_op.py
+++ b/python/jittor/extern/acl/aclops/flashattention_op.py
@ -0,0 +1,209 @@
+import os
+from jittor_utils import env_or_try_find
+import jittor_utils
+import ctypes
+import glob
+import jittor.compiler as compiler
+import jittor as jt
+import math
+import numpy as np
+
+from typing import Union
+from collections.abc import Sequence, Iterable
+
+
+def flashattention_cmd(name: str,
+                       inputs: list,
+                       output_dtypes: list = None,
+                       output_shapes: list = None,
+                       attr_code: str = "",
+                       attr_header: str = "",
+                       outputs: list = None):
+    attr_header = "\nnamespace jittor{" + attr_header + "}\n"
+
+    cuda_header = '''
+    #include "acl/aclops/aclops.h"
+    '''
+    outputs_ = []
+    if outputs is not None:
+        outputs_ = outputs
+    else:
+        assert output_dtypes is not None
+        assert output_shapes is not None
+        assert len(output_dtypes) == len(output_shapes)
+        for i in range(len(output_shapes)):
+            outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
+    input_code = ''
+    for i in range(len(inputs)):
+        input_code += f"op.add(in{i}, true);\n"
+
+    output_code = ''
+    for i in range(len(outputs_)):
+        output_code += f"op.add(out{i}, false);\n"
+    return jt.code(outputs=outputs_,
+                   inputs=inputs,
+                   cuda_header=attr_header + cuda_header,
+                   cuda_src=f"""
+   
+    // aclop
+    {name}OpRunner op;
+    {input_code}
+    {output_code}
+    {attr_code}
+    op.run();""")
+
+
+class FlashAttentionACL(jt.Function):
+
+    def __init__(self,
+                 headnum,
+                 layout="BNSD",
+                 prefix=None,
+                 qstart=None,
+                 kvstart=None,
+                 scale=1.0,
+                 prob=1.0,
+                 pretokens=2147483647,
+                 nexttokens=2147483647,
+                 innerprecise=0,
+                 sparsemode=0,
+                 psetype=1):
+        self.headnum = headnum
+        self.layout = layout
+        self.scale = scale
+        self.prob = prob
+        self.pretokens = pretokens
+        self.nexttokens = nexttokens
+        self.innerprecise = innerprecise
+        self.sparsemode = sparsemode
+        self.psetype = psetype
+        self.prefix = prefix
+        self.qstart = qstart
+        self.kvstart = kvstart
+
+    def execute(
+        self,
+        q,
+        k,
+        v,
+        realshift=None,
+        dropMask=None,
+        paddingMask=None,
+        attenMask=None,
+    ):
+        if self.layout == 'BSH':
+            B, SQ, H = q.shape
+            SKV = k.shape[1]
+            N = self.headnum
+            D = H / N
+        elif self.layout == 'SBH':
+            SQ, B, H = q.shape
+            SKV = k.shape[0]
+            N = self.headnum
+            D = H / N
+        elif self.layout == 'BSND':
+            B, SQ, N, D = q.shape
+            SKV = k.shape[1]
+        elif self.layout == 'BNSD':
+            B, N, SQ, D = q.shape
+            SKV = k.shape[2]
+        else:
+            raise ValueError(f"got invalid input layout {self.layout}")
+
+        output_shape = (B, N, SQ, 8)
+
+        self.q = q
+        self.k = k
+        self.v = v
+
+        self.prefix = self.prefix if self.prefix else [0 for _ in range(B)]
+        self.qstart = self.qstart if self.qstart else [0 for _ in range(B)]
+        self.kvstart = self.kvstart if self.kvstart else [0 for _ in range(B)]
+
+        self.hasRealshift = (not realshift == None)
+        self.hasDropmask = (not dropMask == None)
+        self.hasPaddingmask = (not paddingMask == None)
+        self.hasAttenmask = (not attenMask == None)
+
+        # 待定，目前设为nullptr
+        self.realshift = realshift if realshift else jt.zeros(B, N, SQ, SKV)
+        self.dropMask = dropMask if dropMask else jt.ones(B, N, SQ, SKV)
+        self.paddingMask = paddingMask if paddingMask else jt.zeros(
+            B, N, SQ, SKV)
+        self.attenMask = attenMask if attenMask else jt.zeros(SQ, SKV)
+
+        attr_code = f"""
+        op.jt_name = "flashattention";
+        FlashAttentionAttr *attr = new FlashAttentionAttr();
+        attr->scale = {self.scale};
+        attr->keepProb = {self.prob};
+        attr->preToken = {self.pretokens};
+        attr->nextToken = {self.nexttokens};
+        attr->headNum = {self.headnum};
+        attr->inputLayout = "{self.layout}";
+        attr->innerPrecise = {self.innerprecise};
+        attr->sparseMode = {self.sparsemode};
+        attr->psetype = {self.psetype};
+        attr->prefix = {{ {", ".join(map(str, self.prefix))} }};
+        attr->qStartIdx = {{ {", ".join(map(str, self.qstart))} }};
+        attr->kvStartIdx = {{ {", ".join(map(str, self.kvstart))} }};
+        attr->hasRealshift = {"true" if self.hasRealshift else "false"};
+        attr->hasDropmask = {"true" if self.hasDropmask else "false"};
+        attr->hasPaddingmask = {"true" if self.hasPaddingmask else "false"};
+        attr->hasAttentmask = {"true" if self.hasAttenmask else "false"};
+        op.op_attr.reset(attr);
+        """
+
+        inputs = [
+            q, k, v, self.realshift, self.dropMask, self.paddingMask,
+            self.attenMask
+        ]
+
+        result = flashattention_cmd(
+            "FlashAttention",
+            inputs,
+            output_dtypes=["float", "float", q.dtype],
+            output_shapes=[output_shape, output_shape, q.shape],
+            attr_code=attr_code)
+
+        self.maxout = result[0]
+        self.sumout = result[1]
+        self.attenout = result[2]
+
+        return self.attenout
+
+    def grad(self, dy):
+        attr_code = f"""
+        op.jt_name = "flashattentionbackward";
+        FlashAttentionAttr *attr = new FlashAttentionAttr();
+        attr->scale = {self.scale};
+        attr->keepProb = {self.prob};
+        attr->preToken = {self.pretokens};
+        attr->nextToken = {self.nexttokens};
+        attr->headNum = {self.headnum};
+        attr->inputLayout = "{self.layout}";
+        attr->innerPrecise = {self.innerprecise};
+        attr->sparseMode = {self.sparsemode};
+        attr->psetype = {self.psetype};
+        attr->prefix = {{ {", ".join(map(str, self.prefix))} }};
+        attr->qStartIdx = {{ {", ".join(map(str, self.qstart))} }};
+        attr->kvStartIdx = {{ {", ".join(map(str, self.kvstart))} }};
+        attr->hasRealshift = {"true" if self.hasRealshift else "false"};
+        attr->hasDropmask = {"true" if self.hasDropmask else "false"};
+        attr->hasPaddingmask = {"true" if self.hasPaddingmask else "false"};
+        attr->hasAttentmask = {"true" if self.hasAttenmask else "false"};
+        op.op_attr.reset(attr);
+        """
+        inputs = [
+            self.q, self.k, self.v, dy, self.realshift, self.dropMask,
+            self.paddingMask, self.attenMask, self.maxout, self.sumout,
+            self.attenout
+        ]
+
+        result = flashattention_cmd(
+            "FlashAttentionBackward",
+            inputs,
+            output_dtypes=[self.q.dtype, self.k.dtype, self.v.dtype],
+            output_shapes=[self.q.shape, self.k.shape, self.v.shape],
+            attr_code=attr_code)
+        return result
--- a/python/jittor/extern/acl/aclops/flashattention_op_acl.cc
+++ b/python/jittor/extern/acl/aclops/flashattention_op_acl.cc
@ -0,0 +1,88 @@
+#pragma once
+#include <acl/acl.h>
+#include <acl/acl_op_compiler.h>
+#include <Python.h>
+#include <pystate.h>
+#include <algorithm>
+#include <queue>
+#include <set>
+#include "common.h"
+#include "op.h"
+#include "acl_jittor.h"
+#include "ops/random_op.h"
+#include "ops/reduce_op.h"
+#include "ops/binary_op.h"
+#include "ops/broadcast_to_op.h"
+#include "ops/transpose_op.h"
+#include "ops/array_op.h"
+#include "ops/code_op.h"
+#include "fused_op.h"
+#include "ops/unary_op.h"
+#include "ops/ternary_op.h"
+#include "executor.h"
+#include "misc/cuda_flags.h"
+#include "mem/allocator.h"
+#include "op_compiler.h"
+#include "ops/op_register.h"
+#include "opt/tuner_manager.h"
+#include "utils/str_utils.h"
+#include "aclnn/aclnn.h"
+#include "flashattention_op_acl.h"
+
+namespace jittor
+{
+    FlashAttentionOpRunner::FlashAttentionOpRunner() : BaseOpRunner("FlashAttention")
+    {
+    }
+
+    void FlashAttentionOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
+    {
+        auto attr = dynamic_cast<FlashAttentionAttr *>(op_attr.get());
+        auto prefix = aclCreateIntArray(attr->prefix.data(), attr->prefix.size());
+        auto qstart = aclCreateIntArray(attr->qStartIdx.data(), attr->qStartIdx.size());
+        auto kvstart = aclCreateIntArray(attr->kvStartIdx.data(), attr->kvStartIdx.size());
+        char *layout = const_cast<char *>(attr->inputLayout.data());
+        ret = aclnnFlashAttentionScoreV2GetWorkspaceSize(inputTensors[0], inputTensors[1], inputTensors[2], attr->hasRealshift ? inputTensors[3] : nullptr, attr->hasDropmask ? inputTensors[4] : nullptr, nullptr, attr->hasAttentmask ? inputTensors[6] : nullptr, prefix, qstart, kvstart, attr->scale, attr->keepProb, attr->preToken, attr->nextToken, attr->headNum, layout, attr->innerPrecise, attr->sparseMode, attr->psetype, outputTensors[0], outputTensors[1], nullptr, outputTensors[2], &workspaceSize, &executor);
+
+        checkRet(ret);
+
+        if (workspaceSize > 0)
+        {
+            mallocWorkSpace(workspaceSize);
+        }
+
+        ret = aclnnFlashAttentionScoreV2(workspaceAddr, workspaceSize, executor, aclstream);
+        CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnFlashAttentionScoreV2 failed. ERROR: %d\n", name.c_str(), ret); return);
+
+        syncRun();
+        return;
+    }
+
+    FlashAttentionBackwardOpRunner::FlashAttentionBackwardOpRunner() : BaseOpRunner("FlashAttentionBackward")
+    {
+    }
+
+    void FlashAttentionBackwardOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
+    {
+        auto attr = dynamic_cast<FlashAttentionAttr *>(op_attr.get());
+        auto prefix = aclCreateIntArray(attr->prefix.data(), attr->prefix.size());
+        auto qstart = aclCreateIntArray(attr->qStartIdx.data(), attr->qStartIdx.size());
+        auto kvstart = aclCreateIntArray(attr->kvStartIdx.data(), attr->kvStartIdx.size());
+        char *layout = const_cast<char *>(attr->inputLayout.data());
+        ret = aclnnFlashAttentionScoreGradV2GetWorkspaceSize(inputTensors[0], inputTensors[1], inputTensors[2], inputTensors[3], attr->hasRealshift ? inputTensors[4] : nullptr, attr->hasDropmask ? inputTensors[5] : nullptr, nullptr, attr->hasAttentmask ? inputTensors[7] : nullptr, inputTensors[8], inputTensors[9], nullptr, inputTensors[10], prefix, qstart, kvstart, attr->scale, attr->keepProb, attr->preToken, attr->nextToken, attr->headNum, layout, attr->innerPrecise, attr->sparseMode, attr->psetype, outputTensors[0], outputTensors[1], outputTensors[2], nullptr, &workspaceSize, &executor);
+
+        checkRet(ret);
+
+        if (workspaceSize > 0)
+        {
+            mallocWorkSpace(workspaceSize);
+        }
+
+        ret = aclnnFlashAttentionScoreGradV2(workspaceAddr, workspaceSize, executor, aclstream);
+        CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnFlashAttentionScoreGradV2 failed. ERROR: %d\n", name.c_str(), ret); return);
+
+        syncRun();
+        return;
+    }
+
+}
--- a/python/jittor/extern/acl/aclops/flashattention_op_acl.h
+++ b/python/jittor/extern/acl/aclops/flashattention_op_acl.h
@ -0,0 +1,27 @@
+#pragma once
+#include "utils.h"
+#include "base_op.h"
+
+namespace jittor
+{
+    class FlashAttentionOpRunner : public BaseOpRunner
+    {
+
+    protected:
+        void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
+
+    public:
+        FlashAttentionOpRunner();
+    };
+
+    class FlashAttentionBackwardOpRunner : public BaseOpRunner
+    {
+
+    protected:
+        void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
+
+    public:
+        FlashAttentionBackwardOpRunner();
+    };
+
+}
--- a/python/jittor/extern/acl/aclops/flip_op.py
+++ b/python/jittor/extern/acl/aclops/flip_op.py
@ -0,0 +1,85 @@
+import os
+from jittor_utils import env_or_try_find
+import jittor_utils
+import ctypes
+import glob
+import jittor.compiler as compiler
+import jittor as jt
+import math
+import numpy as np
+
+from typing import Union
+from collections.abc import Sequence, Iterable
+
+
+def flip_cmd(name: str,
+             inputs: list,
+             output_dtypes: list = None,
+             output_shapes: list = None,
+             attr_code: str = "",
+             attr_header: str = "",
+             outputs: list = None):
+    attr_header = "\nnamespace jittor{" + attr_header + "}\n"
+
+    cuda_header = '''
+    #include "acl/aclops/aclops.h"
+    '''
+    outputs_ = []
+    if outputs is not None:
+        outputs_ = outputs
+    else:
+        assert output_dtypes is not None
+        assert output_shapes is not None
+        assert len(output_dtypes) == len(output_shapes)
+        for i in range(len(output_shapes)):
+            outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
+    input_code = ''
+    for i in range(len(inputs)):
+        input_code += f"op.add(in{i}, true);\n"
+
+    output_code = ''
+    for i in range(len(outputs_)):
+        output_code += f"op.add(out{i}, false);\n"
+    return jt.code(outputs=outputs_,
+                   inputs=inputs,
+                   cuda_header=attr_header + cuda_header,
+                   cuda_src=f"""
+   
+    // aclop
+    {name}OpRunner op;
+    {input_code}
+    {output_code}
+    {attr_code}
+    op.run();""")
+
+
+class FlipACL(jt.Function):
+
+    def __init__(self):
+        super(FlipACL, self).__init__()
+
+    def execute(self, input, dim):
+        if type(dim) is tuple:
+            dim = list(dim)
+        if type(dim) is not list:
+            dim = [dim]
+        attr_code = f"""
+        op.jt_name = "flip";
+        ReduceAttr *attr = new ReduceAttr();
+        attr->axes = {{{', '.join(map(str, (list(dim))))}}};
+        attr->prod_dim = {len(dim)};
+        op.op_attr.reset(attr);
+        """
+        self.attr_code = attr_code
+        result = flip_cmd("Flip", [input],
+                          output_dtypes=[input.dtype],
+                          output_shapes=[input.shape],
+                          attr_code=self.attr_code)[0]
+        return result
+
+    def grad(self, grad_output):
+        grad_input = flip_cmd("Flip", [grad_output],
+                              output_dtypes=[grad_output.dtype],
+                              output_shapes=[grad_output.shape],
+                              attr_code=self.attr_code)[0]
+        return grad_input
--- a/python/jittor/extern/acl/aclops/flip_op_acl.cc
+++ b/python/jittor/extern/acl/aclops/flip_op_acl.cc
@ -0,0 +1,58 @@
+#pragma once
+#include <acl/acl.h>
+#include <acl/acl_op_compiler.h>
+#include <Python.h>
+#include <pystate.h>
+#include <algorithm>
+#include <queue>
+#include <set>
+#include "common.h"
+#include "op.h"
+#include "acl_jittor.h"
+#include "ops/random_op.h"
+#include "ops/reduce_op.h"
+#include "ops/binary_op.h"
+#include "ops/broadcast_to_op.h"
+#include "ops/transpose_op.h"
+#include "ops/array_op.h"
+#include "ops/code_op.h"
+#include "fused_op.h"
+#include "ops/unary_op.h"
+#include "ops/ternary_op.h"
+#include "executor.h"
+#include "misc/cuda_flags.h"
+#include "mem/allocator.h"
+#include "op_compiler.h"
+#include "ops/op_register.h"
+#include "opt/tuner_manager.h"
+#include "utils/str_utils.h"
+#include "aclnn/aclnn.h"
+#include "flip_op_acl.h"
+
+namespace jittor
+{
+    FlipOpRunner::FlipOpRunner() : BaseOpRunner("Flip")
+    {
+    }
+
+    void FlipOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
+    {
+        auto attr = dynamic_cast<ReduceAttr *>(op_attr.get());
+        auto dim = aclCreateIntArray(attr->axes.data(), attr->axes.size());
+        ret = aclnnFlipGetWorkspaceSize(inputTensors[0], dim, outputTensors[0], &workspaceSize, &executor);
+
+        checkRet(ret);
+
+        if (workspaceSize > 0)
+        {
+            mallocWorkSpace(workspaceSize);
+        }
+
+        ret = aclnnFlip(workspaceAddr, workspaceSize, executor, aclstream);
+        CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnFlip failed. ERROR: %d\n", name.c_str(), ret); return);
+
+        syncRun();
+        return;
+    }
+
+}
--- a/python/jittor/extern/acl/aclops/flip_op_acl.h
+++ b/python/jittor/extern/acl/aclops/flip_op_acl.h
@ -0,0 +1,16 @@
+#pragma once
+#include "utils.h"
+#include "base_op.h"
+
+namespace jittor
+{
+    class FlipOpRunner : public BaseOpRunner
+    {
+
+    protected:
+        void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
+
+    public:
+        FlipOpRunner();
+    };
+}
--- a/python/jittor/extern/acl/aclops/floor_op.py
+++ b/python/jittor/extern/acl/aclops/floor_op.py
@ -0,0 +1,70 @@
+import os
+from jittor_utils import env_or_try_find
+import jittor_utils
+import ctypes
+import glob
+import jittor.compiler as compiler
+import jittor as jt
+import math
+import numpy as np
+
+from typing import Union
+from collections.abc import Sequence, Iterable
+
+
+def floor_cmd(name: str,
+              inputs: list,
+              output_dtypes: list = None,
+              output_shapes: list = None,
+              attr_code: str = "",
+              attr_header: str = "",
+              outputs: list = None):
+    attr_header = "\nnamespace jittor{" + attr_header + "}\n"
+
+    cuda_header = '''
+    #include "acl/aclops/aclops.h"
+    '''
+    outputs_ = []
+    if outputs is not None:
+        outputs_ = outputs
+    else:
+        assert output_dtypes is not None
+        assert output_shapes is not None
+        assert len(output_dtypes) == len(output_shapes)
+        for i in range(len(output_shapes)):
+            outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
+    input_code = ''
+    for i in range(len(inputs)):
+        input_code += f"op.add(in{i}, true);\n"
+
+    output_code = ''
+    for i in range(len(outputs_)):
+        output_code += f"op.add(out{i}, false);\n"
+    return jt.code(outputs=outputs_,
+                   inputs=inputs,
+                   cuda_header=attr_header + cuda_header,
+                   cuda_src=f"""
+   
+    // aclop
+    {name}OpRunner op;
+    {input_code}
+    {output_code}
+    {attr_code}
+    op.run();""")
+
+
+class FloorIntACL(jt.Function):
+
+    def __init__(self):
+        super(FloorIntACL, self).__init__()
+
+    def execute(self, input):
+        self.shape = input.shape
+        result = floor_cmd("Floor", [input],
+                           output_dtypes=[input.dtype],
+                           output_shapes=[input.shape],
+                           attr_code="op.jt_name=\"floor\";")[0]
+        return result
+
+    def grad(self, grad_output):
+        return jt.zeros(self.shape, dtype=grad_output.dtype)
--- a/python/jittor/extern/acl/aclops/floor_op_acl.cc
+++ b/python/jittor/extern/acl/aclops/floor_op_acl.cc
@ -0,0 +1,56 @@
+#pragma once
+#include <acl/acl.h>
+#include <acl/acl_op_compiler.h>
+#include <Python.h>
+#include <pystate.h>
+#include <algorithm>
+#include <queue>
+#include <set>
+#include "common.h"
+#include "op.h"
+#include "acl_jittor.h"
+#include "ops/random_op.h"
+#include "ops/reduce_op.h"
+#include "ops/binary_op.h"
+#include "ops/broadcast_to_op.h"
+#include "ops/transpose_op.h"
+#include "ops/array_op.h"
+#include "ops/code_op.h"
+#include "fused_op.h"
+#include "ops/unary_op.h"
+#include "ops/ternary_op.h"
+#include "executor.h"
+#include "misc/cuda_flags.h"
+#include "mem/allocator.h"
+#include "op_compiler.h"
+#include "ops/op_register.h"
+#include "opt/tuner_manager.h"
+#include "utils/str_utils.h"
+#include "aclnn/aclnn.h"
+#include "floor_op_acl.h"
+
+namespace jittor
+{
+    FloorOpRunner::FloorOpRunner() : BaseOpRunner("Floor")
+    {
+    }
+
+    void FloorOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
+    {
+        ret = aclnnFloorGetWorkspaceSize(inputTensors[0], outputTensors[0], &workspaceSize, &executor);
+
+        checkRet(ret);
+
+        if (workspaceSize > 0)
+        {
+            mallocWorkSpace(workspaceSize);
+        }
+
+        ret = aclnnFloor(workspaceAddr, workspaceSize, executor, aclstream);
+        CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnFloor failed. ERROR: %d\n", name.c_str(), ret); return);
+
+        syncRun();
+        return;
+    }
+
+}
--- a/python/jittor/extern/acl/aclops/floor_op_acl.h
+++ b/python/jittor/extern/acl/aclops/floor_op_acl.h
@ -0,0 +1,16 @@
+#pragma once
+#include "utils.h"
+#include "base_op.h"
+
+namespace jittor
+{
+    class FloorOpRunner : public BaseOpRunner
+    {
+
+    protected:
+        void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
+
+    public:
+        FloorOpRunner();
+    };
+}
--- a/python/jittor/extern/acl/aclops/gather_scatter_op.py
+++ b/python/jittor/extern/acl/aclops/gather_scatter_op.py
@ -0,0 +1,126 @@
+import os
+from jittor_utils import env_or_try_find
+import jittor_utils
+import ctypes
+import glob
+import jittor.compiler as compiler
+import jittor as jt
+import math
+import numpy as np
+
+from typing import Union
+from collections.abc import Sequence, Iterable
+
+
+def gather_scatter_cmd(name: str,
+                       inputs: list,
+                       output_dtypes: list = None,
+                       output_shapes: list = None,
+                       attr_code: str = "",
+                       attr_header: str = "",
+                       outputs: list = None):
+    attr_header = "\nnamespace jittor{" + attr_header + "}\n"
+
+    cuda_header = '''
+    #include "acl/aclops/aclops.h"
+    '''
+    outputs_ = []
+    if outputs is not None:
+        outputs_ = outputs
+    else:
+        assert output_dtypes is not None
+        assert output_shapes is not None
+        assert len(output_dtypes) == len(output_shapes)
+        for i in range(len(output_shapes)):
+            outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
+    input_code = ''
+    for i in range(len(inputs)):
+        input_code += f"op.add(in{i}, true);\n"
+
+    output_code = ''
+    for i in range(len(outputs_)):
+        output_code += f"op.add(out{i}, false);\n"
+    return jt.code(outputs=outputs_,
+                   inputs=inputs,
+                   cuda_header=attr_header + cuda_header,
+                   cuda_src=f"""
+   
+    // aclop
+    {name}OpRunner op;
+    {input_code}
+    {output_code}
+    {attr_code}
+    op.run();""")
+
+
+class GatherACL(jt.Function):
+
+    def __init__(self):
+        super(GatherACL, self).__init__()
+
+    def execute(self, input, dim, index):
+        self.dim = dim
+        self.index = index
+        attr_code = f"""
+        op.jt_name = "gather";
+        GatherAttr *attr = new GatherAttr();
+        attr->dim = {dim};
+        op.op_attr.reset(attr);
+        """
+        result = gather_scatter_cmd("Gather", [input, index],
+                                    output_dtypes=[input.dtype],
+                                    output_shapes=[index.shape],
+                                    attr_code=attr_code)[0]
+        return result
+
+    def grad(self, grad_output):
+        tmp = jt.zeros(self.index.shape, dtype=grad_output.dtype)
+        attr_code = f"""
+        op.jt_name = "scatter";
+        ScatterAttr *attr = new ScatterAttr();
+        attr->axis = {self.dim};
+        attr->reduction = {1};
+        op.op_attr.reset(attr);
+        """
+        grad_input = gather_scatter_cmd("Scatter",
+                                        [tmp, self.index, grad_output],
+                                        output_dtypes=[grad_output.dtype],
+                                        output_shapes=[tmp.shape],
+                                        attr_code=attr_code)[0]
+        return grad_input
+
+
+class ScatterACL(jt.Function):
+
+    def __init__(self):
+        super(ScatterACL, self).__init__()
+
+    def execute(self, input, dim, index, src, reduce='void'):
+        self.dim = dim
+        self.index = index
+        self.reduce = reduce
+        attr_code = f"""
+        op.jt_name = "scatter";
+        ScatterAttr *attr = new ScatterAttr();
+        attr->axis = {dim};
+        attr->reduction = {1 if reduce == 'add' else 2 if reduce == 'mul' else 0};
+        op.op_attr.reset(attr);
+        """
+        result = gather_scatter_cmd("Scatter", [input, self.index, src],
+                                    output_dtypes=[input.dtype],
+                                    output_shapes=[input.shape],
+                                    attr_code=attr_code)[0]
+        return result
+
+    def grad(self, grad_output):
+        attr_code = f"""
+        op.jt_name = "gather";
+        GatherAttr *attr = new GatherAttr();
+        attr->dim = {self.dim};
+        op.op_attr.reset(attr);
+        """
+        grad_input = gather_scatter_cmd("Gather", [grad_output, self.index],
+                                        output_dtypes=[grad_output.dtype],
+                                        output_shapes=[self.index.shape],
+                                        attr_code=attr_code)[0]
+        return grad_output, None, None, grad_input
--- a/python/jittor/extern/acl/aclops/gather_scatter_op_acl.cc
+++ b/python/jittor/extern/acl/aclops/gather_scatter_op_acl.cc
@ -0,0 +1,80 @@
+#pragma once
+#include <acl/acl.h>
+#include <acl/acl_op_compiler.h>
+#include <Python.h>
+#include <pystate.h>
+#include <algorithm>
+#include <queue>
+#include <set>
+#include "common.h"
+#include "op.h"
+#include "acl_jittor.h"
+#include "ops/random_op.h"
+#include "ops/reduce_op.h"
+#include "ops/binary_op.h"
+#include "ops/broadcast_to_op.h"
+#include "ops/transpose_op.h"
+#include "ops/array_op.h"
+#include "ops/code_op.h"
+#include "fused_op.h"
+#include "ops/unary_op.h"
+#include "ops/ternary_op.h"
+#include "executor.h"
+#include "misc/cuda_flags.h"
+#include "mem/allocator.h"
+#include "op_compiler.h"
+#include "ops/op_register.h"
+#include "opt/tuner_manager.h"
+#include "utils/str_utils.h"
+#include "aclnn/aclnn.h"
+#include "gather_scatter_op_acl.h"
+
+namespace jittor
+{
+    GatherOpRunner::GatherOpRunner() : BaseOpRunner("Gather")
+    {
+    }
+
+    void GatherOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
+    {
+        auto attr = dynamic_cast<GatherAttr *>(op_attr.get());
+        ret = aclnnGatherGetWorkspaceSize(inputTensors[0], attr->dim, inputTensors[1], outputTensors[0], &workspaceSize, &executor);
+
+        checkRet(ret);
+
+        if (workspaceSize > 0)
+        {
+            mallocWorkSpace(workspaceSize);
+        }
+
+        ret = aclnnGather(workspaceAddr, workspaceSize, executor, aclstream);
+        CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnGather failed. ERROR: %d\n", name.c_str(), ret); return);
+
+        syncRun();
+        return;
+    }
+
+    ScatterOpRunner::ScatterOpRunner() : BaseOpRunner("Scatter")
+    {
+    }
+
+    void ScatterOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
+    {
+        auto attr = dynamic_cast<ScatterAttr *>(op_attr.get());
+        ret = aclnnScatterGetWorkspaceSize(inputTensors[0], attr->axis, inputTensors[1], inputTensors[2], attr->reduction, outputTensors[0], &workspaceSize, &executor);
+
+        checkRet(ret);
+
+        if (workspaceSize > 0)
+        {
+            mallocWorkSpace(workspaceSize);
+        }
+
+        ret = aclnnScatter(workspaceAddr, workspaceSize, executor, aclstream);
+        CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnScatter failed. ERROR: %d\n", name.c_str(), ret); return);
+
+        syncRun();
+        return;
+    }
+
+}
--- a/python/jittor/extern/acl/aclops/gather_scatter_op_acl.h
+++ b/python/jittor/extern/acl/aclops/gather_scatter_op_acl.h
@ -0,0 +1,26 @@
+#pragma once
+#include "utils.h"
+#include "base_op.h"
+
+namespace jittor
+{
+    class GatherOpRunner : public BaseOpRunner
+    {
+
+    protected:
+        void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
+
+    public:
+        GatherOpRunner();
+    };
+
+    class ScatterOpRunner : public BaseOpRunner
+    {
+
+    protected:
+        void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
+
+    public:
+        ScatterOpRunner();
+    };
+}
--- a/python/jittor/extern/acl/aclops/getitem_op.py
+++ b/python/jittor/extern/acl/aclops/getitem_op.py
@ -0,0 +1,419 @@
+import os
+from jittor_utils import env_or_try_find
+import jittor_utils
+import ctypes
+import glob
+import jittor.compiler as compiler
+import jittor as jt
+import math
+import numpy as np
+
+from typing import Union
+from collections.abc import Sequence, Iterable
+
+
+def getitem_cmd(name: str,
+                inputs: list,
+                output_dtypes: list = None,
+                output_shapes: list = None,
+                attr_code: str = "",
+                attr_header: str = "",
+                outputs: list = None):
+    attr_header = "\nnamespace jittor{" + attr_header + "}\n"
+
+    cuda_header = '''
+    #include "acl/aclops/aclops.h"
+    '''
+    outputs_ = []
+    if outputs is not None:
+        outputs_ = outputs
+    else:
+        assert output_dtypes is not None
+        assert output_shapes is not None
+        assert len(output_dtypes) == len(output_shapes)
+        for i in range(len(output_shapes)):
+            outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
+    input_code = ''
+    for i in range(len(inputs)):
+        input_code += f"op.add(in{i}, true);\n"
+
+    output_code = ''
+    for i in range(len(outputs_)):
+        output_code += f"op.add(out{i}, false);\n"
+    return jt.code(outputs=outputs_,
+                   inputs=inputs,
+                   cuda_header=attr_header + cuda_header,
+                   cuda_src=f"""
+   
+    // aclop
+    {name}OpRunner op;
+    {input_code}
+    {output_code}
+    {attr_code}
+    op.run();""")
+
+
+def getitem_forward(name: str,
+                    inputs: list,
+                    output_dtypes: list = None,
+                    output_shapes: list = None,
+                    attr_code: str = "",
+                    attr_header: str = "",
+                    outputs: list = None,
+                    extra_data: dict = {}):
+    attr_header = "\nnamespace jittor{" + attr_header + "}\n"
+
+    cuda_header = '''
+    #include "acl/aclops/aclops.h"
+    '''
+    outputs_ = []
+    if outputs is not None:
+        outputs_ = outputs
+    else:
+        assert output_dtypes is not None
+        assert output_shapes is not None
+        assert len(output_dtypes) == len(output_shapes)
+        for i in range(len(output_shapes)):
+            outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
+    input_code = ''
+    for i in range(len(inputs)):
+        input_code += f"op.add(in{i}, true);\n"
+
+    output_code = ''
+    for i in range(len(outputs_)):
+        output_code += f"op.add(out{i}, false);\n"
+    return jt.code(outputs=outputs_,
+                   inputs=inputs,
+                   cuda_header=attr_header + cuda_header,
+                   cuda_src=f"""
+    // aclop
+    {name}OpRunner op;
+    {input_code}
+    op.add(out0, false);
+    {attr_code}
+    op.run();""",
+                   data=extra_data)
+
+
+def caculate_shape(tensors):
+    if isinstance(tensors, jt.Var):
+        # tensors = tensors[0]
+        return tensors.shape
+    elif isinstance(tensors, (int, float)):
+        return []
+    elif isinstance(tensors, (list, tuple)):
+        # return [caculate_shape(tensor) for tensor in tensors]
+        sub_shape = caculate_shape(tensors[0])
+        return [len(tensors)] + sub_shape
+    else:
+        assert False, f"not implemented for {type(tensors)}"
+
+
+def can_broadcast_and_shape(shape1, shape2):
+    """
+    检查两个张量是否可以广播，并返回广播后的形状。
+    
+    参数:
+    - shape1: 第一个张量的形状（tuple 或 list）
+    - shape2: 第二个张量的形状（tuple 或 list）
+    
+    返回:
+    - can_broadcast: 布尔值，表示是否可以广播
+    - broadcast_shape: 如果可以广播，返回广播后的形状；否则返回 None
+    """
+    # 将形状转换为元组，以防输入是列表
+    shape1 = tuple(shape1)
+    shape2 = tuple(shape2)
+
+    # 使两个形状的长度一致，通过在前面补1
+    len1, len2 = len(shape1), len(shape2)
+    if len1 < len2:
+        shape1 = (1, ) * (len2 - len1) + shape1
+    elif len2 < len1:
+        shape2 = (1, ) * (len1 - len2) + shape2
+
+    broadcast_shape = []
+
+    # 从最后一维开始检查每一维度
+    for dim1, dim2 in zip(shape1, shape2):
+        if dim1 == dim2:
+            broadcast_shape.append(dim1)
+        elif dim1 == 1:
+            broadcast_shape.append(dim2)
+        elif dim2 == 1:
+            broadcast_shape.append(dim1)
+        else:
+            # 如果在某一维度上不兼容，则不能广播
+            return False, None
+
+    return True, tuple(broadcast_shape)
+
+
+class GetItemACL(jt.Function):
+
+    def __init__(self):
+        self.type_ = 'notype'
+
+    def stride(self, x, dim):
+        stride = 1
+        for i in range(dim + 1, len(x.shape)):
+            stride *= x.shape[i]
+        return stride
+
+    def execute(self, x, slices, return_x=None):
+        if isinstance(slices, jt.Var) and slices.dtype == 'bool':
+            # assert False, "not support bool type now"
+            #TODO:优化
+            assert x.shape == slices.shape, "shape not match"
+            output_len = slices.sum().item()
+            # output = jt.empty((output_len,),dtype=x.dtype)
+            x_len = x.numel()
+            output = jt.empty((x_len), dtype=x.dtype)
+            outputs = [output]
+            inputs = [x, slices]
+            # print(inputs,outputs)
+            # print(output.shape)
+            self.mask = slices
+            self.type_ = 'mask'
+            attr_code = f"""
+            op.jt_name = "maskedselect";
+            """
+            result = getitem_cmd("MaskedSelect",
+                                 inputs=inputs,
+                                 outputs=outputs,
+                                 attr_code=attr_code)[0]
+            result = result[:output_len]
+            result.sync()
+            return result
+        self.x_shape = x.shape
+        if not isinstance(slices, tuple):
+            slices = (slices, )
+        slices = list(slices)
+        for i, s in enumerate(slices):
+            if isinstance(s, int) and s < 0:
+                slices[i] = s + x.shape[i]
+        slices = tuple(slices)
+        slices_list = list(slices)
+        # if not isinstance(slices[0], slice):
+        #check slices contains slice type
+        contains_slice = False
+        for s in slices:
+            if not isinstance(s, jt.Var) and (isinstance(s, slice)
+                                              or s == Ellipsis):
+                contains_slice = True
+                break
+        if not contains_slice:
+            indices = []
+            output_shape = []
+            slices_len = len(slices)
+            boardcast_shape = caculate_shape(slices_list[0])
+            for ii in range(1, len(slices)):
+                dd, boardcast_shape = can_broadcast_and_shape(
+                    boardcast_shape, caculate_shape(slices_list[ii]))
+                assert dd is True, "can not broadcast"
+            output_shape = boardcast_shape
+            output_shape += x.shape[slices_len:]
+            if output_shape == []:
+                output_shape = [1]
+            for ii in slices:
+                indices.append(jt.Var(ii).int32())
+            if isinstance(slices[0],
+                          jt.Var) or isinstance(slices[0], int) or isinstance(
+                              slices[0], list) or isinstance(slices[0], tuple):
+                self.indices = indices
+                inputs = [x] + indices
+                attr_code = f"""
+                op.jt_name = "index";
+                """
+                self.type_ = 'index'
+                result = getitem_cmd("Index",
+                                     inputs=inputs,
+                                     output_dtypes=[x.dtype],
+                                     output_shapes=[output_shape],
+                                     attr_code=attr_code)[0]
+                result.sync()
+                return result
+        assert contains_slice, "slice type error"
+        x_dim = len(x.shape)
+        slices = list(slices)
+        for s in slices:
+            if not isinstance(s, jt.Var) and s == Ellipsis:
+                slices = slices[:slices.index(s)] + [
+                    slice(None, None, None)
+                ] * (x_dim - len(slices) + 1) + slices[slices.index(s) + 1:]
+                break
+        slices = tuple(slices)
+
+        if len(slices) < x_dim:
+            slices += (slice(None, None, None), ) * (x_dim - len(slices))
+        inputs = [x]
+        sizes = []
+        begins = []
+        ends = []
+        steps = []
+        dims = []
+        squeeze_dims = []
+
+        extra_data = {}
+        if len(slices):
+            extra_data["a"] = len(slices)
+            for dim, s in enumerate(slices):
+                if isinstance(s, int):
+                    s = slice(s, s + 1, 1)
+                    squeeze_dims.append(dim)
+                if isinstance(s, jt.Var):
+                    assert False, "jt.Var not supported"
+                start, stop, step = s.indices(x.size(dim))
+                size = (stop - start - 1) // step + 1
+                # stride = self.stride(x, dim) * step
+                sizes.append(size)
+                extra_data[str(dim * 3)] = start
+                extra_data[str(dim * 3 + 1)] = stop
+                extra_data[str(dim * 3 + 2)] = step
+
+                steps.append(step)
+                begins.append(start)
+                ends.append(stop)
+                dims.append(dim)
+        else:
+            extra_data["a"] = -1
+            sizes = [1]
+            steps = [1]
+        self.type_ = 'slicev2'
+        # for backward
+        self.begins = begins
+        self.ends = ends
+        self.steps = steps
+        self.dims = dims
+
+        self.slices = slices
+        attr_code = """
+        op.jt_name = "slicev2";
+        StrideAttr *attr = new StrideAttr();
+        
+        int slice_dim = data["a"];
+        
+        if(slice_dim == -1) {
+            attr->begins = {};
+            attr->ends = {};
+            attr->steps = {1};
+            attr->axes = {};
+        } else {
+            vector<long int> begins;
+            vector<long int> ends;
+            vector<long int> steps;
+            vector<long int> dims;
+            for(int dim = 0; dim < slice_dim; dim++) {
+                dims.push_back(dim);
+                begins.push_back(data[std::to_string(dim*3)]);
+                ends.push_back(data[std::to_string(dim*3+1)]);
+                steps.push_back(data[std::to_string(dim*3+2)]);
+            }
+            attr->begins = begins;
+            attr->ends = ends;
+            attr->steps = steps;
+            attr->axes = dims;
+        }
+        op.op_attr.reset(attr);
+        """
+        result = getitem_forward("SliceV2",
+                                 inputs,
+                                 output_dtypes=[x.dtype],
+                                 output_shapes=[jt.empty(sizes).shape],
+                                 attr_code=attr_code,
+                                 extra_data=extra_data)[0]
+        self.squeeze_dims = squeeze_dims
+        for dim in squeeze_dims[::-1]:
+            result = jt.squeeze(result, dim)
+        result.sync()
+        return result
+
+    def grad(self, grad_output):
+        if self.type_ == 'index':
+            indices = self.indices
+            inputs = [grad_output] + indices
+            attr_code = f"""
+            op.jt_name = "indexputimplaccumulate";
+            """
+            outputs = [jt.zeros(self.x_shape, dtype=grad_output.dtype)]
+            # breakpoint()
+            result = getitem_cmd("IndexPutImplAccumulate",
+                                 inputs=inputs,
+                                 outputs=outputs,
+                                 attr_code=attr_code)[0]
+            result.sync()
+            return result, None
+        elif self.type_ == 'slicev2':
+            begins = self.begins
+            ends = self.ends
+            steps = self.steps
+            dims = self.dims
+            slices = self.slices
+            #注意前向的维数可能会被压缩，所以这里要还原
+            for dim in self.squeeze_dims:
+                grad_output = jt.unsqueeze(grad_output, dim)
+            #适配华为奇怪的要求，最后一个维度的step必须是1
+            expand_dim = False
+            if isinstance(slices[-1], slice):
+                if slices[-1].step is not None and slices[-1].step != 1:
+                    slices = slices + (slice(None, None, None), )
+                    expand_dim = True
+            elif isinstance(slices[-1], int):
+                #注意最后一个维度是数字
+                slices = list(slices)
+                slices[-1] = slice(slices[-1], slices[-1] + 1, 1)
+                slices = tuple(slices)
+                slices = slices + (slice(None, None, None), )
+                expand_dim = True
+            else:
+                assert False, "not supported"
+                # x = x.unsqueeze(-1)
+            if expand_dim:
+                grad_output = grad_output.unsqueeze(-1)
+                self.x_shape = self.x_shape + (1, )
+                sizes = []
+                begins = []
+                ends = []
+                steps = []
+                dims = []
+                for dim, s in enumerate(slices):
+                    if isinstance(s, int):
+                        s = slice(s, s + 1, 1)
+                        # squeeze_dims.append(dim)
+                    if isinstance(s, jt.Var):
+                        assert False, "jt.Var not supported"
+                    start, stop, step = s.indices(self.x_shape[dim])
+                    size = (stop - start - 1) // step + 1
+                    # stride = self.stride(x, dim) * step
+                    sizes.append(size)
+                    steps.append(step)
+                    begins.append(start)
+                    ends.append(stop)
+                    dims.append(dim)
+                if not sizes:
+                    sizes = [1]
+                    steps = [1]
+            attr_code = f"""
+            op.jt_name = "stridedsliceassignv2";
+            StrideAttr *attr = new StrideAttr();
+            attr->begins = {{ {", ".join(map(str, begins))} }};
+            attr->ends = {{ {", ".join(map(str, ends))} }};
+            attr->steps = {{ {", ".join(map(str, steps))} }};
+            attr->axes = {{ {", ".join(map(str, dims))} }};
+            op.op_attr.reset(attr);
+            """
+            inputs = [grad_output]
+            outputs = [jt.zeros(self.x_shape, dtype=grad_output.dtype)]
+            result = getitem_cmd("StridedSliceAssignV2",
+                                 inputs=inputs,
+                                 outputs=outputs,
+                                 attr_code=attr_code)[0]
+            result.sync()
+            if expand_dim:
+                result = result.squeeze(-1)
+            return result, None
+        elif self.type_ == 'mask':
+            return self.mask.float()
+            pass
+        else:
+            assert False, f"grad not implemented for {self.type_}"
--- a/python/jittor/extern/acl/aclops/getitem_op_acl.cc
+++ b/python/jittor/extern/acl/aclops/getitem_op_acl.cc
@ -0,0 +1,165 @@
+#pragma once
+#include <acl/acl.h>
+#include <acl/acl_op_compiler.h>
+#include <Python.h>
+#include <pystate.h>
+#include <algorithm>
+#include <queue>
+#include <set>
+#include "common.h"
+#include "op.h"
+#include "acl_jittor.h"
+#include "ops/random_op.h"
+#include "ops/reduce_op.h"
+#include "ops/binary_op.h"
+#include "ops/broadcast_to_op.h"
+#include "ops/transpose_op.h"
+#include "ops/array_op.h"
+#include "ops/code_op.h"
+#include "fused_op.h"
+#include "ops/unary_op.h"
+#include "ops/ternary_op.h"
+#include "executor.h"
+#include "misc/cuda_flags.h"
+#include "mem/allocator.h"
+#include "op_compiler.h"
+#include "ops/op_register.h"
+#include "opt/tuner_manager.h"
+#include "utils/str_utils.h"
+#include "aclnn/aclnn.h"
+#include "getitem_op_acl.h"
+
+namespace jittor
+{
+    MaskedSelectOpRunner::MaskedSelectOpRunner() : BaseOpRunner("MaskedSelect")
+    {
+    }
+
+    void MaskedSelectOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
+    {
+        ret = aclnnMaskedSelectGetWorkspaceSize(inputTensors[0], inputTensors[1], outputTensors[0], &workspaceSize, &executor);
+
+        checkRet(ret);
+
+        if (workspaceSize > 0)
+        {
+            mallocWorkSpace(workspaceSize);
+        }
+
+        ret = aclnnMaskedSelect(workspaceAddr, workspaceSize, executor, aclstream);
+        CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnMaskedSelect failed. ERROR: %d\n", name.c_str(), ret); return);
+
+        syncRun();
+        return;
+    }
+
+    IndexOpRunner::IndexOpRunner() : BaseOpRunner("Index")
+    {
+    }
+
+    void IndexOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
+    {
+        auto input_num = in_.size();
+        auto indexTensorList = aclCreateTensorList(&inputTensors[1], input_num - 1);
+        ret = aclnnIndexGetWorkspaceSize(inputTensors[0], indexTensorList, outputTensors[0], &workspaceSize, &executor);
+
+        checkRet(ret);
+
+        if (workspaceSize > 0)
+        {
+            mallocWorkSpace(workspaceSize);
+        }
+
+        ret = aclnnIndex(workspaceAddr, workspaceSize, executor, aclstream);
+        CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnIndex failed. ERROR: %d\n", name.c_str(), ret); return);
+
+        syncRun();
+        return;
+    }
+
+    SliceV2OpRunner::SliceV2OpRunner() : BaseOpRunner("SliceV2")
+    {
+    }
+
+    void SliceV2OpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
+    {
+        auto attr = dynamic_cast<StrideAttr *>(op_attr.get());
+        auto begins = aclCreateIntArray(attr->begins.data(), attr->begins.size());
+        auto ends = aclCreateIntArray(attr->ends.data(), attr->ends.size());
+        auto steps = aclCreateIntArray(attr->steps.data(), attr->steps.size());
+        auto axes = aclCreateIntArray(attr->axes.data(), attr->axes.size());
+        ret = aclnnSliceV2GetWorkspaceSize(inputTensors[0], begins, ends, axes, steps, outputTensors[0], &workspaceSize, &executor);
+
+        checkRet(ret);
+
+        if (workspaceSize > 0)
+        {
+            mallocWorkSpace(workspaceSize);
+        }
+
+        ret = aclnnSliceV2(workspaceAddr, workspaceSize, executor, aclstream);
+        CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnSliceV2 failed. ERROR: %d\n", name.c_str(), ret); return);
+
+        syncRun();
+
+        return;
+    }
+
+    IndexPutImplAccumulateOpRunner::IndexPutImplAccumulateOpRunner() : BaseOpRunner("IndexPutImplAccumulate")
+    {
+    }
+
+    void IndexPutImplAccumulateOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
+    {
+        auto input_num = in_.size();
+        std::vector<aclTensor *> indexTensorList = {};
+        for (int i = 1; i < input_num; i++)
+        {
+            indexTensorList.push_back(inputTensors[i]);
+        }
+        auto indexTensorListInput = aclCreateTensorList(&indexTensorList[0], input_num - 1);
+        ret = aclnnIndexPutImplGetWorkspaceSize(outputTensors[0], indexTensorListInput, inputTensors[0], true, true, &workspaceSize, &executor);
+
+        checkRet(ret);
+
+        if (workspaceSize > 0)
+        {
+            mallocWorkSpace(workspaceSize);
+        }
+
+        ret = aclnnIndexPutImpl(workspaceAddr, workspaceSize, executor, aclstream);
+        CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnIndexPutImpl failed. ERROR: %d\n", name.c_str(), ret); return);
+
+        syncRun();
+
+        return;
+    }
+
+    StridedSliceAssignV2OpRunner::StridedSliceAssignV2OpRunner() : BaseOpRunner("StridedSliceAssignV2")
+    {
+    }
+
+    void StridedSliceAssignV2OpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
+    {
+        auto attr = dynamic_cast<StrideAttr *>(op_attr.get());
+        auto begins = aclCreateIntArray(attr->begins.data(), attr->begins.size());
+        auto ends = aclCreateIntArray(attr->ends.data(), attr->ends.size());
+        auto steps = aclCreateIntArray(attr->steps.data(), attr->steps.size());
+        auto axes = aclCreateIntArray(attr->axes.data(), attr->axes.size());
+        ret = aclnnStridedSliceAssignV2GetWorkspaceSize(outputTensors[0], inputTensors[0], begins, ends, steps, axes, &workspaceSize, &executor);
+
+        checkRet(ret);
+
+        if (workspaceSize > 0)
+        {
+            mallocWorkSpace(workspaceSize);
+        }
+
+        ret = aclnnStridedSliceAssignV2(workspaceAddr, workspaceSize, executor, aclstream);
+        CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnStridedSliceAssignV2 failed. ERROR: %d\n", name.c_str(), ret); return);
+
+        syncRun();
+
+        return;
+    }
+}
--- a/python/jittor/extern/acl/aclops/getitem_op_acl.h
+++ b/python/jittor/extern/acl/aclops/getitem_op_acl.h
@ -0,0 +1,57 @@
+#pragma once
+#include "utils.h"
+#include "base_op.h"
+
+namespace jittor
+{
+    class MaskedSelectOpRunner : public BaseOpRunner
+    {
+
+    protected:
+        void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
+
+    public:
+        MaskedSelectOpRunner();
+    };
+
+    class IndexOpRunner : public BaseOpRunner
+    {
+
+    protected:
+        void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
+
+    public:
+        IndexOpRunner();
+    };
+
+    class SliceV2OpRunner : public BaseOpRunner
+    {
+
+    protected:
+        void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
+
+    public:
+        SliceV2OpRunner();
+    };
+
+    class IndexPutImplAccumulateOpRunner : public BaseOpRunner
+    {
+
+    protected:
+        void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
+
+    public:
+        IndexPutImplAccumulateOpRunner();
+    };
+
+    class StridedSliceAssignV2OpRunner : public BaseOpRunner
+    {
+
+    protected:
+        void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
+
+    public:
+        StridedSliceAssignV2OpRunner();
+    };
+
+}
--- a/python/jittor/extern/acl/aclops/index_op.py
+++ b/python/jittor/extern/acl/aclops/index_op.py
@ -0,0 +1,107 @@
+import os
+from jittor_utils import env_or_try_find
+import jittor_utils
+import ctypes
+import glob
+import jittor.compiler as compiler
+import jittor as jt
+import math
+import numpy as np
+
+from typing import Union
+from collections.abc import Sequence, Iterable
+
+
+def range_forward(name: str,
+                  inputs: list,
+                  output_dtypes: list = None,
+                  output_shapes: list = None,
+                  attr_code: str = "",
+                  attr_header: str = "",
+                  outputs: list = None,
+                  extra_data: dict = {}):
+    attr_header = "\nnamespace jittor{" + attr_header + "}\n"
+
+    cuda_header = '''
+    #include "acl/aclops/aclops.h"
+    '''
+    outputs_ = []
+    if outputs is not None:
+        outputs_ = outputs
+    else:
+        assert output_dtypes is not None
+        assert output_shapes is not None
+        assert len(output_dtypes) == len(output_shapes)
+        for i in range(len(output_shapes)):
+            outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
+    input_code = ''
+    for i in range(len(inputs)):
+        input_code += f"op.add(in{i}, true);\n"
+
+    output_code = ''
+    for i in range(len(outputs_)):
+        output_code += f"op.add(out{i}, false);\n"
+    return jt.code(outputs=outputs_,
+                   inputs=inputs,
+                   cuda_header=attr_header + cuda_header,
+                   cuda_src=f"""
+    // aclop
+    {name}OpRunner op;
+    {input_code}
+    op.add(out0, false);
+    {attr_code}
+    op.run();""",
+                   data=extra_data)
+
+
+class IndexACL(jt.Function):
+
+    def __init__(self):
+        super(IndexACL, self).__init__()
+
+    def execute(self, inshape: list, dim=None, dtype="int32"):
+        # zeros a tensor, shape is inshape, dtype is dtype
+        dim_input = dim
+        if dim == None:
+            dim = [i for i in range(len(inshape))]
+        elif type(dim) == int:
+            dim = [dim]
+        results = []
+        extra_data = {}
+        extra_data["dim_count"] = len(dim)
+
+        for i, d in enumerate(dim):
+            max_len = inshape[d]
+
+            extra_data[f"dim_{i}_start"] = 0
+            extra_data[f"dim_{i}_end"] = max_len
+            extra_data[f"dim_{i}_step"] = 1
+
+            tmp = jt.zeros(max_len, dtype=dtype)
+            range_attr_code = f"""
+            op.jt_name = "range";
+            RangeAttr *attr = new RangeAttr();
+            attr->start = data["dim_{i}_start"];
+            attr->end = data["dim_{i}_end"];
+            attr->step = data["dim_{i}_step"];
+            op.op_attr.reset(attr);
+            """
+            result = range_forward("Range", [],
+                                   output_dtypes=[tmp.dtype],
+                                   output_shapes=[tmp.shape],
+                                   attr_code=range_attr_code,
+                                   extra_data=extra_data)[0]
+            broadcast_dims = list(range(len(inshape)))
+            broadcast_dims.remove(d)
+            result = jt.broadcast(result, shape=inshape, dims=broadcast_dims)
+            results.append(result)
+
+        if len(results) != 1 or dim_input == None:
+            return tuple(results)
+        elif len(results) == 1 and dim_input != None:
+            return results[0]
+        else:
+            return results
+
+    def grad(self, grad_output):
+        return grad_output
--- a/python/jittor/extern/acl/aclops/index_op_acl.cc
+++ b/python/jittor/extern/acl/aclops/index_op_acl.cc
@ -0,0 +1,72 @@
+#pragma once
+#include <acl/acl.h>
+#include <acl/acl_op_compiler.h>
+#include <Python.h>
+#include <pystate.h>
+#include <algorithm>
+#include <queue>
+#include <set>
+#include "common.h"
+#include "op.h"
+#include "acl_jittor.h"
+#include "ops/random_op.h"
+#include "ops/reduce_op.h"
+#include "ops/binary_op.h"
+#include "ops/broadcast_to_op.h"
+#include "ops/transpose_op.h"
+#include "ops/array_op.h"
+#include "ops/code_op.h"
+#include "fused_op.h"
+#include "ops/unary_op.h"
+#include "ops/ternary_op.h"
+#include "executor.h"
+#include "misc/cuda_flags.h"
+#include "mem/allocator.h"
+#include "op_compiler.h"
+#include "ops/op_register.h"
+#include "opt/tuner_manager.h"
+#include "utils/str_utils.h"
+#include "aclnn/aclnn.h"
+#include "index_op_acl.h"
+
+namespace jittor
+{
+    RangeOpRunner::RangeOpRunner() : BaseOpRunner("Range")
+    {
+    }
+
+    void RangeOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
+    {
+        aclScalar *start = nullptr;
+        aclScalar *end = nullptr;
+        aclScalar *step = nullptr;
+
+        auto attr = dynamic_cast<RangeAttr *>(op_attr.get());
+        int64_t startValue = attr->start;
+        int64_t endValue = attr->end;
+        int64_t stepValue = attr->step;
+        start = aclCreateScalar(&startValue, aclDataType::ACL_INT64);
+        end = aclCreateScalar(&endValue, aclDataType::ACL_INT64);
+        step = aclCreateScalar(&stepValue, aclDataType::ACL_INT64);
+
+        ret = aclnnRangeGetWorkspaceSize(start, end, step, outputTensors[0], &workspaceSize, &executor);
+
+        checkRet(ret);
+
+        if (workspaceSize > 0)
+        {
+            mallocWorkSpace(workspaceSize);
+        }
+
+        ret = aclnnRange(workspaceAddr, workspaceSize, executor, aclstream);
+        CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnRange failed. ERROR: %d\n", name.c_str(), ret); return);
+
+        syncRun();
+
+        aclDestroyScalar(start);
+        aclDestroyScalar(end);
+        aclDestroyScalar(step);
+        return;
+    }
+
+}
--- a/python/jittor/extern/acl/aclops/index_op_acl.h
+++ b/python/jittor/extern/acl/aclops/index_op_acl.h
@ -0,0 +1,17 @@
+#pragma once
+#include "utils.h"
+#include "base_op.h"
+
+namespace jittor
+{
+    class RangeOpRunner : public BaseOpRunner
+    {
+
+    protected:
+        void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
+
+    public:
+        RangeOpRunner();
+    };
+
+}
--- a/python/jittor/extern/acl/aclops/matmul_op.py
+++ b/python/jittor/extern/acl/aclops/matmul_op.py
@ -0,0 +1,130 @@
+import os
+from jittor_utils import env_or_try_find
+import jittor_utils
+import ctypes
+import glob
+import jittor.compiler as compiler
+import jittor as jt
+import math
+import numpy as np
+
+from typing import Union
+from collections.abc import Sequence, Iterable
+
+
+def matmul_forward(name: str,
+                   inputs: list,
+                   output_dtypes: list = None,
+                   output_shapes: list = None,
+                   attr_code: str = "",
+                   attr_header: str = "",
+                   outputs: list = None,
+                   extra_data: dict = {}):
+    attr_header = "\nnamespace jittor{" + attr_header + "}\n"
+
+    cuda_header = '''
+    #include "acl/aclops/aclops.h"
+    '''
+    outputs_ = []
+    if outputs is not None:
+        outputs_ = outputs
+    else:
+        assert output_dtypes is not None
+        assert output_shapes is not None
+        assert len(output_dtypes) == len(output_shapes)
+        for i in range(len(output_shapes)):
+            outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
+    input_code = ''
+    for i in range(len(inputs)):
+        input_code += f"op.add(in{i}, true);\n"
+
+    return jt.code(outputs=outputs_,
+                   inputs=inputs,
+                   cuda_header=attr_header + cuda_header,
+                   cuda_src=f"""
+    // aclop
+    MatMulOpRunner op;
+    {input_code}
+    op.add(out0, false);
+    {attr_code}
+    op.run();""",
+                   data=extra_data)
+
+
+class MatmulACL(jt.Function):
+
+    def __init__(self, trans_x2=False):
+        super(MatmulACL, self).__init__()
+        self.trans_x2 = trans_x2
+
+    def execute(self, x1, x2):
+        self.input = [x1, x2]
+        result = matmul_forward(
+            "MatMul", [x1, x2],
+            output_dtypes=[x1.dtype],
+            output_shapes=[
+                x1.shape[:-1] +
+                x2.shape[-2:-1] if self.trans_x2 else x1.shape[:-1] +
+                x2.shape[-1:]
+            ],
+            attr_code="op.jt_name=\"matmul_trans_1\";"
+            if self.trans_x2 else "op.jt_name=\"matmul\";")[0]
+        return result
+
+    def grad(self, grad_output):
+        x1, x2 = self.input
+        if len(x1) != len(x2):
+            reshape_grad_x2 = True
+        else:
+            reshape_grad_x2 = False
+        grad_x1 = matmul_forward(
+            "MatMul", [grad_output, x2],
+            output_dtypes=[x1.dtype],
+            output_shapes=[
+                grad_output.shape[:-1] + x2.shape[-2:-1] if not self.trans_x2
+                else grad_output.shape[:-1] + x2.shape[-1:]
+            ],
+            attr_code="op.jt_name=\"matmul_trans_1\";"
+            if not self.trans_x2 else "op.jt_name=\"matmul\";")[0]
+
+        if self.trans_x2:
+            if reshape_grad_x2:
+                output_shape = grad_output.shape[1:-2] + grad_output.shape[
+                    -1:] + x1.shape[-1:]
+                grad_x2 = matmul_forward(
+                    "MatMul", [
+                        grad_output.reshape(-1, grad_output.shape[-1]),
+                        x1.reshape(-1, x1.shape[-1])
+                    ],
+                    output_dtypes=[x2.dtype],
+                    output_shapes=[output_shape],
+                    attr_code="op.jt_name=\"matmul_trans_0\";")[0]
+            else:
+                output_shape = grad_output.shape[:-2] + grad_output.shape[
+                    -1:] + x1.shape[-1:]
+                grad_x2 = matmul_forward(
+                    "MatMul", [grad_output, x1],
+                    output_dtypes=[x2.dtype],
+                    output_shapes=[output_shape],
+                    attr_code="op.jt_name=\"matmul_trans_0\";")[0]
+        else:
+            if reshape_grad_x2:
+                output_shape = x1.shape[1:-2] + x1.shape[
+                    -1:] + grad_output.shape[-1:]
+                grad_x2 = matmul_forward(
+                    "MatMul", [
+                        x1.reshape(-1, x1.shape[-1]),
+                        grad_output.reshape(-1, grad_output.shape[-1])
+                    ],
+                    output_dtypes=[x2.dtype],
+                    output_shapes=[output_shape],
+                    attr_code="op.jt_name=\"matmul_trans_0\";")[0]
+            else:
+                output_shape = x1.shape[:-2] + x1.shape[
+                    -1:] + grad_output.shape[-1:]
+                grad_x2 = matmul_forward(
+                    "MatMul", [x1, grad_output],
+                    output_dtypes=[x2.dtype],
+                    output_shapes=[output_shape],
+                    attr_code="op.jt_name=\"matmul_trans_0\";")[0]
+        return grad_x1, grad_x2
--- a/python/jittor/extern/acl/aclops/matmul_op_acl.cc
+++ b/python/jittor/extern/acl/aclops/matmul_op_acl.cc
@ -0,0 +1,77 @@
+#pragma once
+#include <acl/acl.h>
+#include <acl/acl_op_compiler.h>
+#include <Python.h>
+#include <pystate.h>
+#include <algorithm>
+#include <queue>
+#include <set>
+#include "common.h"
+#include "op.h"
+#include "acl_jittor.h"
+#include "ops/random_op.h"
+#include "ops/reduce_op.h"
+#include "ops/binary_op.h"
+#include "ops/broadcast_to_op.h"
+#include "ops/transpose_op.h"
+#include "ops/array_op.h"
+#include "ops/code_op.h"
+#include "fused_op.h"
+#include "ops/unary_op.h"
+#include "ops/ternary_op.h"
+#include "executor.h"
+#include "misc/cuda_flags.h"
+#include "mem/allocator.h"
+#include "op_compiler.h"
+#include "ops/op_register.h"
+#include "opt/tuner_manager.h"
+#include "utils/str_utils.h"
+#include "aclnn/aclnn.h"
+#include "matmul_op_acl.h"
+
+namespace jittor
+{
+    MatMulOpRunner::MatMulOpRunner() : BaseOpRunner("MatMul")
+    {
+    }
+    void MatMulOpRunner::setupInputDesc()
+    {
+        auto input_num = in_.size();
+        for (int input_idx = 0; input_idx < input_num; input_idx++)
+        {
+            std::vector<int64_t> shape;
+            for (int j = 0; j < in_[input_idx]->shape.size(); j++)
+            {
+                shape.push_back(in_[input_idx]->shape[j]);
+            }
+            inputShapes.push_back(shape);
+        }
+        for (int idx = 0; idx < input_num; idx++)
+        {
+            inputTensors.push_back(nullptr);
+            if ((jt_name == "matmul_trans_1" && idx == 1) || (jt_name == "matmul_trans_0" && idx == 0))
+            {
+                auto ret = CreateFakeTransAclTensor(inputShapes[idx], in_[idx]->mem_ptr, in_[idx]->size, get_dtype(in_[idx]->dtype()), &inputTensors[idx], use_nchw);
+                CHECK_RET(ret == ACL_SUCCESS, return);
+            }
+            else
+            {
+                auto ret = CreateAclTensor(inputShapes[idx], in_[idx]->mem_ptr, in_[idx]->size, get_dtype(in_[idx]->dtype()), &inputTensors[idx], use_nchw);
+                CHECK_RET(ret == ACL_SUCCESS, return);
+            }
+        }
+    }
+    void MatMulOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
+    {
+
+        ret = aclnnMatmulGetWorkspaceSize(inputTensors[0], inputTensors[1], outputTensors[0], 1, &workspaceSize, &executor);
+        CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnMatmulGetWorkspaceSize failed. ERROR: %d\n", name.c_str(), ret); return);
+        if (workspaceSize > 0)
+        {
+            mallocWorkSpace(workspaceSize);
+        }
+        ret = aclnnMatmul(workspaceAddr, workspaceSize, executor, aclstream);
+        CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnMatmul failed. ERROR: %d\n", name.c_str(), ret); return);
+        syncRun();
+    }
+}
--- a/python/jittor/extern/acl/aclops/matmul_op_acl.h
+++ b/python/jittor/extern/acl/aclops/matmul_op_acl.h
@ -0,0 +1,17 @@
+#pragma once
+#include "utils.h"
+#include "base_op.h"
+
+namespace jittor
+{
+    class MatMulOpRunner : public BaseOpRunner
+    {
+
+    protected:
+        void setupInputDesc() override;
+        void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
+
+    public:
+        MatMulOpRunner();
+    };
+}
--- a/python/jittor/extern/acl/aclops/nantonum_op.py
+++ b/python/jittor/extern/acl/aclops/nantonum_op.py
@ -0,0 +1,75 @@
+import os
+from jittor_utils import env_or_try_find
+import jittor_utils
+import ctypes
+import glob
+import jittor.compiler as compiler
+import jittor as jt
+import math
+import numpy as np
+
+from typing import Union
+from collections.abc import Sequence, Iterable
+
+
+def nantonum_cmd(name: str,
+                 inputs: list,
+                 output_dtypes: list = None,
+                 output_shapes: list = None,
+                 attr_code: str = "",
+                 attr_header: str = "",
+                 outputs: list = None):
+    attr_header = "\nnamespace jittor{" + attr_header + "}\n"
+
+    cuda_header = '''
+    #include "acl/aclops/aclops.h"
+    '''
+    outputs_ = []
+    if outputs is not None:
+        outputs_ = outputs
+    else:
+        assert output_dtypes is not None
+        assert output_shapes is not None
+        assert len(output_dtypes) == len(output_shapes)
+        for i in range(len(output_shapes)):
+            outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
+    input_code = ''
+    for i in range(len(inputs)):
+        input_code += f"op.add(in{i}, true);\n"
+
+    output_code = ''
+    for i in range(len(outputs_)):
+        output_code += f"op.add(out{i}, false);\n"
+    return jt.code(outputs=outputs_,
+                   inputs=inputs,
+                   cuda_header=attr_header + cuda_header,
+                   cuda_src=f"""
+   
+    // aclop
+    {name}OpRunner op;
+    {input_code}
+    {output_code}
+    {attr_code}
+    op.run();""")
+
+
+class NanToNumACL(jt.Function):
+
+    def __init__(self):
+        super(NanToNumACL, self).__init__()
+
+    def execute(self, input, nan_or_inf):
+        attr_code = f"""
+        op.jt_name = "NanToNum";
+        NanToNumAttr *attr = new NanToNumAttr();
+        attr->nan = {nan_or_inf};
+        attr->posinf = {-nan_or_inf};
+        attr->neginf = {-nan_or_inf};
+        op.op_attr.reset(attr);
+        """
+        self.attr_code = attr_code
+        result = nantonum_cmd("NanToNum", [input],
+                              output_dtypes=[input[0].dtype],
+                              output_shapes=[input.shape],
+                              attr_code=self.attr_code)[0]
+        return result
--- a/python/jittor/extern/acl/aclops/nantonum_op_acl.cc
+++ b/python/jittor/extern/acl/aclops/nantonum_op_acl.cc
@ -0,0 +1,58 @@
+#pragma once
+#include <acl/acl.h>
+#include <acl/acl_op_compiler.h>
+#include <Python.h>
+#include <pystate.h>
+#include <algorithm>
+#include <queue>
+#include <set>
+#include "common.h"
+#include "op.h"
+#include "acl_jittor.h"
+#include "ops/random_op.h"
+#include "ops/reduce_op.h"
+#include "ops/binary_op.h"
+#include "ops/broadcast_to_op.h"
+#include "ops/transpose_op.h"
+#include "ops/array_op.h"
+#include "ops/code_op.h"
+#include "fused_op.h"
+#include "ops/unary_op.h"
+#include "ops/ternary_op.h"
+#include "executor.h"
+#include "misc/cuda_flags.h"
+#include "mem/allocator.h"
+#include "op_compiler.h"
+#include "ops/op_register.h"
+#include "opt/tuner_manager.h"
+#include "utils/str_utils.h"
+#include "aclnn/aclnn.h"
+#include "nantonum_op_acl.h"
+
+namespace jittor
+{
+    NanToNumOpRunner::NanToNumOpRunner() : BaseOpRunner("NanToNum")
+    {
+    }
+
+    void NanToNumOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
+    {
+        auto attr = dynamic_cast<NanToNumAttr *>(op_attr.get());
+        ret = aclnnNanToNumGetWorkspaceSize(inputTensors[0], attr->nan, attr->posinf, attr->neginf, outputTensors[0], &workspaceSize, &executor);
+
+        checkRet(ret);
+
+        if (workspaceSize > 0)
+        {
+            mallocWorkSpace(workspaceSize);
+        }
+
+        ret = aclnnNanToNum(workspaceAddr, workspaceSize, executor, aclstream);
+        CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnNanToNum failed. ERROR: %d\n", name.c_str(), ret); return);
+
+        syncRun();
+
+        return;
+    }
+
+}
--- a/python/jittor/extern/acl/aclops/nantonum_op_acl.h
+++ b/python/jittor/extern/acl/aclops/nantonum_op_acl.h
@ -0,0 +1,17 @@
+#pragma once
+#include "utils.h"
+#include "base_op.h"
+
+namespace jittor
+{
+    class NanToNumOpRunner : public BaseOpRunner
+    {
+
+    protected:
+        void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
+
+    public:
+        NanToNumOpRunner();
+    };
+
+}
--- a/python/jittor/extern/acl/aclops/norms_op.py
+++ b/python/jittor/extern/acl/aclops/norms_op.py
@ -0,0 +1,184 @@
+import os
+from jittor_utils import env_or_try_find
+import jittor_utils
+import ctypes
+import glob
+import jittor.compiler as compiler
+import jittor as jt
+import math
+import numpy as np
+
+from typing import Union
+from collections.abc import Sequence, Iterable
+
+def norms_cmd(name: str,
+            inputs: list,
+            output_dtypes: list = None,
+            output_shapes: list = None,
+            attr_code: str = "",
+            attr_header: str = "",
+            outputs: list = None):
+    attr_header = "\nnamespace jittor{" + attr_header + "}\n"
+
+    cuda_header = '''
+    #include "acl/aclops/aclops.h"
+    '''
+    outputs_ = []
+    if outputs is not None:
+        outputs_ = outputs
+    else:
+        assert output_dtypes is not None
+        assert output_shapes is not None
+        assert len(output_dtypes) == len(output_shapes)
+        for i in range(len(output_shapes)):
+            outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
+    input_code = ''
+    for i in range(len(inputs)):
+        input_code += f"op.add(in{i}, true);\n"
+
+    output_code = ''
+    for i in range(len(outputs_)):
+        output_code += f"op.add(out{i}, false);\n"
+    return jt.code(outputs=outputs_,
+                   inputs=inputs,
+                   cuda_header=attr_header + cuda_header,
+                   cuda_src=f"""
+   
+    // aclop
+    {name}OpRunner op;
+    {input_code}
+    {output_code}
+    {attr_code}
+    op.run();""")
+    
+class BatchNormACL(jt.Function):
+
+    def __init__(self,
+                    num_features,
+                    eps=1e-05,
+                    momentum=0.1,
+                    affine=True,
+                    is_train=True,
+                    sync=True):
+        self.num_features = num_features
+        self.eps = eps
+        self.momentum = momentum
+        self.affine = affine
+        self.is_train = is_train
+        self.sync = sync
+        self.weight = jt.init.constant(
+            (num_features, ), "float32", 1.0) if affine else 1.0
+        self.bias = jt.init.constant(
+            (num_features, ), "float32", 0.0) if affine else 0.0
+        self.running_mean = jt.init.constant((num_features, ), "float32",
+                                                0.0).stop_grad()
+        self.running_var = jt.init.constant((num_features, ), "float32",
+                                            1.0).stop_grad()
+
+    def execute(self, x):
+        # assert self.num_features == x.shape[-1]
+        self.input = x.float32()
+        inputs = [
+            self.input, self.weight, self.bias, self.running_mean,
+            self.running_var
+        ]
+        outputs = [
+            jt.empty(x.shape),
+            jt.empty(self.num_features),
+            jt.empty(self.num_features)
+        ]
+        attr_code = f"""
+        op.jt_name = "batchnorm";
+        BatchNormAttr *attr = new BatchNormAttr();
+        attr->is_train = {"true" if self.is_train else "false"};
+        attr->momentum = {self.momentum};
+        attr->eps = {self.eps};
+        op.op_attr.reset(attr);
+        """
+        result = norms_cmd("BatchNorm",
+                            inputs=inputs,
+                            outputs=outputs,
+                            attr_code=attr_code)
+        self.output = result[0]
+        self.saveMean = result[1]
+        self.saveInvstd = result[2]
+        return self.output
+
+    def grad(self, grad_output):
+        attr_code = f"""
+        op.jt_name = "batchnorm";
+        BatchNormAttr *attr = new BatchNormAttr();
+        attr->is_train = {"true" if self.is_train else "false"};
+        attr->momentum = {self.momentum};
+        attr->eps = {self.eps};
+        op.op_attr.reset(attr);
+        """
+        inputs = [
+            grad_output, self.input, self.weight, self.running_mean,
+            self.running_var, self.saveMean, self.saveInvstd
+        ]
+        outputs = [
+            jt.empty(self.input.shape),
+            jt.empty(self.num_features),
+            jt.empty(self.num_features)
+        ]
+        grad_input = norms_cmd("BatchNormBackward",
+                                inputs=inputs,
+                                outputs=outputs,
+                                attr_code=attr_code)[0]
+        return grad_input
+    
+
+class LayerNormACL(jt.Function):
+
+    def __init__(self,
+                    normalized_shape,
+                    eps: float = 1e-5,
+                    elementwise_affine: bool = True):
+        if isinstance(normalized_shape, int):
+            normalized_shape = (normalized_shape, )
+        self.normalized_shape = tuple(normalized_shape)
+        self.eps = eps
+        self.elementwise_affine = elementwise_affine
+        self.weight = jt.init.constant(normalized_shape, "float32",
+                                        1.0) if elementwise_affine else 1.0
+        self.bias = jt.init.constant(normalized_shape, "float32",
+                                        0.0) if elementwise_affine else 0.0
+
+    def execute(self, x):
+        self.input = x.float32()
+        inputs = [self.input, self.weight, self.bias]
+        outputs = [jt.empty(x.shape), jt.empty(x.shape), jt.empty(x.shape)]
+        attr_code = f"""
+        op.jt_name = "layernorm";
+        LayerNormAttr *attr = new LayerNormAttr();
+        attr->eps = {self.eps};
+        attr->normalizedShape = {{{', '.join(map(str, (list(self.normalized_shape))))}}};
+        attr->size = {x.shape[-1]};
+        op.op_attr.reset(attr);
+        """
+        result = norms_cmd("LayerNorm",
+                            inputs=inputs,
+                            outputs=outputs,
+                            attr_code=attr_code)
+        self.output = result[0]
+        self.meanout = result[1]
+        self.rstdout = result[2]
+        return self.output
+
+    def grad(self, grad_output):
+        attr_code = f"""
+        op.jt_name = "batchnorm";
+        BatchNormAttr *attr = new BatchNormAttr();
+        attr->is_train = {"true" if self.is_train else "false"};
+        attr->momentum = {self.momentum};
+        attr->eps = {self.eps};
+        op.op_attr.reset(attr);
+        """
+        inputs = [grad_output, self.input, self.weight, self.running_mean, self.running_var, self.saveMean, self.saveInvstd]
+        outputs = [jt.empty(self.input.shape), jt.empty(self.num_features), jt.empty(self.num_features)]
+        grad_input = norms_cmd("SoftmaxBackward",
+                            inputs=inputs,
+                            outputs=outputs,
+                            attr_code=attr_code)[0]
+        return grad_input
--- a/python/jittor/extern/acl/aclops/norms_op_acl.cc
+++ b/python/jittor/extern/acl/aclops/norms_op_acl.cc
@ -0,0 +1,111 @@
+#pragma once
+#include <acl/acl.h>
+#include <acl/acl_op_compiler.h>
+#include <Python.h>
+#include <pystate.h>
+#include <algorithm>
+#include <queue>
+#include <set>
+#include "common.h"
+#include "op.h"
+#include "acl_jittor.h"
+#include "ops/random_op.h"
+#include "ops/reduce_op.h"
+#include "ops/binary_op.h"
+#include "ops/broadcast_to_op.h"
+#include "ops/transpose_op.h"
+#include "ops/array_op.h"
+#include "ops/code_op.h"
+#include "fused_op.h"
+#include "ops/unary_op.h"
+#include "ops/ternary_op.h"
+#include "executor.h"
+#include "misc/cuda_flags.h"
+#include "mem/allocator.h"
+#include "op_compiler.h"
+#include "ops/op_register.h"
+#include "opt/tuner_manager.h"
+#include "utils/str_utils.h"
+#include "aclnn/aclnn.h"
+#include "norms_op_acl.h"
+
+namespace jittor
+{
+    BatchNormOpRunner::BatchNormOpRunner() : BaseOpRunner("BatchNorm")
+    {
+    }
+
+    void BatchNormOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
+    {
+        auto attr = dynamic_cast<BatchNormAttr *>(op_attr.get());
+        ret = aclnnBatchNormGetWorkspaceSize(inputTensors[0], inputTensors[1], inputTensors[2], inputTensors[3], inputTensors[4], attr->is_train, attr->momentum, attr->eps, outputTensors[0], outputTensors[1], outputTensors[2], &workspaceSize, &executor);
+
+        checkRet(ret);
+
+        if (workspaceSize > 0)
+        {
+            mallocWorkSpace(workspaceSize);
+        }
+
+        ret = aclnnBatchNorm(workspaceAddr, workspaceSize, executor, aclstream);
+        CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnBatchNorm failed. ERROR: %d\n", name.c_str(), ret); return);
+
+        syncRun();
+
+        return;
+    }
+
+    BatchNormBackwardOpRunner::BatchNormBackwardOpRunner() : BaseOpRunner("BatchNormBackward")
+    {
+    }
+
+    void BatchNormBackwardOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
+    {
+        auto attr = dynamic_cast<BatchNormAttr *>(op_attr.get());
+        bool outputMask[3] = {true, true, true};
+        aclBoolArray *outMask = aclCreateBoolArray(outputMask, 3);
+        ret = aclnnBatchNormBackwardGetWorkspaceSize(inputTensors[0], inputTensors[1], inputTensors[2], inputTensors[3], inputTensors[4], inputTensors[5], inputTensors[6], attr->is_train, attr->eps, outMask, outputTensors[0], outputTensors[1], outputTensors[2], &workspaceSize, &executor);
+
+        checkRet(ret);
+
+        if (workspaceSize > 0)
+        {
+            mallocWorkSpace(workspaceSize);
+        }
+
+        ret = aclnnBatchNormBackward(workspaceAddr, workspaceSize, executor, aclstream);
+        CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnBatchNormBackward failed. ERROR: %d\n", name.c_str(), ret); return);
+
+        syncRun();
+
+        return;
+    }
+
+    LayerNormOpRunner::LayerNormOpRunner() : BaseOpRunner("LayerNorm")
+    {
+    }
+
+    void LayerNormOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
+    {
+        auto attr = dynamic_cast<LayerNormAttr *>(op_attr.get());
+        aclIntArray *normalizedShape = nullptr;
+        normalizedShape = aclCreateIntArray(attr->normalizedShape.data(), attr->size);
+        ret = aclnnLayerNormGetWorkspaceSize(inputTensors[0], normalizedShape, inputTensors[1], inputTensors[2], attr->eps, outputTensors[0], outputTensors[1], outputTensors[2], &workspaceSize, &executor);
+
+        checkRet(ret);
+
+        if (workspaceSize > 0)
+        {
+            mallocWorkSpace(workspaceSize);
+        }
+
+        ret = aclnnLayerNorm(workspaceAddr, workspaceSize, executor, aclstream);
+        CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnLayerNorm failed. ERROR: %d\n", name.c_str(), ret); return);
+
+        syncRun();
+        aclDestroyIntArray(normalizedShape);
+
+        return;
+    }
+
+}
--- a/python/jittor/extern/acl/aclops/norms_op_acl.h
+++ b/python/jittor/extern/acl/aclops/norms_op_acl.h
@ -0,0 +1,34 @@
+#pragma once
+#include "utils.h"
+#include "base_op.h"
+
+namespace jittor
+{
+    class BatchNormOpRunner : public BaseOpRunner
+    {
+
+    protected:
+        void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
+    public:
+        BatchNormOpRunner();
+    };
+
+    class BatchNormBackwardOpRunner : public BaseOpRunner
+    {
+
+    protected:
+        void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
+    public:
+        BatchNormBackwardOpRunner();
+    };
+
+    class LayerNormOpRunner : public BaseOpRunner
+    {
+
+    protected:
+        void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
+    public:
+        LayerNormOpRunner();
+    };
+
+}
--- a/python/jittor/extern/acl/aclops/pool_op.py
+++ b/python/jittor/extern/acl/aclops/pool_op.py
@ -0,0 +1,176 @@
+import os
+from jittor_utils import env_or_try_find
+import jittor_utils
+import ctypes
+import glob
+import jittor.compiler as compiler
+import jittor as jt
+import math
+import numpy as np
+
+from typing import Union
+from collections.abc import Sequence, Iterable
+
+
+def pool_cmd(name: str,
+             inputs: list,
+             output_dtypes: list = None,
+             output_shapes: list = None,
+             attr_code: str = "",
+             attr_header: str = "",
+             outputs: list = None):
+    attr_header = "\nnamespace jittor{" + attr_header + "}\n"
+
+    cuda_header = '''
+    #include "acl/aclops/aclops.h"
+    '''
+    outputs_ = []
+    if outputs is not None:
+        outputs_ = outputs
+    else:
+        assert output_dtypes is not None
+        assert output_shapes is not None
+        assert len(output_dtypes) == len(output_shapes)
+        for i in range(len(output_shapes)):
+            outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
+    input_code = ''
+    for i in range(len(inputs)):
+        input_code += f"op.add(in{i}, true);\n"
+
+    output_code = ''
+    for i in range(len(outputs_)):
+        output_code += f"op.add(out{i}, false);\n"
+    return jt.code(outputs=outputs_,
+                   inputs=inputs,
+                   cuda_header=attr_header + cuda_header,
+                   cuda_src=f"""
+   
+    // aclop
+    {name}OpRunner op;
+    {input_code}
+    {output_code}
+    {attr_code}
+    op.run();""")
+
+
+class PoolACL(jt.Function):
+
+    def __init__(self,
+                 kernel_size,
+                 stride=None,
+                 padding=0,
+                 dilation=None,
+                 return_indices=None,
+                 ceil_mode=False,
+                 count_include_pad=True,
+                 op='maximum'):
+        self.kernel_size = kernel_size if isinstance(
+            kernel_size, tuple) else (kernel_size, kernel_size)
+        stride = stride if stride else kernel_size
+        self.stride = stride if isinstance(stride, tuple) else (stride, stride)
+        self.padding = padding if isinstance(padding, tuple) else (padding,
+                                                                   padding)
+        dilation = dilation if dilation else 1
+        assert dilation == 1
+        self.dilation = dilation if isinstance(dilation, tuple) else (dilation,
+                                                                      dilation)
+        for item in self.kernel_size:
+            if item <= 0:
+                raise RuntimeError(
+                    f"kernel_size must be greater than zero, but got {item}")
+        for item in self.stride:
+            if item <= 0:
+                raise RuntimeError(
+                    f"stride must be greater than zero, but got {item}")
+        for item in self.padding:
+            if item < 0:
+                raise RuntimeError(
+                    f"padding must be non-negative, but got {item}")
+        self.op = op
+        self.return_indices = return_indices
+        self.ceil_mode = ceil_mode
+        self.count_include_pad = count_include_pad
+
+    def execute(self, input):
+        self.input = input
+        attr_code = f"""
+        op.jt_name  = "{"avgpool" if self.op == 'mean' else "maxpool"}";
+        PoolAttr *attr = new PoolAttr();
+        attr->kernel_size = {{ {self.kernel_size[0]}, {self.kernel_size[1]} }};
+        attr->poolStrides = {{ {self.stride[0]}, {self.stride[1]} }};
+        attr->poolPads = {{ {self.padding[0]}, {self.padding[1]} }};
+        attr->poolDilations = {{ {self.dilation[0]}, {self.dilation[1]} }};
+        attr->poolCeil = {"true" if self.ceil_mode else "false"};
+        attr->countIncludePad = {"true" if self.count_include_pad else "false"};
+        op.op_attr.reset(attr);
+        """
+        input_height, input_width = input.shape[-2:]
+        kernel_height, kernel_width = self.kernel_size[-2:]
+
+        output_height = (input_height + 2 * self.padding[0] -
+                         (kernel_height - 1) - 1) // self.stride[0] + 1
+        output_width = (input_width + 2 * self.padding[1] -
+                        (kernel_width - 1) - 1) // self.stride[1] + 1
+
+        output_shape = (input.shape[0], input.shape[1], output_height,
+                        output_width)
+
+        inputs = [input]
+
+        if self.op == 'maximum':
+            result = pool_cmd(
+                "Maxpool",
+                inputs,
+                output_dtypes=[input.dtype, 'int32'],
+                output_shapes=[output_shape, output_shape],
+                attr_code=attr_code,
+            )
+        elif self.op == 'mean':
+            result = pool_cmd(
+                "Avgpool",
+                inputs,
+                output_dtypes=[input.dtype],
+                output_shapes=[output_shape],
+                attr_code=attr_code,
+            )
+        else:
+            raise ValueError('no this type pool')
+
+        if self.op == 'maximum':
+            self.index = result[1]
+
+        if self.return_indices:
+            return result[0], result[1]
+        else:
+            return result[0]
+
+    def grad(self, grad_output):
+        input = self.input
+        attr_code = f"""
+        op.jt_name = "{"avgpoolbackward" if self.op == 'mean' else "maxpoolbackward"}";
+        PoolAttr *attr = new PoolAttr();
+        attr->kernel_size = {{ {self.kernel_size[0]}, {self.kernel_size[1]} }};
+        attr->poolStrides = {{ {self.stride[0]}, {self.stride[1]} }};
+        attr->poolPads = {{ {self.padding[0]}, {self.padding[1]} }};
+        attr->poolDilations = {{ {self.dilation[0]}, {self.dilation[1]} }};
+        attr->poolCeil = {"true" if self.ceil_mode else "false"};
+        attr->countIncludePad = {"true" if self.count_include_pad else "false"};
+        op.op_attr.reset(attr);
+        """
+        output_shapes = [input.shape]
+        output_dtypes = [input.dtype]
+        if self.op == 'maximum':
+            result = pool_cmd("MaxpoolBackward",
+                              inputs=[grad_output, input, self.index],
+                              output_dtypes=output_dtypes,
+                              output_shapes=output_shapes,
+                              attr_code=attr_code)[0]
+        elif self.op == 'mean':
+            result = pool_cmd("AvgpoolBackward",
+                              inputs=[grad_output, input],
+                              output_dtypes=output_dtypes,
+                              output_shapes=output_shapes,
+                              attr_code=attr_code)[0]
+        else:
+            raise ValueError('no this type pool')
+        return result
--- a/python/jittor/extern/acl/aclops/pool_op_acl.cc
+++ b/python/jittor/extern/acl/aclops/pool_op_acl.cc
@ -0,0 +1,187 @@
+#pragma once
+#include <acl/acl.h>
+#include <acl/acl_op_compiler.h>
+#include <Python.h>
+#include <pystate.h>
+#include <algorithm>
+#include <queue>
+#include <set>
+#include "common.h"
+#include "op.h"
+#include "acl_jittor.h"
+#include "ops/random_op.h"
+#include "ops/reduce_op.h"
+#include "ops/binary_op.h"
+#include "ops/broadcast_to_op.h"
+#include "ops/transpose_op.h"
+#include "ops/array_op.h"
+#include "ops/code_op.h"
+#include "fused_op.h"
+#include "ops/unary_op.h"
+#include "ops/ternary_op.h"
+#include "executor.h"
+#include "misc/cuda_flags.h"
+#include "mem/allocator.h"
+#include "op_compiler.h"
+#include "ops/op_register.h"
+#include "opt/tuner_manager.h"
+#include "utils/str_utils.h"
+#include "aclnn/aclnn.h"
+#include "pool_op_acl.h"
+
+namespace jittor
+{
+    MaxpoolOpRunner::MaxpoolOpRunner() : BaseOpRunner("Maxpool")
+    {
+        use_nchw = true;
+    }
+
+    void MaxpoolOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
+    {
+
+        aclIntArray *strides = nullptr;
+        aclIntArray *pads = nullptr;
+        aclIntArray *dilations = nullptr;
+        aclIntArray *kernel_size = nullptr;
+
+        auto attr = dynamic_cast<PoolAttr *>(op_attr.get());
+        kernel_size = aclCreateIntArray(attr->kernel_size.data(), 2);
+        strides = aclCreateIntArray(attr->poolStrides.data(), 2);
+        pads = aclCreateIntArray(attr->poolPads.data(), 2);
+        dilations = aclCreateIntArray(attr->poolDilations.data(), 2);
+        ret = aclnnMaxPool2dWithIndicesGetWorkspaceSize(inputTensors[0], kernel_size, strides, pads, dilations, attr->poolCeil, outputTensors[0], outputTensors[1], &workspaceSize, &executor);
+
+        checkRet(ret);
+
+        if (workspaceSize > 0)
+        {
+            mallocWorkSpace(workspaceSize);
+        }
+
+        ret = aclnnMaxPool2dWithIndices(workspaceAddr, workspaceSize, executor, aclstream);
+        CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnMaxPool2dWithIndices failed. ERROR: %d\n", name.c_str(), ret); return);
+
+        syncRun();
+
+        aclDestroyIntArray(strides);
+        aclDestroyIntArray(pads);
+        aclDestroyIntArray(dilations);
+        aclDestroyIntArray(kernel_size);
+
+        return;
+    }
+
+    AvgpoolOpRunner::AvgpoolOpRunner() : BaseOpRunner("Avgpool")
+    {
+        use_nchw = true;
+    }
+
+    void AvgpoolOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
+    {
+
+        aclIntArray *strides = nullptr;
+        aclIntArray *pads = nullptr;
+        aclIntArray *kernel_size = nullptr;
+
+        auto attr = dynamic_cast<PoolAttr *>(op_attr.get());
+        kernel_size = aclCreateIntArray(attr->kernel_size.data(), 2);
+        strides = aclCreateIntArray(attr->poolStrides.data(), 2);
+        pads = aclCreateIntArray(attr->poolPads.data(), 2);
+        ret = aclnnAvgPool2dGetWorkspaceSize(inputTensors[0], kernel_size, strides, pads, attr->poolCeil, attr->countIncludePad, attr->divisorOverride, attr->divisorOverride, outputTensors[0], &workspaceSize, &executor);
+
+        checkRet(ret);
+
+        if (workspaceSize > 0)
+        {
+            mallocWorkSpace(workspaceSize);
+        }
+
+        ret = aclnnAvgPool2d(workspaceAddr, workspaceSize, executor, aclstream);
+        CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnAvgPool2d failed. ERROR: %d\n", name.c_str(), ret); return);
+
+        syncRun();
+
+        aclDestroyIntArray(strides);
+        aclDestroyIntArray(pads);
+        aclDestroyIntArray(kernel_size);
+
+        return;
+    }
+
+    MaxpoolBackwardOpRunner::MaxpoolBackwardOpRunner() : BaseOpRunner("MaxpoolBackward")
+    {
+        use_nchw = true;
+    }
+
+    void MaxpoolBackwardOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
+    {
+
+        aclIntArray *strides = nullptr;
+        aclIntArray *pads = nullptr;
+        aclIntArray *dilations = nullptr;
+        aclIntArray *kernel_size = nullptr;
+
+        auto attr = dynamic_cast<PoolAttr *>(op_attr.get());
+        kernel_size = aclCreateIntArray(attr->kernel_size.data(), 2);
+        strides = aclCreateIntArray(attr->poolStrides.data(), 2);
+        pads = aclCreateIntArray(attr->poolPads.data(), 2);
+        dilations = aclCreateIntArray(attr->poolDilations.data(), 2);
+        ret = aclnnMaxPool2dWithIndicesBackwardGetWorkspaceSize(inputTensors[0], inputTensors[1], inputTensors[2], kernel_size, strides, pads, dilations, attr->poolCeil, outputTensors[0], &workspaceSize, &executor);
+
+        checkRet(ret);
+
+        if (workspaceSize > 0)
+        {
+            mallocWorkSpace(workspaceSize);
+        }
+
+        ret = aclnnMaxPool2dWithIndicesBackward(workspaceAddr, workspaceSize, executor, aclstream);
+        CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnMaxPool2dWithIndicesBackward failed. ERROR: %d\n", name.c_str(), ret); return);
+
+        syncRun();
+
+        aclDestroyIntArray(strides);
+        aclDestroyIntArray(pads);
+        aclDestroyIntArray(dilations);
+        aclDestroyIntArray(kernel_size);
+
+        return;
+    }
+
+    AvgpoolBackwardOpRunner::AvgpoolBackwardOpRunner() : BaseOpRunner("AvgpoolBackward")
+    {
+        use_nchw = true;
+    }
+
+    void AvgpoolBackwardOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
+    {
+        aclIntArray *strides = nullptr;
+        aclIntArray *pads = nullptr;
+        aclIntArray *kernel_size = nullptr;
+
+        auto attr = dynamic_cast<PoolAttr *>(op_attr.get());
+        kernel_size = aclCreateIntArray(attr->kernel_size.data(), 2);
+        strides = aclCreateIntArray(attr->poolStrides.data(), 2);
+        pads = aclCreateIntArray(attr->poolPads.data(), 2);
+        ret = aclnnAvgPool2dBackwardGetWorkspaceSize(inputTensors[0], inputTensors[1], kernel_size, strides, pads, attr->countIncludePad, attr->divisorOverride, attr->divisorOverride, attr->poolCeil, outputTensors[0], &workspaceSize, &executor);
+
+        checkRet(ret);
+
+        if (workspaceSize > 0)
+        {
+            mallocWorkSpace(workspaceSize);
+        }
+
+        ret = aclnnAvgPool2dBackward(workspaceAddr, workspaceSize, executor, aclstream);
+        CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnAvgPool2dBackward failed. ERROR: %d\n", name.c_str(), ret); return);
+
+        syncRun();
+
+        aclDestroyIntArray(strides);
+        aclDestroyIntArray(pads);
+        aclDestroyIntArray(kernel_size);
+
+        return;
+    }
+
+}
--- a/python/jittor/extern/acl/aclops/pool_op_acl.h
+++ b/python/jittor/extern/acl/aclops/pool_op_acl.h
@ -0,0 +1,46 @@
+#pragma once
+#include "utils.h"
+#include "base_op.h"
+
+namespace jittor
+{
+    class MaxpoolOpRunner : public BaseOpRunner
+    {
+
+    protected:
+        void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
+
+    public:
+        MaxpoolOpRunner();
+    };
+
+    class AvgpoolOpRunner : public BaseOpRunner
+    {
+
+    protected:
+        void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
+
+    public:
+        AvgpoolOpRunner();
+    };
+
+    class MaxpoolBackwardOpRunner : public BaseOpRunner
+    {
+
+    protected:
+        void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
+
+    public:
+        MaxpoolBackwardOpRunner();
+    };
+
+    class AvgpoolBackwardOpRunner : public BaseOpRunner
+    {
+
+    protected:
+        void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
+
+    public:
+        AvgpoolBackwardOpRunner();
+    };
+}
--- a/python/jittor/extern/acl/aclops/random_op_acl.cc
+++ b/python/jittor/extern/acl/aclops/random_op_acl.cc
@ -0,0 +1,82 @@
+#pragma once
+#include <acl/acl.h>
+#include <acl/acl_op_compiler.h>
+#include <Python.h>
+#include <pystate.h>
+#include <algorithm>
+#include <queue>
+#include <set>
+#include "common.h"
+#include "op.h"
+#include "acl_jittor.h"
+#include "ops/random_op.h"
+#include "ops/reduce_op.h"
+#include "ops/binary_op.h"
+#include "ops/broadcast_to_op.h"
+#include "ops/transpose_op.h"
+#include "ops/array_op.h"
+#include "ops/code_op.h"
+#include "fused_op.h"
+#include "ops/unary_op.h"
+#include "ops/ternary_op.h"
+#include "executor.h"
+#include "misc/cuda_flags.h"
+#include "mem/allocator.h"
+#include "op_compiler.h"
+#include "ops/op_register.h"
+#include "opt/tuner_manager.h"
+#include "utils/str_utils.h"
+#include "aclnn/aclnn.h"
+#include "random_op_acl.h"
+
+namespace jittor
+{
+    RandomOpRunner::RandomOpRunner() : BaseOpRunner("RandomUniform")
+    {
+        name = "RandomUniform";
+    }
+
+    RandomOpRunner::RandomOpRunner(const string &_name) : BaseOpRunner(_name)
+    {
+        name = _name;
+    }
+
+    void RandomOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
+    {
+        auto attr = dynamic_cast<RandomAttr *>(op_attr.get());
+        if (name == "RandomUniform")
+        {
+            ret = aclnnInplaceUniformGetWorkspaceSize(outputTensors[0], 0.0, 1.0, attr->seed, attr->offset, &workspaceSize, &executor);
+
+            checkRet(ret);
+
+            if (workspaceSize > 0)
+            {
+                mallocWorkSpace(workspaceSize);
+            }
+
+            ret = aclnnInplaceUniform(workspaceAddr, workspaceSize, executor, aclstream);
+            CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnInplaceUniform failed. ERROR: %d\n", name.c_str(), ret); return);
+        }
+        else if (name == "RandomNormal")
+        {
+            ret = aclnnInplaceNormalGetWorkspaceSize(outputTensors[0], 0.0, 1.0, attr->seed, attr->offset, &workspaceSize, &executor);
+
+            checkRet(ret);
+
+            if (workspaceSize > 0)
+            {
+                mallocWorkSpace(workspaceSize);
+            }
+
+            ret = aclnnInplaceNormal(workspaceAddr, workspaceSize, executor, aclstream);
+            CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnInplaceNormal failed. ERROR: %d\n", name.c_str(), ret); return);
+        }
+        else
+        {
+            LOGf << "Not supported random type : " << name;
+        }
+        syncRun();
+        return;
+    }
+}
--- a/python/jittor/extern/acl/aclops/random_op_acl.h
+++ b/python/jittor/extern/acl/aclops/random_op_acl.h
@ -0,0 +1,18 @@
+#pragma once
+#include "utils.h"
+#include "base_op.h"
+
+namespace jittor
+{
+    class RandomOpRunner : public BaseOpRunner
+    {
+
+    protected:
+        string name; // special to random op
+        void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
+
+    public:
+        RandomOpRunner();
+        RandomOpRunner(const string &name);
+    };
+}
--- a/python/jittor/extern/acl/aclops/reduce_op_acl.cc
+++ b/python/jittor/extern/acl/aclops/reduce_op_acl.cc
@ -0,0 +1,127 @@
+#include <acl/acl.h>
+#include <acl/acl_op_compiler.h>
+#include <Python.h>
+#include <pystate.h>
+#include <algorithm>
+#include <queue>
+#include <set>
+#include "common.h"
+#include "op.h"
+#include "acl_jittor.h"
+#include "ops/random_op.h"
+#include "ops/reduce_op.h"
+#include "ops/binary_op.h"
+#include "ops/broadcast_to_op.h"
+#include "ops/transpose_op.h"
+#include "ops/array_op.h"
+#include "ops/code_op.h"
+#include "fused_op.h"
+#include "ops/unary_op.h"
+#include "ops/ternary_op.h"
+#include "executor.h"
+#include "misc/cuda_flags.h"
+#include "mem/allocator.h"
+#include "op_compiler.h"
+#include "ops/op_register.h"
+#include "opt/tuner_manager.h"
+#include "utils/str_utils.h"
+#include "aclnn/aclnn.h"
+#include "reduce_op_acl.h"
+
+namespace jittor
+{
+    ReduceOpRunner::ReduceOpRunner() : BaseOpRunner("reduce")
+    {
+        use_nchw = false;
+    }
+
+    void ReduceOpRunner::setupOutputDesc()
+    {
+        auto output_num = out_.size();
+
+        for (int output_idx = 0; output_idx < output_num; output_idx++)
+        {
+            std::vector<int64_t> shape;
+            for (int j = 0; j < out_[output_idx]->shape.size(); j++)
+            {
+                shape.push_back(out_[output_idx]->shape[j]);
+            }
+            outputShapes.push_back(shape);
+        }
+
+        attr = dynamic_cast<ReduceAttr *>(op_attr.get());
+        dim = aclCreateIntArray(attr->axes.data(), attr->axes.size());
+        keepdims = attr->keepdims;
+
+        if (op_idx < 13)
+        {
+            if (attr->axes.size() == in_[0]->shape.size())
+                outputShapes[0] = {};
+        }
+
+        for (int idx = 0; idx < output_num; idx++)
+        {
+            outputTensors.push_back(nullptr);
+            auto ret = CreateAclTensor(outputShapes[idx], out_[idx]->mem_ptr, out_[idx]->size, get_dtype(out_[idx]->dtype()), &outputTensors[idx], use_nchw);
+            CHECK_RET(ret == ACL_SUCCESS, return);
+        }
+    }
+
+    void ReduceOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
+    {
+        switch (op_idx)
+        {
+        case 9:
+        {
+            ret = aclnnReduceSumGetWorkspaceSize(inputTensors[0], dim, keepdims, get_dtype(out_[0]->dtype()), outputTensors[0], &workspaceSize, &executor);
+            CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnReduceSumGetWorkspaceSize failed. ERROR: %d\n", name.c_str(), ret); return);
+            if (workspaceSize > 0)
+            {
+                mallocWorkSpace(workspaceSize);
+            }
+            ret = aclnnReduceSum(workspaceAddr, workspaceSize, executor, aclstream);
+            break;
+        }
+        case 10:
+        {
+            ret = aclnnMeanGetWorkspaceSize(inputTensors[0], dim, keepdims, get_dtype(out_[0]->dtype()), outputTensors[0], &workspaceSize, &executor);
+            CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnMeanGetWorkspaceSize failed. ERROR: %d\n", name.c_str(), ret); return);
+            if (workspaceSize > 0)
+            {
+                mallocWorkSpace(workspaceSize);
+            }
+            ret = aclnnMean(workspaceAddr, workspaceSize, executor, aclstream);
+            break;
+        }
+        case 11:
+        {
+            ret = aclnnAmaxGetWorkspaceSize(inputTensors[0], dim, keepdims, outputTensors[0], &workspaceSize, &executor);
+            CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnAmaxGetWorkspaceSize failed. ERROR: %d\n", name.c_str(), ret); return);
+            if (workspaceSize > 0)
+            {
+                mallocWorkSpace(workspaceSize);
+            }
+            ret = aclnnAmax(workspaceAddr, workspaceSize, executor, aclstream);
+            break;
+        }
+        case 12:
+        {
+            ret = aclnnAminGetWorkspaceSize(inputTensors[0], dim, keepdims, outputTensors[0], &workspaceSize, &executor);
+            CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnAminGetWorkspaceSize failed. ERROR: %d\n", name.c_str(), ret); return);
+            if (workspaceSize > 0)
+            {
+                mallocWorkSpace(workspaceSize);
+            }
+            ret = aclnnAmin(workspaceAddr, workspaceSize, executor, aclstream);
+            break;
+        }
+        default:
+        {
+            LOGir << "no such reduce!!";
+            exit(-1);
+        }
+        }
+        syncRun();
+        return;
+    }
+}
--- a/python/jittor/extern/acl/aclops/reduce_op_acl.h
+++ b/python/jittor/extern/acl/aclops/reduce_op_acl.h
@ -0,0 +1,21 @@
+#pragma once
+#include "utils.h"
+#include "base_op.h"
+
+namespace jittor
+{
+    struct ReduceOpRunner : public BaseOpRunner
+    {
+        int op_idx; // Specific to reduce operations
+
+        ReduceOpRunner();
+
+    protected:
+        ReduceAttr *attr;
+        aclIntArray *dim;
+        bool keepdims;
+
+        void setupOutputDesc() override;
+        void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
+    };
+}
--- a/python/jittor/extern/acl/aclops/relu_op.py
+++ b/python/jittor/extern/acl/aclops/relu_op.py
@ -0,0 +1,115 @@
+import os
+from jittor_utils import env_or_try_find
+import jittor_utils
+import ctypes
+import glob
+import jittor.compiler as compiler
+import jittor as jt
+import math
+import numpy as np
+
+from typing import Union
+from collections.abc import Sequence, Iterable
+
+
+def relu_cmd(name: str,
+             inputs: list,
+             output_dtypes: list = None,
+             output_shapes: list = None,
+             attr_code: str = "",
+             attr_header: str = "",
+             outputs: list = None):
+    attr_header = "\nnamespace jittor{" + attr_header + "}\n"
+
+    cuda_header = '''
+    #include "acl/aclops/aclops.h"
+    '''
+    outputs_ = []
+    if outputs is not None:
+        outputs_ = outputs
+    else:
+        assert output_dtypes is not None
+        assert output_shapes is not None
+        assert len(output_dtypes) == len(output_shapes)
+        for i in range(len(output_shapes)):
+            outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
+    input_code = ''
+    for i in range(len(inputs)):
+        input_code += f"op.add(in{i}, true);\n"
+
+    output_code = ''
+    for i in range(len(outputs_)):
+        output_code += f"op.add(out{i}, false);\n"
+    return jt.code(outputs=outputs_,
+                   inputs=inputs,
+                   cuda_header=attr_header + cuda_header,
+                   cuda_src=f"""
+   
+    // aclop
+    {name}OpRunner op;
+    {input_code}
+    {output_code}
+    {attr_code}
+    op.run();""")
+
+
+class ReLUACL(jt.Function):
+
+    def __init__(self):
+        super(ReLUACL, self).__init__()
+
+    def execute(self, x):
+        x = x.float32()
+        self.input = x
+        result = relu_cmd("Unary", [x],
+                          output_dtypes=[x.dtype],
+                          output_shapes=[x.shape],
+                          attr_code="op.name=\"ReLU\";")[0]
+        return result
+
+    def grad(self, grad_output):
+        mask = relu_cmd("Binary",
+                        [self.input, jt.zeros(self.input.shape)],
+                        output_dtypes=[self.input.dtype],
+                        output_shapes=[self.input.shape],
+                        attr_code="op.name=\"Greater\";")[0]
+        grad_input = relu_cmd("Binary", [grad_output, mask],
+                              output_dtypes=[grad_output.dtype],
+                              output_shapes=[grad_output.shape],
+                              attr_code="op.name=\"Mul\";")[0]
+        return grad_input
+
+class LeakyReLUACL(jt.Function):
+
+    def __init__(self):
+        super(LeakyReLUACL, self).__init__()
+
+    def execute(self, x, negative_slope=0.01):
+        x = x.float32()
+        self.input = x
+        attr_code = f"""
+        op.jt_name = "leakyrelu";
+        LeakyReluAttr *attr = new LeakyReluAttr();
+        attr->negativeSlope = {negative_slope};
+        op.op_attr.reset(attr);
+        """
+        result = relu_cmd("LeakyReLU", [x],
+                          output_dtypes=[x.dtype],
+                          output_shapes=[x.shape],
+                          attr_code=attr_code)[0]
+        self.negative_slope = negative_slope
+        return result
+
+    def grad(self, grad_output):
+        attr_code = f"""
+        op.jt_name = "leakyrelubackward";
+        LeakyReluAttr *attr = new LeakyReluAttr();
+        attr->negativeSlope = {self.negative_slope};
+        attr->selfIsResult = false;
+        op.op_attr.reset(attr);
+        """
+        grad_input = relu_cmd("LeakyReLUBackward", [grad_output, self.input],
+                              output_dtypes=[grad_output.dtype],
+                              output_shapes=[grad_output.shape],
+                              attr_code=attr_code)[0]
+        return grad_input
--- a/python/jittor/extern/acl/aclops/relu_op_acl.cc
+++ b/python/jittor/extern/acl/aclops/relu_op_acl.cc
@ -0,0 +1,90 @@
+#pragma once
+#include <acl/acl.h>
+#include <acl/acl_op_compiler.h>
+#include <Python.h>
+#include <pystate.h>
+#include <algorithm>
+#include <queue>
+#include <set>
+#include "common.h"
+#include "op.h"
+#include "acl_jittor.h"
+#include "ops/random_op.h"
+#include "ops/reduce_op.h"
+#include "ops/binary_op.h"
+#include "ops/broadcast_to_op.h"
+#include "ops/transpose_op.h"
+#include "ops/array_op.h"
+#include "ops/code_op.h"
+#include "fused_op.h"
+#include "ops/unary_op.h"
+#include "ops/ternary_op.h"
+#include "executor.h"
+#include "misc/cuda_flags.h"
+#include "mem/allocator.h"
+#include "op_compiler.h"
+#include "ops/op_register.h"
+#include "opt/tuner_manager.h"
+#include "utils/str_utils.h"
+#include "aclnn/aclnn.h"
+#include "relu_op_acl.h"
+
+namespace jittor
+{
+    LeakyReLUOpRunner::LeakyReLUOpRunner() : BaseOpRunner("LeakyReLU")
+    {
+    }
+
+    void LeakyReLUOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
+    {
+        aclScalar *negativeSlope = nullptr;
+
+        auto attr = dynamic_cast<LeakyReluAttr *>(op_attr.get());
+        negativeSlope = aclCreateScalar(&attr->negativeSlope, aclDataType::ACL_FLOAT);
+        ret = aclnnLeakyReluGetWorkspaceSize(inputTensors[0], negativeSlope, outputTensors[0], &workspaceSize, &executor);
+
+        checkRet(ret);
+
+        if (workspaceSize > 0)
+        {
+            mallocWorkSpace(workspaceSize);
+        }
+
+        ret = aclnnLeakyRelu(workspaceAddr, workspaceSize, executor, aclstream);
+        CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnLeakyRelu failed. ERROR: %d\n", name.c_str(), ret); return);
+
+        syncRun();
+
+        aclDestroyScalar(negativeSlope);
+        return;
+    }
+
+    LeakyReLUBackwardOpRunner::LeakyReLUBackwardOpRunner() : BaseOpRunner("LeakyReLUBackward")
+    {
+    }
+
+    void LeakyReLUBackwardOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
+    {
+        aclScalar *negativeSlope = nullptr;
+
+        auto attr = dynamic_cast<LeakyReluAttr *>(op_attr.get());
+        negativeSlope = aclCreateScalar(&attr->negativeSlope, aclDataType::ACL_FLOAT);
+        ret = aclnnLeakyReluBackwardGetWorkspaceSize(inputTensors[0], inputTensors[1], negativeSlope, attr->selfIsResult, outputTensors[0], &workspaceSize, &executor);
+
+        checkRet(ret);
+
+        if (workspaceSize > 0)
+        {
+            mallocWorkSpace(workspaceSize);
+        }
+
+        ret = aclnnLeakyReluBackward(workspaceAddr, workspaceSize, executor, aclstream);
+        CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnLeakyReluBackward failed. ERROR: %d\n", name.c_str(), ret); return);
+
+        syncRun();
+
+        aclDestroyScalar(negativeSlope);
+        return;
+    }
+
+}
--- a/python/jittor/extern/acl/aclops/relu_op_acl.h
+++ b/python/jittor/extern/acl/aclops/relu_op_acl.h
@ -0,0 +1,27 @@
+#pragma once
+#include "utils.h"
+#include "base_op.h"
+
+namespace jittor
+{
+    class LeakyReLUOpRunner : public BaseOpRunner
+    {
+
+    protected:
+        void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
+
+    public:
+        LeakyReLUOpRunner();
+    };
+
+    class LeakyReLUBackwardOpRunner : public BaseOpRunner
+    {
+
+    protected:
+        void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
+
+    public:
+        LeakyReLUBackwardOpRunner();
+    };
+
+}
--- a/python/jittor/extern/acl/aclops/rope_op.py
+++ b/python/jittor/extern/acl/aclops/rope_op.py
@ -0,0 +1,84 @@
+import os
+from jittor_utils import env_or_try_find
+import jittor_utils
+import ctypes
+import glob
+import jittor.compiler as compiler
+import jittor as jt
+import math
+import numpy as np
+
+from typing import Union
+from collections.abc import Sequence, Iterable
+
+
+def rope_cmd(name: str,
+             inputs: list,
+             output_dtypes: list = None,
+             output_shapes: list = None,
+             attr_code: str = "",
+             attr_header: str = "",
+             outputs: list = None):
+    attr_header = "\nnamespace jittor{" + attr_header + "}\n"
+
+    cuda_header = '''
+    #include "acl/aclops/aclops.h"
+    '''
+    outputs_ = []
+    if outputs is not None:
+        outputs_ = outputs
+    else:
+        assert output_dtypes is not None
+        assert output_shapes is not None
+        assert len(output_dtypes) == len(output_shapes)
+        for i in range(len(output_shapes)):
+            outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
+    input_code = ''
+    for i in range(len(inputs)):
+        input_code += f"op.add(in{i}, true);\n"
+
+    output_code = ''
+    for i in range(len(outputs_)):
+        output_code += f"op.add(out{i}, false);\n"
+    return jt.code(outputs=outputs_,
+                   inputs=inputs,
+                   cuda_header=attr_header + cuda_header,
+                   cuda_src=f"""
+   
+    // aclop
+    {name}OpRunner op;
+    {input_code}
+    {output_code}
+    {attr_code}
+    op.run();""")
+
+
+class RopeACL(jt.Function):
+
+    def __init__(self):
+        super(RopeACL, self).__init__()
+
+    def execute(self, xq, xk, freqs_cis, freq_cos, freq_sin):
+        attr_code = f"""
+        op.jt_name = "RotaryPosEmb";
+        """
+        if freqs_cis is not None:
+            freq_cos = freqs_cis[..., 0]
+            freq_sin = freqs_cis[..., 1]
+        else:
+            assert freq_cos is not None and freq_sin is not None
+        inputs = [xq, xk, freq_cos, freq_sin]
+        results = rope_cmd("RotaryPosEmb",
+                           inputs,
+                           output_dtypes=[
+                               xq.dtype,
+                           ],
+                           output_shapes=[
+                               xq.shape,
+                           ],
+                           attr_code=attr_code)
+        results[0].sync()
+        return inputs[0], inputs[1]
+
+    def grad(self, grad_output):
+        return grad_output
--- a/python/jittor/extern/acl/aclops/rope_op_acl.cc
+++ b/python/jittor/extern/acl/aclops/rope_op_acl.cc
@ -0,0 +1,57 @@
+#pragma once
+#include <acl/acl.h>
+#include <acl/acl_op_compiler.h>
+#include <Python.h>
+#include <pystate.h>
+#include <algorithm>
+#include <queue>
+#include <set>
+#include "common.h"
+#include "op.h"
+#include "acl_jittor.h"
+#include "ops/random_op.h"
+#include "ops/reduce_op.h"
+#include "ops/binary_op.h"
+#include "ops/broadcast_to_op.h"
+#include "ops/transpose_op.h"
+#include "ops/array_op.h"
+#include "ops/code_op.h"
+#include "fused_op.h"
+#include "ops/unary_op.h"
+#include "ops/ternary_op.h"
+#include "executor.h"
+#include "misc/cuda_flags.h"
+#include "mem/allocator.h"
+#include "op_compiler.h"
+#include "ops/op_register.h"
+#include "opt/tuner_manager.h"
+#include "utils/str_utils.h"
+#include "aclnn/aclnn.h"
+#include "rope_op_acl.h"
+
+namespace jittor
+{
+    RotaryPosEmbOpRunner::RotaryPosEmbOpRunner() : BaseOpRunner("RotaryPosEmb")
+    {
+    }
+
+    void RotaryPosEmbOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
+    {
+        ret = aclnnApplyRotaryPosEmbGetWorkspaceSize(inputTensors[0], inputTensors[1], inputTensors[2], inputTensors[3], (int64_t)1, &workspaceSize, &executor);
+
+        checkRet(ret);
+
+        if (workspaceSize > 0)
+        {
+            mallocWorkSpace(workspaceSize);
+        }
+
+        ret = aclnnApplyRotaryPosEmb(workspaceAddr, workspaceSize, executor, aclstream);
+        CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnApplyRotaryPosEmb failed. ERROR: %d\n", name.c_str(), ret); return);
+
+        syncRun();
+
+        return;
+    }
+
+}
--- a/python/jittor/extern/acl/aclops/rope_op_acl.h
+++ b/python/jittor/extern/acl/aclops/rope_op_acl.h
@ -0,0 +1,17 @@
+#pragma once
+#include "utils.h"
+#include "base_op.h"
+
+namespace jittor
+{
+    class RotaryPosEmbOpRunner : public BaseOpRunner
+    {
+
+    protected:
+        void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
+
+    public:
+        RotaryPosEmbOpRunner();
+    };
+
+}
--- a/python/jittor/extern/acl/aclops/setitem_op.py
+++ b/python/jittor/extern/acl/aclops/setitem_op.py
@ -0,0 +1,356 @@
+import os
+from jittor_utils import env_or_try_find
+import jittor_utils
+import ctypes
+import glob
+import jittor.compiler as compiler
+import jittor as jt
+import math
+import numpy as np
+
+from typing import Union
+from collections.abc import Sequence, Iterable
+
+
+def setitem_cmd(name: str,
+                inputs: list,
+                output_dtypes: list = None,
+                output_shapes: list = None,
+                attr_code: str = "",
+                attr_header: str = "",
+                outputs: list = None):
+    attr_header = "\nnamespace jittor{" + attr_header + "}\n"
+
+    cuda_header = '''
+    #include "acl/aclops/aclops.h"
+    '''
+    outputs_ = []
+    if outputs is not None:
+        outputs_ = outputs
+    else:
+        assert output_dtypes is not None
+        assert output_shapes is not None
+        assert len(output_dtypes) == len(output_shapes)
+        for i in range(len(output_shapes)):
+            outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
+    input_code = ''
+    for i in range(len(inputs)):
+        input_code += f"op.add(in{i}, true);\n"
+
+    output_code = ''
+    for i in range(len(outputs_)):
+        output_code += f"op.add(out{i}, false);\n"
+    return jt.code(outputs=outputs_,
+                   inputs=inputs,
+                   cuda_header=attr_header + cuda_header,
+                   cuda_src=f"""
+   
+    // aclop
+    {name}OpRunner op;
+    {input_code}
+    {output_code}
+    {attr_code}
+    op.run();""")
+
+
+def setitem_forward(name: str,
+                    inputs: list,
+                    output_dtypes: list = None,
+                    output_shapes: list = None,
+                    attr_code: str = "",
+                    attr_header: str = "",
+                    outputs: list = None,
+                    extra_data: dict = {}):
+    attr_header = "\nnamespace jittor{" + attr_header + "}\n"
+
+    cuda_header = '''
+    #include "acl/aclops/aclops.h"
+    '''
+    outputs_ = []
+    if outputs is not None:
+        outputs_ = outputs
+    else:
+        assert output_dtypes is not None
+        assert output_shapes is not None
+        assert len(output_dtypes) == len(output_shapes)
+        for i in range(len(output_shapes)):
+            outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
+    input_code = ''
+    for i in range(len(inputs)):
+        input_code += f"op.add(in{i}, true);\n"
+
+    output_code = ''
+    for i in range(len(outputs_)):
+        output_code += f"op.add(out{i}, false);\n"
+    return jt.code(outputs=outputs_,
+                   inputs=inputs,
+                   cuda_header=attr_header + cuda_header,
+                   cuda_src=f"""
+    // aclop
+    {name}OpRunner op;
+    {input_code}
+    op.add(out0, false);
+    {attr_code}
+    op.run();""",
+                   data=extra_data)
+
+
+def caculate_shape(tensors):
+    if isinstance(tensors, jt.Var):
+        # tensors = tensors[0]
+        return tensors.shape
+    elif isinstance(tensors, (int, float)):
+        return []
+    elif isinstance(tensors, (list, tuple)):
+        # return [caculate_shape(tensor) for tensor in tensors]
+        sub_shape = caculate_shape(tensors[0])
+        return [len(tensors)] + sub_shape
+    else:
+        assert False, f"not implemented for {type(tensors)}"
+
+
+def can_broadcast_and_shape(shape1, shape2):
+    """
+    检查两个张量是否可以广播，并返回广播后的形状。
+    
+    参数:
+    - shape1: 第一个张量的形状（tuple 或 list）
+    - shape2: 第二个张量的形状（tuple 或 list）
+    
+    返回:
+    - can_broadcast: 布尔值，表示是否可以广播
+    - broadcast_shape: 如果可以广播，返回广播后的形状；否则返回 None
+    """
+    # 将形状转换为元组，以防输入是列表
+    shape1 = tuple(shape1)
+    shape2 = tuple(shape2)
+
+    # 使两个形状的长度一致，通过在前面补1
+    len1, len2 = len(shape1), len(shape2)
+    if len1 < len2:
+        shape1 = (1, ) * (len2 - len1) + shape1
+    elif len2 < len1:
+        shape2 = (1, ) * (len1 - len2) + shape2
+
+    broadcast_shape = []
+
+    # 从最后一维开始检查每一维度
+    for dim1, dim2 in zip(shape1, shape2):
+        if dim1 == dim2:
+            broadcast_shape.append(dim1)
+        elif dim1 == 1:
+            broadcast_shape.append(dim2)
+        elif dim2 == 1:
+            broadcast_shape.append(dim1)
+        else:
+            # 如果在某一维度上不兼容，则不能广播
+            return False, None
+
+    return True, tuple(broadcast_shape)
+
+
+class SetItemACL(jt.Function):
+
+    def __init__(self):
+        self.type_ = 'notype'
+        self.value_var = True
+
+    def stride(self, x, dim):
+        stride = 1
+        for i in range(dim + 1, len(x.shape)):
+            stride *= x.shape[i]
+        return stride
+
+    def execute(self, x, slices, value):
+        self.x_shape = x.shape
+        self.input_slice = slices
+        if not isinstance(value, jt.Var):
+            self.value_var = False
+        if isinstance(slices, jt.Var):
+            if slices.dtype == "bool":
+                slices_len = slices.sum().item()
+                if slices_len == 0:
+                    return x
+                if isinstance(value, int) or isinstance(value, float):
+                    value = jt.full((slices_len, ), value, dtype=x.dtype)
+                assert slices.shape == x.shape, "setitem shape not match"
+                assert len(value.shape) == 1, "value shape must be 1D"
+                assert value.shape[
+                    0] == slices_len, "value shape length must be equal to slices sum"
+                self.type_ = 'mask'
+                self.value_shape = value.shape
+                inputs = [slices, value]
+                outputs = [x.clone()]
+                attr_code = f"""
+                op.jt_name = "inplacemaskedscatter";
+                """
+                result = setitem_cmd("InplaceMaskedScatter",
+                                     inputs=inputs,
+                                     outputs=outputs,
+                                     attr_code=attr_code)[0]
+                return result
+
+        # assert isinstance(value,jt.Var), "value must be jt.Var"
+        # self.value_shape = value.shape
+        if not isinstance(slices, tuple):
+            slices = (slices, )
+        slices = list(slices)
+        for i, s in enumerate(slices):
+            if isinstance(s, int) and s < 0:
+                slices[i] = x.shape[i] + s
+        slices = tuple(slices)
+        slices_list = list(slices)
+        #check slices contains slice type
+        contains_slice = False
+        for s in slices:
+            if not isinstance(s, jt.Var) and (isinstance(s, slice)
+                                              or s == Ellipsis):
+                contains_slice = True
+                break
+        if not contains_slice:
+            indices = []
+            value_shape = []
+            slices_len = len(slices)
+            boardcast_shape = caculate_shape(slices_list[0])
+            for ii in range(1, len(slices)):
+                dd, boardcast_shape = can_broadcast_and_shape(
+                    boardcast_shape, caculate_shape(slices_list[ii]))
+                assert dd is True, "can not broadcast"
+            value_shape = boardcast_shape
+            value_shape += x.shape[slices_len:]
+            if value_shape == []:
+                value_shape = [1]
+            if isinstance(value, int) or isinstance(value, float):
+                value = jt.full(value_shape, value)
+            self.value_shape = value_shape
+            for ii in slices:
+                indices.append(jt.Var(ii).int32())
+            if isinstance(slices[0],
+                          jt.Var) or isinstance(slices[0], int) or isinstance(
+                              slices[0], list) or isinstance(slices[0], tuple):
+                self.indices = indices
+                self.type_ = 'index'
+                attr_code = f"""
+                op.jt_name = "indexputimpl";
+                """
+                inputs = [value] + indices
+                outputs = [x.clone()]
+                result = setitem_cmd("IndexPutImpl",
+                                     inputs=inputs,
+                                     outputs=outputs,
+                                     attr_code=attr_code)[0]
+                # result.sync()
+                return result
+            assert "not support"
+        assert contains_slice, "slice type error"
+        x_dim = len(x.shape)
+        slices = list(slices)
+        for s in slices:
+            if not isinstance(s, jt.Var) and s == Ellipsis:
+                slices = slices[:slices.index(s)] + [
+                    slice(None, None, None)
+                ] * (x_dim - len(slices) + 1) + slices[slices.index(s) + 1:]
+                break
+        slices = tuple(slices)
+        self.input_slice = slices
+        if len(slices) < x_dim:
+            slices += (slice(None, None, None), ) * (x_dim - len(slices))
+        sizes = []
+        #适配华为奇怪的要求，最后一个维度的step必须是1
+        expand_dim = False
+        if isinstance(slices[-1], slice):
+            if slices[-1].step is not None and slices[-1].step != 1:
+                slices = slices + (slice(None, None, None), )
+                expand_dim = True
+
+        elif isinstance(slices[-1], int):
+            #注意最后一个维度是数字
+            slices = slices + (slice(None, None, None), )
+            expand_dim = True
+            # value = value.unsqueeze(-1)
+        else:
+            assert False, "not supported"
+        x_shape = list(x.shape)
+        if expand_dim:
+            x_shape.append(1)
+            x = x.unsqueeze(-1)
+            value = value.unsqueeze(-1)
+
+        squeeze_dims = []
+        if isinstance(value, jt.Var):
+            for dim, s in enumerate(slices):
+                if isinstance(s, int):
+                    s = slice(s, s + 1, 1)
+                    squeeze_dims.append(dim)
+
+            for dim in squeeze_dims:
+                value = value.unsqueeze(dim)
+
+        extra_data = {}
+        if len(slices):
+            extra_data["a"] = len(slices)
+            for dim, s in enumerate(slices):
+                if isinstance(s, int):
+                    s = slice(s, s + 1, 1)
+                if isinstance(s, jt.Var):
+                    assert False, "jt.Var not supported"
+                start, stop, step = s.indices(x_shape[dim])
+                size = (stop - start - 1) // step + 1
+                sizes.append(size)
+                extra_data[str(dim * 3)] = start
+                extra_data[str(dim * 3 + 1)] = stop
+                extra_data[str(dim * 3 + 2)] = step
+        else:
+            extra_data["a"] = -1
+            sizes = [1]
+            steps = [1]
+        if isinstance(value, int) or isinstance(value, float):
+            value = jt.full(sizes, value)
+        self.type_ = 'slicev2'
+        attr_code = """
+        op.jt_name = "stridedsliceassignv2";
+        StrideAttr *attr = new StrideAttr();
+        int slice_dim = data["a"];
+        
+        if(slice_dim == -1) {
+            attr->begins = {};
+            attr->ends = {};
+            attr->steps = {1};
+            attr->axes = {};
+        } else {
+            vector<long int> begins;
+            vector<long int> ends;
+            vector<long int> steps;
+            vector<long int> dims;
+            for(int dim = 0; dim < slice_dim; dim++) {
+                dims.push_back(dim);
+                begins.push_back(data[std::to_string(dim*3)]);
+                ends.push_back(data[std::to_string(dim*3+1)]);
+                steps.push_back(data[std::to_string(dim*3+2)]);
+            }
+            attr->begins = begins;
+            attr->ends = ends;
+            attr->steps = steps;
+            attr->axes = dims;
+        }
+        op.op_attr.reset(attr);
+        """
+        self.value_shape = value.shape
+        inputs = [value]
+        outputs = [x.clone()]
+        result = setitem_forward("StridedSliceAssignV2",
+                                 inputs=inputs,
+                                 outputs=outputs,
+                                 attr_code=attr_code,
+                                 extra_data=extra_data)[0]
+        if expand_dim:
+            result = result.squeeze(-1)
+        # result.sync()
+        return result
+
+    def grad(self, grad_output):
+        value_grad = None
+        if self.value_var:
+            value_grad = grad_output[self.input_slice]
+        grad_output[self.input_slice] = jt.zeros(self.value_shape)
+        return grad_output, None, value_grad
--- a/python/jittor/extern/acl/aclops/setitem_op_acl.cc
+++ b/python/jittor/extern/acl/aclops/setitem_op_acl.cc
@ -0,0 +1,84 @@
+#include <acl/acl.h>
+#include <acl/acl_op_compiler.h>
+#include <Python.h>
+#include <pystate.h>
+#include <algorithm>
+#include <queue>
+#include <set>
+#include "common.h"
+#include "op.h"
+#include "acl_jittor.h"
+#include "ops/random_op.h"
+#include "ops/reduce_op.h"
+#include "ops/binary_op.h"
+#include "ops/broadcast_to_op.h"
+#include "ops/transpose_op.h"
+#include "ops/array_op.h"
+#include "ops/code_op.h"
+#include "fused_op.h"
+#include "ops/unary_op.h"
+#include "ops/ternary_op.h"
+#include "executor.h"
+#include "misc/cuda_flags.h"
+#include "mem/allocator.h"
+#include "op_compiler.h"
+#include "ops/op_register.h"
+#include "opt/tuner_manager.h"
+#include "utils/str_utils.h"
+#include "aclnn/aclnn.h"
+#include "setitem_op_acl.h"
+
+namespace jittor
+{
+    InplaceMaskedScatterOpRunner::InplaceMaskedScatterOpRunner() : BaseOpRunner("InplaceMaskedScatter")
+    {
+    }
+
+    void InplaceMaskedScatterOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
+    {
+        ret = aclnnInplaceMaskedScatterGetWorkspaceSize(outputTensors[0], inputTensors[0], inputTensors[1], &workspaceSize, &executor);
+
+        checkRet(ret);
+
+        if (workspaceSize > 0)
+        {
+            mallocWorkSpace(workspaceSize);
+        }
+
+        ret = aclnnInplaceMaskedScatter(workspaceAddr, workspaceSize, executor, aclstream);
+        CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnInplaceMaskedScatter failed. ERROR: %d\n", name.c_str(), ret); return);
+
+        syncRun();
+        return;
+    }
+
+    IndexPutImplOpRunner::IndexPutImplOpRunner() : BaseOpRunner("IndexPutImpl")
+    {
+    }
+
+    void IndexPutImplOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
+    {
+        auto input_num = in_.size();
+        std::vector<aclTensor *> indexTensorList = {};
+        for (int i = 1; i < input_num; i++)
+        {
+            indexTensorList.push_back(inputTensors[i]);
+        }
+        auto indexTensorListInput = aclCreateTensorList(&indexTensorList[0], input_num - 1);
+        ret = aclnnIndexPutImplGetWorkspaceSize(outputTensors[0], indexTensorListInput, inputTensors[0], false, true, &workspaceSize, &executor);
+
+        checkRet(ret);
+
+        if (workspaceSize > 0)
+        {
+            mallocWorkSpace(workspaceSize);
+        }
+
+        ret = aclnnIndexPutImpl(workspaceAddr, workspaceSize, executor, aclstream);
+        CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnIndexPutImpl failed. ERROR: %d\n", name.c_str(), ret); return);
+
+        syncRun();
+        return;
+    }
+
+}
--- a/python/jittor/extern/acl/aclops/setitem_op_acl.h
+++ b/python/jittor/extern/acl/aclops/setitem_op_acl.h
@ -0,0 +1,26 @@
+#pragma once
+#include "utils.h"
+#include "base_op.h"
+
+namespace jittor
+{
+    class InplaceMaskedScatterOpRunner : public BaseOpRunner
+    {
+
+    protected:
+        void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
+
+    public:
+        InplaceMaskedScatterOpRunner();
+    };
+
+    class IndexPutImplOpRunner : public BaseOpRunner
+    {
+
+    protected:
+        void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
+
+    public:
+        IndexPutImplOpRunner();
+    };
+}
--- a/python/jittor/extern/acl/aclops/sigmoid_op.py
+++ b/python/jittor/extern/acl/aclops/sigmoid_op.py
@ -0,0 +1,85 @@
+import os
+from jittor_utils import env_or_try_find
+import jittor_utils
+import ctypes
+import glob
+import jittor.compiler as compiler
+import jittor as jt
+import math
+import numpy as np
+
+from typing import Union
+from collections.abc import Sequence, Iterable
+
+
+def sigmoid_cmd(name: str,
+                inputs: list,
+                output_dtypes: list = None,
+                output_shapes: list = None,
+                attr_code: str = "",
+                attr_header: str = "",
+                outputs: list = None):
+    attr_header = "\nnamespace jittor{" + attr_header + "}\n"
+
+    cuda_header = '''
+    #include "acl/aclops/aclops.h"
+    '''
+    outputs_ = []
+    if outputs is not None:
+        outputs_ = outputs
+    else:
+        assert output_dtypes is not None
+        assert output_shapes is not None
+        assert len(output_dtypes) == len(output_shapes)
+        for i in range(len(output_shapes)):
+            outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
+    input_code = ''
+    for i in range(len(inputs)):
+        input_code += f"op.add(in{i}, true);\n"
+
+    output_code = ''
+    for i in range(len(outputs_)):
+        output_code += f"op.add(out{i}, false);\n"
+    return jt.code(outputs=outputs_,
+                   inputs=inputs,
+                   cuda_header=attr_header + cuda_header,
+                   cuda_src=f"""
+   
+    // aclop
+    {name}OpRunner op;
+    {input_code}
+    {output_code}
+    {attr_code}
+    op.run();""")
+
+
+class SigmoidACL(jt.Function):
+
+    def __init__(self):
+        super(SigmoidACL, self).__init__()
+
+    def execute(self, x):
+        x = x.float32()
+        inputs = [x]
+        outputs = [jt.empty(x.shape, x.dtype)]
+        attr_code = f"""
+        op.jt_name = "sigmoid";
+        """
+        result = sigmoid_cmd("Sigmoid",
+                             inputs=inputs,
+                             outputs=outputs,
+                             attr_code=attr_code)[0]
+        self.output = result
+        return result
+
+    def grad(self, grad_output):
+        attr_code = f"""
+        op.jt_name = "sigmoidbackward";
+        """
+        inputs = [grad_output, self.output]
+        outputs = [jt.empty(grad_output.shape, grad_output.dtype)]
+        grad_input = sigmoid_cmd("SigmoidBackward",
+                                 inputs=inputs,
+                                 outputs=outputs,
+                                 attr_code=attr_code)[0]
+        return grad_input
--- a/python/jittor/extern/acl/aclops/sigmoid_op_acl.cc
+++ b/python/jittor/extern/acl/aclops/sigmoid_op_acl.cc
@ -0,0 +1,80 @@
+#pragma once
+#include <acl/acl.h>
+#include <acl/acl_op_compiler.h>
+#include <Python.h>
+#include <pystate.h>
+#include <algorithm>
+#include <queue>
+#include <set>
+#include "common.h"
+#include "op.h"
+#include "acl_jittor.h"
+#include "ops/random_op.h"
+#include "ops/reduce_op.h"
+#include "ops/binary_op.h"
+#include "ops/broadcast_to_op.h"
+#include "ops/transpose_op.h"
+#include "ops/array_op.h"
+#include "ops/code_op.h"
+#include "fused_op.h"
+#include "ops/unary_op.h"
+#include "ops/ternary_op.h"
+#include "executor.h"
+#include "misc/cuda_flags.h"
+#include "mem/allocator.h"
+#include "op_compiler.h"
+#include "ops/op_register.h"
+#include "opt/tuner_manager.h"
+#include "utils/str_utils.h"
+#include "aclnn/aclnn.h"
+#include "sigmoid_op_acl.h"
+
+namespace jittor
+{
+    SigmoidOpRunner::SigmoidOpRunner() : BaseOpRunner("Sigmoid")
+    {
+    }
+
+    void SigmoidOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
+    {
+        ret = aclnnSigmoidGetWorkspaceSize(inputTensors[0], outputTensors[0], &workspaceSize, &executor);
+
+        checkRet(ret);
+
+        if (workspaceSize > 0)
+        {
+            mallocWorkSpace(workspaceSize);
+        }
+
+        ret = aclnnSigmoid(workspaceAddr, workspaceSize, executor, aclstream);
+        CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnSigmoid failed. ERROR: %d\n", name.c_str(), ret); return);
+
+        syncRun();
+
+        return;
+    }
+
+    SigmoidBackwardOpRunner::SigmoidBackwardOpRunner() : BaseOpRunner("SigmoidBackward")
+    {
+    }
+
+    void SigmoidBackwardOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
+    {
+        ret = aclnnSigmoidBackwardGetWorkspaceSize(inputTensors[0], inputTensors[1], outputTensors[0], &workspaceSize, &executor);
+
+        checkRet(ret);
+
+        if (workspaceSize > 0)
+        {
+            mallocWorkSpace(workspaceSize);
+        }
+
+        ret = aclnnSigmoidBackward(workspaceAddr, workspaceSize, executor, aclstream);
+        CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnSigmoidBackward failed. ERROR: %d\n", name.c_str(), ret); return);
+
+        syncRun();
+
+        return;
+    }
+
+}
--- a/python/jittor/extern/acl/aclops/sigmoid_op_acl.h
+++ b/python/jittor/extern/acl/aclops/sigmoid_op_acl.h
@ -0,0 +1,27 @@
+#pragma once
+#include "utils.h"
+#include "base_op.h"
+
+namespace jittor
+{
+    class SigmoidOpRunner : public BaseOpRunner
+    {
+
+    protected:
+        void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
+
+    public:
+        SigmoidOpRunner();
+    };
+
+    class SigmoidBackwardOpRunner : public BaseOpRunner
+    {
+
+    protected:
+        void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
+
+    public:
+        SigmoidBackwardOpRunner();
+    };
+
+}
--- a/python/jittor/extern/acl/aclops/silu_op.py
+++ b/python/jittor/extern/acl/aclops/silu_op.py
@ -0,0 +1,85 @@
+import os
+from jittor_utils import env_or_try_find
+import jittor_utils
+import ctypes
+import glob
+import jittor.compiler as compiler
+import jittor as jt
+import math
+import numpy as np
+
+from typing import Union
+from collections.abc import Sequence, Iterable
+
+
+def silu_cmd(name: str,
+             inputs: list,
+             output_dtypes: list = None,
+             output_shapes: list = None,
+             attr_code: str = "",
+             attr_header: str = "",
+             outputs: list = None):
+    attr_header = "\nnamespace jittor{" + attr_header + "}\n"
+
+    cuda_header = '''
+    #include "acl/aclops/aclops.h"
+    '''
+    outputs_ = []
+    if outputs is not None:
+        outputs_ = outputs
+    else:
+        assert output_dtypes is not None
+        assert output_shapes is not None
+        assert len(output_dtypes) == len(output_shapes)
+        for i in range(len(output_shapes)):
+            outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
+    input_code = ''
+    for i in range(len(inputs)):
+        input_code += f"op.add(in{i}, true);\n"
+
+    output_code = ''
+    for i in range(len(outputs_)):
+        output_code += f"op.add(out{i}, false);\n"
+    return jt.code(outputs=outputs_,
+                   inputs=inputs,
+                   cuda_header=attr_header + cuda_header,
+                   cuda_src=f"""
+   
+    // aclop
+    {name}OpRunner op;
+    {input_code}
+    {output_code}
+    {attr_code}
+    op.run();""")
+
+
+class SiLUACL(jt.Function):
+
+    def __init__(self):
+        super(SiLUACL, self).__init__()
+
+    def execute(self, x):
+        x = x.float32()
+        inputs = [x]
+        self.input = x
+        outputs = [jt.empty(x.shape, x.dtype)]
+        attr_code = f"""
+        op.jt_name = "silu";
+        """
+        result = silu_cmd("SiLU",
+                          inputs=inputs,
+                          outputs=outputs,
+                          attr_code=attr_code)[0]
+        return result
+
+    def grad(self, grad_output):
+        attr_code = f"""
+        op.jt_name = "silubackward";
+        """
+        inputs = [grad_output, self.input]
+        outputs = [jt.empty(grad_output.shape, grad_output.dtype)]
+        grad_input = silu_cmd("SiLUBackward",
+                              inputs=inputs,
+                              outputs=outputs,
+                              attr_code=attr_code)[0]
+        return grad_input
--- a/python/jittor/extern/acl/aclops/silu_op_acl.cc
+++ b/python/jittor/extern/acl/aclops/silu_op_acl.cc
@ -0,0 +1,80 @@
+#pragma once
+#include <acl/acl.h>
+#include <acl/acl_op_compiler.h>
+#include <Python.h>
+#include <pystate.h>
+#include <algorithm>
+#include <queue>
+#include <set>
+#include "common.h"
+#include "op.h"
+#include "acl_jittor.h"
+#include "ops/random_op.h"
+#include "ops/reduce_op.h"
+#include "ops/binary_op.h"
+#include "ops/broadcast_to_op.h"
+#include "ops/transpose_op.h"
+#include "ops/array_op.h"
+#include "ops/code_op.h"
+#include "fused_op.h"
+#include "ops/unary_op.h"
+#include "ops/ternary_op.h"
+#include "executor.h"
+#include "misc/cuda_flags.h"
+#include "mem/allocator.h"
+#include "op_compiler.h"
+#include "ops/op_register.h"
+#include "opt/tuner_manager.h"
+#include "utils/str_utils.h"
+#include "aclnn/aclnn.h"
+#include "silu_op_acl.h"
+
+namespace jittor
+{
+    SiLUOpRunner::SiLUOpRunner() : BaseOpRunner("SiLU")
+    {
+    }
+
+    void SiLUOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
+    {
+        ret = aclnnSiluGetWorkspaceSize(inputTensors[0], outputTensors[0], &workspaceSize, &executor);
+
+        checkRet(ret);
+
+        if (workspaceSize > 0)
+        {
+            mallocWorkSpace(workspaceSize);
+        }
+
+        ret = aclnnSilu(workspaceAddr, workspaceSize, executor, aclstream);
+        CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnSilu failed. ERROR: %d\n", name.c_str(), ret); return);
+
+        syncRun();
+
+        return;
+    }
+
+    SiLUBackwardOpRunner::SiLUBackwardOpRunner() : BaseOpRunner("SiLUBackward")
+    {
+    }
+
+    void SiLUBackwardOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
+    {
+        ret = aclnnSiluBackwardGetWorkspaceSize(inputTensors[0], inputTensors[1], outputTensors[0], &workspaceSize, &executor);
+
+        checkRet(ret);
+
+        if (workspaceSize > 0)
+        {
+            mallocWorkSpace(workspaceSize);
+        }
+
+        ret = aclnnSiluBackward(workspaceAddr, workspaceSize, executor, aclstream);
+        CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnSiluBackward failed. ERROR: %d\n", name.c_str(), ret); return);
+
+        syncRun();
+
+        return;
+    }
+
+}
--- a/python/jittor/extern/acl/aclops/silu_op_acl.h
+++ b/python/jittor/extern/acl/aclops/silu_op_acl.h
@ -0,0 +1,27 @@
+#pragma once
+#include "utils.h"
+#include "base_op.h"
+
+namespace jittor
+{
+    class SiLUOpRunner : public BaseOpRunner
+    {
+
+    protected:
+        void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
+
+    public:
+        SiLUOpRunner();
+    };
+
+    class SiLUBackwardOpRunner : public BaseOpRunner
+    {
+
+    protected:
+        void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
+
+    public:
+        SiLUBackwardOpRunner();
+    };
+
+}
--- a/python/jittor/extern/acl/aclops/softmax_op.py
+++ b/python/jittor/extern/acl/aclops/softmax_op.py
@ -0,0 +1,92 @@
+import os
+from jittor_utils import env_or_try_find
+import jittor_utils
+import ctypes
+import glob
+import jittor.compiler as compiler
+import jittor as jt
+import math
+import numpy as np
+
+from typing import Union
+from collections.abc import Sequence, Iterable
+
+
+def softmax_cmd(name: str,
+                inputs: list,
+                output_dtypes: list = None,
+                output_shapes: list = None,
+                attr_code: str = "",
+                attr_header: str = "",
+                outputs: list = None):
+    attr_header = "\nnamespace jittor{" + attr_header + "}\n"
+
+    cuda_header = '''
+    #include "acl/aclops/aclops.h"
+    '''
+    outputs_ = []
+    if outputs is not None:
+        outputs_ = outputs
+    else:
+        assert output_dtypes is not None
+        assert output_shapes is not None
+        assert len(output_dtypes) == len(output_shapes)
+        for i in range(len(output_shapes)):
+            outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
+    input_code = ''
+    for i in range(len(inputs)):
+        input_code += f"op.add(in{i}, true);\n"
+
+    output_code = ''
+    for i in range(len(outputs_)):
+        output_code += f"op.add(out{i}, false);\n"
+    return jt.code(outputs=outputs_,
+                   inputs=inputs,
+                   cuda_header=attr_header + cuda_header,
+                   cuda_src=f"""
+   
+    // aclop
+    {name}OpRunner op;
+    {input_code}
+    {output_code}
+    {attr_code}
+    op.run();""")
+
+
+class SoftmaxACL(jt.Function):
+
+    def __init__(self):
+        super(SoftmaxACL, self).__init__()
+
+    def execute(self, x, dim):
+        x = x.float32()
+        inputs = [x]
+        outputs = [jt.empty(x.shape)]
+        self.dim = dim
+        attr_code = f"""
+        op.jt_name = "softmax";
+        SoftmaxAttr *attr = new SoftmaxAttr();
+        attr->dim = {dim};
+        op.op_attr.reset(attr);
+        """
+        result = softmax_cmd("Softmax",
+                             inputs=inputs,
+                             outputs=outputs,
+                             attr_code=attr_code)[0]
+        self.output = result
+        return result
+
+    def grad(self, grad_output):
+        attr_code = f"""
+        op.jt_name = "softmax";
+        SoftmaxAttr *attr = new SoftmaxAttr();
+        attr->dim = {self.dim};
+        op.op_attr.reset(attr);
+        """
+        inputs = [grad_output, self.output]
+        outputs = [jt.empty(grad_output.shape)]
+        grad_input = softmax_cmd("SoftmaxBackward",
+                                 inputs=inputs,
+                                 outputs=outputs,
+                                 attr_code=attr_code)[0]
+        return grad_input
--- a/python/jittor/extern/acl/aclops/softmax_op_acl.cc
+++ b/python/jittor/extern/acl/aclops/softmax_op_acl.cc
@ -0,0 +1,82 @@
+#pragma once
+#include <acl/acl.h>
+#include <acl/acl_op_compiler.h>
+#include <Python.h>
+#include <pystate.h>
+#include <algorithm>
+#include <queue>
+#include <set>
+#include "common.h"
+#include "op.h"
+#include "acl_jittor.h"
+#include "ops/random_op.h"
+#include "ops/reduce_op.h"
+#include "ops/binary_op.h"
+#include "ops/broadcast_to_op.h"
+#include "ops/transpose_op.h"
+#include "ops/array_op.h"
+#include "ops/code_op.h"
+#include "fused_op.h"
+#include "ops/unary_op.h"
+#include "ops/ternary_op.h"
+#include "executor.h"
+#include "misc/cuda_flags.h"
+#include "mem/allocator.h"
+#include "op_compiler.h"
+#include "ops/op_register.h"
+#include "opt/tuner_manager.h"
+#include "utils/str_utils.h"
+#include "aclnn/aclnn.h"
+#include "softmax_op_acl.h"
+
+namespace jittor
+{
+    SoftmaxOpRunner::SoftmaxOpRunner() : BaseOpRunner("Softmax")
+    {
+    }
+
+    void SoftmaxOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
+    {
+        auto attr = dynamic_cast<SoftmaxAttr *>(op_attr.get());
+        ret = aclnnSoftmaxGetWorkspaceSize(inputTensors[0], aclDataType(attr->dim), outputTensors[0], &workspaceSize, &executor);
+
+        checkRet(ret);
+
+        if (workspaceSize > 0)
+        {
+            mallocWorkSpace(workspaceSize);
+        }
+
+        ret = aclnnSoftmax(workspaceAddr, workspaceSize, executor, aclstream);
+        CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnSoftmax failed. ERROR: %d\n", name.c_str(), ret); return);
+
+        syncRun();
+
+        return;
+    }
+
+    SoftmaxBackwardOpRunner::SoftmaxBackwardOpRunner() : BaseOpRunner("SoftmaxBackward")
+    {
+    }
+
+    void SoftmaxBackwardOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
+    {
+        auto attr = dynamic_cast<SoftmaxAttr *>(op_attr.get());
+        ret = aclnnSoftmaxBackwardGetWorkspaceSize(inputTensors[0], inputTensors[1], attr->dim, outputTensors[0], &workspaceSize, &executor);
+
+        checkRet(ret);
+
+        if (workspaceSize > 0)
+        {
+            mallocWorkSpace(workspaceSize);
+        }
+
+        ret = aclnnSoftmaxBackward(workspaceAddr, workspaceSize, executor, aclstream);
+        CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnSoftmaxBackward failed. ERROR: %d\n", name.c_str(), ret); return);
+
+        syncRun();
+
+        return;
+    }
+
+}
--- a/python/jittor/extern/acl/aclops/softmax_op_acl.h
+++ b/python/jittor/extern/acl/aclops/softmax_op_acl.h
@ -0,0 +1,27 @@
+#pragma once
+#include "utils.h"
+#include "base_op.h"
+
+namespace jittor
+{
+    class SoftmaxOpRunner : public BaseOpRunner
+    {
+
+    protected:
+        void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
+
+    public:
+        SoftmaxOpRunner();
+    };
+
+    class SoftmaxBackwardOpRunner : public BaseOpRunner
+    {
+
+    protected:
+        void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
+
+    public:
+        SoftmaxBackwardOpRunner();
+    };
+
+}
--- a/python/jittor/extern/acl/aclops/stack_op.py
+++ b/python/jittor/extern/acl/aclops/stack_op.py
@ -0,0 +1,115 @@
+import os
+from jittor_utils import env_or_try_find
+import jittor_utils
+import ctypes
+import glob
+import jittor.compiler as compiler
+import jittor as jt
+import math
+import numpy as np
+
+from typing import Union
+from collections.abc import Sequence, Iterable
+
+
+def stack_cmd(name: str,
+              inputs: list,
+              output_dtypes: list = None,
+              output_shapes: list = None,
+              attr_code: str = "",
+              attr_header: str = "",
+              outputs: list = None):
+    attr_header = "\nnamespace jittor{" + attr_header + "}\n"
+
+    cuda_header = '''
+    #include "acl/aclops/aclops.h"
+    '''
+    outputs_ = []
+    if outputs is not None:
+        outputs_ = outputs
+    else:
+        assert output_dtypes is not None
+        assert output_shapes is not None
+        assert len(output_dtypes) == len(output_shapes)
+        for i in range(len(output_shapes)):
+            outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
+    input_code = ''
+    for i in range(len(inputs)):
+        input_code += f"op.add(in{i}, true);\n"
+
+    output_code = ''
+    for i in range(len(outputs_)):
+        output_code += f"op.add(out{i}, false);\n"
+    return jt.code(outputs=outputs_,
+                   inputs=inputs,
+                   cuda_header=attr_header + cuda_header,
+                   cuda_src=f"""
+   
+    // aclop
+    {name}OpRunner op;
+    {input_code}
+    {output_code}
+    {attr_code}
+    op.run();""")
+
+
+class StackACL(jt.Function):
+
+    def __init__(self):
+        super(StackACL, self).__init__()
+
+    def execute(self, input_tensors, dim):
+        if type(input_tensors) is tuple:
+            input_tensors = list(input_tensors)
+        assert type(input_tensors) is list
+        assert -1 * len(input_tensors) - 1 <= dim and dim <= len(input_tensors)
+        for i in range(len(input_tensors)):
+            if input_tensors[i].dtype != input_tensors[0].dtype:
+                raise ValueError("All input tensors must have the same dtype")
+            if input_tensors[i].shape != input_tensors[0].shape:
+                raise ValueError("All input tensors must have the same shape")
+        self.input = input_tensors
+        input_shape = list(input_tensors[0].shape)
+        output_shape = input_shape[:dim] + [len(input_tensors)
+                                            ] + input_shape[dim:]
+        attr_code = f"""
+        op.jt_name = "stack";
+        ConcatAttr *attr = new ConcatAttr();
+        attr->tensorNum = {len(input_tensors)};
+        attr->dim = {dim};
+        op.op_attr.reset(attr);
+        """
+        self.attr_code = attr_code
+        result = stack_cmd("Stack",
+                           input_tensors,
+                           output_dtypes=[input_tensors[0].dtype],
+                           output_shapes=[output_shape],
+                           attr_code=self.attr_code)[0]
+        return result
+
+    def grad(self, grad_output):
+        grad_inputs = self.split_grad(grad_output, self.input, self.dim)
+        return grad_inputs
+
+    def split_grad(self, grad_output, input_tensors, axis):
+        offset = []
+        shapeVec = []
+        dtypeVec = []
+        for tensor in input_tensors:
+            offset.append(tensor.shape[axis])
+            dtypeVec.append(tensor.dtype)
+            shapeVec.append(tensor.shape)
+
+        attr_code = f"""
+        op.jt_name = "splitwithsize";
+        auto *attr = new SplitWithSizeAttr();
+        attr->splitSize = {{ {", ".join(map(str, offset))} }};
+        attr->dim = {axis};
+        op.op_attr.reset(attr);
+        """
+
+        result = stack_cmd("SplitWithSize", [grad_output],
+                           output_dtypes=dtypeVec,
+                           output_shapes=shapeVec,
+                           attr_code=attr_code)
+        return result
--- a/python/jittor/extern/acl/aclops/stack_op_acl.cc
+++ b/python/jittor/extern/acl/aclops/stack_op_acl.cc
@ -0,0 +1,65 @@
+#pragma once
+#include <acl/acl.h>
+#include <acl/acl_op_compiler.h>
+#include <Python.h>
+#include <pystate.h>
+#include <algorithm>
+#include <queue>
+#include <set>
+#include "common.h"
+#include "op.h"
+#include "acl_jittor.h"
+#include "ops/random_op.h"
+#include "ops/reduce_op.h"
+#include "ops/binary_op.h"
+#include "ops/broadcast_to_op.h"
+#include "ops/transpose_op.h"
+#include "ops/array_op.h"
+#include "ops/code_op.h"
+#include "fused_op.h"
+#include "ops/unary_op.h"
+#include "ops/ternary_op.h"
+#include "executor.h"
+#include "misc/cuda_flags.h"
+#include "mem/allocator.h"
+#include "op_compiler.h"
+#include "ops/op_register.h"
+#include "opt/tuner_manager.h"
+#include "utils/str_utils.h"
+#include "aclnn/aclnn.h"
+#include "stack_op_acl.h"
+
+namespace jittor
+{
+    StackOpRunner::StackOpRunner() : BaseOpRunner("Stack")
+    {
+    }
+
+    void StackOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
+    {
+        auto input_num = in_.size();
+        std::vector<aclTensor *> stackTensorList = {};
+        for (int i = 0; i < input_num; i++)
+        {
+            stackTensorList.push_back(inputTensors[i]);
+        }
+        auto stackTensorListInput = aclCreateTensorList(&stackTensorList[0], input_num);
+        auto attr = dynamic_cast<ConcatAttr *>(op_attr.get());
+        ret = aclnnStackGetWorkspaceSize(stackTensorListInput, attr->dim, outputTensors[0], &workspaceSize, &executor);
+
+        checkRet(ret);
+
+        if (workspaceSize > 0)
+        {
+            mallocWorkSpace(workspaceSize);
+        }
+
+        ret = aclnnStack(workspaceAddr, workspaceSize, executor, aclstream);
+        CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnStack failed. ERROR: %d\n", name.c_str(), ret); return);
+
+        syncRun();
+
+        return;
+    }
+
+}
--- a/python/jittor/extern/acl/aclops/stack_op_acl.h
+++ b/python/jittor/extern/acl/aclops/stack_op_acl.h
@ -0,0 +1,17 @@
+#pragma once
+#include "utils.h"
+#include "base_op.h"
+
+namespace jittor
+{
+    class StackOpRunner : public BaseOpRunner
+    {
+
+    protected:
+        void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
+
+    public:
+        StackOpRunner();
+    };
+
+}
--- a/python/jittor/extern/acl/aclops/ternary_op_acl.cc
+++ b/python/jittor/extern/acl/aclops/ternary_op_acl.cc
@ -0,0 +1,54 @@
+#include <acl/acl.h>
+#include <acl/acl_op_compiler.h>
+#include <Python.h>
+#include <pystate.h>
+#include <algorithm>
+#include <queue>
+#include <set>
+#include "common.h"
+#include "op.h"
+#include "acl_jittor.h"
+#include "ops/random_op.h"
+#include "ops/reduce_op.h"
+#include "ops/binary_op.h"
+#include "ops/broadcast_to_op.h"
+#include "ops/transpose_op.h"
+#include "ops/array_op.h"
+#include "ops/code_op.h"
+#include "fused_op.h"
+#include "ops/unary_op.h"
+#include "ops/ternary_op.h"
+#include "executor.h"
+#include "misc/cuda_flags.h"
+#include "mem/allocator.h"
+#include "op_compiler.h"
+#include "ops/op_register.h"
+#include "opt/tuner_manager.h"
+#include "utils/str_utils.h"
+#include "aclnn/aclnn.h"
+#include "ternary_op_acl.h"
+
+namespace jittor
+{
+    TernaryOpRunner::TernaryOpRunner() : BaseOpRunner("ternary")
+    {
+        use_nchw = false;
+    }
+
+    void TernaryOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
+    {
+        ret = aclnnSWhereGetWorkspaceSize(inputTensors[0], inputTensors[1], inputTensors[2], outputTensors[0], &workspaceSize, &executor);
+
+        checkRet(ret);
+
+        if (workspaceSize > 0)
+        {
+            mallocWorkSpace(workspaceSize);
+        }
+
+        ret = aclnnSWhere(workspaceAddr, workspaceSize, executor, aclstream);
+        CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnxxx failed. ERROR: %d\n", name.c_str(), ret); return);
+        // syncRun();
+        return;
+    }
+}
--- a/python/jittor/extern/acl/aclops/ternary_op_acl.h
+++ b/python/jittor/extern/acl/aclops/ternary_op_acl.h
@ -0,0 +1,14 @@
+#pragma once
+#include "utils.h"
+#include "base_op.h"
+
+namespace jittor
+{
+    struct TernaryOpRunner : public BaseOpRunner
+    {
+        TernaryOpRunner();
+
+    protected:
+        void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
+    };
+}
--- a/python/jittor/extern/acl/aclops/transpose_op.py
+++ b/python/jittor/extern/acl/aclops/transpose_op.py
@ -0,0 +1,101 @@
+import os
+from jittor_utils import env_or_try_find
+import jittor_utils
+import ctypes
+import glob
+import jittor.compiler as compiler
+import jittor as jt
+import math
+import numpy as np
+
+from typing import Union
+from collections.abc import Sequence, Iterable
+
+
+def transpose_cmd(name: str,
+                  inputs: list,
+                  output_dtypes: list = None,
+                  output_shapes: list = None,
+                  attr_code: str = "",
+                  attr_header: str = "",
+                  outputs: list = None):
+    attr_header = "\nnamespace jittor{" + attr_header + "}\n"
+
+    cuda_header = '''
+    #include "acl/aclops/aclops.h"
+    '''
+    outputs_ = []
+    if outputs is not None:
+        outputs_ = outputs
+    else:
+        assert output_dtypes is not None
+        assert output_shapes is not None
+        assert len(output_dtypes) == len(output_shapes)
+        for i in range(len(output_shapes)):
+            outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
+    input_code = ''
+    for i in range(len(inputs)):
+        input_code += f"op.add(in{i}, true);\n"
+
+    output_code = ''
+    for i in range(len(outputs_)):
+        output_code += f"op.add(out{i}, false);\n"
+    return jt.code(outputs=outputs_,
+                   inputs=inputs,
+                   cuda_header=attr_header + cuda_header,
+                   cuda_src=f"""
+   
+    // aclop
+    {name}OpRunner op;
+    {input_code}
+    {output_code}
+    {attr_code}
+    op.run();""")
+
+
+class TransPoseACL(jt.Function):
+
+    def __init__(self):
+        super(TransPoseACL, self).__init__()
+
+    def execute(self, x, *dim):
+        self.input = x
+        if len(dim) == 1 and isinstance(dim[0], Sequence):
+            dim = dim[0]
+        elif len(dim) == 2:
+            axes = list(range(x.ndim))
+            a, b = dim
+            axes[a], axes[b] = axes[b], axes[a]
+            dim = axes
+
+        attr_code = f"""
+        op.jt_name = "transpose";
+        ReduceAttr *attr = new ReduceAttr();
+        attr->axes = {{ {", ".join(map(str, dim))} }};
+        op.op_attr.reset(attr);
+        """
+        # calculate output shape
+        output_shape = [x.shape[i] for i in dim]
+        output = transpose_cmd("Transpose", [x],
+                               output_dtypes=[x.dtype],
+                               output_shapes=[jt.empty(output_shape).shape],
+                               attr_code=attr_code)[0]
+        self.dim = dim
+        return output
+
+    def grad(self, grad_output):
+        dim = list(range(grad_output.ndim))
+        for i, p in enumerate(self.dim):
+            dim[p] = i
+        output_shape = [grad_output.shape[i] for i in dim]
+        attr_code = f"""
+        op.jt_name = "transpose";
+        ReduceAttr *attr = new ReduceAttr();
+        attr->axes = {{ {", ".join(map(str, dim))} }};
+        op.op_attr.reset(attr);
+        """
+        output = transpose_cmd("Transpose", [grad_output],
+                               output_dtypes=[grad_output.dtype],
+                               output_shapes=[jt.empty(output_shape).shape],
+                               attr_code=attr_code)[0]
+        return output
--- a/python/jittor/extern/acl/aclops/transpose_op_acl.cc
+++ b/python/jittor/extern/acl/aclops/transpose_op_acl.cc
@ -0,0 +1,66 @@
+#pragma once
+#include <acl/acl.h>
+#include <acl/acl_op_compiler.h>
+#include <Python.h>
+#include <pystate.h>
+#include <algorithm>
+#include <queue>
+#include <set>
+#include "common.h"
+#include "op.h"
+#include "acl_jittor.h"
+#include "ops/random_op.h"
+#include "ops/reduce_op.h"
+#include "ops/binary_op.h"
+#include "ops/broadcast_to_op.h"
+#include "ops/transpose_op.h"
+#include "ops/array_op.h"
+#include "ops/code_op.h"
+#include "fused_op.h"
+#include "ops/unary_op.h"
+#include "ops/ternary_op.h"
+#include "executor.h"
+#include "misc/cuda_flags.h"
+#include "mem/allocator.h"
+#include "op_compiler.h"
+#include "ops/op_register.h"
+#include "opt/tuner_manager.h"
+#include "utils/str_utils.h"
+#include "aclnn/aclnn.h"
+#include "transpose_op_acl.h"
+
+namespace jittor
+{
+    TransposeOpRunner::TransposeOpRunner() : BaseOpRunner("Transpose")
+    {
+    }
+
+    void TransposeOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
+    {
+        auto attr = dynamic_cast<ReduceAttr *>(op_attr.get());
+
+        aclIntArray *dim = nullptr;
+
+        dim = aclCreateIntArray(attr->axes.data(), attr->axes.size());
+        bool keepdims = attr->keepdims;
+
+        ret = aclnnPermuteGetWorkspaceSize(inputTensors[0], dim, outputTensors[0], &workspaceSize, &executor);
+
+        checkRet(ret);
+
+        if (workspaceSize > 0)
+        {
+            mallocWorkSpace(workspaceSize);
+        }
+
+        ret = aclnnPermute(workspaceAddr, workspaceSize, executor, aclstream);
+        CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnPermute failed. ERROR: %d\n", name.c_str(), ret); return);
+
+        syncRun();
+
+        aclDestroyIntArray(dim);
+
+        return;
+    }
+
+}
--- a/python/jittor/extern/acl/aclops/transpose_op_acl.h
+++ b/python/jittor/extern/acl/aclops/transpose_op_acl.h
@ -0,0 +1,17 @@
+#pragma once
+#include "utils.h"
+#include "base_op.h"
+
+namespace jittor
+{
+    class TransposeOpRunner : public BaseOpRunner
+    {
+
+    protected:
+        void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
+
+    public:
+        TransposeOpRunner();
+    };
+
+}
--- a/python/jittor/extern/acl/aclops/triu_op.py
+++ b/python/jittor/extern/acl/aclops/triu_op.py
@ -0,0 +1,74 @@
+import os
+from jittor_utils import env_or_try_find
+import jittor_utils
+import ctypes
+import glob
+import jittor.compiler as compiler
+import jittor as jt
+import math
+import numpy as np
+
+from typing import Union
+from collections.abc import Sequence, Iterable
+
+def triu_cmd(name: str,
+            inputs: list,
+            output_dtypes: list = None,
+            output_shapes: list = None,
+            attr_code: str = "",
+            attr_header: str = "",
+            outputs: list = None):
+    attr_header = "\nnamespace jittor{" + attr_header + "}\n"
+
+    cuda_header = '''
+    #include "acl/aclops/aclops.h"
+    '''
+    outputs_ = []
+    if outputs is not None:
+        outputs_ = outputs
+    else:
+        assert output_dtypes is not None
+        assert output_shapes is not None
+        assert len(output_dtypes) == len(output_shapes)
+        for i in range(len(output_shapes)):
+            outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
+    input_code = ''
+    for i in range(len(inputs)):
+        input_code += f"op.add(in{i}, true);\n"
+
+    output_code = ''
+    for i in range(len(outputs_)):
+        output_code += f"op.add(out{i}, false);\n"
+    return jt.code(outputs=outputs_,
+                   inputs=inputs,
+                   cuda_header=attr_header + cuda_header,
+                   cuda_src=f"""
+   
+    // aclop
+    {name}OpRunner op;
+    {input_code}
+    {output_code}
+    {attr_code}
+    op.run();""")
+    
+class TriuACL(jt.Function):
+
+    def __init__(self):
+        super(TriuACL, self).__init__()
+
+    def execute(self, input, diagonal):
+        attr_code = f"""
+        op.jt_name = "triu";
+        TriuAttr *attr = new TriuAttr();
+        attr->diagonal = {diagonal};
+        op.op_attr.reset(attr);
+        """
+
+        result = triu_cmd("Triu", [input],
+                            output_dtypes=[input.dtype],
+                            output_shapes=[input.shape],
+                            attr_code=attr_code)[0]
+        return result
+
+    def grad(self, grad_output):
+        return grad_output
--- a/python/jittor/extern/acl/aclops/triu_op_acl.cc
+++ b/python/jittor/extern/acl/aclops/triu_op_acl.cc
@ -0,0 +1,58 @@
+#pragma once
+#include <acl/acl.h>
+#include <acl/acl_op_compiler.h>
+#include <Python.h>
+#include <pystate.h>
+#include <algorithm>
+#include <queue>
+#include <set>
+#include "common.h"
+#include "op.h"
+#include "acl_jittor.h"
+#include "ops/random_op.h"
+#include "ops/reduce_op.h"
+#include "ops/binary_op.h"
+#include "ops/broadcast_to_op.h"
+#include "ops/transpose_op.h"
+#include "ops/array_op.h"
+#include "ops/code_op.h"
+#include "fused_op.h"
+#include "ops/unary_op.h"
+#include "ops/ternary_op.h"
+#include "executor.h"
+#include "misc/cuda_flags.h"
+#include "mem/allocator.h"
+#include "op_compiler.h"
+#include "ops/op_register.h"
+#include "opt/tuner_manager.h"
+#include "utils/str_utils.h"
+#include "aclnn/aclnn.h"
+#include "triu_op_acl.h"
+
+namespace jittor
+{
+    TriuOpRunner::TriuOpRunner() : BaseOpRunner("Triu")
+    {
+    }
+
+    void TriuOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
+    {
+        auto attr = dynamic_cast<TriuAttr *>(op_attr.get());
+        ret = aclnnTriuGetWorkspaceSize(inputTensors[0], aclDataType(attr->diagonal), outputTensors[0], &workspaceSize, &executor);
+
+        checkRet(ret);
+
+        if (workspaceSize > 0)
+        {
+            mallocWorkSpace(workspaceSize);
+        }
+
+        ret = aclnnTriu(workspaceAddr, workspaceSize, executor, aclstream);
+        CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnTriu failed. ERROR: %d\n", name.c_str(), ret); return);
+
+        syncRun();
+
+        return;
+    }
+
+}
--- a/python/jittor/extern/acl/aclops/triu_op_acl.h
+++ b/python/jittor/extern/acl/aclops/triu_op_acl.h
@ -0,0 +1,16 @@
+#pragma once
+#include "utils.h"
+#include "base_op.h"
+
+namespace jittor
+{
+    class TriuOpRunner : public BaseOpRunner
+    {
+
+    protected:
+        void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
+    public:
+        TriuOpRunner();
+    };
+
+}
--- a/python/jittor/extern/acl/aclops/unary_op_acl.cc
+++ b/python/jittor/extern/acl/aclops/unary_op_acl.cc
@ -0,0 +1,59 @@
+#include <acl/acl.h>
+#include <acl/acl_op_compiler.h>
+#include <Python.h>
+#include <pystate.h>
+#include <algorithm>
+#include <queue>
+#include <set>
+#include "common.h"
+#include "op.h"
+#include "acl_jittor.h"
+#include "ops/random_op.h"
+#include "ops/reduce_op.h"
+#include "ops/binary_op.h"
+#include "ops/broadcast_to_op.h"
+#include "ops/transpose_op.h"
+#include "ops/array_op.h"
+#include "ops/code_op.h"
+#include "fused_op.h"
+#include "ops/unary_op.h"
+#include "ops/ternary_op.h"
+#include "executor.h"
+#include "misc/cuda_flags.h"
+#include "mem/allocator.h"
+#include "op_compiler.h"
+#include "ops/op_register.h"
+#include "opt/tuner_manager.h"
+#include "utils/str_utils.h"
+#include "aclnn/aclnn.h"
+
+#include "unary_op_acl.h"
+
+namespace jittor
+{
+    UnaryOpRunner::UnaryOpRunner() : BaseOpRunner("unary")
+    {
+        use_nchw = false;
+        is_group_op = true;
+    }
+
+    void UnaryOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
+    {
+        if (name == "Cast")
+            ret = it->second.getWorkspaceSizeFuncCast(inputTensors[0], get_dtype(out_[0]->dtype()), outputTensors[0], &workspaceSize, &executor);
+        else
+            ret = it->second.getWorkspaceSizeFuncUnaryNonzero(inputTensors[0], outputTensors[0], &workspaceSize, &executor);
+
+        checkRet(ret);
+
+        if (workspaceSize > 0)
+        {
+            mallocWorkSpace(workspaceSize);
+        }
+
+        ret = it->second.executeFunc(workspaceAddr, workspaceSize, executor, aclstream);
+        CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnxxx failed. ERROR: %d\n", name.c_str(), ret); return);
+        // syncRun();
+        return;
+    }
+}
--- a/python/jittor/extern/acl/aclops/unary_op_acl.h
+++ b/python/jittor/extern/acl/aclops/unary_op_acl.h
@ -0,0 +1,14 @@
+#pragma once
+#include "utils.h"
+#include "base_op.h"
+
+namespace jittor
+{
+    struct UnaryOpRunner : public BaseOpRunner
+    {
+        UnaryOpRunner();
+
+    protected:
+        void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
+    };
+}
--- a/Show More
+++ b/Show More