enable cuda and acl

2025-07-19 11:05:30 +08:00 · 2025-07-19 11:05:30 +08:00 · c78db2a794
parent f8e44de79d
commit c78db2a794
7 changed files with 27 additions and 24 deletions
--- a/python/jittor/compile_extern.py
+++ b/python/jittor/compile_extern.py
@ -457,8 +457,7 @@ def setup_cutt():
 def install_cutlass(root_folder):
    # Modified from: https://github.com/ap-hynninen/cutlass
    # url = "https://cloud.tsinghua.edu.cn/f/171e49e5825549548bc4/?dl=1"
-    # url = "https://cg.cs.tsinghua.edu.cn/jittor/assets/cutlass.zip"
-    url = "https://cloud.tsinghua.edu.cn/f/171e49e5825549548bc4/?dl=1"
+    url = "https://cg.cs.tsinghua.edu.cn/jittor/assets/cutlass.zip"

    filename = "cutlass.zip"
    fullname = os.path.join(root_folder, filename)
--- a/python/jittor/compiler.py
+++ b/python/jittor/compiler.py
@ -1186,20 +1186,22 @@ make_cache_dir(os.path.join(cache_path, "tmp"))
 ck_path = os.path.join(cache_path, "checkpoints")
 make_cache_dir(ck_path)

-
-ascend_toolkit_home = os.getenv('ASCEND_TOOLKIT_HOME')
-
 # build cache_compile
 cc_flags += f" -I\"{os.path.join(jittor_path, 'src')}\" "
 cc_flags += f" -I\"{os.path.join(jittor_path, 'extern')}\" "
-cc_flags += f" -I\"{os.path.join(ascend_toolkit_home, 'include')}\" "
-cc_flags += f" -I\"{os.path.join(ascend_toolkit_home, 'include/acl')}\" "
-cc_flags += f" -I\"{os.path.join(ascend_toolkit_home, 'include/aclnn')}\" "
-cc_flags += f" -I\"{os.path.join(ascend_toolkit_home, 'include/aclnnop')}\" "
-cc_flags += f" -L\"{os.path.join(ascend_toolkit_home, 'lib64')}\" "
-cc_flags += " -llibascendcl "
-cc_flags += " -llibnnopbase "
-cc_flags += " -llibopapi "
+
+ascend_toolkit_home = os.getenv('ASCEND_TOOLKIT_HOME')
+
+if ascend_toolkit_home:
+    cc_flags += f" -I\"{os.path.join(ascend_toolkit_home, 'include')}\" "
+    cc_flags += f" -I\"{os.path.join(ascend_toolkit_home, 'include/acl')}\" "
+    cc_flags += f" -I\"{os.path.join(ascend_toolkit_home, 'include/aclnn')}\" "
+    cc_flags += f" -I\"{os.path.join(ascend_toolkit_home, 'include/aclnnop')}\" "
+    cc_flags += f" -L\"{os.path.join(ascend_toolkit_home, 'lib64')}\" "
+    cc_flags += " -llibascendcl "
+    cc_flags += " -llibnnopbase "
+    cc_flags += " -llibopapi "
+
 cc_flags += py_include

 check_cache_compile()
--- a/python/jittor/extern/acl/acl_jittor.h
+++ b/python/jittor/extern/acl/acl_jittor.h
@ -6,6 +6,7 @@
 // ***************************************************************
 #pragma once
 #include "common.h"
+#include "aclnn/aclnn.h"
 #include <acl/acl.h>

 std::string acl_error_to_string(aclError error);
--- a/python/jittor/extern/acl/aclops/utils.cc
+++ b/python/jittor/extern/acl/aclops/utils.cc
@ -5,6 +5,7 @@
 #include <Python.h>
 #include <pystate.h>
 #include "utils.h"
+#include "aclnn/aclnn.h"

 namespace jittor
 {
--- a/python/jittor/extern/acl/aclops/utils.h
+++ b/python/jittor/extern/acl/aclops/utils.h
@ -6,6 +6,7 @@
 #include <Python.h>
 #include <pystate.h>
 #include "misc/nano_string.h"
+#include "aclnn/aclnn.h"

 namespace jittor
 {
--- a/python/jittor/src/common.h
+++ b/python/jittor/src/common.h
@ -8,7 +8,6 @@
 #include <memory>
 #include <functional>
 #include "utils/log.h"
-#include "../extern/acl/aclnn/aclnn.h"

 #define JIT_TEST(name) extern void jit_test_ ## name ()
 void expect_error(std::function<void()> func);
--- a/python/jittor/src/ops/fetch_op.cc
+++ b/python/jittor/src/ops/fetch_op.cc
@ -47,7 +47,7 @@ Init() {
    if (!get_device_count()) return;
    checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
    checkCudaErrors(cudaEventCreate(&event, cudaEventDisableTiming));
-    stream = aclstream;
+    // stream = aclstream;
 }
 ~Init() {
    if (!get_device_count()) return;
@ -123,11 +123,11 @@ void FetchOp::run() {
            new (&allocation) Allocation(&cuda_dual_allocator, v->size);
            // mostly device to device
            #if IS_CUDA
-            // checkCudaErrors(cudaMemcpyAsync(
-            //     allocation.ptr, v->mem_ptr, v->size, cudaMemcpyDefault, stream));
            checkCudaErrors(cudaMemcpyAsync(
-                allocation.ptr, v->size, v->mem_ptr, v->size, cudaMemcpyDefault, aclstream));
-            checkCudaErrors(aclrtSynchronizeStream(aclstream));
+                allocation.ptr, v->mem_ptr, v->size, cudaMemcpyDefault, stream));
+            // checkCudaErrors(cudaMemcpyAsync(
+            //     allocation.ptr, v->size, v->mem_ptr, v->size, cudaMemcpyDefault, aclstream));
+            // checkCudaErrors(aclrtSynchronizeStream(aclstream));
            #else
            checkCudaErrors(cudaMemcpyAsync(
                allocation.ptr, v->mem_ptr, v->size, cudaMemcpyDeviceToDevice, stream));
@ -135,11 +135,11 @@ void FetchOp::run() {
            auto host_ptr = cuda_dual_allocator.get_dual_allocation(
                allocation.allocation).host_ptr;
            // device to host
-            // checkCudaErrors(cudaMemcpyAsync(
-            //     host_ptr, allocation.ptr, v->size, cudaMemcpyDeviceToHost, stream));
-            checkCudaErrors(aclrtMemcpyAsync(
-                host_ptr, v->size, allocation.ptr, v->size, cudaMemcpyDeviceToHost, aclstream));
-            checkCudaErrors(aclrtSynchronizeStream(aclstream));
+            checkCudaErrors(cudaMemcpyAsync(
+                host_ptr, allocation.ptr, v->size, cudaMemcpyDeviceToHost, stream));
+            // checkCudaErrors(aclrtMemcpyAsync(
+            //     host_ptr, v->size, allocation.ptr, v->size, cudaMemcpyDeviceToHost, aclstream));
+            // checkCudaErrors(aclrtSynchronizeStream(aclstream));
            allocation.ptr = host_ptr;
            has_cuda_memcpy = true;
        } else