Merge pull request #646 from Jittor/fixHW

adjust aclnn.h reference
2025-06-17 13:34:28 +08:00 · 2025-06-17 13:34:28 +08:00 · daf04e9fb5
parent 330dec69d2 3cf5d7f2a4
commit daf04e9fb5
9 changed files with 32 additions and 24 deletions
--- a/python/jittor/compiler.py
+++ b/python/jittor/compiler.py
@ -1191,6 +1191,8 @@ ascend_toolkit_home = os.getenv('ASCEND_TOOLKIT_HOME')

 # build cache_compile
 cc_flags += f" -I\"{os.path.join(jittor_path, 'src')}\" "
+
+if ascend_toolkit_home:
    cc_flags += f" -I\"{os.path.join(jittor_path, 'extern')}\" "
    cc_flags += f" -I\"{os.path.join(ascend_toolkit_home, 'include')}\" "
    cc_flags += f" -I\"{os.path.join(ascend_toolkit_home, 'include/acl')}\" "
--- a/python/jittor/extern/acl/acl_jittor.h
+++ b/python/jittor/extern/acl/acl_jittor.h
@ -7,6 +7,7 @@
 #pragma once
 #include "common.h"
 #include <acl/acl.h>
+#include "aclnn.h"

 std::string acl_error_to_string(aclError error);

--- a/python/jittor/extern/acl/aclops/utils.cc
+++ b/python/jittor/extern/acl/aclops/utils.cc
@ -5,6 +5,7 @@
 #include <Python.h>
 #include <pystate.h>
 #include "utils.h"
+#include "aclnn.h"

 namespace jittor
 {
--- a/python/jittor/extern/acl/aclops/utils.h
+++ b/python/jittor/extern/acl/aclops/utils.h
@ -6,6 +6,7 @@
 #include <Python.h>
 #include <pystate.h>
 #include "misc/nano_string.h"
+#include "aclnn.h"

 namespace jittor
 {
--- a/python/jittor/extern/mpi/ops/mpi_reduce_op.cc
+++ b/python/jittor/extern/mpi/ops/mpi_reduce_op.cc
@ -49,10 +49,13 @@ MpiReduceOp::MpiReduceOp(Var* x, NanoString op, int root) : x(x), op(op), root(r
            forward(var);
            return;
        } else if (hccl_reduce) {
+            auto var = hccl_reduce(x, "sum", root);
+            //exe.run_sync({var}, true);
            forward(var);
            return;
        }
    }
+    #endif
    y = create_output(nullptr, x->dtype());
 }

--- a/python/jittor/src/common.h
+++ b/python/jittor/src/common.h
@ -8,7 +8,7 @@
 #include <memory>
 #include <functional>
 #include "utils/log.h"
-#include "../extern/acl/aclnn/aclnn.h"
+// #include "../extern/acl/aclnn/aclnn.h"

 #define JIT_TEST(name) extern void jit_test_ ## name ()
 void expect_error(std::function<void()> func);
--- a/python/jittor/src/ops/array_op.cc
+++ b/python/jittor/src/ops/array_op.cc
@ -31,9 +31,9 @@ cudaEvent_t event;
 struct Init {
 Init() {
    if (!get_device_count()) return;
-  //checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
-  //checkCudaErrors(cudaEventCreate(&event, cudaEventDisableTiming));
-    stream = aclstream;
+  checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+  checkCudaErrors(cudaEventCreate(&event, cudaEventDisableTiming));
+    // stream = aclstream;
 }
 ~Init() {
    if (!get_device_count()) return;
--- a/python/jittor/src/ops/copy_op.cc
+++ b/python/jittor/src/ops/copy_op.cc
@ -17,7 +17,7 @@

 namespace jittor {

-EXTERN_LIB aclrtStream aclstream;
+// EXTERN_LIB aclrtStream aclstream;

 CopyOp::CopyOp(Var* x) {
    flags.set(NodeFlags::_cpu);
--- a/python/jittor/src/ops/fetch_op.cc
+++ b/python/jittor/src/ops/fetch_op.cc
@ -47,7 +47,7 @@ Init() {
    if (!get_device_count()) return;
    checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
    checkCudaErrors(cudaEventCreate(&event, cudaEventDisableTiming));
-    stream = aclstream;
+    // stream = aclstream;
 }
 ~Init() {
    if (!get_device_count()) return;
@ -123,11 +123,11 @@ void FetchOp::run() {
            new (&allocation) Allocation(&cuda_dual_allocator, v->size);
            // mostly device to device
            #if IS_CUDA
-            // checkCudaErrors(cudaMemcpyAsync(
-            //     allocation.ptr, v->mem_ptr, v->size, cudaMemcpyDefault, stream));
            checkCudaErrors(cudaMemcpyAsync(
-                allocation.ptr, v->size, v->mem_ptr, v->size, cudaMemcpyDefault, aclstream));
-            checkCudaErrors(aclrtSynchronizeStream(aclstream));
+                allocation.ptr, v->mem_ptr, v->size, cudaMemcpyDefault, stream));
+            // checkCudaErrors(cudaMemcpyAsync(
+            //     allocation.ptr, v->size, v->mem_ptr, v->size, cudaMemcpyDefault, aclstream));
+            // checkCudaErrors(aclrtSynchronizeStream(aclstream));
            #else
            checkCudaErrors(cudaMemcpyAsync(
                allocation.ptr, v->mem_ptr, v->size, cudaMemcpyDeviceToDevice, stream));
@ -135,11 +135,11 @@ void FetchOp::run() {
            auto host_ptr = cuda_dual_allocator.get_dual_allocation(
                allocation.allocation).host_ptr;
            // device to host
-            // checkCudaErrors(cudaMemcpyAsync(
-            //     host_ptr, allocation.ptr, v->size, cudaMemcpyDeviceToHost, stream));
-            checkCudaErrors(aclrtMemcpyAsync(
-                host_ptr, v->size, allocation.ptr, v->size, cudaMemcpyDeviceToHost, aclstream));
-            checkCudaErrors(aclrtSynchronizeStream(aclstream));
+            checkCudaErrors(cudaMemcpyAsync(
+                host_ptr, allocation.ptr, v->size, cudaMemcpyDeviceToHost, stream));
+            // checkCudaErrors(aclrtMemcpyAsync(
+            //     host_ptr, v->size, allocation.ptr, v->size, cudaMemcpyDeviceToHost, aclstream));
+            // checkCudaErrors(aclrtSynchronizeStream(aclstream));
            allocation.ptr = host_ptr;
            has_cuda_memcpy = true;
        } else