Debug nan.

2024-12-12 14:49:18 +08:00 · 2024-12-12 14:49:18 +08:00 · c9c02508d4
parent da6acc6cc3
commit c9c02508d4
8 changed files with 32 additions and 14 deletions
--- a/python/jittor/init.py
+++ b/python/jittor/init.py
@ -9,7 +9,7 @@
 # file 'LICENSE.txt', which is part of this source code package.
 # ***************************************************************

-__version__ = '1.3.2412.29'
+__version__ = '1.3.2415.29'
 from jittor_utils import lock
 with lock.lock_scope():
    ori_int = int
--- a/python/jittor/extern/acl/acl_op_exec.cc
+++ b/python/jittor/extern/acl/acl_op_exec.cc
@ -328,6 +328,7 @@ namespace jittor
                    op.add(bop->z, false);
                    op.run();
                    bop->x->shape = xshape_bk;
+                    // aclrtSynchronizeStream(aclstream);
                }
                else if (op->name() == string("fuse_transpose"))
                {
--- a/python/jittor/extern/acl/aclops/acl_op.h
+++ b/python/jittor/extern/acl/aclops/acl_op.h
@ -626,14 +626,14 @@ namespace jittor
            
            // 6. 释放aclTensor和aclScalar，需要根据具体API的接口定义修改
            // destroy tensor
-            for (int idx = 0; idx < input_num; idx++)
-            {
-                aclDestroyTensor(inputTensors[idx]);
-            }
-            for (int idx = 0; idx < output_num; idx++)
-            {
-                aclDestroyTensor(outputTensors[idx]);
-            }
+            // for (int idx = 0; idx < input_num; idx++)
+            // {
+            //     aclDestroyTensor(inputTensors[idx]);
+            // }
+            // for (int idx = 0; idx < output_num; idx++)
+            // {
+            //     aclDestroyTensor(outputTensors[idx]);
+            // }
            // destroy scalar
            // aclDestroyScalar(start);
            // aclDestroyScalar(end);
--- a/python/jittor/optim.py
+++ b/python/jittor/optim.py
@ -312,6 +312,7 @@ class SGD(Optimizer):

            # optimize main body
            for p, g, v in zip(pg["params"], pg["grads"], pg["values"]):
+                # print(p.shape, g.shape, v.shape)
                if p.is_stop_grad(): continue
                dp = p * weight_decay + g
                v.update(momentum * v + dp * (1 - dampening))
--- a/python/jittor/src/ops/array_op.cc
+++ b/python/jittor/src/ops/array_op.cc
@ -27,6 +27,7 @@ namespace array_local {
 cudaStream_t stream;
 cudaEvent_t event;

+
 struct Init {
 Init() {
    if (!get_device_count()) return;
@ -103,8 +104,11 @@ void ArrayOp::run() {
        auto host_ptr = cuda_dual_allocator.get_dual_allocation(allocation.allocation).host_ptr;
        checkCudaErrors(cudaMemcpyAsync(
            allocation.ptr, host_ptr, allocation.size, cudaMemcpyHostToDevice, stream));
-        checkCudaErrors(cudaEventRecord(event, stream));
-        checkCudaErrors(cudaStreamWaitEvent(0, event, 0));
+        // checkCudaErrors(aclrtMemcpyAsync(
+        //     allocation.ptr, allocation.size, host_ptr, allocation.size, cudaMemcpyHostToDevice, aclstream));
+        // checkCudaErrors(cudaEventRecord(event, stream));
+        // checkCudaErrors(cudaStreamWaitEvent(0, event, 0));
+        // checkCudaErrors(aclrtSynchronizeStream(aclstream));
        // delay free this allocation
        allocation.allocator = &delay_free;
    }
--- a/python/jittor/src/ops/copy_op.cc
+++ b/python/jittor/src/ops/copy_op.cc
@ -17,6 +17,8 @@

 namespace jittor {

+EXTERN_LIB aclrtStream aclstream;
+
 CopyOp::CopyOp(Var* x) {
    flags.set(NodeFlags::_cpu);
    flags.set(NodeFlags::_cuda);
@ -42,6 +44,8 @@ void CopyOp::run() {
    #ifdef HAS_CUDA
    if (flags.get(NodeFlags::_cuda)) {
        checkCudaErrors(cudaMemcpyAsync(y_ptr, x_ptr, size, cudaMemcpyDeviceToDevice, 0));
+        // checkCudaErrors(aclrtMemcpyAsync(y_ptr, size, x_ptr, size, cudaMemcpyDeviceToDevice, aclstream));
+        // checkCudaErrors(aclrtSynchronizeStream(aclstream));
    } else
    #endif
    {
--- a/python/jittor/src/ops/fetch_op.cc
+++ b/python/jittor/src/ops/fetch_op.cc
@ -123,8 +123,11 @@ void FetchOp::run() {
            new (&allocation) Allocation(&cuda_dual_allocator, v->size);
            // mostly device to device
            #if IS_CUDA
+            // checkCudaErrors(cudaMemcpyAsync(
+            //     allocation.ptr, v->mem_ptr, v->size, cudaMemcpyDefault, stream));
            checkCudaErrors(cudaMemcpyAsync(
-                allocation.ptr, v->mem_ptr, v->size, cudaMemcpyDefault, stream));
+                allocation.ptr, v->size, v->mem_ptr, v->size, cudaMemcpyDefault, aclstream));
+            checkCudaErrors(aclrtSynchronizeStream(aclstream));
            #else
            checkCudaErrors(cudaMemcpyAsync(
                allocation.ptr, v->mem_ptr, v->size, cudaMemcpyDeviceToDevice, stream));
@ -132,8 +135,11 @@ void FetchOp::run() {
            auto host_ptr = cuda_dual_allocator.get_dual_allocation(
                allocation.allocation).host_ptr;
            // device to host
-            checkCudaErrors(cudaMemcpyAsync(
-                host_ptr, allocation.ptr, v->size, cudaMemcpyDeviceToHost, stream));
+            // checkCudaErrors(cudaMemcpyAsync(
+            //     host_ptr, allocation.ptr, v->size, cudaMemcpyDeviceToHost, stream));
+            checkCudaErrors(aclrtMemcpyAsync(
+                host_ptr, v->size, allocation.ptr, v->size, cudaMemcpyDeviceToHost, aclstream));
+            checkCudaErrors(aclrtSynchronizeStream(aclstream));
            allocation.ptr = host_ptr;
            has_cuda_memcpy = true;
        } else
--- a/python/jittor/src/ops/setitem_op.cc
+++ b/python/jittor/src/ops/setitem_op.cc
@ -330,6 +330,8 @@ void SetitemOp::jit_run() {
    #else
    if (op != ip)
        checkCudaErrors(cudaMemcpyAsync(op, ip, out->size, cudaMemcpyDeviceToDevice, 0));
+        // checkCudaErrors(aclrtMemcpyAsync(op, out->size, ip, out->size, cudaMemcpyDeviceToDevice, aclstream));
+        // checkCudaErrors(aclrtSynchronizeStream(aclstream));
    #endif

    if (ns.get(GetitemOp::_inplace) &&