Debug nan.

This commit is contained in:
Exusial 2024-12-12 14:49:18 +08:00
parent da6acc6cc3
commit c9c02508d4
8 changed files with 32 additions and 14 deletions

View File

@ -9,7 +9,7 @@
# file 'LICENSE.txt', which is part of this source code package.
# ***************************************************************
__version__ = '1.3.2412.29'
__version__ = '1.3.2415.29'
from jittor_utils import lock
with lock.lock_scope():
ori_int = int

View File

@ -328,6 +328,7 @@ namespace jittor
op.add(bop->z, false);
op.run();
bop->x->shape = xshape_bk;
// aclrtSynchronizeStream(aclstream);
}
else if (op->name() == string("fuse_transpose"))
{

View File

@ -626,14 +626,14 @@ namespace jittor
// 6. 释放aclTensor和aclScalar需要根据具体API的接口定义修改
// destroy tensor
for (int idx = 0; idx < input_num; idx++)
{
aclDestroyTensor(inputTensors[idx]);
}
for (int idx = 0; idx < output_num; idx++)
{
aclDestroyTensor(outputTensors[idx]);
}
// for (int idx = 0; idx < input_num; idx++)
// {
// aclDestroyTensor(inputTensors[idx]);
// }
// for (int idx = 0; idx < output_num; idx++)
// {
// aclDestroyTensor(outputTensors[idx]);
// }
// destroy scalar
// aclDestroyScalar(start);
// aclDestroyScalar(end);

View File

@ -312,6 +312,7 @@ class SGD(Optimizer):
# optimize main body
for p, g, v in zip(pg["params"], pg["grads"], pg["values"]):
# print(p.shape, g.shape, v.shape)
if p.is_stop_grad(): continue
dp = p * weight_decay + g
v.update(momentum * v + dp * (1 - dampening))

View File

@ -27,6 +27,7 @@ namespace array_local {
cudaStream_t stream;
cudaEvent_t event;
struct Init {
Init() {
if (!get_device_count()) return;
@ -103,8 +104,11 @@ void ArrayOp::run() {
auto host_ptr = cuda_dual_allocator.get_dual_allocation(allocation.allocation).host_ptr;
checkCudaErrors(cudaMemcpyAsync(
allocation.ptr, host_ptr, allocation.size, cudaMemcpyHostToDevice, stream));
checkCudaErrors(cudaEventRecord(event, stream));
checkCudaErrors(cudaStreamWaitEvent(0, event, 0));
// checkCudaErrors(aclrtMemcpyAsync(
// allocation.ptr, allocation.size, host_ptr, allocation.size, cudaMemcpyHostToDevice, aclstream));
// checkCudaErrors(cudaEventRecord(event, stream));
// checkCudaErrors(cudaStreamWaitEvent(0, event, 0));
// checkCudaErrors(aclrtSynchronizeStream(aclstream));
// delay free this allocation
allocation.allocator = &delay_free;
}

View File

@ -17,6 +17,8 @@
namespace jittor {
EXTERN_LIB aclrtStream aclstream;
CopyOp::CopyOp(Var* x) {
flags.set(NodeFlags::_cpu);
flags.set(NodeFlags::_cuda);
@ -42,6 +44,8 @@ void CopyOp::run() {
#ifdef HAS_CUDA
if (flags.get(NodeFlags::_cuda)) {
checkCudaErrors(cudaMemcpyAsync(y_ptr, x_ptr, size, cudaMemcpyDeviceToDevice, 0));
// checkCudaErrors(aclrtMemcpyAsync(y_ptr, size, x_ptr, size, cudaMemcpyDeviceToDevice, aclstream));
// checkCudaErrors(aclrtSynchronizeStream(aclstream));
} else
#endif
{

View File

@ -123,8 +123,11 @@ void FetchOp::run() {
new (&allocation) Allocation(&cuda_dual_allocator, v->size);
// mostly device to device
#if IS_CUDA
// checkCudaErrors(cudaMemcpyAsync(
// allocation.ptr, v->mem_ptr, v->size, cudaMemcpyDefault, stream));
checkCudaErrors(cudaMemcpyAsync(
allocation.ptr, v->mem_ptr, v->size, cudaMemcpyDefault, stream));
allocation.ptr, v->size, v->mem_ptr, v->size, cudaMemcpyDefault, aclstream));
checkCudaErrors(aclrtSynchronizeStream(aclstream));
#else
checkCudaErrors(cudaMemcpyAsync(
allocation.ptr, v->mem_ptr, v->size, cudaMemcpyDeviceToDevice, stream));
@ -132,8 +135,11 @@ void FetchOp::run() {
auto host_ptr = cuda_dual_allocator.get_dual_allocation(
allocation.allocation).host_ptr;
// device to host
checkCudaErrors(cudaMemcpyAsync(
host_ptr, allocation.ptr, v->size, cudaMemcpyDeviceToHost, stream));
// checkCudaErrors(cudaMemcpyAsync(
// host_ptr, allocation.ptr, v->size, cudaMemcpyDeviceToHost, stream));
checkCudaErrors(aclrtMemcpyAsync(
host_ptr, v->size, allocation.ptr, v->size, cudaMemcpyDeviceToHost, aclstream));
checkCudaErrors(aclrtSynchronizeStream(aclstream));
allocation.ptr = host_ptr;
has_cuda_memcpy = true;
} else

View File

@ -330,6 +330,8 @@ void SetitemOp::jit_run() {
#else
if (op != ip)
checkCudaErrors(cudaMemcpyAsync(op, ip, out->size, cudaMemcpyDeviceToDevice, 0));
// checkCudaErrors(aclrtMemcpyAsync(op, out->size, ip, out->size, cudaMemcpyDeviceToDevice, aclstream));
// checkCudaErrors(aclrtSynchronizeStream(aclstream));
#endif
if (ns.get(GetitemOp::_inplace) &&