fix some bug & add sync weights

2020-04-08 18:26:15 +08:00 · 2020-04-08 18:26:15 +08:00 · 178d7b259a
parent f22a8ec1fe
commit 178d7b259a
4 changed files with 49 additions and 8 deletions
--- a/extern/cuda/nccl/ops/nccl_broadcast_op.cc
+++ b/extern/cuda/nccl/ops/nccl_broadcast_op.cc
@ -42,7 +42,6 @@ void NcclBroadcastOp::jit_run() {
    auto* __restrict__ xp = x->ptr<Tx>();
    auto* __restrict__ yp = y->ptr<Tx>();
    checkCudaErrors(ncclBroadcast(xp, yp, size, ncclFloat, root, comm, 0));
-    checkCudaErrors(cudaStreamSynchronize(0));
 }

 #endif
--- a/extern/cuda/nccl/ops/nccl_reduce_op.cc
+++ b/extern/cuda/nccl/ops/nccl_reduce_op.cc
@ -41,8 +41,7 @@ void NcclReduceOp::jit_run() {
    int size = 1 @for(i, 0, XDIM,  * xshape@{i});
    auto* __restrict__ xp = x->ptr<Tx>();
    auto* __restrict__ yp = y->ptr<Tx>();
-    checkCudaErrors(ncclReduce(xp, yp, size, ncclFloat, root, comm, 0));
-    checkCudaErrors(cudaStreamSynchronize(0));
+    checkCudaErrors(ncclReduce(xp, yp, size, ncclFloat, ncclSum, root, comm, 0));
 }

 #endif
--- a/python/jittor/nn.py
+++ b/python/jittor/nn.py
@ -161,6 +161,14 @@ class SGD(object):
        # sync such parameters to reduce memory consumption
        jt.sync(self.no_grad_parameters)

+    def sync(self):
+        ps = self.parameters
+        for p in ps:
+            temp = jt.compile_extern.nccl_ops.nccl_broadcast(p, 0)
+            p -= p
+            p += temp
+            p.detach_inplace()
+
 class Adam(object):
    """ Usage:
    optimizer = nn.Adam(model.parameters(), lr)
--- a/python/jittor/test/test_nccl_ops.py
+++ b/python/jittor/test/test_nccl_ops.py
@ -1,5 +1,6 @@
 # ***************************************************************
 # Copyright (c) 2020 Jittor. Authors: 
+#     Guoye Yang <498731903@qq.com>
 #     Guowei Yang <471184555@qq.com>
 #     Dun Liang <randonlang@gmail.com>. 
 # All Rights Reserved.
@ -10,12 +11,16 @@ import unittest
 import os, sys
 import jittor as jt
 import numpy as np
+from jittor import nn
+from jittor import nn, Module
+import copy
+n = 2

 def test_all_reduce():
    print("test all_reduce")
    x = jt.random([5, 5])
    y = jt.compile_extern.nccl_ops.nccl_all_reduce(x)
-    assert np.allclose(y.data, (x*3).data)
+    assert np.allclose(y.data, (x*n).data)

 def test_broadcast():
    print("test broadcast")
@ -32,15 +37,45 @@ def test_reduce():
    print("test reduce")
    mpi = jt.compile_extern.mpi
    x = jt.random([5, 5])
-    y = jt.compile_extern.nccl_ops.nccl_all_reduce(x)
+    y = jt.compile_extern.nccl_ops.nccl_reduce(x, 0)
+    y_ = y.data
+    x_ = (x*n).data
    if mpi.world_rank() == 0:
-        assert np.allclose(y.data, (x*3).data)
+        assert np.allclose(y_, x_)
+
+class Model(Module):
+    def __init__(self):
+        self.linear1 = nn.Linear(3, 3)
+        self.linear2 = nn.Linear(3, 1024, False)
+
+    def execute(self, x):
+        x = self.linear1(x)
+        x = nn.relu(x)
+        return self.linear2(x)
+
+def test_sync():
+    mpi = jt.compile_extern.mpi
+    net = Model()
+    SGD = nn.SGD(net.parameters(), 0.1, 0.9, 0.00001)
+    if mpi.world_rank() == 0:
+        net.linear1.weight *= 0
+        net.linear2.weight *= 0
+        net.linear1.bias *= 0
+        net.linear1.weight += 1
+        net.linear2.weight += 1
+        net.linear1.bias += 1
+    SGD.sync()
+    assert np.allclose(net.linear1.weight.data, jt.ones(net.linear1.weight.shape).data)
+    assert np.allclose(net.linear2.weight.data, jt.ones(net.linear2.weight.shape).data)
+    assert np.allclose(net.linear1.bias.data, jt.ones(net.linear1.bias.shape).data)
+

 def main():
    np.random.seed(0)
    jt.set_seed(3)
    with jt.flag_scope(use_cuda=1):
        if jt.compile_extern.nccl_ops:
+            test_sync()
            test_all_reduce()
            test_broadcast()
            test_reduce()
@ -49,9 +84,9 @@ def main():
 class TestNcclOps(unittest.TestCase):
    def test(self):
        mpi = jt.compile_extern.mpi
-        if mpi.world_size() == 1:
+        if mpi.world_size() == 1 and n != 1:
            mpirun_path = jt.compiler.env_or_try_find('mpirun_path', 'mpirun')
-            cmd = f"{mpirun_path} -np 3 {sys.executable} -m jittor.test.test_nccl_ops"
+            cmd = f"{mpirun_path} -np {n} {sys.executable} -m jittor.test.test_nccl_ops"
            print("run cmd", cmd)
            jt.compiler.run_cmd(cmd)
        else: