Merge branch 'master' of https://github.com/Jittor/jittor

2020-12-15 11:01:18 +08:00 · 2020-12-15 11:01:18 +08:00 · 398746044a
parent b2ad8e533c 88b08ab6cb
commit 398746044a
125 changed files with 2734 additions and 1631 deletions
--- a/.dockerignore
+++ b/.dockerignore
@ -26,3 +26,4 @@ venv/
 python/jittor.egg-info
 dist/
 !doc/source/*
+__data__
--- a/.gitignore
+++ b/.gitignore
@ -25,3 +25,4 @@ python/jittor.egg-info
 dist/
 !doc/source/*
 core
+__data__
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -0,0 +1,2 @@
+exclude __data__
+exclude __pycache__
--- a/README.cn.md
+++ b/README.cn.md
@ -217,13 +217,13 @@ jt.flags.use_cuda = 1
 ```


-### 可选步骤五：进行完整测试
+### 可选步骤五：测试训练Resnet18


-要检查Jittor的完整性，您可以运行完整的测试。
+要检查Jittor的完整性，您可以运行Resnet18训练测试。

 ```bash
-python3.7 -m jittor.test -v
+python3.7 -m jittor.test.test_resnet
 ```

 如果这些测试失败，请为我们报告错误，我们十分欢迎您为Jittor做出贡献^ _ ^
@ -360,10 +360,29 @@ Jittor还很年轻。 它可能存在错误和问题。 请在我们的错误跟



+QQ 群：761222083
+
+
+
 ## 团队


-Jittor目前由来自[清华大学计算机图形学组](https://cg.cs.tsinghua.edu.cn/)的梁盾，杨国烨，杨国炜，周文洋和国孟昊等博士生维护。 如果您也对Jittor感兴趣并希望对其进行改进，请加入我们！
+Jittor目前由[清华大学计算机图形学组](https://cg.cs.tsinghua.edu.cn/)维护。 如果您也对Jittor感兴趣并希望对其进行改进，请加入我们！
+
+
+## 引用
+
+```
+@article{hu2020jittor,
+  title={Jittor: a novel deep learning framework with meta-operators and unified graph execution},
+  author={Hu, Shi-Min and Liang, Dun and Yang, Guo-Ye and Yang, Guo-Wei and Zhou, Wen-Yang},
+  journal={Information Sciences},
+  volume={63},
+  number={222103},
+  pages={1--222103},
+  year={2020}
+}
+```


 ## 版权声明
--- a/README.md
+++ b/README.md
@ -210,14 +210,14 @@ import jittor as jt
 jt.flags.use_cuda = 1
 ```

-### Optional Step 5: Run full tests
+### Optional Step 5: Test Resnet18 training


-To check the integrity of Jittor, you can run full tests.
+To check the integrity of Jittor, you can run Resnet18 training test.


 ```bash
-python3.7 -m jittor.test -v
+python3.7 -m jittor.test.test_resnet
 ```
 if those tests are failed, please report bugs for us, and feel free to contribute ^_^

@ -353,12 +353,32 @@ Email: jittor@qq.com

 File an issue: https://github.com/Jittor/jittor/issues

+QQ Group: 761222083
+
+
+<img src="https://cg.cs.tsinghua.edu.cn/jittor/images/news/2020-12-8-21-19-1_2_2/fig4.png" width="200"/>
+
 ## The Team


-Jittor is currently maintained by Dun Liang, Guo-Ye Yang, Guo-Wei Yang,  Wen-Yang Zhou and Meng-Hao Guo etc. from the [Tsinghua CSCG Group](https://cg.cs.tsinghua.edu.cn/). If you are also interested in Jittor and want to improve it, Please join us!
+Jittor is currently maintained by the [Tsinghua CSCG Group](https://cg.cs.tsinghua.edu.cn/). If you are also interested in Jittor and want to improve it, Please join us!


+## Citation
+
+
+```
+@article{hu2020jittor,
+  title={Jittor: a novel deep learning framework with meta-operators and unified graph execution},
+  author={Hu, Shi-Min and Liang, Dun and Yang, Guo-Ye and Yang, Guo-Wei and Zhou, Wen-Yang},
+  journal={Information Sciences},
+  volume={63},
+  number={222103},
+  pages={1--222103},
+  year={2020}
+}
+```
+
 ## License


--- a/README.src.md
+++ b/README.src.md
@ -223,7 +223,7 @@ sudo apt install python3.7 python3.7-dev

 The whole framework is compiled Just-in-time. Let's install jittor via pip

-整个框架是及时编译的。 让我们通过pip安装jittor
+整个框架是即时编译的。 让我们通过pip安装jittor

 ```bash
 git clone https://github.com/Jittor/jittor.git
@ -263,16 +263,16 @@ import jittor as jt
 jt.flags.use_cuda = 1
 ```

-### Optional Step 5: Run full tests
+### Optional Step 5: Test Resnet18 training

-### 可选步骤五：进行完整测试
+### 可选步骤五：测试训练Resnet18

-To check the integrity of Jittor, you can run full tests.
+To check the integrity of Jittor, you can run Resnet18 training test.

-要检查Jittor的完整性，您可以运行完整的测试。
+要检查Jittor的完整性，您可以运行Resnet18训练测试。

 ```bash
-python3.7 -m jittor.test -v
+python3.7 -m jittor.test.test_resnet
 ```
 if those tests are failed, please report bugs for us, and feel free to contribute ^_^

@ -453,13 +453,35 @@ Email: jittor@qq.com

 File an issue: https://github.com/Jittor/jittor/issues

+QQ Group: 761222083
+
+QQ 群：761222083
+
+<img src="https://cg.cs.tsinghua.edu.cn/jittor/images/news/2020-12-8-21-19-1_2_2/fig4.png" width="200"/>
+
 ## The Team

 ## 团队

-Jittor is currently maintained by Dun Liang, Guo-Ye Yang, Guo-Wei Yang,  Wen-Yang Zhou and Meng-Hao Guo etc. from the [Tsinghua CSCG Group](https://cg.cs.tsinghua.edu.cn/). If you are also interested in Jittor and want to improve it, Please join us!
+Jittor is currently maintained by the [Tsinghua CSCG Group](https://cg.cs.tsinghua.edu.cn/). If you are also interested in Jittor and want to improve it, Please join us!

-Jittor目前由来自[清华大学计算机图形学组](https://cg.cs.tsinghua.edu.cn/)的梁盾，杨国烨，杨国炜，周文洋和国孟昊等博士生维护。 如果您也对Jittor感兴趣并希望对其进行改进，请加入我们！
+Jittor目前由[清华大学计算机图形学组](https://cg.cs.tsinghua.edu.cn/)维护。 如果您也对Jittor感兴趣并希望对其进行改进，请加入我们！
+
+## Citation
+
+## 引用
+
+```
+@article{hu2020jittor,
+  title={Jittor: a novel deep learning framework with meta-operators and unified graph execution},
+  author={Hu, Shi-Min and Liang, Dun and Yang, Guo-Ye and Yang, Guo-Wei and Zhou, Wen-Yang},
+  journal={Information Sciences},
+  volume={63},
+  number={222103},
+  pages={1--222103},
+  year={2020}
+}
+```

 ## License

--- a/extern/cuda/cub/ops/cub_where_op.cc
+++ b/extern/cuda/cub/ops/cub_where_op.cc
@ -10,6 +10,8 @@
 #include "cub_where_op.h"
 #ifdef JIT_cuda
 #include "executor.h"
+#include <cuda_runtime.h>
+#include "helper_cuda.h"
 #include <assert.h>
 #include <executor.h>
 #include <cub/cub.cuh>
@ -56,17 +58,6 @@ struct NonZeroOp
    }
 };

-template<typename T>
-struct ConvertOp 
-{   
-    const T div;
-    const T dim_size;
-    ConvertOp(T _div,T dim_size): div(_div),dim_size(dim_size){} 
-    __host__ __device__ __forceinline__ T operator()(const T& val) const {
-        return (val/div) % dim_size;
-    }
-};
-
 __global__ static void where_kernel(
    int n, 
    To* input
@ -90,30 +81,25 @@ void CubWhereOp::jit_run(){
    int N = cond->num;
    size_t temp_storage_bytes=0;
    size_t num_nonzeros_allocation;
-    auto num_nonzeros = exe.allocator->alloc(sizeof(int), num_nonzeros_allocation);
-    cub::TransformInputIterator<bool, NonZeroOp<Ti>, Ti*> itr(cond->ptr<Ti>(), NonZeroOp<Ti>());
-    cub::DeviceReduce::Sum(nullptr, temp_storage_bytes, itr, (int *)num_nonzeros, N);
-    
+    auto num_nonzeros = exe.allocator->alloc(sizeof(To), num_nonzeros_allocation);
+
    size_t temp_storage_allocation;
-    auto temp_storage = exe.allocator->alloc(temp_storage_bytes, temp_storage_allocation);
-
-    cub::DeviceReduce::Sum(temp_storage, temp_storage_bytes, itr, (int *)num_nonzeros, N);
-    exe.allocator->free(temp_storage, temp_storage_bytes, temp_storage_allocation);
-
-    int num_nonzeros_h;
-    checkCudaErrors(cudaMemcpy(&num_nonzeros_h, num_nonzeros, sizeof(int), cudaMemcpyDeviceToHost));
+    void* temp_storage;
    
    To* out_temp = outs[0]->ptr<To>();

-    @for(i, 0, NDIM, outs[@i]->set_shape({num_nonzeros_h});)
-
    cub::CountingInputIterator<To> counting_itr(0);
+    cub::TransformInputIterator<bool, NonZeroOp<Ti>, Ti*> itr(cond->ptr<Ti>(), NonZeroOp<Ti>());
    temp_storage_bytes = 0;
-    cub::DeviceSelect::Flagged(nullptr, temp_storage_bytes, counting_itr, itr,out_temp, (int*)num_nonzeros, N);
+    checkCudaErrors(cub::DeviceSelect::Flagged(nullptr, temp_storage_bytes, counting_itr, itr, out_temp, (To*)num_nonzeros, N));
    temp_storage = exe.allocator->alloc(temp_storage_bytes, temp_storage_allocation);
-    cub::DeviceSelect::Flagged(temp_storage, temp_storage_bytes, counting_itr, itr,out_temp, (int*)num_nonzeros, N);
+    checkCudaErrors(cub::DeviceSelect::Flagged(temp_storage, temp_storage_bytes, counting_itr, itr,out_temp, (To*)num_nonzeros, N));
    exe.allocator->free(temp_storage, temp_storage_bytes, temp_storage_allocation);

+    To num_nonzeros_h;
+    cudaMemcpy(&num_nonzeros_h, num_nonzeros, sizeof(To), cudaMemcpyDeviceToHost);
+    @for(i, 0, NDIM, outs[@i]->set_shape({num_nonzeros_h});)
+
    if (num_nonzeros_h > 0 && NDIM > 1) {
        int thread_num = std::min(1024, num_nonzeros_h);
        int block_num = std::max(1, num_nonzeros_h/1024);
--- a/extern/cuda/cublas/ops/cublas_batched_matmul_op.cc
+++ b/extern/cuda/cublas/ops/cublas_batched_matmul_op.cc
@ -39,11 +39,17 @@ VarPtr CublasBatchedMatmulOp::grad(Var* out, Var* dout, Var* v, int v_index) {
    // a [b,n,m] b [b,m,k], c[b,n,k]
    // c = a*b
    if (v_index == 0) {
-        // da = dc*b^T
-        return make_cublas_batched_matmul(dout, b, trans_a^0, trans_b^1);
+        if (trans_a)
+            return make_cublas_batched_matmul(b, dout, trans_b, 1);
+        else
+            // da = dc*b^T
+            return make_cublas_batched_matmul(dout, b, 0, trans_b^1);
    } else {
-        // db = a^T*dc
-        return make_cublas_batched_matmul(a, dout, trans_a^1, trans_b^0);
+        if (trans_b)
+            return make_cublas_batched_matmul(dout, a, 1, trans_a);
+        else
+            // db = a^T*dc
+            return make_cublas_batched_matmul(a, dout, trans_a^1, 0);
    }
 }

--- a/extern/cuda/cublas/src/cublas_matmul_test.cc
+++ b/extern/cuda/cublas/src/cublas_matmul_test.cc
@ -58,7 +58,7 @@

 // CUDA and CUBLAS functions
 #include <helper_functions.h>
-#include <helper_cuda.h>
+#include "helper_cuda.h"

 #ifndef min
 #define min(a,b) ((a < b) ? a : b)
--- a/extern/cuda/cublas/src/helper_cublas.cc
+++ b/extern/cuda/cublas/src/helper_cublas.cc
@ -14,7 +14,7 @@

 #include <cuda_runtime.h>
 #include <cublas.h>
-#include <helper_cuda.h>
+#include "helper_cuda.h"

 #ifdef CUBLAS_API_H_
 // cuBLAS API errors
--- a/extern/cuda/cudnn/inc/cudnn_warper.h
+++ b/extern/cuda/cudnn/inc/cudnn_warper.h
@ -15,8 +15,12 @@ namespace jittor {

 extern cudnnHandle_t cudnn_handle;
 extern int max_cache_size;
+extern float max_workspace_ratio;

 // @pyjt(set_algorithm_cache_size)
 void set_algorithm_cache_size(int size);

+// @pyjt(set_max_workspace_ratio)
+void set_max_workspace_ratio(float64 ratio);
+
 } // jittor
--- a/extern/cuda/cudnn/ops/cudnn_conv_backward_w_op.cc
+++ b/extern/cuda/cudnn/ops/cudnn_conv_backward_w_op.cc
@ -198,7 +198,7 @@ void CudnnConvBackwardWOp::jit_run() {
                size_t sz;
                cudnnStatus_t ret = cudnnGetConvolutionBackwardFilterWorkspaceSize(handle_, cudnnIdesc, cudnnOdesc, cudnnConvDesc, cudnnFdesc, algos[i], &sz);
                // continue if use too much workspace
-                if (sz*4 > mem_info.total_cuda_ram) continue;
+                if (sz > mem_info.total_cuda_ram * max_workspace_ratio) continue;
                if (CUDNN_STATUS_SUCCESS == ret && sz > max_ws_size) max_ws_size = sz;
            } 
            size_t allocation;
--- a/extern/cuda/cudnn/ops/cudnn_conv_backward_x_op.cc
+++ b/extern/cuda/cudnn/ops/cudnn_conv_backward_x_op.cc
@ -178,7 +178,7 @@ void CudnnConvBackwardXOp::jit_run() {
        CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD,
        CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED
    };
-    int num_algos = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT;
+    int num_algos = CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT;
    int perf_count;
    cudnnConvolutionBwdDataAlgoPerf_t perf_results[num_algos];
    cudnnConvolutionBwdDataAlgo_t algo;
@ -199,7 +199,7 @@ void CudnnConvBackwardXOp::jit_run() {
                size_t sz;
                cudnnStatus_t ret = cudnnGetConvolutionBackwardDataWorkspaceSize(handle_, cudnnFdesc, cudnnOdesc, cudnnConvDesc, cudnnIdesc, algos[i], &sz);
                // continue if use too much workspace
-                if (sz*4 > mem_info.total_cuda_ram) continue;
+                if (sz > mem_info.total_cuda_ram * max_workspace_ratio) continue;
                if (CUDNN_STATUS_SUCCESS == ret && sz > max_ws_size) max_ws_size = sz;
            } 
            size_t allocation;
--- a/extern/cuda/cudnn/ops/cudnn_conv_op.cc
+++ b/extern/cuda/cudnn/ops/cudnn_conv_op.cc
@ -203,7 +203,7 @@ void CudnnConvOp::jit_run() {
                    handle_, cudnnIdesc, cudnnFdesc, cudnnConvDesc, 
                    cudnnOdesc, algos[i], &sz);
                // continue if use too much workspace
-                if (sz*4 > mem_info.total_cuda_ram) continue;
+                if (sz > mem_info.total_cuda_ram * max_workspace_ratio) continue;
                if (CUDNN_STATUS_SUCCESS == ret && sz > max_ws_size) max_ws_size = sz;
            } 
            size_t allocation;
--- a/extern/cuda/cudnn/src/cudnn_conv_test.cc
+++ b/extern/cuda/cudnn/src/cudnn_conv_test.cc
@ -65,7 +65,7 @@
 #include <assert.h>

 #include <cudnn.h>
-#include <helper_cuda.h>
+#include "helper_cuda.h"
 #include "fp16_dev.h"
 #include "fp16_emu.h"

--- a/extern/cuda/cudnn/src/cudnn_warper.cc
+++ b/extern/cuda/cudnn/src/cudnn_warper.cc
@ -10,11 +10,16 @@ namespace jittor {

 cudnnHandle_t cudnn_handle;
 int max_cache_size = 100;
+float max_workspace_ratio = 0.25;

 void set_algorithm_cache_size(int size) {
    max_cache_size = size;
 }

+void set_max_workspace_ratio(float64 ratio) {
+    max_workspace_ratio = ratio;
+}
+
 struct cudnn_initer {

 inline cudnn_initer() {
--- a/extern/cuda/cudnn/src/helper_cudnn.cc
+++ b/extern/cuda/cudnn/src/helper_cudnn.cc
@ -1,5 +1,5 @@
 #include <cudnn.h>
-#include <helper_cuda.h>
+#include "helper_cuda.h"

 const char *_cudaGetErrorEnum(cudnnStatus_t error) {
    return cudnnGetErrorString(error);
--- a/extern/cuda/curand/ops/curand_random_op.cc
+++ b/extern/cuda/curand/ops/curand_random_op.cc
@ -9,7 +9,7 @@
 #include "init.h"
 #include <cuda_runtime.h>
 #include <curand.h>
-#include <helper_cuda.h>
+#include "helper_cuda.h"
 #include "curand_random_op.h"
 #include "curand_warper.h"

--- a/extern/cuda/curand/src/helper_curand.cc
+++ b/extern/cuda/curand/src/helper_curand.cc
@ -10,7 +10,7 @@

 #include <cuda_runtime.h>
 #include <cublas.h>
-#include <helper_cuda.h>
+#include "helper_cuda.h"
 #include <curand.h>

 // cuRAND API errors
--- a/extern/cuda/cutt/ops/cutt_transpose_op.cc
+++ b/extern/cuda/cutt/ops/cutt_transpose_op.cc
@ -6,16 +6,13 @@
 #include "var.h"
 #include "cutt_transpose_op.h"
 #include "ops/op_register.h"
-#include <iostream>
-
-#ifdef JIT
 #include "cutt.h"
-#endif
 #include "cutt_warper.h"
+#include "misc/stack_vector.h"
+#include "helper_cuda.h"

 namespace jittor {

-#ifndef JIT
 static auto make_transpose = get_op_info("cutt_transpose")
    .get_constructor<VarPtr, Var*, NanoVector>();

@ -58,52 +55,49 @@ VarPtr CuttTransposeOp::grad(Var* out, Var* dout, Var* v, int v_index) {
    return make_transpose(dout, reverse);
 }

-void CuttTransposeOp::jit_prepare(JK& jk) {
-    jk << _CS("[Tx:") << x->dtype();
-    jk << _CS("][DIM=") << JK::hex1(axes.size());
-    for (uint i=0; i<axes.size(); i++)
-        jk << _CS("][AXES") << JK::hex1(axes[i]) << '=' << JK::hex1(i);
-    jk << ']';
-}
 unordered_map<string, unsigned int> cutt_plan_cache;

-#else // JIT
-#ifdef JIT_cuda
-
-extern unordered_map<string, unsigned int> cutt_plan_cache;
-
-void CuttTransposeOp::jit_run() {
-    auto* __restrict__ xp = x->ptr<Tx>();
-    auto* __restrict__ yp = y->ptr<Tx>();
-    vector<int> permutation, permutation2;
-    vector<int> y_shape;
-    vector<int> x_shape;
-    @for(i, 0, DIM, permutation.push_back(DIM-1-AXES@i);)
-    @for(i, 0, DIM, permutation2.push_back(permutation[DIM-1-@i@@]);)
-    std::vector<int> reverse;
-    reverse.reserve(permutation2.size());
-    for (uint i=0; i<permutation2.size(); i++)
-        reverse[permutation2[i]] = i;
-
-    @for(i, 0, DIM, x_shape.push_back(x->shape[DIM-1-@i@@]);)
-
+void CuttTransposeOp::run() {
+    auto* __restrict__ xp = x->mem_ptr;
+    auto* __restrict__ yp = y->mem_ptr;
+    StackVector<int> x_shape;
+    StackVector<int> new_shape, new_axes, trans, reverse;
+    int dim = x->shape.size();
+    for (int i=0; i<dim; i++) {
+        trans[i] = new_shape.size();
+        if (x->shape[i] != 1)
+            new_shape.push_back(x->shape[i]);
+    }
+    for (int i = 0; i < dim; ++i) {
+        if (x->shape[axes[i]] != 1) {
+            new_axes.push_back(trans[axes[i]]);
+        }
+    }
+    dim = new_shape.size();
+    for (int i=0; i<dim; i++)
+        reverse[i] = dim-1-new_axes[dim-1-i];
+    for (int i=0; i<dim; i++)
+        x_shape[i] = new_shape[dim-1-i];
+    if (dim == 1) {
+        checkCudaErrors(cudaMemcpyAsync(yp, xp, x->size, cudaMemcpyDefault, 0));
+        return;
+    }
    jk.clear();
-    jk << @DIM << ",";
-    for (uint i=0; i<@DIM; i++) jk << x_shape[i] << ",";
-    for (uint i=0; i<@DIM; i++) jk << reverse[i] << ",";
-    jk << sizeof(Tx) << ".";
+    jk << dim << ',';
+    for (int i=0; i<dim; i++) jk << x_shape[i] << ',';
+    for (int i=0; i<dim; i++) jk << reverse[i] << ',';
+    jk << x->dtype().dsize() << '.';
    auto iter = cutt_plan_cache.find(jk.to_string());
+    LOGvvv << "Run cutt_transpose with key:" << jk.to_string();

    if (iter!=cutt_plan_cache.end()){
        cuttExecute(iter->second, xp, yp);
    } else {
        cuttHandle plan;
-        cuttPlan(&plan, @DIM, x_shape.data(), reverse.data(), sizeof(Tx), 0);
+        cuttPlan(&plan, dim, x_shape.data(), reverse.data(), x->dtype().dsize(), 0);
        cutt_plan_cache[jk.to_string()] = plan;
        cuttExecute(plan, xp, yp);
    }
 }
-#endif // JIT_cuda
-#endif // JIT

 } // jittor
--- a/extern/cuda/cutt/ops/cutt_transpose_op.h
+++ b/extern/cuda/cutt/ops/cutt_transpose_op.h
@ -19,7 +19,7 @@ struct CuttTransposeOp : Op {
    const char* name() const override { return "cutt_transpose"; }
    VarPtr grad(Var* out, Var* dout, Var* v, int v_index) override;
    void infer_shape() override;
-    DECLARE_jit_run;
+    void run() override;
 };

 } // jittor
--- a/extern/cuda/inc/helper_cuda.h
+++ b/extern/cuda/inc/helper_cuda.h
@ -101,11 +101,17 @@ const char *_cudaGetErrorEnum(NppStatus error);
 #endif
 #endif

+namespace jittor {
+extern bool peek_logged;
+}
+
 template <typename T>
 void peek(T result, char const *const func, const char *const file,
           int const line) {
  if (result) {
    // DEVICE_RESET
+    if (jittor::peek_logged) return;
+    jittor::peek_logged = 1;
    LOGe << "Peek CUDA error at" << file >> ":" >> line << " code="
      >> static_cast<unsigned int>(result) >> "(" << _cudaGetErrorEnum(result) << ")"
      << func;
--- a/extern/cuda/nccl/inc/nccl_warper.h
+++ b/extern/cuda/nccl/inc/nccl_warper.h
@ -11,7 +11,7 @@

 #include <cuda_runtime.h>
 #include <nccl.h>
-#include <helper_cuda.h>
+#include "helper_cuda.h"

 namespace jittor {

--- a/extern/cuda/nccl/ops/nccl_all_reduce_op.cc
+++ b/extern/cuda/nccl/ops/nccl_all_reduce_op.cc
@ -12,7 +12,7 @@

 #include <nccl.h>
 #include <cuda_runtime.h>
-#include <helper_cuda.h>
+#include "helper_cuda.h"
 #include "nccl_warper.h"
 #include "ops/op_register.h"
 namespace jittor {
--- a/extern/cuda/nccl/ops/nccl_broadcast_op.cc
+++ b/extern/cuda/nccl/ops/nccl_broadcast_op.cc
@ -12,7 +12,7 @@

 #include <nccl.h>
 #include <cuda_runtime.h>
-#include <helper_cuda.h>
+#include "helper_cuda.h"
 #include "nccl_warper.h"
 #include "ops/op_register.h"
 namespace jittor {
--- a/extern/cuda/nccl/ops/nccl_reduce_op.cc
+++ b/extern/cuda/nccl/ops/nccl_reduce_op.cc
@ -12,7 +12,7 @@

 #include <nccl.h>
 #include <cuda_runtime.h>
-#include <helper_cuda.h>
+#include "helper_cuda.h"
 #include "nccl_warper.h"
 #include "ops/op_register.h"
 namespace jittor {
--- a/extern/cuda/src/helper_cuda.cc
+++ b/extern/cuda/src/helper_cuda.cc
@ -13,7 +13,7 @@
 // These are CUDA Helper functions for initialization and error checking

 #include <cuda_runtime.h>
-#include <helper_cuda.h>
+#include "helper_cuda.h"


 #ifdef _CUFFT_H_
--- a/python/jittor/init.py
+++ b/python/jittor/init.py
@ -7,7 +7,7 @@
 # This file is subject to the terms and conditions defined in
 # file 'LICENSE.txt', which is part of this source code package.
 # ***************************************************************
-__version__ = '1.2.1.2'
+__version__ = '1.2.2.5'
 from . import lock
 with lock.lock_scope():
    ori_int = int
@ -33,9 +33,38 @@ from collections import OrderedDict
 from collections.abc import Sequence, Mapping
 import types
 import pickle
-import sys
+import hashlib
+import sys, os
 import traceback

+
+def safepickle(obj, path):
+    s = pickle.dumps(obj, pickle.HIGHEST_PROTOCOL)
+    checksum = hashlib.sha1(s).digest()
+    s += bytes(checksum)
+    s += b"HCAJSLHD"
+    with open(path, 'wb') as f:
+        f.write(s)
+
+def safeunpickle(path):
+    if path.startswith("jittorhub://"):
+        path = path.replace("jittorhub://", "https://cg.cs.tsinghua.edu.cn/jittor/assets/build/checkpoints/")
+    if path.startswith("https:") or path.startswith("http:"):
+        base = path.split("/")[-1]
+        fname = os.path.join(compiler.ck_path, base)
+        from jittor.utils.misc import download_url_to_local
+        download_url_to_local(path, base, compiler.ck_path, None)
+        path = fname
+    with open(path, "rb") as f:
+        s = f.read()
+    if not s.endswith(b"HCAJSLHD"):
+        return pickle.loads(s)
+    checksum = s[-28:-8]
+    s = s[:-28]
+    if hashlib.sha1(s).digest() != checksum:
+        raise ValueError("Pickle checksum does not match! path: "+path)
+    return pickle.loads(s)
+
 class _call_no_record_scope:
    def __enter__(self): pass
    def __exit__(self, *exc): pass
@ -92,6 +121,7 @@ class log_capture_scope(_call_no_record_scope):
        print(logs)
    """
    def __init__(self, **jt_flags):
+        jt_flags["use_parallel_op_compiler"] = 0
        self.fs = flag_scope(**jt_flags)

    def __enter__(self):
@ -435,8 +465,15 @@ def display_memory_info():
    core.display_memory_info(fileline)

 def load(path):
-    pkl_file = open(path, 'rb')
-    model_dict = pickle.load(pkl_file)
+    if path.endswith(".pth"):
+        try:
+            dirty_fix_pytorch_runtime_error()
+            import torch
+        except:
+            raise RuntimeError("pytorch need to be installed when load pth format.")
+        model_dict = torch.load(path, map_location=torch.device('cpu'))
+    else:
+        model_dict = safeunpickle(path)
    return model_dict

 def _uniq(x):
@ -559,6 +596,21 @@ class Module:
            return ret
        self.__class__.__call__ = new_call

+    def register_pre_forward_hook(self, func):
+        cls = self.__class__
+        self.__fhook2__ = func
+        if hasattr(cls, "__hooked2__"):
+            return
+        cls.__hooked2__ = True
+        origin_call = cls.__call__
+        def new_call(self, *args, **kw):
+            if hasattr(self, "__fhook2__"):
+                if len(kw):
+                    self.__fhook2__(self, args, kw)
+                else:
+                    self.__fhook2__(self, args)
+            return origin_call(self, *args, **kw)
+        self.__class__.__call__ = new_call

    def children(self):
        cd = []
@ -631,20 +683,10 @@ class Module:
        params_dict = {}
        for p in params:
            params_dict[p.name()] = p.data
-        with open(path, 'wb') as f:
-            pickle.dump(params_dict, f, pickle.HIGHEST_PROTOCOL)
+        safepickle(params_dict, path)

    def load(self, path):
-        if path.endswith(".pth"):
-            try:
-                dirty_fix_pytorch_runtime_error()
-                import torch
-            except:
-                raise RuntimeError("pytorch need to be installed when load pth format.")
-            self.load_parameters(torch.load(path, map_location=torch.device('cpu')))
-            return
-        with open(path, 'rb') as f:
-            self.load_parameters(pickle.load(f))
+        self.load_parameters(load(path))

    def eval(self):
        def callback(parents, k, v, n):
@ -789,6 +831,11 @@ can also be None)::
    def dfs(self, parents, k, callback, callback_leave=None):
        pass

+    @classmethod
+    def apply(cls, *args, **kw):
+        func = cls()
+        return func(*args, **kw)
+

 def make_module(func, exec_n_args=1):
    class MakeModule(Module):
@ -864,8 +911,6 @@ def size(v, dim=None):
    return v.shape[dim]
 Var.size = size

-def item(v):
-    return v.data.item()

 def to_int(v):
    dtype = str(v.dtype)
@ -882,11 +927,15 @@ def to_bool(v):
    assert dtype.startswith("int") or dtype=="bool"
    return ori_bool(v.item())

-Var.item = item
 Var.__int__ = to_int
 Var.__float__ = to_float
 Var.__bool__ = to_bool

+def format(v, spec):
+    return v.item().__format__(spec)
+Var.__format__ = format
+
+
 int = int32
 Var.int = Var.int32
 float = float32
--- a/python/jittor/compile_extern.py
+++ b/python/jittor/compile_extern.py
@ -78,29 +78,37 @@ def setup_mkl():


 def install_cub(root_folder):
-    url = "https://github.com/NVlabs/cub/archive/v1.8.0.tar.gz"
-    filename = "cub-1.8.0.tgz"
+    url = "https://github.com/NVIDIA/cub/archive/1.11.0.tar.gz"
+    filename = "cub-1.11.0.tgz"
+    md5 = "97196a885598e40592100e1caaf3d5ea"
    fullname = os.path.join(root_folder, filename)
    dirname = os.path.join(root_folder, filename.replace(".tgz",""))
    
    if not os.path.isfile(os.path.join(dirname, "examples", "test")):
        LOG.i("Downloading cub...")
-        download_url_to_local(url, filename, root_folder, "9203ea2499b56782601fddf8a12e9b08")
+        download_url_to_local(url, filename, root_folder, md5)
        import tarfile
    
        with tarfile.open(fullname, "r") as tar:
            tar.extractall(root_folder)
        assert 0 == os.system(f"cd {dirname}/examples && "
-                    f"{nvcc_path} device/example_device_radix_sort.cu -O2 -I.. -o test")
+                    f"{nvcc_path} device/example_device_radix_sort.cu -O2 -I.. -std=c++14 -o test")
        if core.get_device_count():
            assert 0 == os.system(f"cd {dirname}/examples && ./test")
    return dirname

 def setup_cub():
+    global cub_home
+    cub_home = ""
    from pathlib import Path
    cub_path = os.path.join(str(Path.home()), ".cache", "jittor", "cub")
-    cub_home = install_cub(cub_path)
-    setup_cuda_lib("cub", link=False, extra_flags=f"-I{cub_home}")
+    cuda_version = int(get_version(nvcc_path)[1:-1].split('.')[0])
+    extra_flags = ""
+    if cuda_version < 11:
+        cub_home = install_cub(cub_path)
+        extra_flags = f"-I{cub_home}"
+        cub_home += "/"
+    setup_cuda_lib("cub", link=False, extra_flags=extra_flags)

 def setup_cuda_extern():
    if not has_cuda: return
--- a/python/jittor/compiler.py
+++ b/python/jittor/compiler.py
@ -894,10 +894,12 @@ make_cache_dir(cache_path)
 make_cache_dir(os.path.join(cache_path, "jit"))
 make_cache_dir(os.path.join(cache_path, "obj_files"))
 make_cache_dir(os.path.join(cache_path, "gen"))
+ck_path = os.path.join(cache_path, "checkpoints")
+make_cache_dir(ck_path)

 # build cache_compile
-cc_flags += pybind_include
 cc_flags += f" -I{jittor_path}/src "
+cc_flags += pybind_include
 check_cache_compile()
 LOG.v(f"Get cache_compile: {jit_utils.cc}")

@ -981,10 +983,11 @@ assert libname is not None, "openmp library not found"
 ctypes.CDLL(libname, os.RTLD_NOW | os.RTLD_GLOBAL)

 version_file = os.path.join(jittor_path, "version")
-if os.path.isfile(version_file):
+if os.path.isfile(version_file) and not os.path.isdir(os.path.join(jittor_path, "src", "__data__")):
    with open(version_file, 'r') as f:
        version = f.read().strip()
-    key = f"{version}-{cc_type}-{'cuda' if has_cuda else 'cpu'}.o"
+    # key = f"{version}-{cc_type}-{'cuda' if has_cuda else 'cpu'}.o"
+    key = f"{version}-g++-cpu.o"
    # TODO: open the website
    extra_obj = os.path.join(cache_path, key)
    url = os.path.join("https://cg.cs.tsinghua.edu.cn/jittor/assets/build/"+key)
--- a/python/jittor/contrib.py
+++ b/python/jittor/contrib.py
@ -10,6 +10,7 @@
 import jittor as jt
 import numpy as np
 from jittor import pool
+from collections.abc import Sequence

 def argmax_pool(x, size, stride, padding=0):
    return pool.pool(x, size, 'maximum', padding, stride)
@ -180,8 +181,14 @@ jt.Var.__setitem__ = setitem
 def getitem(x, slices):
    if isinstance(slices, jt.Var) and slices.dtype == "bool":
        return getitem(x, slices.where())
-    if isinstance(slices, list):
-        slices = tuple(slices)
+    if isinstance(slices, Sequence):
+        ss = []
+        for s in slices:
+            if isinstance(s, jt.Var) and s.dtype == "bool":
+                ss.extend(s.where())
+            else:
+                ss.append(s)
+        slices = tuple(ss)
    return x.getitem(slices)

 def setitem(x, slices, value):
--- a/python/jittor/dataset/dataset.py
+++ b/python/jittor/dataset/dataset.py
@ -101,7 +101,7 @@ class Dataset(object):
        Attrs:

            * batch_size(int): batch size, default 16.
-            * totol_len(int): totol lenght.
+            * total_len(int): total lenght.
            * shuffle(bool): shuffle at each epoch, default False.
            * drop_last(bool): if true, the last batch of dataset might smaller than batch_size, default True.
            * num_workers: number of workers for loading data
@ -267,6 +267,9 @@ Example::
        LOG.i('\n'.join(msg))

    def _stop_all_workers(self):
+        # stop workers
+        for w in self.workers:
+            w.buffer.stop()
        # wait until all workers idle
        if self.num_idle.value < self.num_workers:
            with self.gid.get_lock():
--- a/python/jittor/depthwise_conv.py
+++ b/python/jittor/depthwise_conv.py
@ -0,0 +1,321 @@
+# ***************************************************************
+# Copyright (c) 2020 Jittor. Authors:
+#     Guoye Yang <498731903@qq.com>
+#     Dun Liang <randonlang@gmail.com>.
+#
+# All Rights Reserved.
+# This file is subject to the terms and conditions defined in
+# file 'LICENSE.txt', which is part of this source code package.
+# ***************************************************************
+import jittor as jt
+from jittor import init
+from jittor import nn
+from jittor import Function
+
+class DepthwiseConv(Function):
+    def __init__(self, stride=1, padding=0, dilation=1):
+        self.stride = stride if isinstance(stride, tuple) else (stride, stride)
+        self.padding = padding if isinstance(padding, tuple) else (padding, padding)
+        self.dilation = dilation if isinstance(dilation, tuple) else (dilation, dilation)
+
+    def execute(self, x, weight):
+        self.save_vars = x, weight
+        N,C,H,W = x.shape
+        o,i,Kh,Kw = weight.shape
+        assert(o == C)
+        oh = (H+self.padding[0]*2-Kh*self.dilation[0]+self.dilation[0]-1)//self.stride[0]+1
+        ow = (W+self.padding[1]*2-Kw*self.dilation[1]+self.dilation[1]-1)//self.stride[1]+1
+        filter_height, filter_width = Kh, Kw
+        self.Khw = Kh, Kw
+        output = jt.code(
+            [N, C, oh, ow],
+            x.dtype,
+            [x, weight],
+            cuda_header = """
+        template <typename T, 
+            int filter_height,
+            int filter_width, 
+            int stride_height,
+            int stride_width>
+        __global__ void KernelDepthwiseConv(
+            const T *const input_data, const T *const filter_data, const int batch_size,
+            const int output_channels, const int output_height,
+            const int output_width, const int input_channels,  
+            const int input_height, const int input_width,     
+            const int padding_height, const int padding_width, 
+            const int dilate_height, const int dilate_width, T *const output_data) {
+            const int kWeghtSize = filter_height * filter_width;
+            T r_weight[kWeghtSize];
+            const int batch = blockIdx.y;
+            const int c_out = blockIdx.x;
+            const T* weight = filter_data + c_out * filter_height * filter_width;
+            for (int i = 0; i < filter_height * filter_width; i++) r_weight[i] = weight[i];
+
+            for (int w_out = threadIdx.x; w_out < output_width; w_out += blockDim.x) {
+                for (int h_out = threadIdx.y; h_out < output_height; h_out += blockDim.y) {
+                    const int batch = blockIdx.y;
+                    const int c_out = blockIdx.x;
+
+                    const int c_in = c_out;
+                    T value = 0;
+                    const int h_in_start = -padding_height + h_out * stride_height;
+                    const int w_in_start = -padding_width + w_out * stride_width;
+                    const int h_in_end = h_in_start + filter_height * dilate_height;
+                    const int w_in_end = w_in_start + filter_width * dilate_width;
+
+                    const int in_offset =
+                        ((batch * input_channels + c_in) * input_height) * input_width;
+
+                    const int h_end = h_in_end < input_height ? h_in_end : input_height;
+                    const int w_end = w_in_end < input_width ? w_in_end : input_width;
+                    const int h_start = h_in_start > 0 ? h_in_start : 0;
+                    const int w_start = w_in_start > 0 ? w_in_start : 0;
+
+                    for (int h_in = h_in_start, h_f = 0; h_f < filter_height;
+                        h_in += dilate_height, h_f++) {
+                        for (int w_in = w_in_start, w_f = 0; w_f < filter_width;
+                            w_in += dilate_width, w_f++) {
+                            if (h_in >= 0 && h_in < input_height && w_in >= 0 &&
+                                w_in < input_width) {
+                                const int offset = in_offset + h_in * input_width + w_in;
+                                value += r_weight[h_f * filter_width + w_f] * input_data[offset];
+                            }
+                        }
+                    }
+                    int index =
+                        ((batch * gridDim.x + c_out) * output_height + h_out) * output_width +
+                        w_out;
+                    output_data[index] = value;
+                }
+            }
+        }
+        """,
+        cuda_src=f"""
+            @alias(input, in0)
+            @alias(filter, in1)
+            @alias(output, out)
+            
+            const int batch_size = input_shape0;
+            const int input_channels = input_shape1;
+            const int input_height = input_shape2;
+            const int input_width = input_shape3;
+            const int output_channels = output_shape1;
+            const int output_height = output_shape2;
+            const int output_width = output_shape3;
+            const int ksize_height = {Kh};
+            const int ksize_width = {Kw};
+            const int stride_height = {self.stride[0]};
+            const int stride_width = {self.stride[1]};
+            const int padding_height = {self.padding[0]};
+            const int padding_width = {self.padding[1]};
+            const int dilate_height = {self.dilation[0]};
+            const int dilate_width = {self.dilation[1]};
+
+            int thread = 512;
+            if (output_width > 1024 && output_width <= 2048)
+                thread = (output_width - 1) / 2 + 1;
+            else if (output_width > 512 && output_width <= 1024)
+                thread = output_width;
+            int blocks = std::min(std::max(thread / output_width, 1), output_height);
+            dim3 threads(std::min(output_width, thread), blocks, 1);
+            dim3 grid(output_channels, batch_size, 1);
+            KernelDepthwiseConv<
+                input_type, ksize_height, ksize_width, 
+                stride_height, stride_width>
+            <<<grid, threads>>>( 
+                input_p, filter_p, batch_size, output_channels, output_height,
+                output_width, input_channels, input_height, input_width,
+                padding_height, padding_width, dilate_height,
+                dilate_width, output_p);
+        """
+        )
+        return output
+
+    def grad(self, grad):
+        x, weight = self.save_vars
+        Kh, Kw = self.Khw
+        return jt.code([x.shape, weight.shape], [x.dtype, weight.dtype], [x, weight, grad],
+        cuda_header = f"#include <{jt.compile_extern.cub_home}cub/cub.cuh>"+"""
+    template <typename T>
+    __device__ __inline__ void CudaAtomicAddWithWarp(T* sum, T value) {
+    typedef cub::WarpReduce<T> WarpReduce;
+    typename WarpReduce::TempStorage temp_storage;
+    value = WarpReduce(temp_storage).Sum(value);
+    if (cub::LaneId() == 0) 
+        atomicAdd(sum, value);
+    }
+    
+    // CUDA kernel to compute the depthwise convolution backprop w.r.t input.
+    template <typename T, 
+        int filter_height,
+        int filter_width, 
+        int stride_height,
+        int stride_width>
+    __global__ void KernelDepthwiseConvInputGradCFilter(
+        const T *const input_data, const T *const output_grad_data,
+        const T *const filter_data, const int batch_size,   
+        const int output_channels, const int output_height, 
+        const int output_width, const int input_channels,   
+        const int input_height, const int input_width,      
+        const int padding_height, const int padding_width,  
+        const int dilate_height, const int dilate_width,    
+        T *const input_grad_data) {
+        const int kWeghtSize = filter_height * filter_width + 1;
+        T r_weight[kWeghtSize];
+        const int batch = blockIdx.y;
+        const int c_in = blockIdx.x;
+
+        const T* weight = filter_data + c_in * filter_height * filter_width;
+        for (int i = 0; i < filter_height * filter_width; i++)
+            r_weight[i] =
+                weight[filter_height * filter_width - i - 1];
+
+        for (int w_in = threadIdx.x; w_in < input_width; w_in += blockDim.x) {
+            for (int h_in = threadIdx.y; h_in < input_height; h_in += blockDim.y) {
+                const int batch = blockIdx.y;
+                const int c_in = blockIdx.x;
+
+                int h_out_start = h_in - (filter_height - 1) * dilate_height + padding_height;
+
+                int w_out_start = w_in - (filter_width - 1) * dilate_width + padding_width;
+
+                T value = 0;
+                int index =
+                    ((batch * gridDim.x + c_in) * input_height + h_in) * input_width +
+                    w_in;
+
+                for (int h_out = h_out_start, h_f = 0; h_f < filter_height;
+                    h_out += dilate_height, h_f++) {
+                    for (int w_out = w_out_start, w_f = 0; w_f < filter_width;
+                        w_out += dilate_width, w_f++) {
+                        int s_h_out = h_out / stride_height;
+                        int s_w_out = w_out / stride_width;
+                        if (h_out % stride_height == 0 && w_out % stride_width == 0 &&
+                            s_h_out >= 0 && s_h_out < output_height && s_w_out >= 0 &&
+                            s_w_out < output_width) {
+                        const int output_grad_offset =
+                            ((batch * output_channels + c_in) * output_height +
+                            s_h_out) *
+                                output_width +
+                            s_w_out;
+                        value +=
+                            output_grad_data[output_grad_offset] *
+                            r_weight[h_f * filter_width + w_f];
+                        }
+                    }
+                }
+                input_grad_data[index] = value;
+            }
+        }
+    }
+
+    // Cuda kernel to compute the depthwise convolution backprop w.r.t. filter.
+    template <typename T>
+    __global__ void KernelDepthwiseConvFilterGrad(
+        const T* output_grad_data, const T* input_data, const int num,
+        const int output_channels, const int output_height, const int output_width,
+        const int input_channels, const int input_height, const int input_width,
+        const int filter_height,
+        const int filter_width, const int stride_height, const int stride_width,
+        const int padding_height, const int padding_width, const int dilate_height,
+        const int dilate_width, T* filter_grad_data) {
+        T s = 0;
+
+        int gbid = (((blockIdx.z * blockDim.z + threadIdx.z) * gridDim.y) + blockIdx.y) * gridDim.x + blockIdx.x;
+
+        for (int image_w = threadIdx.x; image_w < output_width;
+            image_w += blockDim.x) {
+            for (int bid = 0; bid < num; bid++) {
+            //for (int bid = threadIdx.z; bid < num; bid+=blockDim.z) {
+                for (int image_h = threadIdx.y; image_h < output_height;
+                    image_h += blockDim.y) {
+                    int kernel_id = blockIdx.z;
+                    int kernel_h = blockIdx.y * dilate_height - padding_height;
+                    int kernel_w = blockIdx.x * dilate_width - padding_width;
+
+                    int image_hk = image_h * stride_height + kernel_h;
+                    int image_wk = image_w * stride_width + kernel_w;
+                    if (image_hk < 0 || image_hk >= input_height) continue;
+                    if (image_wk < 0 || image_wk >= input_width) continue;
+                    #define gaid(N, C, H, W) \
+                    ((((N)*gridDim.z + (C)) * output_height + (H)) * output_width + (W))
+                            int input_id = ((bid * gridDim.z +
+                                            kernel_id) *
+                                                input_height +
+                                            image_hk) *
+                                            input_width +
+                                        image_wk;
+                            s += output_grad_data[gaid(bid, kernel_id, image_h, image_w)] *
+                                input_data[input_id];
+
+                    #undef gaid
+                }
+            }
+        }
+        CudaAtomicAddWithWarp(&filter_grad_data[gbid], s);
+    }
+        """,
+    cuda_src=f"""
+    // source for backward to data
+        @alias(input, in0)
+        @alias(filter, in1)
+        @alias(output_grad, in2)
+        @alias(input_grad, out0)
+        @alias(filter_grad, out1)
+
+        const int batch_size = input_shape0;
+        const int input_channels = input_shape1;
+        const int input_height = input_shape2;
+        const int input_width = input_shape3;
+        const int output_channels = output_grad_shape1;
+        const int output_height = output_grad_shape2;
+        const int output_width = output_grad_shape3;
+        const int ksize_height = {Kh};
+        const int ksize_width = {Kw};
+        const int stride_height = {self.stride[0]};
+        const int stride_width = {self.stride[1]};
+        const int padding_height = {self.padding[0]};
+        const int padding_width = {self.padding[1]};
+        const int dilate_height = {self.dilation[0]};
+        const int dilate_width = {self.dilation[1]};
+
+        int thread = 512;
+        if (input_width > 1024 && input_width <= 2048)
+        thread = (input_width - 1) / 2 + 1;
+        else if (input_width > 512 && input_width <= 1024)
+        thread = input_width;
+        int blocks = std::min(std::max(thread / input_width, 1), input_height);
+        dim3 threads(std::min(input_width, thread), blocks, 1);
+        dim3 grid(input_channels, batch_size, 1);
+        KernelDepthwiseConvInputGradCFilter<
+            input_type, ksize_height, ksize_width
+            , stride_height, stride_width>
+            <<<grid, threads, 0>>>( 
+            input_p, output_grad_p, filter_p, batch_size,          
+            output_channels, output_height, output_width, input_channels,   
+            input_height, input_width, padding_height,       
+            padding_width, dilate_height, dilate_width, input_grad_p);   
+
+    // source for backward to filter
+    
+        int block_size = 512;
+        if (output_width > 1024 && output_width <= 2048)
+        block_size = (output_width - 1) / 2 + 1;
+        else if (output_width > 512 && output_width <= 1024)
+        block_size = output_width;
+        int crop_output_height =
+            std::min(std::max(block_size / output_width, 1), output_height);
+
+        grid = dim3(ksize_width, ksize_height, output_channels);
+        threads = dim3(std::min(output_width, block_size), crop_output_height, 1);
+        cudaMemsetAsync(filter_grad_p, 0, filter_grad->size);
+
+        KernelDepthwiseConvFilterGrad<                                         
+            input_type><<<grid, threads, 0>>>(      
+            output_grad_p, input_p, batch_size, output_channels,           
+            output_height, output_width, input_channels, input_height,           
+            input_width, ksize_height, ksize_width,           
+            stride_height, stride_width, padding_height, padding_width,          
+            dilate_height, dilate_width, filter_grad_p);                      
+    """
+    )
--- a/python/jittor/misc.py
+++ b/python/jittor/misc.py
@ -12,6 +12,35 @@ import numpy as np
 import math
 from collections.abc import Sequence,Iterable

+def __copy__(x):
+    return x.copy().detach()
+jt.Var.__copy__ = __copy__
+
+def __deepcopy__(x,memo):
+    result = x.copy().detach()
+    memo[id(x)]=result
+    return result
+jt.Var.__deepcopy__ = __deepcopy__
+
+def __len__(x):
+    return x.shape[0]
+jt.Var.__len__ = __len__
+
+def __iter__(x):
+    result = []
+    for i in range(x.shape[0]):
+        result.append(x[i])
+    return result.__iter__()
+jt.Var.__iter__ = __iter__
+
+def all(x, dim=[]):
+    return x.all_(dim).bool()
+jt.Var.all = all
+
+def any(x,dim):
+    return x.any_(dim).bool()
+jt.Var.any = any
+    

 def repeat(x, *shape):
    r'''
@ -47,10 +76,24 @@ def repeat(x, *shape):
        x = x.broadcast(x_shape)
    elif len_x_shape > len_shape:
        rep_shape = (len_x_shape - len_shape) * [1] + shape
+
+    reshape_shape = []
+    broadcast_shape = []
+    for x_s,r_s in zip(x_shape,rep_shape):
+        reshape_shape.append(1)
+        reshape_shape.append(x_s)
+
+        broadcast_shape.append(r_s)
+        broadcast_shape.append(1)
+
+    x = x.reshape(reshape_shape)
+    x = x.broadcast(broadcast_shape)
+
    tar_shape = (np.array(x_shape) * np.array(rep_shape)).tolist()
-    dims = []
-    for i in range(len(tar_shape)): dims.append(f"i{i}%{x_shape[i]}")
-    return x.reindex(tar_shape, dims)
+
+    x = x.reshape(tar_shape)
+    return x
+
 jt.Var.repeat = repeat

 def chunk(x, chunks, dim=0):
@ -166,15 +209,18 @@ def flip(x, dim=0):
        >>> x.flip(1)
        [[4 3 2 1]]
    '''
-    assert isinstance(dim, int)
-    if dim<0:
-        dim+=x.ndim
-    assert dim>=0 and dim<len(x.shape)
+    if isinstance(dim, int):
+        dim = [dim]
+    for i in range(len(dim)):
+        if dim[i]<0:
+            dim[i] += x.ndim
+        assert dim[i]>=0 and dim[i]<x.ndim
+    dim = set(dim)

    tar_dims = []
    for i in range(len(x.shape)):
-        if i == dim:
-            tar_dims.append(f"{x.shape[dim]-1}-i{i}")
+        if i in dim:
+            tar_dims.append(f"xshape{i}-1-i{i}")
        else:
            tar_dims.append(f"i{i}")
    return x.reindex(x.shape, tar_dims)
@ -302,6 +348,25 @@ def make_grid(x, nrow=8, padding=2, normalize=False, range=None, scale_each=Fals
                     [f"i1/{padding+h}*{nrow}+i2/{padding+w}", "i0", 
                      f"i1-i1/{padding+h}*{padding+h}-{padding}", f"i2-i2/{padding+w}*{padding+w}-{padding}"], overflow_value=pad_value)

+def save_image(
+    x,
+    filepath,
+    nrow: int = 8,
+    padding: int = 2,
+    normalize: bool = False,
+    range = None,
+    scale_each = False,
+    pad_value = 0,
+    format = None
+):
+    from PIL import Image
+    grid = make_grid(x, nrow=nrow, padding=padding, pad_value=pad_value,
+                     normalize=normalize, range=range, scale_each=scale_each)
+
+    ndarr = (grid*255+0.5).clamp(0, 255).permute(1, 2, 0).uint8().numpy()
+    im = Image.fromarray(ndarr)
+    im.save(filepath, format=format)
+

 def _ntuple(n):
    def parse(x):
@ -326,9 +391,8 @@ def unique(x):
    '''
    x = x.reshape(-1)
    _,x = jt.argsort(x)
-    index2 = [i for i in range(1,x.shape[0])]
-    index1 = [i for i in range(x.shape[0]-1)]
-    y = x[1:][x[index2] != x[index1]]
+    index,= jt.index((x.shape[0],))
+    y = x[1:][x[index[1:]] != x[index[:-1]]]
    x = jt.contrib.concat([x[:1],y],dim=0)
    return x

@ -401,12 +465,6 @@ def log2(x):

 jt.Var.log2 = log2

-def item(x):
-    assert x.ndim==1 and x.shape[0]==1
-    return x.numpy().item()
-
-jt.Var.item  = item
-
 def meshgrid(*tensors):
    r'''
    Take N tensors, each of which can be 1-dimensional vector, and create N n-dimensional grids, 
--- a/python/jittor/models/alexnet.py
+++ b/python/jittor/models/alexnet.py
@ -61,6 +61,7 @@ class AlexNet(nn.Module):
        x = self.classifier(x)
        return x

-def alexnet(**kwargs):
+def alexnet(pretrained=False, **kwargs):
    model = AlexNet(**kwargs)
+    if pretrained: model.load("jittorhub://alexnet.pkl")
    return model
--- a/python/jittor/models/densenet.py
+++ b/python/jittor/models/densenet.py
@ -21,7 +21,7 @@ def densenet121(pretrained=False, **kwargs):
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    '''
    model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 24, 16), **kwargs)
-    assert not pretrained, "pretrained doesn't support now"
+    if pretrained: model.load("jittorhub://densenet121.pkl")
    return model

 def densenet161(pretrained=False, **kwargs):
@ -32,7 +32,7 @@ def densenet161(pretrained=False, **kwargs):
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    '''
    model = DenseNet(num_init_features=96, growth_rate=48, block_config=(6, 12, 36, 24), **kwargs)
-    assert not pretrained, "pretrained doesn't support now"
+    if pretrained: model.load("jittorhub://densenet161.pkl")
    return model

 def densenet169(pretrained=False, **kwargs):
@ -43,7 +43,7 @@ def densenet169(pretrained=False, **kwargs):
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    '''
    model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 32, 32), **kwargs)
-    assert not pretrained, "pretrained doesn't support now"
+    if pretrained: model.load("jittorhub://densenet169.pkl")
    return model

 def densenet201(pretrained=False, **kwargs):
@ -54,7 +54,7 @@ def densenet201(pretrained=False, **kwargs):
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    '''
    model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 48, 32), **kwargs)
-    assert not pretrained, "pretrained doesn't support now"
+    if pretrained: model.load("jittorhub://densenet201.pkl")
    return model


--- a/python/jittor/models/googlenet.py
+++ b/python/jittor/models/googlenet.py
@ -12,8 +12,10 @@ from jittor import nn

 __all__ = ['GoogLeNet', 'googlenet']

-def googlenet(**kwargs):
-    return GoogLeNet(**kwargs)
+def googlenet(pretrained=False, **kwargs):
+    model = GoogLeNet(**kwargs)
+    if pretrained: model.load("jittorhub://googlenet.pkl")
+    return model

 class GoogLeNet(nn.Module):
    """ GoogLeNet model architecture.
--- a/python/jittor/models/inception.py
+++ b/python/jittor/models/inception.py
@ -4,7 +4,9 @@ from jittor import nn
 __all__ = ['Inception3', 'inception_v3']

 def inception_v3(pretrained=False, progress=True, **kwargs):
-    return Inception3(**kwargs)
+    model = Inception3(**kwargs)
+    if pretrained: model.load("jittorhub://inception_v3.pkl")
+    return model

 class Inception3(nn.Module):
    """ Inceptionv3 model architecture.
--- a/python/jittor/models/mnasnet.py
+++ b/python/jittor/models/mnasnet.py
@ -90,18 +90,22 @@ class MNASNet(nn.Module):
        x = x.mean([2, 3])
        return self.classifier(x)

-def mnasnet0_5(**kwargs):
+def mnasnet0_5(pretrained=False, **kwargs):
    model = MNASNet(0.5, **kwargs)
+    if pretrained: model.load("jittorhub://mnasnet0_5.pkl")
    return model

-def mnasnet0_75(**kwargs):
+def mnasnet0_75(pretrained=False, **kwargs):
    model = MNASNet(0.75, **kwargs)
+    if pretrained: model.load("jittorhub://mnasnet0_75.pkl")
    return model

-def mnasnet1_0(**kwargs):
+def mnasnet1_0(pretrained=False, **kwargs):
    model = MNASNet(1.0, **kwargs)
+    if pretrained: model.load("jittorhub://mnasnet1_0.pkl")
    return model

-def mnasnet1_3(**kwargs):
+def mnasnet1_3(pretrained=False, **kwargs):
    model = MNASNet(1.3, **kwargs)
+    if pretrained: model.load("jittorhub://mnasnet1_3.pkl")
    return model
--- a/python/jittor/models/mobilenet.py
+++ b/python/jittor/models/mobilenet.py
@ -93,7 +93,8 @@ class MobileNetV2(nn.Module):
    def execute(self, x):
        return self._forward_impl(x)

-def mobilenet_v2():
+def mobilenet_v2(pretrained=False):
    model = MobileNetV2()
+    if pretrained: model.load("jittorhub://mobilenet_v2.pkl")
    return model

--- a/python/jittor/models/res2net.py
+++ b/python/jittor/models/res2net.py
@ -175,10 +175,10 @@ class Res2Net(Module):
        x = self.layer4(x)
        return x, low_level_feat
    
-def res2net50(output_stride):
+def res2net50(output_stride=16):
    model = Res2Net(Bottle2neck, [3,4,6,3], output_stride)
    return model

-def res2net101(output_stride):
+def res2net101(output_stride=16):
    model = Res2Net(Bottle2neck, [3,4,23,3], output_stride)
    return model
--- a/python/jittor/models/resnet.py
+++ b/python/jittor/models/resnet.py
@ -143,7 +143,7 @@ class ResNet(nn.Module):
        x = self.layer3(x)
        x = self.layer4(x)
        x = self.avgpool(x)
-        x = jt.reshape(x, (x.shape[0], (- 1)))
+        x = jt.reshape(x, (x.shape[0], -1))
        x = self.fc(x)
        return x

@ -154,19 +154,26 @@ def _resnet(block, layers, **kwargs):
    model = ResNet(block, layers, **kwargs)
    return model

-def Resnet18(**kwargs):
-    return _resnet(BasicBlock, [2, 2, 2, 2], **kwargs)
+def Resnet18(pretrained=False, **kwargs):
+    model = _resnet(BasicBlock, [2, 2, 2, 2], **kwargs)
+    if pretrained: model.load("jittorhub://resnet18.pkl")
+    return model
 resnet18 = Resnet18

-def Resnet34(**kwargs):
-    return _resnet( BasicBlock, [3, 4, 6, 3], **kwargs)
+def Resnet34(pretrained=False, **kwargs):
+    model = _resnet(BasicBlock, [3, 4, 6, 3], **kwargs)
+    if pretrained: model.load("jittorhub://resnet34.pkl")
+    return model
 resnet34 = Resnet34

-def Resnet50(**kwargs):
-    return _resnet(Bottleneck, [3, 4, 6, 3], **kwargs)
+def Resnet50(pretrained=False, **kwargs):
+    model = _resnet(Bottleneck, [3, 4, 6, 3], **kwargs)
+    if pretrained: model.load("jittorhub://resnet50.pkl")
+    return model
+
 resnet50 = Resnet50

-def Resnet101(**kwargs):
+def Resnet101(pretrained=False, **kwargs):
    """
    ResNet-101 model architecture.

@ -180,28 +187,38 @@ def Resnet101(**kwargs):
    return _resnet(Bottleneck, [3, 4, 23, 3], **kwargs)
 resnet101 = Resnet101

-def Resnet152(**kwargs):
-    return _resnet(Bottleneck, [3, 8, 36, 3], **kwargs)
+def Resnet152(pretrained=False, **kwargs):
+    model = _resnet(Bottleneck, [3, 8, 36, 3], **kwargs)
+    if pretrained: model.load("jittorhub://resnet152.pkl")
+    return model
 resnet152 = Resnet152

-def Resnext50_32x4d(**kwargs):
+def Resnext50_32x4d(pretrained=False, **kwargs):
    kwargs['groups'] = 32
    kwargs['width_per_group'] = 4
-    return _resnet(Bottleneck, [3, 4, 6, 3], **kwargs)
+    model = _resnet(Bottleneck, [3, 4, 6, 3], **kwargs)
+    if pretrained: model.load("jittorhub://resnext50_32x4d.pkl")
+    return model
 resnext50_32x4d = Resnext50_32x4d

-def Resnext101_32x8d(**kwargs):
+def Resnext101_32x8d(pretrained=False, **kwargs):
    kwargs['groups'] = 32
    kwargs['width_per_group'] = 8
-    return _resnet(Bottleneck, [3, 4, 23, 3], **kwargs)
+    model = _resnet(Bottleneck, [3, 4, 23, 3], **kwargs)
+    if pretrained: model.load("jittorhub://resnext101_32x8d.pkl")
+    return model
 resnext101_32x8d = Resnext101_32x8d

-def Wide_resnet50_2(**kwargs):
+def Wide_resnet50_2(pretrained=False, **kwargs):
    kwargs['width_per_group'] = (64 * 2)
-    return _resnet(Bottleneck, [3, 4, 6, 3], **kwargs)
+    model = _resnet(Bottleneck, [3, 4, 6, 3], **kwargs)
+    if pretrained: model.load("jittorhub://wide_resnet50_2.pkl")
+    return model
 wide_resnet50_2 = Wide_resnet50_2

-def Wide_resnet101_2(**kwargs):
+def Wide_resnet101_2(pretrained=False, **kwargs):
    kwargs['width_per_group'] = (64 * 2)
-    return _resnet(Bottleneck, [3, 4, 23, 3], **kwargs)
+    model = _resnet(Bottleneck, [3, 4, 23, 3], **kwargs)
+    if pretrained: model.load("jittorhub://wide_resnet101_2.pkl")
+    return model
 wide_resnet101_2 = Wide_resnet101_2
--- a/python/jittor/models/shufflenetv2.py
+++ b/python/jittor/models/shufflenetv2.py
@ -93,14 +93,22 @@ def _shufflenetv2(arch, *args):
    model = ShuffleNetV2(*args)
    return model

-def shufflenet_v2_x0_5():
-    return _shufflenetv2('shufflenetv2_x0.5', [4, 8, 4], [24, 48, 96, 192, 1024])
+def shufflenet_v2_x0_5(pretrained=False):
+    model = _shufflenetv2('shufflenetv2_x0.5', [4, 8, 4], [24, 48, 96, 192, 1024])
+    if pretrained: model.load("jittorhub://shufflenet_v2_x0_5.pkl")
+    return model

-def shufflenet_v2_x1_0():
-    return _shufflenetv2('shufflenetv2_x1.0', [4, 8, 4], [24, 116, 232, 464, 1024])
+def shufflenet_v2_x1_0(pretrained=False):
+    model = _shufflenetv2('shufflenetv2_x1.0', [4, 8, 4], [24, 116, 232, 464, 1024])
+    if pretrained: model.load("jittorhub://shufflenet_v2_x1_0.pkl")
+    return model

-def shufflenet_v2_x1_5():
-    return _shufflenetv2('shufflenetv2_x1.5', [4, 8, 4], [24, 176, 352, 704, 1024])
+def shufflenet_v2_x1_5(pretrained=False):
+    model = _shufflenetv2('shufflenetv2_x1.5', [4, 8, 4], [24, 176, 352, 704, 1024])
+    if pretrained: model.load("jittorhub://shufflenet_v2_x1_5.pkl")
+    return model

-def shufflenet_v2_x2_0():
-    return _shufflenetv2('shufflenetv2_x2.0', [4, 8, 4], [24, 244, 488, 976, 2048])
+def shufflenet_v2_x2_0(pretrained=False):
+    model = _shufflenetv2('shufflenetv2_x2.0', [4, 8, 4], [24, 244, 488, 976, 2048])
+    if pretrained: model.load("jittorhub://shufflenet_v2_x2_0.pkl")
+    return model
--- a/python/jittor/models/squeezenet.py
+++ b/python/jittor/models/squeezenet.py
@ -83,8 +83,12 @@ def _squeezenet(version, **kwargs):
    model = SqueezeNet(version, **kwargs)
    return model

-def squeezenet1_0(**kwargs):
-    return _squeezenet('1_0', **kwargs)
+def squeezenet1_0(pretrained=False, **kwargs):
+    model = _squeezenet('1_0', **kwargs)
+    if pretrained: model.load("jittorhub://squeezenet1_0.pkl")
+    return model

-def squeezenet1_1(**kwargs):
-    return _squeezenet('1_1', **kwargs)
+def squeezenet1_1(pretrained=False, **kwargs):
+    model = _squeezenet('1_1', **kwargs)
+    if pretrained: model.load("jittorhub://squeezenet1_1.pkl")
+    return model
--- a/python/jittor/models/vgg.py
+++ b/python/jittor/models/vgg.py
@ -67,33 +67,49 @@ def _vgg(arch, cfg, batch_norm, **kwargs):
    return model


-def vgg11(**kwargs):
-    return _vgg('vgg11', 'A', False, **kwargs)
+def vgg11(pretrained=False, **kwargs):
+    model = _vgg('vgg11', 'A', False, **kwargs)
+    if pretrained: model.load("jittorhub://vgg11.pkl")
+    return model


-def vgg11_bn(**kwargs):
-    return _vgg('vgg11_bn', 'A', True, **kwargs)
+def vgg11_bn(pretrained=False, **kwargs):
+    model = _vgg('vgg11_bn', 'A', True, **kwargs)
+    if pretrained: model.load("jittorhub://vgg11_bn.pkl")
+    return model


-def vgg13(**kwargs):
-    return _vgg('vgg13', 'B', False, **kwargs)
+def vgg13(pretrained=False, **kwargs):
+    model = _vgg('vgg13', 'B', False, **kwargs)
+    if pretrained: model.load("jittorhub://vgg13.pkl")
+    return model


-def vgg13_bn(**kwargs):
-    return _vgg('vgg13_bn', 'B', True, **kwargs)
+def vgg13_bn(pretrained=False, **kwargs):
+    model = _vgg('vgg13_bn', 'B', True, **kwargs)
+    if pretrained: model.load("jittorhub://vgg13_bn.pkl")
+    return model


-def vgg16(**kwargs):
-    return _vgg('vgg16', 'D', False, **kwargs)
+def vgg16(pretrained=False, **kwargs):
+    model = _vgg('vgg16', 'D', False, **kwargs)
+    if pretrained: model.load("jittorhub://vgg16.pkl")
+    return model


-def vgg16_bn(**kwargs):
-    return _vgg('vgg16_bn', 'D', True, **kwargs)
+def vgg16_bn(pretrained=False, **kwargs):
+    model = _vgg('vgg16_bn', 'D', True, **kwargs)
+    if pretrained: model.load("jittorhub://vgg16_bn.pkl")
+    return model


-def vgg19(**kwargs):
-    return _vgg('vgg19', 'E', False, **kwargs)
+def vgg19(pretrained=False, **kwargs):
+    model = _vgg('vgg19', 'E', False, **kwargs)
+    if pretrained: model.load("jittorhub://vgg19.pkl")
+    return model


-def vgg19_bn(**kwargs):
-    return _vgg('vgg19_bn', 'E', True, **kwargs)
+def vgg19_bn(pretrained=False, **kwargs):
+    model = _vgg('vgg19_bn', 'E', True, **kwargs)
+    if pretrained: model.load("jittorhub://vgg19_bn.pkl")
+    return model
--- a/python/jittor/nn.py
+++ b/python/jittor/nn.py
@ -151,9 +151,9 @@ jt.Var.__imatmul__ = lambda a,b: a.assign(matmul(a,b))
 def get_init_var_rand(shape, dtype):
    return jt.array(np.random.normal(0.0, 1.0, shape).astype(np.float32))

-def relu(x): return jt.maximum(x, 0)
+def relu(x): return jt.ternary((x>0.0), x, jt.broadcast_var(0.0, x))
 def leaky_relu(x, scale=0.01): return jt.ternary(x>0, x, x*scale)
-def relu6(x): return jt.minimum(jt.maximum(x, 0), 6)
+def relu6(x): return jt.minimum(jt.maximum(x, 0.0), 6.0)
 def sign(x):
    one = jt.ones(x.shape)
    x = jt.ternary(x>0, one, x)
@ -264,17 +264,29 @@ class L1Loss(Module):
    def execute(self, output, target):
        return l1_loss(output, target)

-class BCEWithLogitsLoss(Module):
-    def __init__(self, weight=None, size_average=True):
-        self.sigmoid = Sigmoid()
-        self.bce = BCELoss(weight, size_average)
-    def execute(self, output, target):
-        output = self.sigmoid(output)
-        output = self.bce(output, target)
-        return output
+def binary_cross_entropy_with_logits(output, target, weight=None, pos_weight=None, size_average=True):
+    max_val = jt.clamp(-output,min_v=0)
+    if pos_weight is not None:
+        log_weight = (pos_weight-1)*target + 1
+        loss = (1-target)*output+(log_weight*(((-max_val).exp()+(-output - max_val).exp()).log()+max_val))
+    else:
+        loss = (1-target)*output+max_val+((-max_val).exp()+(-output -max_val).exp()).log()
+    if weight is not None:
+        loss *=weight

-def binary_cross_entropy_with_logits(input, target, weight=None, size_average=True):
-    return BCEWithLogitsLoss(weight, size_average)(input, target)
+    if size_average:
+        return loss.mean()
+    else:
+        return loss.sum()
+
+class BCEWithLogitsLoss(Module):
+    def __init__(self, weight=None, pos_weight=None, size_average=True):
+        self.pos_weight = pos_weight
+        self.weight = weight
+        self.size_average = size_average
+
+    def execute(self, output, target):
+        return binary_cross_entropy_with_logits(output,target,self.weight,self.pos_weight,self.size_average)

 def softmax(x, dim = None):
    if dim is None:
@ -340,82 +352,39 @@ class BatchNorm(Module):
        self.eps = eps
        self.momentum = momentum
        self.affine = affine
-        if affine:
-            self.weight = init.constant((num_features,), "float32", 1.0)
-            self.bias = init.constant((num_features,), "float32", 0.0)
+        self.weight = init.constant((num_features,), "float32", 1.0) if affine else 1.0
+        self.bias = init.constant((num_features,), "float32", 0.0) if affine else 0.0
        self.running_mean = init.constant((num_features,), "float32", 0.0).stop_grad()
        self.running_var = init.constant((num_features,), "float32", 1.0).stop_grad()

    def execute(self, x):
+        dims = [0]+list(range(2,x.ndim))
        if self.is_train:
-            xmean = jt.mean(x, dims=[0,2,3], keepdims=1)
-            x2mean = jt.mean(x*x, dims=[0,2,3], keepdims=1)
+            xmean = jt.mean(x, dims=dims)
+            x2mean = jt.mean(x*x, dims=dims)
            if self.sync and jt.in_mpi:
                xmean = xmean.mpi_all_reduce("mean")
                x2mean = x2mean.mpi_all_reduce("mean")

-            xvar = x2mean-xmean*xmean
-            norm_x = (x-xmean)/jt.sqrt(xvar+self.eps)
+            xvar = (x2mean-xmean*xmean).maximum(0.0)
+            w = self.weight / jt.sqrt(xvar+self.eps)
+            b = self.bias - xmean * w
+            norm_x = x * w.broadcast(x, dims) + b.broadcast(x, dims)
+
            self.running_mean.update(self.running_mean +
                (xmean.reshape((-1,)) - self.running_mean) * self.momentum)
            self.running_var.update(self.running_var +
                (xvar.reshape((-1,))-self.running_var)*self.momentum)
-        else:
-            running_mean = self.running_mean.broadcast(x, [0,2,3])
-            running_var = self.running_var.broadcast(x, [0,2,3])
-            norm_x = (x-running_mean)/jt.sqrt(running_var+self.eps)
-        if not self.affine:
            return norm_x
-        w = self.weight.broadcast(x, [0,2,3])
-        b = self.bias.broadcast(x, [0,2,3])
-        return norm_x * w + b
-        
-BatchNorm2d = BatchNorm
-	
-class BatchNorm1d(Module):
-    def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True, is_train=True, sync=True):
-        self.sync = sync
-        self.num_features = num_features
-        self.is_train = is_train
-        self.eps = eps
-        self.momentum = momentum
-        self.affine = affine
-        if affine:
-            self.weight = init.constant((num_features,), "float32", 1.0)
-            self.bias = init.constant((num_features,), "float32", 0.0)
-        self.running_mean = init.constant((num_features,), "float32", 0.0).stop_grad()
-        self.running_var = init.constant((num_features,), "float32", 1.0).stop_grad()
-
-    def execute(self, x):
-        if len(x.shape) == 3:
-            dims = [0, 2]
        else:
-            dims = [0]
-        if self.is_train:
-            xmean = jt.mean(x, dims=dims, keepdims=1)
-            x2mean = jt.mean(x*x, dims=dims, keepdims=1)
-
-            if self.sync and jt.in_mpi:
-                xmean = xmean.mpi_all_reduce("mean")
-                x2mean = x2mean.mpi_all_reduce("mean")
-
-            xvar = x2mean-xmean*xmean
-            norm_x = (x-xmean)/jt.sqrt(xvar+self.eps)
-            self.running_mean.update(self.running_mean + 
-                (xmean.sum(dims)-self.running_mean)*self.momentum)
-            self.running_var.update(self.running_var + 
-                (xvar.sum(dims)-self.running_var)*self.momentum)
-        else:
-            running_mean = self.running_mean.broadcast(x, dims)
-            running_var = self.running_var.broadcast(x, dims)
-            norm_x = (x-running_mean)/jt.sqrt(running_var+self.eps)
-        if not self.affine:
+            w = self.weight / jt.sqrt(self.running_var+self.eps)
+            b = self.bias - self.running_mean * w
+            norm_x = x * w.broadcast(x, dims) + b.broadcast(x, dims)
            return norm_x
-        w = self.weight.broadcast(x, dims)
-        b = self.bias.broadcast(x, dims)
-        return norm_x * w + b

-class InstanceNorm2d(Module):
+BatchNorm2d = BatchNorm1d = BatchNorm
+
+class InstanceNorm(Module):
    def __init__(self, num_features, eps=1e-05, momentum=0.1, affine=True, is_train=True, sync=True):
        self.sync = sync
        self.num_features = num_features
@ -424,47 +393,43 @@ class InstanceNorm2d(Module):
        self.momentum = momentum

        self.affine = affine
-        if self.affine:
-            self.weight = init.constant((num_features,), "float32", 1.0)
-            self.bias = init.constant((num_features,), "float32", 0.0)
+        self.weight = init.constant((num_features,), "float32", 1.0) if affine else 1.0
+        self.bias = init.constant((num_features,), "float32", 0.0) if affine else 0.0

    def execute(self, x):
-        xmean = jt.mean(x, dims=[2,3], keepdims=1)
-        x2mean = jt.mean(x*x, dims=[2,3], keepdims=1)
-        if self.sync and jt.in_mpi:
-            xmean = xmean.mpi_all_reduce("mean")
-            x2mean = x2mean.mpi_all_reduce("mean")
+        dims = list(range(2,x.ndim))
+        xmean = jt.mean(x, dims=dims)
+        x2mean = jt.mean(x*x, dims=dims)

-        xvar = jt.maximum(x2mean-xmean*xmean, 0)
-        norm_x = (x-xmean)/jt.sqrt(xvar+self.eps)
-        if not self.affine:
-            return norm_x
-        w = self.weight.broadcast(x, [0,2,3])
-        b = self.bias.broadcast(x, [0,2,3])
-        return norm_x * w + b
+        xvar = (x2mean-xmean*xmean).maximum(0.0)
+        w = self.weight / jt.sqrt(xvar+self.eps)
+        b = self.bias - xmean * w
+        return x * w.broadcast(x, dims) + b.broadcast(x, dims)
+
+InstanceNorm2d = InstanceNorm1d = InstanceNorm

 class LayerNorm(Module):
    def __init__(self, normalized_shape, eps: float = 1e-5, elementwise_affine: bool = True) -> None:
-        super(LayerNorm, self).__init__()
        if isinstance(normalized_shape, int):
            normalized_shape = (normalized_shape,)
        self.normalized_shape = tuple(normalized_shape)
        self.eps = eps
        self.elementwise_affine = elementwise_affine
-        if self.elementwise_affine:
-            self.weight = init.constant(normalized_shape, "float32", 1.0)
-            self.bias = init.constant(normalized_shape, "float32", 0.0)
+        self.weight = init.constant(normalized_shape, "float32", 1.0) if elementwise_affine else 1.0
+        self.bias = init.constant(normalized_shape, "float32", 0.0) if elementwise_affine else 0.0

-    def execute(self,x):
+    def execute(self, x):
        dims = [-i for i in range(len(self.normalized_shape), 0, -1)]
-        mean = jt.mean(x,dims=dims,keepdims=1)
-        numerator = x-mean
-        variance = jt.mean(numerator.sqr(),dims=dims,keepdims=1)
-        denominator = jt.sqrt(variance+self.eps)
-        norm_x = numerator/denominator
-        if self.elementwise_affine:
-            norm_x = norm_x * self.weight+self.bias
-        return norm_x
+        xmean = jt.mean(x, dims=dims, keepdims=1)
+        x2mean = jt.mean(x*x, dims=dims, keepdims=1)
+
+        xvar = (x2mean-xmean*xmean).maximum(0.0)
+        w = self.weight / jt.sqrt(xvar+self.eps)
+        b = self.bias - xmean * w
+        return x * w + b
+
+
+LayerNorm2d = LayerNorm1d = LayerNorm

 class GroupNorm(Module):
    def __init__(self, num_groups, num_channels, eps=1e-05, affine=True, is_train=True):
@ -473,28 +438,32 @@ class GroupNorm(Module):
        self.eps = eps

        self.affine = affine
-        if self.affine:
-            self.weight = init.constant((num_channels,), "float32", 1.0)
-            self.bias = init.constant((num_channels,), "float32", 0.0)
+        self.weight = init.constant((num_channels,), "float32", 1.0) if affine else 1.0
+        self.bias = init.constant((num_channels,), "float32", 0.0) if affine else 0.0

    def execute(self, x):
        N = x.shape[0]
        C = self.num_channels
        output_shape = (N,-1)
-	# TODO: 3d group norm
+	    # TODO: 3d group norm
        if x.ndim==4:
            output_shape = x.shape
        assert C % self.num_groups == 0
-        x = x.reshape((N, self.num_groups, int(C/self.num_groups), -1))
-        xmean = jt.mean(x, dims=[2,3], keepdims=1)
-        x2mean = jt.mean(x*x, dims=[2,3], keepdims=1)
-        xvar = jt.maximum(x2mean-xmean*xmean, 0)
-        norm_x = (x-xmean)/jt.sqrt(xvar+self.eps)
-        if not self.affine:
-            return norm_x.reshape(output_shape)
-        w = self.weight.reshape((1,self.num_groups,C//self.num_groups,1))
-        b = self.bias.reshape((1,self.num_groups,C//self.num_groups,1))
-        return (norm_x * w + b).reshape(output_shape)
+        x = x.reshape((N, self.num_groups, C//self.num_groups, -1))
+        xmean = jt.mean(x, dims=[2,3]).reshape((N, self.num_groups, 1))
+        x2mean = jt.mean(x*x, dims=[2,3]).reshape((N, self.num_groups, 1))
+        xvar = (x2mean-xmean*xmean).maximum(0.0)
+
+        if self.affine:
+            w = self.weight.reshape((1, self.num_groups, -1))
+            b = self.bias.reshape((1, self.num_groups, -1))
+        else:
+            w = 1
+            b = 0
+        w = w / jt.sqrt(xvar+self.eps)
+        b = b - xmean * w
+        x = x * w.broadcast(x, [3]) + b.broadcast(x, [3])
+        return x.reshape(output_shape)

 Relu = jt.make_module(relu)
 ReLU = Relu
@ -504,6 +473,8 @@ ReLU6 = jt.make_module(relu6)
 Softmax = jt.make_module(softmax, 2)
 GELU = jt.make_module(gelu)

+from jittor.depthwise_conv import DepthwiseConv
+
 class Conv(Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True):
        self.in_channels = in_channels
@ -513,6 +484,9 @@ class Conv(Module):
        self.padding = padding if isinstance(padding, tuple) else (padding, padding)
        self.dilation = dilation if isinstance(dilation, tuple) else (dilation, dilation)
        self.groups = groups
+        self.is_depthwise_conv = self.groups == self.out_channels and self.groups == self.in_channels
+        if self.is_depthwise_conv and jt.flags.use_cuda:
+            self.depthwise_conv = DepthwiseConv(stride, padding, dilation)
        assert in_channels % groups == 0, 'in_channels must be divisible by groups'
        assert out_channels % groups == 0, 'out_channels must be divisible by groups'
        Kh, Kw = self.kernel_size
@ -532,7 +506,13 @@ class Conv(Module):
            self.bias = None

    def execute(self, x):
-        if self.groups == 1:
+        if self.is_depthwise_conv and jt.flags.use_cuda:
+            y = self.depthwise_conv(x, self.weight)
+            if self.bias is not None:
+                b = self.bias.broadcast(y.shape, [0,2,3])
+                y = y + b
+            return y
+        elif self.groups == 1:
            N,C,H,W = x.shape
            Kh, Kw = self.kernel_size
            assert C==self.in_channels
@ -566,7 +546,6 @@ class Conv(Module):
                f'i4*{self.stride[0]}-{self.padding[0]}+i6*{self.dilation[0]}', # Hid+Khid
                f'i5*{self.stride[1]}-{self.padding[1]}+i7*{self.dilation[1]}', # Wid+KWid
            ])
-            xx.compile_options = {"G":G}
            # w: [oc, CpG, Kh, Kw]
            ww = self.weight.reindex([N, G, oc//G, CpG, oh, ow, Kh, Kw], [
                f'i1*{oc//G}+i2',
@ -574,6 +553,7 @@ class Conv(Module):
                'i6',
                'i7'
            ])
+            ww.compile_options = xx.compile_options = {"G":G,"C":C}
            yy = xx*ww
            y = yy.reindex_reduce('add', [N, oc, oh, ow], [
                'i0',
@ -727,6 +707,45 @@ class ConvTranspose(Module):
            y = y + b
        return y

+def conv_transpose(input, weight, bias=None, stride=1, padding=0, output_padding=0, groups=1, dilation=1):
+    x = input
+    N,C,H,W = x.shape
+    i,o,h,w = weight.shape
+    assert C==i
+    assert groups==1, "Group conv not supported yet."
+    stride = stride if isinstance(stride, tuple) else (stride, stride)
+    dilation = dilation if isinstance(dilation, tuple) else (dilation, dilation)
+    # added
+    padding = padding if isinstance(padding, tuple) else (padding, padding)
+    output_padding = output_padding if isinstance (output_padding, tuple) else (output_padding, output_padding)
+    assert output_padding[0] < max(stride[0], dilation[0]) and \
+        output_padding[1] < max(stride[1], dilation[1]), \
+        "output padding must be smaller than max(stride, dilation)"
+
+    stride_h, stride_w = stride
+    padding_h, padding_w = padding
+    dilation_h, dilation_w = dilation
+
+    h_out = (H-1) * stride_h + output_padding[0] - 2*padding_h + 1 + (h-1)*dilation_h
+    w_out = (W-1) * stride_w + output_padding[1] - 2*padding_w + 1 + (w-1)*dilation_w
+    out_shape = (N, o, h_out, w_out)
+    shape = (N, i, o, H, W, h, w)
+    xx = x.broadcast(shape, (2, 5, 6)) # i,h,w
+    ww = weight.broadcast(shape, (0, 3, 4)) # N,H,W
+    y = (ww*xx).reindex_reduce("add", out_shape, [
+        'i0', # N
+        'i2', # o
+        f'i3*{stride_h}-{padding_h}+i5*{dilation_h}', # Hid+Khid
+        f'i4*{stride_w}-{padding_w}+i6*{dilation_w}', # Wid+KWid
+    ])
+    if isinstance(bias, jt.Var):
+        b = bias.broadcast(y.shape, [0,2,3])
+        y = y + b
+    else:
+        assert not bias, "Bias should be none or jittor var"
+    return y
+
+conv_transpose2d = conv_transpose

 def pad(x,padding, mode='constant', value=0):
    assert mode in ['constant','replicate','reflect','circular'],'only support constant,replicate,reflect,circular pad'
--- a/python/jittor/optim.py
+++ b/python/jittor/optim.py
@ -33,6 +33,9 @@ class Optimizer(object):
            assert isinstance(pg, dict)
            self.param_groups.append(pg)
        self.n_step = 0
+
+    def add_param_group(self, group):
+        self.param_groups.append(group)
    
    @property
    def defaults(self):
@ -210,3 +213,64 @@ class Adam(Optimizer):
                v.update(b1 * v + (1-b1) * g * g)
                step_size = lr * jt.sqrt(1-b1**n) / (1-b0 ** n)
                p.update(p - m * step_size / (jt.sqrt(v) + eps))
+
+
+class LRScheduler:
+    def __init__(self,optimizer, last_epoch=-1):
+        assert isinstance(optimizer,Optimizer)
+        self.optimizer = optimizer
+        
+        if last_epoch==-1:
+            for gp in optimizer.param_groups:
+                gp.setdefault('initial_lr',gp.get('lr',optimizer.lr))
+        else:
+            for gp in optimizer.param_groups:
+                assert 'initial_lr' in gp 
+        
+        self.base_lrs = list(map(lambda group: group['initial_lr'], optimizer.param_groups))
+        self.last_epoch = last_epoch
+        self.optimizer._step_count = 0
+        self._step_count = 0
+        self.step()
+
+    def get_lr(self):
+        raise NotImplementedError 
+    
+    def get_last_lr(self):
+        return self._last_lr
+
+    def step(self,epoch=None):
+        self._step_count += 1
+
+        if epoch is None:
+            self.last_epoch += 1
+            values = self.get_lr()
+        else:
+            self.last_epoch = epoch
+            values = self.get_lr()
+
+        for i, data in enumerate(zip(self.optimizer.param_groups, values)):
+            param_group, lr = data
+            param_group['lr'] = lr
+
+        self._last_lr = [group['lr'] for group in self.optimizer.param_groups]
+
+
+class LambdaLR(LRScheduler):
+
+    def __init__(self, optimizer, lr_lambda, last_epoch=-1):
+        if not isinstance(lr_lambda, list) and not isinstance(lr_lambda, tuple):
+            self.lr_lambdas = [lr_lambda] * len(optimizer.param_groups)
+        else:
+            if len(lr_lambda) != len(optimizer.param_groups):
+                raise ValueError("Expected {} lr_lambdas, but got {}".format(len(optimizer.param_groups), len(lr_lambda)))
+
+            self.lr_lambdas = list(lr_lambda)
+            
+        super(LambdaLR, self).__init__(optimizer, last_epoch)
+        
+        
+
+    def get_lr(self):
+        return [base_lr * lmbda(self.last_epoch)
+                for lmbda, base_lr in zip(self.lr_lambdas, self.base_lrs)]
--- a/python/jittor/pool.py
+++ b/python/jittor/pool.py
@ -40,6 +40,7 @@ class Pool(Module):
                    count = f"int count = {self.kernel_size*self.kernel_size};"
                else:
                    count = "int count = (k2_ - k2) * (k3_ - k3);"
+                count += "float32 rcount = 1.0f / count;"
            else:
                count = ""
            forward_body = f'''{{
@ -168,7 +169,9 @@ class AdaptiveAvgPool2d(Module):
            oh = x.shape[2] if self.output_size[0] is None else self.output_size[0]
            ow = x.shape[3] if self.output_size[1] is None else self.output_size[1]
        else:
-            raise TypeError(f"AdaptiveAvgPool2d only support int, typle or list input. Not support {type(self.output_size)} yet.")
+            raise TypeError(f"AdaptiveAvgPool2d only support int, tuple or list input. Not support {type(self.output_size)} yet.")
+        if oh == 1 and ow == 1:
+            return x.reduce("mean", [2,3], keepdims=True)
        N,C,H,W = x.shape
        self.sh = math.floor(H / oh)
        self.sw = math.floor(W / ow)
--- a/python/jittor/test/perf/perf.py
+++ b/python/jittor/test/perf/perf.py
@ -0,0 +1,221 @@
+import sys, os
+
+suffix = ""
+
+import jittor as jt
+import time
+from pathlib import Path
+home_path = str(Path.home())
+perf_path = os.path.join(home_path, ".cache", "jittor_perf")
+
+def main():
+    os.makedirs(perf_path+"/src/jittor", exist_ok=True)
+    os.makedirs(perf_path+"/src/jittor_utils", exist_ok=True)
+    os.system(f"cp -rL {jt.flags.jittor_path} {perf_path+'/src/'}")
+    os.system(f"cp -rL {jt.flags.jittor_path}/../jittor_utils {perf_path+'/src/'}")
+    use_torch_1_4 = os.environ.get("use_torch_1_4", "0") == "1"
+    dockerfile_src = r"""
+FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04
+
+RUN echo \
+"deb [trusted=yes] https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ bionic main restricted universe multiverse\n\
+deb [trusted=yes] https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ bionic-updates main restricted universe multiverse\n\
+deb [trusted=yes] https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ bionic-backports main restricted universe multiverse\n\
+deb [trusted=yes] https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ bionic-security main restricted universe multiverse" > /etc/apt/sources.list
+
+# RUN rm -rf /var/lib/apt/lists/*
+RUN apt update || true
+
+RUN apt install wget \
+        python3.7 python3.7-dev \
+        g++ build-essential -y
+
+WORKDIR /usr/src
+
+RUN apt download python3-distutils && dpkg-deb -x ./python3-distutils* / \
+    && wget -O - https://bootstrap.pypa.io/get-pip.py | python3.7
+
+# change tsinghua mirror
+RUN pip3 config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
+
+RUN pip3 install  \
+        pybind11 \
+        numpy \
+        tqdm \
+        pillow \
+        astunparse
+
+RUN pip3 install torch torchvision
+"""
+    global suffix
+    if use_torch_1_4:
+        suffix = "_1_4"
+        dockerfile_src = dockerfile_src.replace("torch ", "torch==1.4.0 ")
+        dockerfile_src = dockerfile_src.replace("torchvision", "torchvision==0.5.0")
+    with open("/tmp/perf_dockerfile", 'w') as f:
+        f.write(dockerfile_src)
+    assert os.system("sudo nvidia-smi -lgc 1500") == 0
+    assert os.system(f"sudo docker build --tag jittor/jittor-perf{suffix} -f /tmp/perf_dockerfile .") == 0
+    # run once for compile source
+    jt_fps = test_main("jittor", "resnet50", 1)
+    
+    logs = ""
+    # resnext50_32x4d with bs=8 cannot pass this test
+    #### inference test
+    for model_name in ["resnet50", "wide_resnet50_2", # "resnext50_32x4d", 
+        "resnet152", "wide_resnet101_2", "resnext101_32x8d", 
+        "alexnet", "vgg11", "squeezenet1_1", "mobilenet_v2", 
+        "densenet121", "densenet169", "densenet201",
+        "res2net50", "res2net101"]:
+        for bs in [1, 2, 4, 8, 16, 32, 64, 128]:
+            jt_fps = test_main("jittor", model_name, bs)
+            logs += f"jittor-{model_name}-{bs} {jt_fps}\n"
+            tc_fps = test_main("torch", model_name, bs)
+            logs += f"torch-{model_name}-{bs} {tc_fps}\n"
+            logs += f"compare-{model_name}-{bs} {jt_fps/tc_fps}\n"
+            print(logs)
+    #### train test
+    for model_name in ["train_resnet50", "train_resnet101"
+        ]:
+        for bs in [1, 2, 4, 8, 16, 32, 64, 128]:
+            jt_fps = test_main("jittor", model_name, bs)
+            logs += f"jittor-{model_name}-{bs} {jt_fps}\n"
+            tc_fps = test_main("torch", model_name, bs)
+            logs += f"torch-{model_name}-{bs} {tc_fps}\n"
+            logs += f"compare-{model_name}-{bs} {jt_fps/tc_fps}\n"
+            print(logs)
+    with open(f"{perf_path}/jittor-perf{suffix}-latest.txt", "w") as f:
+        f.write(logs)
+    from datetime import datetime
+    with open(f"{perf_path}/jittor-perf{suffix}-{datetime.now()}.txt", "w") as f:
+        f.write(logs)
+
+def test_main(name, model_name, bs):
+    cmd = f"sudo docker run --gpus all --rm -v {perf_path}:/root/.cache/jittor --network host jittor/jittor-perf{suffix} bash -c 'PYTHONPATH=/root/.cache/jittor/src python3.7 /root/.cache/jittor/src/jittor/test/perf/perf.py {name} {model_name} {bs}'"
+    fps = -1
+    try:
+        print("run cmd:", cmd)
+        if os.system(cmd) == 0:
+            with open(f"{perf_path}/{name}-{model_name}-{bs}.txt", 'r') as f:
+                fps = float(f.read().split()[3])
+    except:
+        pass
+    return fps
+
+def time_iter(duration=2, min_iter=5):
+    start = time.time()
+    for i in range(10000000):
+        yield i
+        end = time.time()
+        if end-start>duration and i>=min_iter:
+            return
+
+def test(name, model_name, bs):
+    print("hello", name, model_name, bs)
+    import numpy as np
+    import time
+    is_train = False
+    _model_name = model_name
+    if model_name.startswith("train_"):
+        is_train = True
+        model_name = model_name[6:]
+    if name == "torch":
+        import torch
+        import torchvision.models as tcmodels
+        from torch import optim
+        from torch import nn
+        torch.backends.cudnn.deterministic = False
+        torch.backends.cudnn.benchmark = True
+        model = tcmodels.__dict__[model_name]()
+        model = model.cuda()
+    else:
+        import jittor as jt
+        from jittor import optim
+        from jittor import nn
+        jt.flags.use_cuda = 1
+        jt.cudnn.set_algorithm_cache_size(10000)
+        import jittor.models as jtmodels
+        model = jtmodels.__dict__[model_name]()
+        if (model == "resnet152" or model == "resnet101") and bs == 128 and is_train:
+            jt.cudnn.set_max_workspace_ratio(0.05)
+    if is_train:
+        model.train()
+    else:
+        model.eval()
+    img_size = 224
+    if model_name == "inception_v3":
+        img_size = 300
+    test_img = np.random.random((bs, 3, img_size, img_size)).astype("float32")
+    if is_train:
+        label = (np.random.random((bs,)) * 1000).astype("int32")
+    if name == "torch":
+        test_img = torch.Tensor(test_img).cuda()
+        if is_train:
+            label = torch.LongTensor(label).cuda()
+            opt = optim.SGD(model.parameters(), 0.001)
+        sync = lambda: torch.cuda.synchronize()
+        jt = torch
+    else:
+        test_img = jt.array(test_img).stop_grad()
+        if is_train:
+            label = jt.array(label).stop_grad()
+            opt = optim.SGD(model.parameters(), 0.001)
+        sync = lambda: jt.sync_all(True)
+
+    sync()
+    use_profiler = os.environ.get("use_profiler", "0") == "1"
+    if hasattr(jt, "nograd"):
+        ng = jt.no_grad()
+        ng.__enter__()
+    def iter():
+        x = model(test_img)
+        if isinstance(x, tuple):
+            x = x[0]
+        if is_train:
+            loss = nn.CrossEntropyLoss()(x, label)
+            if name == "jittor":
+                opt.step(loss)
+            else:
+                opt.zero_grad()
+                loss.backward()
+                opt.step()
+        else:
+            x.sync()
+    sync()
+    for i in time_iter():
+        iter()
+    sync()
+    for i in time_iter():
+        iter()
+    sync()
+    if use_profiler:
+        if name == "torch":
+            prof = torch.autograd.profiler.profile(use_cuda=True)
+        else:
+            prof = jt.profile_scope()
+        prof.__enter__()
+    if name == "jittor":
+        if hasattr(jt.flags, "use_parallel_op_compiler"):
+            jt.flags.use_parallel_op_compiler = 0
+    start = time.time()
+    for i in time_iter(10):
+        iter()
+    sync()
+    end = time.time()
+    if use_profiler:
+        prof.__exit__(None,None,None)
+        if name == "torch":
+            print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=30))
+    total_iter = i+1
+    print("duration:", end-start, "FPS:", total_iter*bs/(end-start))
+    fpath = f"{home_path}/.cache/jittor/{name}-{_model_name}-{bs}.txt"
+    with open(fpath, 'w') as f:
+        f.write(f"duration: {end-start} FPS: {total_iter*bs/(end-start)}")
+    os.chmod(fpath, 0x666)
+
+if len(sys.argv) <= 1:
+    main()
+else:
+    name, model, bs = sys.argv[1:]
+    bs = int(bs)
+    test(name, model, bs)
--- a/python/jittor/test/system/test_all.sh
+++ b/python/jittor/test/system/test_all.sh
@ -0,0 +1,6 @@
+bash python/jittor/test/system/test_cuda10.0_ubuntu16.04.sh
+bash python/jittor/test/system/test_cuda10.0_ubuntu18.04.sh
+bash python/jittor/test/system/test_cuda11.1_ubuntu16.04.sh
+bash python/jittor/test/system/test_cuda11.1_ubuntu18.04.sh
+bash python/jittor/test/system/test_cuda11.1_ubuntu20.04.sh
+bash python/jittor/test/system/test_nocuda_ubuntu18.04.sh
--- a/python/jittor/test/system/test_cuda10.0_ubuntu16.04.sh
+++ b/python/jittor/test/system/test_cuda10.0_ubuntu16.04.sh
@ -0,0 +1,41 @@
+cat > /tmp/cuda10.0-ubuntu16.04.dockerfile <<\EOF
+FROM nvidia/cuda:10.0-cudnn7-devel-ubuntu16.04
+
+RUN apt update && apt install ca-certificates -y
+
+RUN echo \
+"deb [trusted=yes] https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ bionic main restricted universe multiverse\n\
+deb [trusted=yes] https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ bionic-updates main restricted universe multiverse\n\
+deb [trusted=yes] https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ bionic-backports main restricted universe multiverse\n\
+deb [trusted=yes] https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ bionic-security main restricted universe multiverse" > /etc/apt/sources.list
+
+# RUN rm -rf /var/lib/apt/lists/*
+RUN apt update || true
+
+RUN apt install wget \
+        python3.7 python3.7-dev \
+        g++ build-essential -y
+
+WORKDIR /usr/src
+
+RUN apt download python3-distutils && dpkg-deb -x ./python3-distutils* / \
+    && wget -O - https://bootstrap.pypa.io/get-pip.py | python3.7
+
+# change tsinghua mirror
+RUN pip3 config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
+
+RUN pip3 install jittor --timeout 100 && python3.7 -m jittor.test.test_example
+RUN pip3 uninstall jittor -y
+
+COPY . jittor
+RUN python3.7 -m pip install ./jittor
+RUN python3.7 -m jittor.test.test_core
+EOF
+
+sudo docker build --tag jittor/jittor-cuda:10.0-16.04 -f /tmp/cuda10.0-ubuntu16.04.dockerfile .
+sudo docker run --gpus all --rm jittor/jittor-cuda:10.0-18.04 bash -c \
+"python3.7 -m jittor.test.test_example && \
+python3.7 -m jittor.test.test_resnet && \
+python3.7 -m jittor.test.test_parallel_pass && \
+python3.7 -m jittor.test.test_atomic_tuner && \
+python3.7 -m jittor.test.test_where_op"
--- a/python/jittor/test/system/test_cuda10.0_ubuntu18.04.sh
+++ b/python/jittor/test/system/test_cuda10.0_ubuntu18.04.sh
@ -0,0 +1,41 @@
+cat > /tmp/cuda10.0-ubuntu18.04.dockerfile <<\EOF
+FROM nvidia/cuda:10.0-cudnn7-devel-ubuntu18.04
+
+RUN apt update && apt install ca-certificates -y
+
+RUN echo \
+"deb [trusted=yes] https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ bionic main restricted universe multiverse\n\
+deb [trusted=yes] https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ bionic-updates main restricted universe multiverse\n\
+deb [trusted=yes] https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ bionic-backports main restricted universe multiverse\n\
+deb [trusted=yes] https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ bionic-security main restricted universe multiverse" > /etc/apt/sources.list
+
+# RUN rm -rf /var/lib/apt/lists/*
+RUN apt update || true
+
+RUN apt install wget \
+        python3.7 python3.7-dev \
+        g++ build-essential -y
+
+WORKDIR /usr/src
+
+RUN apt download python3-distutils && dpkg-deb -x ./python3-distutils* / \
+    && wget -O - https://bootstrap.pypa.io/get-pip.py | python3.7
+
+# change tsinghua mirror
+RUN pip3 config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
+
+RUN pip3 install jittor --timeout 100 && python3.7 -m jittor.test.test_example
+RUN pip3 uninstall jittor -y
+
+COPY . jittor
+RUN python3.7 -m pip install ./jittor
+RUN python3.7 -m jittor.test.test_core
+EOF
+
+sudo docker build --tag jittor/jittor-cuda:10.0-18.04 -f /tmp/cuda10.0-ubuntu18.04.dockerfile .
+sudo docker run --gpus all --rm jittor/jittor-cuda:10.0-18.04 bash -c \
+"python3.7 -m jittor.test.test_example && \
+python3.7 -m jittor.test.test_resnet && \
+python3.7 -m jittor.test.test_parallel_pass && \
+python3.7 -m jittor.test.test_atomic_tuner && \
+python3.7 -m jittor.test.test_where_op"
--- a/python/jittor/test/system/test_cuda11.1_ubuntu16.04.sh
+++ b/python/jittor/test/system/test_cuda11.1_ubuntu16.04.sh
@ -0,0 +1,41 @@
+cat > /tmp/cuda11.1-ubuntu16.04.dockerfile <<\EOF
+FROM nvidia/cuda:11.1-cudnn8-devel-ubuntu16.04
+
+RUN apt update && apt install ca-certificates -y
+
+RUN echo \
+"deb [trusted=yes] https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ bionic main restricted universe multiverse\n\
+deb [trusted=yes] https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ bionic-updates main restricted universe multiverse\n\
+deb [trusted=yes] https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ bionic-backports main restricted universe multiverse\n\
+deb [trusted=yes] https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ bionic-security main restricted universe multiverse" > /etc/apt/sources.list
+
+# RUN rm -rf /var/lib/apt/lists/*
+RUN apt update || true
+
+RUN apt install wget \
+        python3.7 python3.7-dev \
+        g++ build-essential -y
+
+WORKDIR /usr/src
+
+RUN apt download python3-distutils && dpkg-deb -x ./python3-distutils* / \
+    && wget -O - https://bootstrap.pypa.io/get-pip.py | python3.7
+
+# change tsinghua mirror
+RUN pip3 config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
+
+RUN pip3 install jittor --timeout 100 && python3.7 -m jittor.test.test_example
+RUN pip3 uninstall jittor -y
+
+COPY . jittor
+RUN python3.7 -m pip install ./jittor
+RUN python3.7 -m jittor.test.test_core
+EOF
+
+sudo docker build --tag jittor/jittor-cuda:11.1-16.04 -f /tmp/cuda11.1-ubuntu16.04.dockerfile .
+sudo docker run --gpus all --rm jittor/jittor-cuda:11.1-16.04 bash -c \
+"python3.7 -m jittor.test.test_example && \
+python3.7 -m jittor.test.test_resnet && \
+python3.7 -m jittor.test.test_parallel_pass && \
+python3.7 -m jittor.test.test_atomic_tuner && \
+python3.7 -m jittor.test.test_where_op"
--- a/python/jittor/test/system/test_cuda11.1_ubuntu18.04.sh
+++ b/python/jittor/test/system/test_cuda11.1_ubuntu18.04.sh
@ -0,0 +1,41 @@
+cat > /tmp/cuda11.1-ubuntu18.04.dockerfile <<\EOF
+FROM nvidia/cuda:11.1-cudnn8-devel-ubuntu18.04
+
+RUN apt update && apt install ca-certificates -y
+
+RUN echo \
+"deb [trusted=yes] https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ bionic main restricted universe multiverse\n\
+deb [trusted=yes] https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ bionic-updates main restricted universe multiverse\n\
+deb [trusted=yes] https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ bionic-backports main restricted universe multiverse\n\
+deb [trusted=yes] https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ bionic-security main restricted universe multiverse" > /etc/apt/sources.list
+
+# RUN rm -rf /var/lib/apt/lists/*
+RUN apt update || true
+
+RUN apt install wget \
+        python3.7 python3.7-dev \
+        g++ build-essential -y
+
+WORKDIR /usr/src
+
+RUN apt download python3-distutils && dpkg-deb -x ./python3-distutils* / \
+    && wget -O - https://bootstrap.pypa.io/get-pip.py | python3.7
+
+# change tsinghua mirror
+RUN pip3 config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
+
+RUN pip3 install jittor --timeout 100 && python3.7 -m jittor.test.test_example
+RUN pip3 uninstall jittor -y
+
+COPY . jittor
+RUN python3.7 -m pip install ./jittor
+RUN python3.7 -m jittor.test.test_core
+EOF
+
+sudo docker build --tag jittor/jittor-cuda:11.1-18.04 -f /tmp/cuda11.1-ubuntu18.04.dockerfile .
+sudo docker run --gpus all --rm jittor/jittor-cuda:11.1-18.04 bash -c \
+"python3.7 -m jittor.test.test_example && \
+python3.7 -m jittor.test.test_resnet && \
+python3.7 -m jittor.test.test_parallel_pass && \
+python3.7 -m jittor.test.test_atomic_tuner && \
+python3.7 -m jittor.test.test_where_op"
--- a/python/jittor/test/system/test_cuda11.1_ubuntu20.04.sh
+++ b/python/jittor/test/system/test_cuda11.1_ubuntu20.04.sh
@ -0,0 +1,39 @@
+cat > /tmp/cuda11.1-ubuntu20.04.dockerfile <<\EOF
+FROM nvidia/cuda:11.1-devel-ubuntu20.04
+
+RUN apt update && apt install ca-certificates -y
+
+RUN echo \
+"deb [trusted=yes] https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal main restricted universe multiverse\n\
+deb [trusted=yes] https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-updates main restricted universe multiverse\n\
+deb [trusted=yes] https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-backports main restricted universe multiverse\n\
+deb [trusted=yes] https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-security main restricted universe multiverse" > /etc/apt/sources.list
+
+# RUN rm -rf /var/lib/apt/lists/*
+RUN apt update || true
+RUN apt install g++ build-essential libomp-dev python3-dev python3-pip wget -y
+RUN python3 -m pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
+WORKDIR /usr/src/
+
+RUN wget https://developer.download.nvidia.cn/compute/cuda/repos/ubuntu2004/x86_64/libcudnn8_8.0.5.39-1+cuda11.1_amd64.deb && \
+    wget https://developer.download.nvidia.cn/compute/cuda/repos/ubuntu2004/x86_64/libcudnn8-dev_8.0.5.39-1+cuda11.1_amd64.deb && \
+    dpkg -i ./libcudnn8_8.0.5.39-1+cuda11.1_amd64.deb ./libcudnn8-dev_8.0.5.39-1+cuda11.1_amd64.deb && \
+    rm *.deb
+RUN ls
+
+
+RUN pip3 install jittor --timeout 100 && python3 -m jittor.test.test_example
+RUN pip3 uninstall jittor -y
+
+COPY . jittor
+RUN python3 -m pip install ./jittor
+RUN python3 -m jittor.test.test_core
+EOF
+
+sudo docker build --tag jittor/jittor-cuda:11.1-20.04 -f /tmp/cuda11.1-ubuntu20.04.dockerfile .
+sudo docker run --gpus all --rm jittor/jittor-cuda:11.1-20.04 bash -c \
+"python3 -m jittor.test.test_example && \
+python3 -m jittor.test.test_resnet && \
+python3 -m jittor.test.test_parallel_pass && \
+python3 -m jittor.test.test_atomic_tuner && \
+python3 -m jittor.test.test_where_op"
--- a/python/jittor/test/system/test_nocuda_ubuntu18.04.sh
+++ b/python/jittor/test/system/test_nocuda_ubuntu18.04.sh
@ -0,0 +1,40 @@
+cat > /tmp/ubuntu18.04.dockerfile <<\EOF
+FROM ubuntu:18.04
+
+RUN apt update && apt install ca-certificates -y
+
+RUN echo \
+"deb [trusted=yes] https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ bionic main restricted universe multiverse\n\
+deb [trusted=yes] https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ bionic-updates main restricted universe multiverse\n\
+deb [trusted=yes] https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ bionic-backports main restricted universe multiverse\n\
+deb [trusted=yes] https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ bionic-security main restricted universe multiverse" > /etc/apt/sources.list
+
+# RUN rm -rf /var/lib/apt/lists/*
+RUN apt update
+
+RUN apt install wget \
+        python3.7 python3.7-dev \
+        g++ build-essential -y
+
+WORKDIR /usr/src
+
+RUN apt download python3-distutils && dpkg-deb -x ./python3-distutils* / \
+    && wget -O - https://bootstrap.pypa.io/get-pip.py | python3.7
+
+# change tsinghua mirror
+RUN pip3 config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
+
+RUN pip3 install jittor --timeout 100 && python3.7 -m jittor.test.test_example
+RUN pip3 uninstall jittor -y
+
+COPY . jittor
+RUN python3.7 -m pip install ./jittor
+RUN python3.7 -m jittor.test.test_core
+EOF
+
+sudo docker build --tag jittor/jittor:18.04 -f /tmp/ubuntu18.04.dockerfile .
+sudo docker run --gpus all --rm jittor/jittor:18.04 bash -c \
+"python3.7 -m jittor.test.test_example && \
+python3.7 -m jittor.test.test_parallel_pass && \
+python3.7 -m jittor.test.test_atomic_tuner && \
+python3.7 -m jittor.test.test_where_op"
--- a/python/jittor/test/test_conv_transpose.py
+++ b/python/jittor/test/test_conv_transpose.py
@ -60,6 +60,41 @@ class TestConvTranspose(unittest.TestCase):
        check((4, 5, 100, 100), (5, 6, 5, 5), 1, 2)
        check((4, 5, 100, 100), (5, 6, 5, 5), 2, 2)
        check((4, 5, 100, 100), (5, 6, 5, 5), 2, 3)
+
+    def test_function(self):
+        def check(data_shape, weights_shape, stride=1, dilation=1):
+            N,C,H,W = data_shape
+            i,o,h,w = weights_shape
+            img = np.random.rand(N,C,H,W).astype("float32")
+            weights = np.random.rand(i,o,h,w).astype("float32")
+            m1 = jt.nn.ConvTranspose(i,o,h, stride=stride, dilation=dilation, bias=False)
+            m2 = torch.nn.ConvTranspose2d(i,o,h, stride=stride, dilation=dilation, bias=False)
+            m1.weight.data = weights
+            m2.weight.data = torch.Tensor(weights)
+            x = jt.array(img)
+            # out1 = m1(x)
+            out1 = jt.nn.conv_transpose2d(x, m1.weight, stride=stride, dilation=dilation, bias=False)
+            mask = jt.random(out1.shape)
+            out1 = out1*mask
+            tx = torch.Tensor(img)
+            tx.requires_grad = True
+            out2 = m2(tx) * torch.Tensor(mask.data)
+            with jt.log_capture_scope(log_silent=1, 
+                log_vprefix="var_re=0,conv=0,op.cc=100") as logs:
+                assert np.allclose(out1.data, out2.data)
+                dx, dw = jt.grad(out1, [x, m1.weight])
+                jt.sync([dx, dw])
+                out2.sum().backward()
+                assert np.allclose(dw.data, m2.weight.grad.numpy(), 1e-3)
+                assert np.allclose(dx.data, tx.grad.numpy())
+            assert len(find_log_with_re(logs, "conv")) == 3
+        check((4, 5, 10, 10), (5, 6, 3, 3))
+        check((4, 5, 10, 10), (5, 6, 3, 3), 2)
+        check((4, 5, 100, 100), (5, 6, 4, 4), 2)
+        check((4, 5, 100, 100), (5, 6, 4, 4), 3)
+        check((4, 5, 100, 100), (5, 6, 5, 5), 1, 2)
+        check((4, 5, 100, 100), (5, 6, 5, 5), 2, 2)
+        check((4, 5, 100, 100), (5, 6, 5, 5), 2, 3)
        
 if __name__ == "__main__":
    unittest.main()
--- a/python/jittor/test/test_cutt_transpose_op.py
+++ b/python/jittor/test/test_cutt_transpose_op.py
@ -30,7 +30,7 @@ class TestCuttTransposeOp(unittest.TestCase):
            for perm in perms:
                with jt.log_capture_scope(
                    log_silent=1,
-                    log_v=0, log_vprefix="op.cc=100"
+                    log_v=0, log_vprefix="cutt=100"
                ) as raw_log:
                    if perm:
                        x = np.transpose(a, perm)
@ -39,7 +39,7 @@ class TestCuttTransposeOp(unittest.TestCase):
                        x = np.transpose(a)
                        y = jt.transpose(a).data
                    self.assertEqual(x.shape, y.shape)
-                logs = find_log_with_re(raw_log, "(Jit op key (not )?found: " + "cutt_transpose" + ".*)")
+                logs = find_log_with_re(raw_log, "(Run cutt_transpose with key.*)")
                if perm is None:
                    continue
                last = -1
@ -53,7 +53,7 @@ class TestCuttTransposeOp(unittest.TestCase):
                    last = perm[i]
                if not in_order:
                    assert len(logs)==1
-                assert (x==y).all(), f"\n{x}\n{y}"
+                assert (x==y).all(), f"\n{x}\n{y}\n{perm}\n{a.shape}"
                
        ia = [gen_data([5, 7]), gen_data([2,2,2]), gen_data([2,3,4,5]), gen_data([5,3]), gen_data([3,1,5,3,1])]
        for a in ia: check(a)
--- a/python/jittor/test/test_depthwise_conv.py
+++ b/python/jittor/test/test_depthwise_conv.py
@ -0,0 +1,88 @@
+# ***************************************************************
+# Copyright (c) 2020 Jittor. Authors:
+#     Guoye Yang <498731903@qq.com>
+#     Dun Liang <randonlang@gmail.com>.
+# All Rights Reserved.
+# This file is subject to the terms and conditions defined in
+# file 'LICENSE.txt', which is part of this source code package.
+# ***************************************************************
+import unittest
+import jittor as jt
+import numpy as np
+import jittor.models as jtmodels
+
+def load_parameters(m1, m2):
+    m1.save('/tmp/temp.pk')
+    m2.load('/tmp/temp.pk')
+
+def compare_parameters(m1, m2):
+    ps1 = m1.parameters()
+    ps2 = m2.parameters()
+    for i in range(len(ps1)):
+        x = ps1[i].data + 1e-8
+        y = ps2[i].data + 1e-8
+        relative_error = abs(x - y) / abs(y)
+        diff = relative_error.mean()
+        assert diff < 1e-4, (diff, 'backward', ps2[i].name(), ps1[i].mean(), ps1[i].std(), ps2[i].mean(), ps2[i].std())
+
+class TestDepthwiseConv(unittest.TestCase):
+    @unittest.skipIf(not jt.has_cuda, "Cuda not found")
+    @jt.flag_scope(use_cuda=1)
+    def test_data(self):
+        test_img = np.random.random((64,3,224,224)).astype('float32')
+        jittor_test_img = jt.array(test_img)
+        lr = 100
+
+        jittor_model = jtmodels.__dict__['mobilenet_v2']()
+        jittor_model2 = jtmodels.__dict__['mobilenet_v2']()
+        # Set eval to avoid dropout layer & bn errors
+        jittor_model.train()
+        jittor_model.classifier[0].eval()
+        for m in jittor_model.modules():
+            if isinstance(m, jt.nn.BatchNorm):
+                m.eval()
+
+        jittor_model2.train()
+        jittor_model2.classifier[0].eval()
+        for m in jittor_model2.modules():
+            if isinstance(m, jt.nn.BatchNorm):
+                m.eval()
+
+        load_parameters(jittor_model2, jittor_model)
+        for m in jittor_model.modules():
+            if isinstance(m, jt.nn.Conv):
+                m.is_depthwise_conv = False
+        cnt = 0
+        for m in jittor_model2.modules():
+            if isinstance(m, jt.nn.Conv):
+                if (m.is_depthwise_conv):
+                    cnt += 1
+        assert cnt == 17, (cnt, '!=', 17)
+        jt_optimizer = jt.nn.SGD(jittor_model.parameters(), lr = lr)
+        jt_optimizer2 = jt.nn.SGD(jittor_model2.parameters(), lr = lr)
+
+        jittor_result = jittor_model(jittor_test_img)
+        mask = jt.random(jittor_result.shape, jittor_result.dtype)
+        loss = jittor_result * mask
+        jt_optimizer.step(loss)
+        jt.sync_all(True)
+
+        jittor_result2 = jittor_model2(jittor_test_img)
+        loss = jittor_result2 * mask
+
+        x = jittor_result2.data + 1e-8
+        y = jittor_result.data + 1e-8
+        relative_error = abs(x - y) / abs(y)
+        diff = relative_error.mean()
+        assert diff < 1e-4, (diff, 'forword')
+
+        jt_optimizer2.step(loss)
+        jt.sync_all(True)
+        compare_parameters(jittor_model, jittor_model2)
+
+
+        jt.clean()
+        jt.gc()
+
+if __name__ == "__main__":
+    unittest.main()
--- a/python/jittor/test/test_function.py
+++ b/python/jittor/test/test_function.py
@ -26,6 +26,19 @@ class TestFunction(unittest.TestCase):
        da = jt.grad(b, a)
        assert da.data == -1

+    def test_apply(self):
+        class MyFunc(Function):
+            def execute(self, x):
+                return x+1
+
+            def grad(self, grad):
+                return grad-2
+        a = jt.ones(1)
+        func = MyFunc.apply
+        b = func(a)
+        da = jt.grad(b, a)
+        assert da.data == -1
+
    def test2(self):
        class MyFunc(Function):
            def execute(self, x):
--- a/python/jittor/test/test_merge_loop_var_pass.py
+++ b/python/jittor/test/test_merge_loop_var_pass.py
@ -0,0 +1,73 @@
+# ***************************************************************
+# Copyright (c) 2020 Jittor. Authors: Dun Liang <randonlang@gmail.com>. All Rights Reserved.
+# This file is subject to the terms and conditions defined in
+# file 'LICENSE.txt', which is part of this source code package.
+# ***************************************************************
+import unittest
+import jittor as jt
+import numpy as numpy
+
+class TestMergeLoopVarPass(unittest.TestCase):
+    def test(self):
+        a = jt.ones([10,10,10,10])
+        a.sync()
+        with jt.profile_scope() as rep:
+            b = a.sum([2,3])
+            b.sync()
+        with open(rep[1][1]) as f:
+            src = f.read()
+            assert "range01" in src
+            assert "range23" in src
+
+    def test2(self):
+        a = jt.ones([10,10,10,10])
+        a.sync()
+        with jt.profile_scope() as rep:
+            b = a + 1
+            b.sync()
+        with open(rep[1][1]) as f:
+            src = f.read()
+            assert "range0123" in src
+
+    def test3(self):
+        a = jt.ones([10,10,10,10])
+        x = jt.ones([1,10,1,1])
+        a.sync(), x.sync()
+        with jt.profile_scope() as rep:
+            b = a + x
+            b.sync()
+        with open(rep[1][1]) as f:
+            src = f.read()
+            assert "range23" in src
+
+    def test4(self):
+        # don't optimize reindex like op yet
+        a = jt.ones([10,10,10,10])
+        a.sync()
+        with jt.profile_scope() as rep:
+            b = a.reindex_reduce("add", [10,10], ["i0","i1"])
+            b.sync()
+        with open(rep[1][1]) as f:
+            src = f.read()
+            assert "range23" not in src
+
+    def test5(self):
+        a = jt.ones([10,10,10,10])
+        a.sync()
+        with jt.profile_scope() as rep:
+            b = a.sum([1])
+            b.sync()
+        with open(rep[1][1]) as f:
+            src = f.read()
+            assert "range01" not in src
+            assert "range23" in src
+
+@unittest.skipIf(not jt.compiler.has_cuda, "No CUDA found")
+class TestMergeLoopVarPassCuda(TestMergeLoopVarPass):
+    def setUp(self):
+        jt.flags.use_cuda = 1
+    def tearDown(self):
+        jt.flags.use_cuda = 0
+
+if __name__ == "__main__":
+    unittest.main()
--- a/python/jittor/test/test_misc_op.py
+++ b/python/jittor/test/test_misc_op.py
@ -54,6 +54,7 @@ class TestPad(unittest.TestCase):
        check_equal(torch.Tensor(arr).flip(1), jt.array(arr).flip(1))
        check_equal(torch.Tensor(arr).flip(2), jt.array(arr).flip(2))
        check_equal(torch.Tensor(arr).flip(3), jt.array(arr).flip(3))
+        check_equal(torch.Tensor(arr).flip([2,3]), jt.array(arr).flip([2,3]))
        print('pass flip test ...')

    def test_cross(self):
@ -85,6 +86,10 @@ class TestPad(unittest.TestCase):
        check_equal(torchvision.utils.make_grid(torch.Tensor(arr), nrow=3, normalize=True, padding=4, pad_value=-1), jt.make_grid(jt.array(arr), nrow=3, normalize=True, padding=4, pad_value=-1))
        print('pass make_grid test ...')

+    def test_save_image(self):
+        arr = jt.array(np.random.randn(16,3,10,10))
+        jt.save_image(arr, "/tmp/a.jpg")
+
    def test_unbind(self):
        arr = np.random.randn(2,3,4)
        for dim in range(len(arr.shape)):
--- a/python/jittor/test/test_mkl_conv_op.py
+++ b/python/jittor/test/test_mkl_conv_op.py
@ -120,7 +120,7 @@ class TestMklConvOp(unittest.TestCase):

        with jt.flag_scope(
            enable_tuner=0,
-            compile_options={"test_mkl_conv":1}
+            # compile_options={"test_mkl_conv":1}
        ):
            c_jt = conv(a_jt, b_jt, 1, 1) * da
            gs=jt.grad(c_jt,[a_jt,b_jt])
@ -166,7 +166,7 @@ class TestMklConvOp(unittest.TestCase):

        with jt.flag_scope(
            enable_tuner=0,
-            compile_options={"test_mkl_conv":1}
+            # compile_options={"test_mkl_conv":1}
        ):
            c_jt = conv_nhwc_hwio(a_jt, b_jt, 1, 1) * da
            gs=jt.grad(c_jt,[a_jt,b_jt])
--- a/python/jittor/test/test_op_compiler.py
+++ b/python/jittor/test/test_op_compiler.py
@ -120,7 +120,7 @@ class TestOpCompiler(unittest.TestCase):
        OP1
        1+2
        std::max(T(1), T(2))
-        ((1)+T(2)*(T(1)/T(count)))''')
+        ((1)+T(2)*(T(rcount)))''')
        expect_error(lambda: jit_precompile(vars, "@{a"))
        expect_error(lambda: jit_precompile(vars, "@for(a"))
        expect_error(lambda: jit_precompile(vars, "@for(i,l,r)"))
--- a/python/jittor/test/test_parallel_pass.py
+++ b/python/jittor/test/test_parallel_pass.py
@ -97,7 +97,7 @@ class TestParallelPass3(unittest.TestCase):
        def check(ndim, depth, tdim):
            a = jt.random([16]*ndim)
            a.sync()
-            compile_options = {"parallel":1}
+            compile_options = {"parallel":1, "merge_loop_var": self.merge_loop_var}
            if depth is not None:
                compile_options["max_parallel_depth"] = depth
            with jt.profile_scope(compile_options=compile_options) as rep:
@ -110,6 +110,7 @@ class TestParallelPass3(unittest.TestCase):
                for i in range(tdim):
                    assert f"tnum{i}" in src
                assert f"tnum{tdim}" not in src
+        self.merge_loop_var = 0
        check(1, None, 0)
        check(2, None, 1)
        check(3, None, 2)
@ -134,7 +135,7 @@ class TestParallelPass3(unittest.TestCase):
        a = jt.random(shape)
        a.sync()
        config = {
-            "parallel":1, "max_parallel_depth":depth
+            "parallel":1, "max_parallel_depth":depth, "merge_loop_var": self.merge_loop_var
        }
        for k in args:
            config[k] = args[k]
@ -164,6 +165,7 @@ class TestParallelPass3(unittest.TestCase):
        assert np.allclose(a.data.sum(rdim), b), (b.sum(), a.data.sum())

    def test_reduce(self):
+        self.merge_loop_var = 0
        check = lambda *a, **kw: self.reduce_check(*a, **kw)
        check(1, 2, 1, 0, 1)
        check(2, 1, 1, 1, 0)
@ -185,6 +187,29 @@ class TestParallelPass3(unittest.TestCase):
        check(4, 2, 2, [2,3], 0)
        check(4, 2, 2, [0,3], 1)

+    def test_reduce_with_merge_loop_var(self):
+        self.merge_loop_var = 1
+        check = lambda *a, **kw: self.reduce_check(*a, **kw)
+        check(1, 2, 1, 0, 1)
+        check(2, 1, 1, 1, 0)
+        check(2, 1, 1, 0, 1)
+        check(2, 1, 1, 0, 1, [0,0])
+        check(2, 1, 1, 0, 0, [0,1])
+        check(2, 1, 1, 0, 0, [0,1], [0,64])
+        check(2, 1, 1, [0,1], 1, [0,1])
+        check(3, 1, 1, [1,2], 0)
+        check(3, 1, 1, [0,1], 1)
+        check(3, 1, 1, [0,1], 0, [0,0,2])
+        check(3, 2, 1, [2], 0)
+        if jt.flags.use_cuda:
+            # loop is not merged so parallel depth 2
+            check(3, 2, 2, [1], 1)
+        else:
+            check(3, 2, 1, [1], 0)
+        check(3, 2, 2, [1], 1, merge=0)
+        check(4, 2, 1, [2,3], 0)
+        check(4, 2, 2, [0,3], 1)
+
    @unittest.skipIf(not jt.compiler.has_cuda, "No CUDA found")
    def test_reduce_cuda(self):
        with jt.flag_scope(use_cuda=1):
--- a/python/jittor/test/test_resnet.py
+++ b/python/jittor/test/test_resnet.py
@ -96,7 +96,7 @@ class TestResnet(unittest.TestCase):
                -jt.flags.stat_allocator_total_free_byte
            # assert mem_used < 4e9, mem_used
            # TODO: why bigger?
-            assert mem_used < 5.5e9, mem_used
+            assert mem_used < 5.6e9, mem_used
            # example log:
            # Train Epoch: 0 [0/100 (0%)]     Loss: 2.352903  Acc: 0.110000
            # Train Epoch: 0 [1/100 (1%)]     Loss: 2.840830  Acc: 0.080000
@ -115,9 +115,9 @@ class TestResnet(unittest.TestCase):
            # Train Epoch: 0 [50/100 (50%)]   Loss: 2.055014  Acc: 0.290000

            if jt.in_mpi:
-                assert jt.core.number_of_lived_vars() < 7500, jt.core.number_of_lived_vars()
+                assert jt.core.number_of_lived_vars() < 7800, jt.core.number_of_lived_vars()
            else:
-                assert jt.core.number_of_lived_vars() < 6500, jt.core.number_of_lived_vars()
+                assert jt.core.number_of_lived_vars() < 6700, jt.core.number_of_lived_vars()

        jt.sync_all(True)
        assert np.mean(loss_list[-50:])<0.5
--- a/python/jittor/test/test_where_op.py
+++ b/python/jittor/test/test_where_op.py
@ -43,7 +43,7 @@ class TestWhereOp(unittest.TestCase):
        x = a.reindex_var(self.where(a>0.1))
        x = x.reindex_var(self.where(x<0.9))
        na = a.data
-        assert (na[np.logical_and(na>0.1, na<0.9)]==x.data).all()
+        assert np.allclose(na[np.logical_and(na>0.1, na<0.9)], x.data)

    def test_reduce_dep(self):
        a = jt.random([100,100])
--- a/python/jittor/utils/converter_server.py
+++ b/python/jittor/utils/converter_server.py
@ -0,0 +1,27 @@
+from flask import Flask
+from flask import request
+from flask import jsonify
+app = Flask(__name__)
+import json
+
+from jittor.utils.pytorch_converter import convert
+
+@app.route('/', methods=["GET", "POST"])
+def hello():
+    msg = request
+    data = msg.data.decode("utf-8") 
+    try:
+        data = json.loads(data)
+        src = data["src"]
+        pjmap = json.loads(data["pjmap"])
+        jt_src = convert(src, pjmap)
+    except Exception as e:
+        jt_src = str(e)
+    response = jsonify(jt_src=jt_src)
+
+    # Enable Access-Control-Allow-Origin
+    response.headers.add("Access-Control-Allow-Origin", "*")
+    return response
+
+if __name__ == '__main__':
+    app.run(host="0.0.0.0")
--- a/python/jittor/utils/misc.py
+++ b/python/jittor/utils/misc.py
@ -38,7 +38,7 @@ def download_url_to_local(url, filename, root_folder, md5):
    ensure_dir(root_folder)
    file_path = os.path.join(root_folder, filename)
    if check_file_exist(file_path, md5):
-        print("Data file has been downloaded and verified")
+        return
    else:
        try:
            print('Downloading ' + url + ' to ' + file_path)
--- a/python/jittor/utils/polish.py
+++ b/python/jittor/utils/polish.py
@ -22,49 +22,36 @@ from jittor.compiler import run_cmd
 from jittor_utils import translator
 import sys

-jittor_path = os.path.realpath(os.path.join(jt.flags.jittor_path, "..", ".."))
-
-polish_path = os.path.join(jittor_path, "..", "jittor-polish")
-polish_path = os.path.realpath(polish_path)
-build_path = polish_path + "/build"
-LOG.i("Polish path:", polish_path)
-if not os.path.isdir(polish_path):
-    # create jittor-polish repo
-    os.mkdir(polish_path)
+jittor_path = jt.flags.jittor_path
+root_path = os.path.realpath(os.path.join(jt.flags.jittor_path, "..", ".."))
+data_path = os.path.join(jittor_path, "src", "__data__")
+build_path = os.path.join(data_path, "build")
+if not os.path.isdir(build_path):
    os.mkdir(build_path)
-    run_cmd("git init . && git remote add origin git@github.com:Jittor/Jittor.git", polish_path)
+status = run_cmd("git status", data_path)
+print(status)
+if "working tree clean" not in status:
+    LOG.f("__data__ has untracked files")

-# copy jittor src into it
-names = "extern notebook python script src README.md README.src.md README.cn.md LICENSE.txt setup.py .gitignore".split()
-for name in names:
-    run_cmd(f"rsync -a {jittor_path}/{name} {polish_path}/")
-
-git_version = run_cmd("git rev-parse HEAD", jittor_path)
+git_version = run_cmd("git rev-parse HEAD", data_path)
 LOG.i("git_version", git_version)
-run_cmd(f"git rev-parse HEAD > {polish_path}/python/jittor/version", jittor_path)
+
+run_cmd(f"git rev-parse HEAD > {jittor_path}/version", data_path)

 # remove files
 files = jt.compiler.files
-file_to_delete = [ name for name in files
-    if name.startswith("src") and \
-        len(name.split("/"))==2 and name.endswith("node.cc")
+data_files = [ name for name in files
+    if "__data__" in name
 ]
-LOG.i("file_to_delete", file_to_delete)
-run_cmd(f"rm {' '.join(file_to_delete)}", polish_path)
+LOG.i("data_files", data_files)

-# commit jittor-polish
-run_cmd(f"git add .", polish_path)
-status = run_cmd(f"git status", polish_path)
-if "new file" not in status:
-    LOG.i("Nothing change, exit...")
-else:
-    run_cmd(f"git commit -a -m 'version {git_version}'", polish_path)
-
-# compile delete files
+# compile data files
 from pathlib import Path
 home = str(Path.home())
-for cc_type in ["g++", "clang"]:
-    for device in ["cpu", "cuda"]:
+# for cc_type in ["g++", "clang"]:
+#     for device in ["cpu", "cuda"]:
+for cc_type in ["g++"]:
+    for device in ["cpu"]:
        key = f"{git_version}-{cc_type}-{device}"
        env = f"cache_name=build/{cc_type}/{device} cc_path="
        cname = "g++" if cc_type=="g++" else "clang-8"
@ -84,7 +71,7 @@ for cc_type in ["g++", "clang"]:

        obj_path = home + f"/.cache/jittor/build/{cc_type}/{device}/{cname}/obj_files"
        obj_files = []
-        for name in file_to_delete:
+        for name in data_files:
            name = name.split("/")[-1]
            fname = f"{obj_path}/{name}.o"
            assert os.path.isfile(fname), fname
@ -94,14 +81,17 @@ for cc_type in ["g++", "clang"]:
 # compress source
 # tar -cvzf build/jittor.tgz . --exclude build --exclude .git --exclude .ipynb_checkpoints --exclude __pycache__
 # mkdir -p jittor && tar -xvf ./jittor.tgz -C jittor
-assert os.system(f"cd {polish_path} && tar --exclude=build --exclude=.git --exclude=.ipynb_checkpoints --exclude=__pycache__ -cvzf build/jittor.tgz . ")==0
+assert os.system(f"cd {root_path} && tar --exclude=build --exclude=.git --exclude=.ipynb_checkpoints --exclude=__pycache__ --exclude=__data__  --exclude=my --exclude=dist  --exclude=.vscode --exclude=.github  -cvzf {build_path}/jittor.tgz * ")==0

 # rsync to build-server
 jittor_web_base_dir = "Documents/jittor-blog/assets/"
-jittor_web_build_dir = jittor_web_base_dir + "build/"
-assert os.system(f"rsync -avPu {polish_path}/build/ jittor-web:{jittor_web_build_dir}")==0
+jittor_web_build_dir = jittor_web_base_dir
+assert os.system(f"rsync -avPu {build_path} jittor-web:{jittor_web_build_dir}")==0
 assert os.system(f"ssh jittor-web Documents/jittor-blog.git/hooks/post-update")==0

+
+# sys.exit(0)
+
 # push to github
 # assert os.system(f"cd {polish_path} && git push -f origin master")==0

--- a/python/jittor/utils/pytorch_converter.py
+++ b/python/jittor/utils/pytorch_converter.py
@ -179,6 +179,18 @@ pjmap = {
        'links': {},
        'extras': {'affine': 'None'},
    },
+    'Parameter':{
+        'pytorch': {
+            'args': "data,require_grad=True"
+        },
+        'jittor': {
+            'module': 'jt',
+            'name': 'array',
+            'args': 'data,dtype=None',
+        },
+        'links': {},
+        'extras': {},
+    },
    'Dropout2d': {
        'pytorch': {
            'args': 'p=0.5, inplace=False', 
@ -351,6 +363,32 @@ pjmap = {
    }
 }

+unsupport_ops = [
+    # ***************************************************************
+    # torch.nn
+    # ***************************************************************
+    'ModuleDict', 'ParameterList', 'ParameterDict', 
+    'Conv1d', 'Conv3d', 'ConvTranspose1d', 'ConvTranspose3d', 'Unfold', 'Fold', 
+    'MaxPool1d', 'MaxPool3d', 'MaxUnpool1d', 'MaxUnpool2d', 'MaxUnpool3d', 'AvgPool1d', 
+    'AvgPool3d', 'FractionalMaxPool2d', 'LPPool1d', 'LPPool2d', 'AdaptiveMaxPool1d', 
+    'AdaptiveMaxPool2d', 'AdaptiveMaxPool3d', 'AdaptiveAvgPool1d', 'AdaptiveAvgPool3d', 
+    'ReflectionPad1d', 'ReplicationPad1d', 'ReplicationPad3d', 'ConstantPad1d', 'ConstantPad3d', 
+    'ELU', 'Hardshrink', 'Hardtanh', 'LogSigmoid', 'MultiheadAttention', 
+    'RReLU', 'SELU', 'CELU', 'GELU', 'Softshrink', 'Softsign', 'Tanhshrink', 
+    'Threshold', 'Softmin', 'Softmax2d', 'LogSoftmax', 'AdaptiveLogSoftmaxWithLoss', 
+    'BatchNorm3d', 'SyncBatchNorm', 'InstanceNorm1d', 'InstanceNorm3d', 'LocalResponseNorm', 
+    'RNNBase', 'RNN', 'LSTM', 'GRU', 'RNNCell', 'LSTMCell', 'GRUCell', 'Transformer', 'TransformerEncoder', 
+    'TransformerDecoder', 'TransformerEncoderLayer', 'TransformerDecoderLayer', 'Identity', 'Bilinear', 
+    'Dropout3d', 'AlphaDropout', 'EmbeddingBag', 'CosineSimilarity', 'PairwiseDistance', 'CTCLoss', 'NLLLoss', 'PoissonNLLLoss', 'KLDivLoss', 'BCEWithLogitsLoss', 
+    'MarginRankingLoss', 'HingeEmbeddingLoss', 'MultiLabelMarginLoss', 'SmoothL1Loss', 'SoftMarginLoss', 
+    'MultiLabelSoftMarginLoss', 'CosineEmbeddingLoss', 'MultiMarginLoss', 'TripletMarginLoss', 'UpsamplingNearest2d', 
+    'UpsamplingBilinear2d', 'DataParallel', 'DistributedDataParallel', 'clip_grad_norm_', 'clip_grad_value_', 
+    'parameters_to_vector', 'vector_to_parameters', 'BasePruningMethod', 'PruningContainer', 'Identity', 
+    'RandomUnstructured', 'L1Unstructured', 'RandomStructured', 'LnStructured', 'CustomFromMask', 'identity', 
+    'random_unstructured', 'l1_unstructured', 'random_structured', 'ln_structured', 'global_unstructured', 
+    'custom_from_mask', 'remove', 'is_pruned', 'weight_norm', 'remove_weight_norm', 'spectral_norm', 
+    'remove_spectral_norm', 'PackedSequence', 'pack_padded_sequence', 'pad_packed_sequence', 'pad_sequence', 'pack_sequence'
+]

 def pjmap_append(pytorch_func_name, pytorch_args, jittor_func_module, jittor_func_name, jittor_args, extras=None, links=None, delete=None):
    ''' adding map to pjmap for converting new function, example: convert AvgPool2d to Pool
@ -393,58 +431,268 @@ def pjmap_append(pytorch_func_name, pytorch_args, jittor_func_module, jittor_fun
        'delete': delete,
    }

-unsupport_ops = [
-    # ***************************************************************
-    # torch.nn
-    # ***************************************************************
-    'Parameter', 'ModuleDict', 'ParameterList', 'ParameterDict', 
-    'Conv1d', 'Conv3d', 'ConvTranspose1d', 'ConvTranspose3d', 'Unfold', 'Fold', 
-    'MaxPool1d', 'MaxPool3d', 'MaxUnpool1d', 'MaxUnpool2d', 'MaxUnpool3d', 'AvgPool1d', 
-    'AvgPool3d', 'FractionalMaxPool2d', 'LPPool1d', 'LPPool2d', 'AdaptiveMaxPool1d', 
-    'AdaptiveMaxPool2d', 'AdaptiveMaxPool3d', 'AdaptiveAvgPool1d', 'AdaptiveAvgPool3d', 
-    'ReflectionPad1d', 'ReplicationPad1d', 'ReplicationPad3d', 'ConstantPad1d', 'ConstantPad3d', 
-    'ELU', 'Hardshrink', 'Hardtanh', 'LogSigmoid', 'MultiheadAttention', 
-    'RReLU', 'SELU', 'CELU', 'GELU', 'Softshrink', 'Softsign', 'Tanhshrink', 
-    'Threshold', 'Softmin', 'Softmax2d', 'LogSoftmax', 'AdaptiveLogSoftmaxWithLoss', 
-    'BatchNorm3d', 'SyncBatchNorm', 'InstanceNorm1d', 'InstanceNorm3d', 'LocalResponseNorm', 
-    'RNNBase', 'RNN', 'LSTM', 'GRU', 'RNNCell', 'LSTMCell', 'GRUCell', 'Transformer', 'TransformerEncoder', 
-    'TransformerDecoder', 'TransformerEncoderLayer', 'TransformerDecoderLayer', 'Identity', 'Bilinear', 
-    'Dropout3d', 'AlphaDropout', 'EmbeddingBag', 'CosineSimilarity', 'PairwiseDistance', 'CTCLoss', 'NLLLoss', 'PoissonNLLLoss', 'KLDivLoss', 'BCEWithLogitsLoss', 
-    'MarginRankingLoss', 'HingeEmbeddingLoss', 'MultiLabelMarginLoss', 'SmoothL1Loss', 'SoftMarginLoss', 
-    'MultiLabelSoftMarginLoss', 'CosineEmbeddingLoss', 'MultiMarginLoss', 'TripletMarginLoss', 'UpsamplingNearest2d', 
-    'UpsamplingBilinear2d', 'DataParallel', 'DistributedDataParallel', 'clip_grad_norm_', 'clip_grad_value_', 
-    'parameters_to_vector', 'vector_to_parameters', 'BasePruningMethod', 'PruningContainer', 'Identity', 
-    'RandomUnstructured', 'L1Unstructured', 'RandomStructured', 'LnStructured', 'CustomFromMask', 'identity', 
-    'random_unstructured', 'l1_unstructured', 'random_structured', 'ln_structured', 'global_unstructured', 
-    'custom_from_mask', 'remove', 'is_pruned', 'weight_norm', 'remove_weight_norm', 'spectral_norm', 
-    'remove_spectral_norm', 'PackedSequence', 'pack_padded_sequence', 'pad_packed_sequence', 'pad_sequence', 'pack_sequence'
-]

-support_ops = {}
-for key in pjmap.keys():
-    module = pjmap[key]['jittor']['module']
-    name = pjmap[key]['jittor']['name']
-    if module == 'nn':
-        support_ops[key] = name
+def raise_unsupport(name, ori_src):
+    ret = f"raise RuntimeError('''original source: <{ori_src.strip()}>, {name} is not supported in Jittor yet. We will appreciate it if you provide an implementation of {name} and make pull request at https://github.com/Jittor/jittor.''')"
+    print(ret+'\n')
+    ret = ast.parse(ret).body[0]
+    return ret

-def raise_unsupport(name):
-    raise RuntimeError(f'{name} is not supported in Jittor yet. We will appreciate it if you provide an implementation of {name} and make pull request at https://github.com/Jittor/jittor.')
+class Converter:
+    def __init__(self, ex_pjmap):
+        import copy
+        self.pjmap = copy.deepcopy(pjmap)
+        if ex_pjmap:
+            self.pjmap.update(ex_pjmap)
+        self.unsupport_ops = set(unsupport_ops)
+        support_ops = {}
+        for key in self.pjmap.keys():
+            module = self.pjmap[key]['jittor']['module']
+            name = self.pjmap[key]['jittor']['name']
+            if module == 'nn':
+                support_ops[key] = name
+            if key in self.unsupport_ops:
+                self.unsupport_ops.remove(key)
+        self.support_ops = support_ops
+        self.import_flag = []

-def replace(a):
-    if hasattr(a, "attr") and a.attr in unsupport_ops:
-        raise_unsupport(a.attr)
-    
-    if hasattr(a, "id") and a.id in unsupport_ops:
-        raise_unsupport(a.id)
-
-    if hasattr(a, "attr"):
-        if a.attr in support_ops.keys(): a.attr = support_ops[a.attr]
+    def replace(self, a):
+        if hasattr(a, "attr") and a.attr in self.unsupport_ops:
+            ori_src = astunparse.unparse(a)
+            return raise_unsupport(a.attr, ori_src)
        
-    if hasattr(a, "id"):
-        if a.id in support_ops.keys(): a.id = support_ops[a.id]
+        if hasattr(a, "id") and a.id in self.unsupport_ops:
+            ori_src = astunparse.unparse(a)
+            return raise_unsupport(a.id, ori_src)

-import_flag = []
-def convert(code):
+        if hasattr(a, "attr"):
+            if a.attr in self.support_ops.keys(): a.attr = self.support_ops[a.attr]
+            
+        if hasattr(a, "id"):
+            if a.id in self.support_ops.keys(): a.id = self.support_ops[a.id]
+            
+        return None
+
+    def convert_(self, prefix, func_name, ags, kws, ori_src):
+        info = self.pjmap[func_name]
+        p_prefix = info['pytorch']['prefix'] if 'prefix' in info['pytorch'].keys() else None
+        if p_prefix is not None and prefix in p_prefix:
+            p_ags = info['pytorch']['args_prefix']
+            j_ags = info['jittor']['args_prefix']
+        else:
+            p_ags = info['pytorch']['args']
+            j_ags = info['jittor']['args']
+        if 'delete' in info.keys():
+            delete = info['delete']
+        else:
+            delete = None
+        j_prefix = info['jittor']['prefix'] if 'prefix' in info['jittor'].keys() else None
+        j_module = info['jittor']['module']
+        j_name = info['jittor']['name']
+        links = info['links']
+        extras = info['extras']
+        jj_ags = []
+        jj_kws = {}
+        pp_ags = []
+        pp_kws = {}
+        if j_ags == '' and p_ags == '':
+            # no args in Pytorch and Jittor.
+            if p_prefix is None:
+                return f"{j_module}.{j_name}()"
+            else:
+                if prefix in p_prefix:
+                    return f"{j_prefix}.{j_name}()"
+                else:
+                    return f"{prefix}.{j_name}()"
+        else:
+            j_ags = j_ags.replace(' ','').split(',')
+            for j_ag in j_ags:
+                if '=' in j_ag:
+                    k,v = j_ag.split('=')
+                    jj_kws[k] = v
+                else:
+                    jj_ags.append(j_ag)
+            p_ags = p_ags.replace(' ','').split(',')
+            for p_ag in p_ags:
+                if '=' in p_ag:
+                    k,v = p_ag.split('=')
+                    pp_kws[k] = v
+                else:
+                    pp_ags.append(p_ag)
+            if len(jj_ags) == 0 and len(pp_ags) != 0:
+                return f"raise AttributeError('''origin source: <{ori_src.strip()}>, {func_name} in Jittor has no Attribute {pp_ags[0]}''')"
+                # raise AttributeError(f"{func_name} in Jittor has no Attribute {pp_ags[0]}")
+        if delete is not None:
+            for d in delete:
+                if d in pp_ags:
+                    jj_ags.append(d)
+                if d in pp_kws.keys():
+                    jj_kws[d] = None
+        if len(pp_ags) > len(ags) + len(kws):
+            return f"raise RuntimeError('''origin source: <{ori_src.strip()}>, There are needed {len(pp_ags) + len(list(pp_kws.keys()))} args in Pytorch {func_name} function, but you only provide {len(ags) + len(kws)}''')"
+            # raise RuntimeError(f'There are needed {len(pp_ags) + len(list(pp_kws.keys()))} args in Pytorch {func_name} function, but you only provide {len(ags) + len(kws)}')
+        ags_ = []
+        for i in range(len(pp_ags)):
+            if i < len(ags):
+                if '*' in pp_ags[i]:
+                    ags_.append('(' + ', '.join(ags[i:]) + ')')
+                    ags = ags_
+                    break
+                else:
+                    ags_.append(ags[i])
+            else:
+                break
+        if len(pp_ags) + len(list(pp_kws.keys())) < len(ags) + len(kws):
+            return f"raise RuntimeError('''origin source: <{ori_src.strip()}>,There are only {len(pp_ags) + len(list(pp_kws.keys()))} args in Pytorch {func_name} function, but you provide {len(ags) + len(kws)}''')"
+            # raise RuntimeError(f'There are only {len(pp_ags) + len(list(pp_kws.keys()))} args in Pytorch {func_name} function, but you provide {len(ags) + len(kws)}')
+        j_ags_flag = np.zeros(len(jj_ags))
+        j_ags_values = {}
+        j_kws_values = {}
+        for i,ag in enumerate(ags):
+            if len(pp_ags) == 0:
+                ag_name = list(pp_kws.keys())[i]
+            elif i < len(pp_ags):
+                ag_name = pp_ags[i]
+            elif i >= len(pp_ags) and (i-len(pp_ags)) <= len(list(pp_kws.keys())):
+                ag_name = list(pp_kws.keys())[i-len(pp_ags)]
+            else:
+                return f"raise RuntimeError('''origin source: <{ori_src.strip()}>,The args number is not matc{func_name} in Jittor has no Attribute {ag_name}''')"
+                # raise RuntimeError(f'The args number is not matc{func_name} in Jittor has no Attribute {ag_name}')
+            if ag_name in links.keys():
+                ag_name = links[ag_name]
+            if ag_name in jj_ags:
+                j_ags_flag[jj_ags.index(ag_name)] = 1
+                j_ags_values[str(jj_ags.index(ag_name))] = ag
+            elif ag_name in jj_kws.keys():
+                j_kws_values[ag_name] = ag
+            else:
+                return f"raise AttributeError('''origin source: <{ori_src.strip()}>, {func_name} in Jittor has no Attribute {ag_name}''')"
+                # raise AttributeError(f'{func_name} in Jittor has no Attribute {ag_name}')
+        for i,kw in enumerate(kws):
+            kw_name, kw_value = kw.split('=')
+            if kw_name in links.keys():
+                kw_name = links[kw_name]
+            if kw_name in jj_ags:
+                j_ags_flag[jj_ags.index(kw_name)] = 1
+                j_ags_values[str(jj_ags.index(kw_name))] = kw_value
+            elif kw_name in jj_kws.keys():
+                j_kws_values[kw_name] = kw_value
+            else:
+                return f"raise AttributeError('''origin source: <{ori_src.strip()}>, {func_name} in Jittor has no Attribute {kw_name}''')"
+                # raise AttributeError(f'{func_name} in Jittor has no Attribute {kw_name}')
+        len_jj_ags = len(jj_ags) if len(jj_ags) == 0 or jj_ags[0] != '' else 0
+        if j_ags_flag.sum() < len_jj_ags:
+            missing_args = []
+            for i in range(len(jj_ags)):
+                if j_ags_flag[i] == 0:
+                    missing_args.append(jj_ags[i])
+            return f"raise AttributeError('''origin source: <{ori_src.strip()}>, the needed args of {func_name} in Jittor is {', '.join(jj_ags)}, so you need to give value of {', '.join(missing_args)}.''')"
+            # raise AttributeError(f"the needed args of {func_name} in Jittor is {', '.join(jj_ags)}, so you need to give value of {', '.join(missing_args)}.")
+        if extras:
+            for k in extras.keys():
+                if k in jj_ags:
+                    j_ags_values[str(jj_ags.index(k))] = extras[k]
+                elif k in jj_kws.keys():
+                    j_kws_values[k] = extras[k]
+                else:
+                    return f"raise AttributeError('''origin source: <{ori_src.strip()}>, there is not attribute named {k} in Jittor {func_name}, you should delete it in {func_name} extras.''')"
+                    # raise AttributeError(f"there is not attribute named {k} in Jittor {func_name}, you should delete it in {func_name} extras.")
+        if delete is not None:
+            for d in delete:
+                if d in j_ags_values:
+                    del j_ags_values[d]
+                if d in j_kws_values.keys():
+                    j_kws_values.pop(d)
+        j_ags_ = [j_ags_values[str(i)] for i in range(len(list(j_ags_values.keys())))]
+        j_kws_ = [key + "=" + j_kws_values[key] for key in j_kws_values.keys()]
+        j_func = f"{j_module}.{j_name}({', '.join(j_ags_+j_kws_)})"
+        if p_prefix is None:
+            return f"{j_module}.{j_name}({', '.join(j_ags_+j_kws_)})"
+        else:
+            if prefix in p_prefix:
+                return f"{j_prefix}.{j_name}({', '.join(j_ags_+j_kws_)})"
+            else:
+                return f"{prefix}.{j_name}({', '.join(j_ags_+j_kws_)})"
+        return j_func
+
+    def dfs(self, a):
+        if isinstance(a, ast.Import):
+            if 'torch' in astunparse.unparse(a) and 'init' in astunparse.unparse(a):
+                self.import_flag.append('init')
+                return ast.parse('from jittor import init').body[0]
+            if 'torch' in astunparse.unparse(a) and a.names[0].asname == 'nn':
+                self.import_flag.append('nn')
+                return ast.parse('from jittor import nn').body[0]
+            if 'torch' in a.names[0].name: 
+                return 'delete'
+        elif isinstance(a, ast.ImportFrom):
+            if 'torch' in a.module:
+                return 'delete'
+        elif isinstance(a, ast.Call):
+            for idx, ag in enumerate(a.args): 
+                ret = self.dfs(ag)
+                if ret is not None:
+                    a.args[idx] = ret
+            for idx, kw in enumerate(a.keywords): 
+                ret = self.dfs(kw)
+                if ret is not None:
+                    a.keywords[idx] = ret
+            ori_src = astunparse.unparse(a)
+            func = astunparse.unparse(a.func).strip('\n').split('.')
+            prefix = '.'.join(func[0:-1])
+            func_name = func[-1]
+            if func_name in self.unsupport_ops:
+                ret = raise_unsupport(func_name, ori_src)
+                return ret
+            if func_name in self.pjmap:
+                ags = [astunparse.unparse(ag).strip('\n') for ag in a.args]
+                kws = [astunparse.unparse(kw).strip('\n') for kw in a.keywords]
+                ret = self.convert_(prefix, func_name, ags, kws, ori_src)
+                ret_tmp = ret
+                ret = ast.parse(ret).body[0]
+                if hasattr(ret,'value'):
+                    return ret.value
+                else:
+                    print(ret_tmp+'\n')
+                    return ret
+            if ".load_state_dict" in astunparse.unparse(a.func):
+                a.func.attr = 'load_parameters'
+            if astunparse.unparse(a.func).strip('\n').endswith(".size"):
+                ags = [astunparse.unparse(ag).strip('\n') for ag in a.args]
+                if len(ags) != 0:
+                    con = astunparse.unparse(a.func).split('.size')[0] + '.shape[' + ','.join(ags) + ']'
+                else:
+                    con = astunparse.unparse(a.func).replace('size', 'shape')
+                return ast.parse(con).body[0].value
+        elif isinstance(a, ast.Expr): pass
+        elif isinstance(a, ast.Attribute) or isinstance(a, ast.Name): 
+            ret = self.replace(a)
+            if ret is not None:
+                print(ret)
+                return ret
+        elif isinstance(a, ast.FunctionDef):
+            if a.name == 'forward': a.name = 'execute'
+        if hasattr(a, '__dict__'):
+            for k in a.__dict__.keys():
+                if isinstance(a.__dict__[k], list):
+                    delete_flag = []
+                    for i,a_ in enumerate(a.__dict__[k]):
+                        ret = self.dfs(a_)
+                        if ret == 'delete':
+                            delete_flag.append(True)
+                            continue
+                        if ret is not None:
+                            a.__dict__[k][i] = ret
+                        delete_flag.append(False)
+                    tmp = [a_ for i,a_ in enumerate(a.__dict__[k]) if delete_flag[i] == False]
+                    a.__dict__[k] = tmp
+                else:
+                    ret = self.dfs(a.__dict__[k])
+                    if ret is not None:
+                        a.__dict__[k] = ret
+
+
+def convert(code, ex_pjmaps=None):
    ''' Model code converter, example:

    from jittor.utils.pytorch_converter import convert
@ -469,209 +717,13 @@ def convert(code):
    model = Model()
    print("## Jittor model:", model)
    '''
+
    a = ast.parse(code)
-    dfs(a)
+    converter = Converter(ex_pjmaps)
+    converter.dfs(a)
    a.body.insert(0, ast.parse('import jittor as jt').body[0])
-    if 'init' not in import_flag:
+    if 'init' not in converter.import_flag:
        a.body.insert(1, ast.parse('from jittor import init').body[0])
-    if 'nn' not in import_flag:
+    if 'nn' not in converter.import_flag:
        a.body.insert(2, ast.parse('from jittor import nn').body[0])
    return astunparse.unparse(a)
-
-def convert_(prefix, func_name, ags, kws):
-    info = pjmap[func_name]
-    p_prefix = info['pytorch']['prefix'] if 'prefix' in info['pytorch'].keys() else None
-    if p_prefix is not None and prefix in p_prefix:
-        p_ags = info['pytorch']['args_prefix']
-        j_ags = info['jittor']['args_prefix']
-    else:
-        p_ags = info['pytorch']['args']
-        j_ags = info['jittor']['args']
-    if 'delete' in info.keys():
-        delete = info['delete']
-    else:
-        delete = None
-    j_prefix = info['jittor']['prefix'] if 'prefix' in info['jittor'].keys() else None
-    j_module = info['jittor']['module']
-    j_name = info['jittor']['name']
-    links = info['links']
-    extras = info['extras']
-    jj_ags = []
-    jj_kws = {}
-    pp_ags = []
-    pp_kws = {}
-    if j_ags == '' and p_ags == '':
-        # no args in Pytorch and Jittor.
-        if p_prefix is None:
-            return f"{j_module}.{j_name}()"
-        else:
-            if prefix in p_prefix:
-                return f"{j_prefix}.{j_name}()"
-            else:
-                return f"{prefix}.{j_name}()"
-    else:
-        j_ags = j_ags.replace(' ','').split(',')
-        for j_ag in j_ags:
-            if '=' in j_ag:
-                k,v = j_ag.split('=')
-                jj_kws[k] = v
-            else:
-                jj_ags.append(j_ag)
-        p_ags = p_ags.replace(' ','').split(',')
-        for p_ag in p_ags:
-            if '=' in p_ag:
-                k,v = p_ag.split('=')
-                pp_kws[k] = v
-            else:
-                pp_ags.append(p_ag)
-        if len(jj_ags) == 0 and len(pp_ags) != 0:
-            raise AttributeError(f"{func_name} in Jittor has no Attribute {pp_ags[0]}")
-    if delete is not None:
-        for d in delete:
-            if d in pp_ags:
-                jj_ags.append(d)
-            if d in pp_kws.keys():
-                jj_kws[d] = None
-    if len(pp_ags) > len(ags) + len(kws):
-        raise RuntimeError(f'There are needed {len(pp_ags) + len(list(pp_kws.keys()))} args in Pytorch {func_name} function, but you only provide {len(ags) + len(kws)}')
-    ags_ = []
-    for i in range(len(pp_ags)):
-        if i < len(ags):
-            if '*' in pp_ags[i]:
-                ags_.append('(' + ', '.join(ags[i:]) + ')')
-                ags = ags_
-                break
-            else:
-                ags_.append(ags[i])
-        else:
-            break
-    if len(pp_ags) + len(list(pp_kws.keys())) < len(ags) + len(kws):
-        raise RuntimeError(f'There are only {len(pp_ags) + len(list(pp_kws.keys()))} args in Pytorch {func_name} function, but you provide {len(ags) + len(kws)}')
-    j_ags_flag = np.zeros(len(jj_ags))
-    j_ags_values = {}
-    j_kws_values = {}
-    for i,ag in enumerate(ags):
-        if len(pp_ags) == 0:
-            ag_name = list(pp_kws.keys())[i]
-        elif i < len(pp_ags):
-            ag_name = pp_ags[i]
-        elif i >= len(pp_ags) and (i-len(pp_ags)) <= len(list(pp_kws.keys())):
-            ag_name = list(pp_kws.keys())[i-len(pp_ags)]
-        else:
-            raise RuntimeError(f'The args number is not matc{func_name} in Jittor has no Attribute {ag_name}')
-        if ag_name in links.keys():
-            ag_name = links[ag_name]
-        if ag_name in jj_ags:
-            j_ags_flag[jj_ags.index(ag_name)] = 1
-            j_ags_values[str(jj_ags.index(ag_name))] = ag
-        elif ag_name in jj_kws.keys():
-            j_kws_values[ag_name] = ag
-        else:
-            raise AttributeError(f'{func_name} in Jittor has no Attribute {ag_name}')
-    for i,kw in enumerate(kws):
-        kw_name, kw_value = kw.split('=')
-        if kw_name in links.keys():
-            kw_name = links[kw_name]
-        if kw_name in jj_ags:
-            j_ags_flag[jj_ags.index(kw_name)] = 1
-            j_ags_values[str(jj_ags.index(kw_name))] = kw_value
-        elif kw_name in jj_kws.keys():
-            j_kws_values[kw_name] = kw_value
-        else:
-            raise AttributeError(f'{func_name} in Jittor has no Attribute {kw_name}')
-    len_jj_ags = len(jj_ags) if len(jj_ags) == 0 or jj_ags[0] != '' else 0
-    if j_ags_flag.sum() < len_jj_ags:
-        missing_args = []
-        for i in range(len(jj_ags)):
-            if j_ags_flag[i] == 0:
-                missing_args.append(jj_ags[i])
-        raise AttributeError(f"the needed args of {func_name} in Jittor is {', '.join(jj_ags)}, so you need to give value of {', '.join(missing_args)}.")
-    if extras:
-        for k in extras.keys():
-            if k in jj_ags:
-                j_ags_values[str(jj_ags.index(k))] = extras[k]
-            elif k in jj_kws.keys():
-                j_kws_values[k] = extras[k]
-            else:
-                raise AttributeError(f"there is not attribute named {k} in Jittor {func_name}, you should delete it in {func_name} extras.")
-    if delete is not None:
-        for d in delete:
-            if d in j_ags_values:
-                j_ags_values.remove(d)
-            if d in j_kws_values.keys():
-                j_kws_values.pop(d)
-    j_ags_ = [j_ags_values[str(i)] for i in range(len(list(j_ags_values.keys())))]
-    j_kws_ = [key + "=" + j_kws_values[key] for key in j_kws_values.keys()]
-    j_func = f"{j_module}.{j_name}({', '.join(j_ags_+j_kws_)})"
-    if p_prefix is None:
-        return f"{j_module}.{j_name}({', '.join(j_ags_+j_kws_)})"
-    else:
-        if prefix in p_prefix:
-            return f"{j_prefix}.{j_name}({', '.join(j_ags_+j_kws_)})"
-        else:
-            return f"{prefix}.{j_name}({', '.join(j_ags_+j_kws_)})"
-    return j_func
-
-def dfs(a):
-    if isinstance(a, ast.Import):
-        if 'torch' in astunparse.unparse(a) and 'init' in astunparse.unparse(a):
-            import_flag.append('init')
-            return ast.parse('from jittor import init').body[0]
-        if 'torch' in astunparse.unparse(a) and a.names[0].asname == 'nn':
-            import_flag.append('nn')
-            return ast.parse('from jittor import nn').body[0]
-        if 'torch' in a.names[0].name: 
-            return 'delete'
-    elif isinstance(a, ast.ImportFrom):
-        if 'torch' in a.module:
-            return 'delete'
-    elif isinstance(a, ast.Call):
-        for idx, ag in enumerate(a.args): 
-            ret = dfs(ag)
-            if ret is not None:
-                a.args[idx] = ret
-        for idx, kw in enumerate(a.keywords): 
-            ret = dfs(kw)
-            if ret is not None:
-                a.keywords[idx] = ret
-        func = astunparse.unparse(a.func).strip('\n').split('.')
-        prefix = '.'.join(func[0:-1])
-        func_name = func[-1]
-        if func_name in unsupport_ops:
-            raise_unsupport(func_name)
-        if func_name in pjmap.keys():
-            ags = [astunparse.unparse(ag).strip('\n') for ag in a.args]
-            kws = [astunparse.unparse(kw).strip('\n') for kw in a.keywords]
-            ret = convert_(prefix, func_name, ags, kws)
-            return ast.parse(ret).body[0].value
-        if ".load_state_dict" in astunparse.unparse(a.func):
-            a.func.attr = 'load_parameters'
-        if astunparse.unparse(a.func).strip('\n').endswith(".size"):
-            ags = [astunparse.unparse(ag).strip('\n') for ag in a.args]
-            if len(ags) != 0:
-                con = astunparse.unparse(a.func).split('.size')[0] + '.shape[' + ','.join(ags) + ']'
-            else:
-                con = astunparse.unparse(a.func).replace('size', 'shape')
-            return ast.parse(con).body[0].value
-    elif isinstance(a, ast.Expr): pass
-    elif isinstance(a, ast.Attribute) or isinstance(a, ast.Name): replace(a)
-    elif isinstance(a, ast.FunctionDef):
-        if a.name == 'forward': a.name = 'execute'
-    if hasattr(a, '__dict__'):
-        for k in a.__dict__.keys():
-            if isinstance(a.__dict__[k], list):
-                delete_flag = []
-                for i,a_ in enumerate(a.__dict__[k]):
-                    ret = dfs(a_)
-                    if ret is 'delete':
-                        delete_flag.append(True)
-                        continue
-                    if ret is not None:
-                        a.__dict__[k][i] = ret
-                    delete_flag.append(False)
-                tmp = [a_ for i,a_ in enumerate(a.__dict__[k]) if delete_flag[i] == False]
-                a.__dict__[k] = tmp
-            else:
-                ret = dfs(a.__dict__[k])
-                if ret is not None:
-                    a.__dict__[k] = ret
--- a/python/jittor/version
+++ b/python/jittor/version
@ -1 +1 @@
-f9e290160bead0d5892754da56b9ad63bc316320
+84596508776983dce645fc4ef77c7f35700549d5
--- a/script/converter_server.sh
+++ b/script/converter_server.sh
@ -0,0 +1,14 @@
+cat > /tmp/converter_server.dockerfile <<\EOF
+FROM jittor/jittor
+
+RUN python3.7 -m pip install flask
+RUN apt update && apt install git -y
+EOF
+
+docker build --tag jittor/converter_server -f /tmp/converter_server.dockerfile .
+
+# docker run --rm -it -m 16g --cpus=8 -p 0.0.0.0:5000:5000  jittor/converter_server bash -c "python3.7 -m pip install -U git+https://github.com/Jittor/jittor.git && python3.7 -m jittor.utils.converter_server"
+while true; do
+    timeout --foreground 24h docker run --rm -it -m 16g --cpus=8 -p 0.0.0.0:5000:5000  jittor/converter_server bash -c "python3.7 -m pip install -U git+https://github.com/Jittor/jittor.git && python3.7 -m jittor.utils.converter_server"
+    sleep 10
+done
--- a/src/executor.cc
+++ b/src/executor.cc
@ -7,7 +7,7 @@
 #include <functional>
 #ifdef HAS_CUDA
 #include <cuda_runtime.h>
-#include <helper_cuda.h>
+#include "helper_cuda.h"
 #include "mem/allocator/cuda_dual_allocator.h"
 #include "event_queue.h"
 #endif
@ -41,29 +41,46 @@ void load_fused_op(FusedOp& fused_op, vector<int>& fuse_ops, vector<Op*>& ops, i
        op->tflag = ntt;
        fused_op.ops.push_back(op);
    }
-    for (Op* op : fused_op.ops) {
-        uint fid1 = op->custom_data;
-        uint oid = 0;
-        for (Var* v : op->outputs()) {
-            oid++;
-            if (v->tflag != tt) {
-                // this var node not belong to current execution
-                // this will happend in multiple outputs fuseable op
-                // v->custom_data = 0 represents this var cannot be fused
-                v->custom_data = 0;
-                continue;
-            }
-            for (auto o : v->outputs_with_index()) {
-                Op* op2 = o.op;
-                uint iid = o.index;
-                if (op2->tflag != ntt) continue;
-                uint fid2 = op2->custom_data;
-                fused_op.edges.emplace_back(fid1, oid-1, fid2, iid);
-            }
-        }
-    }
    LOGvvv << "Prepare fused_op" << fused_op.ops;
    fused_op.update_ops();
+    for (Op* op : fused_op.ops) {
+        uint fid1 = op->custom_data;
+        int iid = 0;
+        for (Var* v : op->inputs()) {
+            iid++;
+            int iop_id;
+            int iv_id;
+            if (v->_inputs.size() && v->input()->tflag == ntt) {
+                auto e = v->_inputs.front();
+                iop_id = e.node->custom_data;
+                iv_id = e.back->index;
+            } else {
+                iv_id = v->custom_data >> 2;
+                // add iv_id, prevent iv_id jit key overflow
+                iop_id = fused_op.ops.size() + iv_id;
+            }
+            fused_op.edges.emplace_back(iop_id, iv_id, fid1, iid-1);
+        }
+        // TODO: can we remove this?
+        // uint oid = 0;
+        // for (Var* v : op->outputs()) {
+        //     oid++;
+        //     if (v->tflag != tt) {
+        //         // this var node not belong to current execution
+        //         // this will happend in multiple outputs fuseable op
+        //         // v->custom_data = 0 represents this var cannot be fused
+        //         v->custom_data = 0;
+        //         continue;
+        //     }
+        //     // for (auto o : v->outputs_with_index()) {
+        //     //     Op* op2 = o.op;
+        //     //     uint iid = o.index;
+        //     //     if (op2->tflag != ntt) continue;
+        //     //     uint fid2 = op2->custom_data;
+        //     //     fused_op.edges.emplace_back(fid1, oid-1, fid2, iid);
+        //     // }
+        // }
+    }
 }

 void Executor::run_sync(vector<Var*> vars, bool device_sync) {
@ -429,6 +446,10 @@ void Executor::run_sync(vector<Var*> vars, bool device_sync) {
        // record trace data
        if (PREDICT_BRANCH_NOT_TAKEN(trace_py_var==2)) {
            trace_data.record_execution(op, is_fused_op, jkl);
+            #ifdef HAS_CUDA
+            if (use_cuda)
+                checkCudaErrors(cudaDeviceSynchronize());
+            #endif
        }
        LOGvvv << "Finished Op(" >> op->name() << rid >> 
            "/" >> queue.size() >> ") output:" << op->outputs();
--- a/src/executor.h
+++ b/src/executor.h
@ -7,10 +7,6 @@
 #pragma once
 #include "common.h"
 #include "mem/allocator.h"
-#ifdef HAS_CUDA
-#include <cuda_runtime.h>
-#include <helper_cuda.h>
-#endif

 namespace jittor {

--- a/src/fuser.cc
+++ b/src/fuser.cc
@ -1,196 +0,0 @@
-// ***************************************************************
-// Copyright (c) 2020 Jittor. Authors: 
-//     Guowei Yang <471184555@qq.com>
-//     Dun Liang <randonlang@gmail.com>. 
-// All Rights Reserved.
-// This file is subject to the terms and conditions defined in
-// file 'LICENSE.txt', which is part of this source code package.
-// ***************************************************************
-#include <algorithm>
-#include <functional>
-#include "fuser.h"
-#include "var.h"
-#include "op.h"
-#include "mem/allocator.h"
-#include "graph.h"
-#include "fused_op.h"
-
-namespace jittor {
-
-#define PREVENT_LARGE_FUSED_OP 16
-
-void count_fuse(int64_t tt, int start_var_num, const vector<Op*>& ops, const vector<Var*>& vars, vector<int> &father, vector<int> &var_fused) {
-    vector<int> dis(ops.size(), -1);
-    
-    auto find_fa = [&](int i) -> int {
-        int j=i;
-        while (father[j] != j) j = father[j];
-        while (i != j) {
-            int tmp = father[i];
-            father[i] = j;
-            i = tmp;
-        }
-        return j;
-    };
-
-    auto can_fuse = [&](Var* v, Op* op1, Op* op2, int fuse_type) -> bool {
-        if (v->flags.get(NodeFlags::_stop_fuse))
-            return false;
-        if (fuse_type == 1) {
-            // if v is output, do not fuse
-            if (v->custom_data < start_var_num)
-                return false;
-            // op2 ---> v ---> op1
-            if (op1->type() == OpType::other || op2->type() == OpType::other)
-                return false;
-            if (v->flags.get(NodeFlags::_force_fuse))
-                return true;
-            // Do not fuse op after reduce(has reduce)
-            // TODO: better fuse strategy
-            if (op2->type() == OpType::reduce)
-                return false;
-            // Do not fuse op before broadcast
-            // TODO: better fuse strategy
-            if (op1->type() == OpType::broadcast)
-                return false;
-            return op2->type() == OpType::element ||
-                op2->type() == OpType::broadcast;
-        } else if (fuse_type == 0) {
-            #ifdef PREVENT_LARGE_FUSED_OP
-            // This statement prevent fuse large ops 
-            if (v->outputs().size()>=PREVENT_LARGE_FUSED_OP) return false;
-            #endif
-
-            // v ---> op1
-            // |
-            // +----> op2 ( prev of op1 )
-            if (op1->type() == OpType::other || op2->type() == OpType::other)
-                return false;
-            // Do not fuse op after reduce(has reduce)
-            // TODO: better fuse strategy
-            if (op2->type() == OpType::broadcast || op1->type() == OpType::broadcast)
-                return false;
-            return true;
-        }
-        return false;
-    };
-
-    auto for_each_edge = [&](Op* op, int forward, auto&& func){
-        auto e=op->_inputs.begin();
-        for (Var* v : op->inputs()) {
-            if ((forward && (*e).back!=std::prev(v->_outputs.end())) || 
-            (!forward && (*e).back!=v->_outputs.begin())){
-                Op* next_op = forward ? std::next((*e).back)->node->op() : std::prev((*e).back)->node->op();
-                if (next_op && next_op->tflag==tt 
-                    && next_op->custom_data != op->custom_data 
-                    && can_fuse(v, next_op, op, 0))
-                    func(v, next_op, 0);
-            }
-            e = std::next(e);
-        }
-
-        if (forward) {
-            for (Var* sv : op->outputs())
-                if (sv && sv->tflag == tt)
-                    for (Op* next_op: sv->outputs()) 
-                        if (next_op && next_op->tflag==tt) func(sv, next_op, 1);
-        } else {
-            for (Var* sv : op->inputs())
-                if (sv && sv->tflag == tt) func(sv, sv->input(), 1);
-        }
-        
-    };
-
-    vector<int> queue;
-    vector<int> deps;
-    deps.reserve(ops.size());
-    queue.reserve(ops.size());
-    for (uint i=0; i<ops.size(); i++) {
-        deps.push_back(0);
-        Op* op = ops[i];
-
-        for_each_edge(op, 1, [&](Var* v, Op* next_op, int real_edge) {
-            deps[i]++;
-        });
-        
-        if (!deps[i]) {
-            queue.push_back(i);
-            dis[i]=0;
-        }
-    }
-
-    uint head=0;
-    while (head<queue.size()) {
-        int op_id=queue[head++];
-        Op* op = ops[op_id];
-
-        for_each_edge(op, 1, [&](Var* v, Op* next_op, int real_edge) {
-            int next_id = next_op->custom_data;
-            if (dis[next_id] == dis[op_id]){
-                int next_fa = find_fa(next_id);
-                father[next_fa] = op_id;
-            }
-        });
-
-        for_each_edge(op, 0, [&](Var* v, Op* next_op, int real_edge) {
-            int next_id = next_op->custom_data;
-            int lon=0;
-            if (real_edge && !can_fuse(v, op, next_op, 1)) lon=1;
-            if (dis[op_id]+lon>dis[next_id])
-                dis[next_id]=dis[op_id]+lon;
-            if (!--deps[next_id]) queue.push_back(next_id);
-        });
-    }
-
-    if (V_ON(1000)) {
-        for (uint i=0; i<ops.size(); i++)
-            LOGvvvv << ops[i] << dis[i] << deps[i];
-    }
-    for (uint i=0; i<vars.size(); i++) {
-        Var* v = vars[i];
-        if (!v || v->tflag!=tt) {
-            var_fused[i]=1;
-            continue;
-        }
-        // sf: input op's father id
-        int sf = -1;
-        // vf: is input op can be fused with all output op
-        int vf = 1;
-        // all outputs are reduce
-        int all_reduce = 1;
-        Op* iop = v->input();
-        // if (iop && iop->tflag==tt) 
-        sf = find_fa(iop->custom_data);
-
-        for (Op* sop : v->outputs())
-            if (sop->tflag==tt) {
-                if (vf && !can_fuse(v,sop,iop,1))
-                    vf = 0;
-                if (sop->type()!=OpType::reduce)
-                    all_reduce = 0;
-                // in two different fused op
-                if (find_fa(sop->custom_data)!=sf) {
-                    var_fused[i]=1;
-                }
-            }
-        if (vf==0)
-            // cannot fused
-            var_fused[i]=1;
-        else if (var_fused[i]) {
-            if (iop->type()==OpType::broadcast || 
-                all_reduce || 
-                v->flags.get(NodeFlags::_force_fuse))
-                // strong fused
-                var_fused[i] = 3;
-            else
-                // weak fused
-                var_fused[i] = 2;
-                // var_fused[i] = 3;
-        }
-    }
-    // output vars can not be fused
-    for (int i=0; i<start_var_num; i++)
-        var_fused[i] = 1;
-}
-    
-} // jittor
--- a/src/grad.cc
+++ b/src/grad.cc
@ -177,7 +177,8 @@ vector<VarPtr> grad(Var* loss, vector<Var*> targets) {
                    Var* dout = grads[id];
                    trace_grad_op = op;
                    VarPtr dvar = make_grad(op, out, dout, var, index);
-                    if (dvar && dvar->num>=0 && var->num)
+                    if (dvar && dvar->num>=0 && var->num>0)
+                        // var->num == 0 represents a any match var
                        ASSERT(dvar->num==var->num && dvar->shape.size()==var->shape.size())
                        << "dvar" << dvar << "var" << var;
                    if (!grad)
--- a/src/init.cc
+++ b/src/init.cc
@ -5,7 +5,7 @@
 // ***************************************************************
 #ifdef HAS_CUDA
 #include <cuda_runtime.h>
-#include <helper_cuda.h>
+#include "helper_cuda.h"
 #endif
 #include <random>

--- a/src/mem/allocator/cuda_device_allocator.cc
+++ b/src/mem/allocator/cuda_device_allocator.cc
@ -5,7 +5,7 @@
 // ***************************************************************
 #ifdef HAS_CUDA
 #include <cuda_runtime.h>
-#include <helper_cuda.h>
+#include "helper_cuda.h"
 #include "mem/allocator/cuda_device_allocator.h"

 namespace jittor {
--- a/src/mem/allocator/cuda_dual_allocator.h
+++ b/src/mem/allocator/cuda_dual_allocator.h
@ -9,7 +9,7 @@
 #include <mutex>
 #include <cstring>
 #include <cuda_runtime.h>
-#include <helper_cuda.h>
+#include "helper_cuda.h"
 #include "misc/cuda_flags.h"
 #include "var.h"
 #include "mem/allocator.h"
--- a/src/mem/allocator/cuda_host_allocator.cc
+++ b/src/mem/allocator/cuda_host_allocator.cc
@ -5,7 +5,7 @@
 // ***************************************************************
 #ifdef HAS_CUDA
 #include <cuda_runtime.h>
-#include <helper_cuda.h>
+#include "helper_cuda.h"
 #include "mem/allocator/cuda_host_allocator.h"

 namespace jittor {
--- a/src/mem/allocator/cuda_managed_allocator.cc
+++ b/src/mem/allocator/cuda_managed_allocator.cc
@ -5,7 +5,7 @@
 // ***************************************************************
 #ifdef HAS_CUDA
 #include <cuda_runtime.h>
-#include <helper_cuda.h>
+#include "helper_cuda.h"
 #include "mem/allocator/cuda_managed_allocator.h"

 namespace jittor {
--- a/src/misc/nano_vector.h
+++ b/src/misc/nano_vector.h
@ -280,4 +280,20 @@ inline bool operator!=(const NanoVector& a, const NanoVector& b) {
    return ne(a, b);
 }

+inline bool operator<(const NanoVector& a, const NanoVector& b) {
+    return a.data < b.data || (a.data == b.data && a.offset < b.offset);
+}
+
 } // jittor
+
+
+namespace std {
+template<> struct hash<jittor::NanoVector> {
+inline std::size_t operator()(jittor::NanoVector const& s) const noexcept {
+        std::size_t h1 = std::hash<jittor::int64>{}(s.data);
+        std::size_t h2 = std::hash<jittor::int64>{}(s.offset);
+        return h1 ^ (h2 << 1);
+}
+};
+}
+ 
--- a/src/misc/ring_buffer.cc
+++ b/src/misc/ring_buffer.cc
@ -92,7 +92,7 @@ JIT_TEST(ring_buffer_benchmark) {
    LOGi << tt << tt*1.0/n;
    LOGi << s << (n*(n-1)/2); 
    ASSERTop(s,==,(n*(n-1)/2));
-    ASSERTop(tt*1.0/n,<=,50);
+    ASSERTop(tt*1.0/n,<=,100);
 }

 }
--- a/src/misc/ring_buffer.h
+++ b/src/misc/ring_buffer.h
@ -88,6 +88,8 @@ struct RingBuffer {
    static RingBuffer* make_ring_buffer(uint64 size, bool multiprocess);
    static void free_ring_buffer(RingBuffer* rb);

+    inline void clear() { l = r = is_stop = 0; }
+
    inline void wait() {
        if (is_stop) {
            throw std::runtime_error("stop");
--- a/src/misc/stack_vector.h
+++ b/src/misc/stack_vector.h
@ -17,6 +17,7 @@ struct StackVector {
    inline T& front() { return a[0]; }
    inline T& back() { return a[n-1]; }
    inline int size() { return n;}
+    inline T* data() { return a;}
    inline StackVector(int n=0) : n(n) {}

    struct Iter {
--- a/src/op.cc
+++ b/src/op.cc
@ -287,8 +287,12 @@ std::ostream& operator<<(std::ostream& os, const Op* op) {
    os << ')';
 #ifdef NODE_MEMCHECK
    os << '<' << op->__id() << '>';
-    print_node_trace(op, os);
 #endif
+    if (trace_py_var) {
+        os << '{';
+        print_node_trace(op, os);
+        os << '}';
+    }
    return os;
 }

--- a/src/ops/array_op.cc
+++ b/src/ops/array_op.cc
@ -5,7 +5,7 @@
 // ***************************************************************
 #ifdef HAS_CUDA
 #include <cuda_runtime.h>
-#include <helper_cuda.h>
+#include "helper_cuda.h"
 #include "mem/allocator.h"
 #include "mem/allocator/cuda_dual_allocator.h"
 #include "event_queue.h"
@ -75,8 +75,19 @@ ArrayOp::ArrayOp(ArrayArgs&& args) {
 }

 void ArrayOp::jit_prepare(JK& jk) {
-    if (output->flags.get(NodeFlags::_force_fuse))
+    if (output->flags.get(NodeFlags::_force_fuse)) {
        jk << _CS("[T:") << output->dtype() << ']';
+
+        // fill or find cbuffer for const var pass
+        if (output->dtype().dsize() == 4) {
+            auto x = abs(ptr<int32>()[0]);
+            auto y = abs(ptr<float32>()[0]);
+            auto z = ptr<uint32>()[0];
+            if ((x<=2) || (y==1.0f || y==2.0f))
+                jk << _CS("[o:") << z << ']';
+        }
+        // end of fill cbuffer
+    }
 }

 void ArrayOp::run() {
--- a/src/ops/binary_op.cc
+++ b/src/ops/binary_op.cc
@ -93,6 +93,9 @@ VarPtr dirty_clone_broadcast(Var* v) {
    if (op && !v->is_finished() && v->shape.size() > 4 && op->type() == OpType::broadcast) {
        auto vp = op->duplicate();
        if (vp) {
+            // TODO: loop options should be set to op, rather than var
+            if (v->loop_options)
+                vp->loop_options = v->loop_options;
            return vp;
        }
    }
@ -126,8 +129,8 @@ VarPtr BinaryOp::grad(Var* out, Var* dout, Var* v, int v_index) {
    }
    if (ns == ns_maximum || ns == ns_minimum) {
        auto zeros = make_number(0, dout);
-        auto cond = make_binary(x, y, ns_greater_equal);
-        if ((ns == ns_maximum) == (v_index==0))
+        auto cond = make_binary(y, z, ns_equal);
+        if (v_index==1)
            return make_ternary(cond, dout, zeros);
        else
            return make_ternary(cond, zeros, dout);
--- a/src/ops/binary_op_defs.h
+++ b/src/ops/binary_op_defs.h
@ -38,7 +38,7 @@ namespace jittor {
 #define bitwise_and(T,a,b) ((a)&(b))
 #define bitwise_or(T,a,b) ((a)|(b))
 #define bitwise_xor(T,a,b) ((a)^(b))
-#define mean(T,a,b) ((a)+T(b)*(T(1)/T(count)))
+#define mean(T,a,b) ((a)+T(b)*(T(rcount)))

 #ifdef JIT_cuda
 #define init_maximum(T) ::numeric_min<T>()
--- a/src/ops/concat_op.cc
+++ b/src/ops/concat_op.cc
@ -5,7 +5,7 @@
 // ***************************************************************
 #ifdef HAS_CUDA
 #include <cuda_runtime.h>
-#include <helper_cuda.h>
+#include "helper_cuda.h"
 #endif
 #include <algorithm>
 #include "var.h"
--- a/src/ops/copy_op.cc
+++ b/src/ops/copy_op.cc
@ -10,7 +10,8 @@
 #include "ops/copy_op.h"
 #ifdef HAS_CUDA
 #include <cuda_runtime.h>
-#include <helper_cuda.h>
+#include "helper_cuda.h"
+#include "misc/cuda_flags.h"
 #endif

 namespace jittor {
@ -36,14 +37,14 @@ void CopyOp::run() {
    auto size = x->size;
    auto x_ptr = x->mem_ptr;
    auto y_ptr = outputs().front()->mem_ptr;
-    if (flags.get(NodeFlags::_cpu)) {
+    #ifdef HAS_CUDA
+    if (flags.get(NodeFlags::_cuda))  {
+        checkCudaErrors(cudaMemcpyAsync(y_ptr, x_ptr, size, cudaMemcpyDefault, 0));
+    } else
+    #endif
+    {
        std::memcpy(y_ptr, x_ptr, size);
    }
-    #ifdef HAS_CUDA
-    else {
-        checkCudaErrors(cudaMemcpyAsync(y_ptr, x_ptr, size, cudaMemcpyDefault, 0));
-    }
-    #endif
 }


--- a/src/ops/fetch_op.cc
+++ b/src/ops/fetch_op.cc
@ -7,7 +7,7 @@
 // ***************************************************************
 #ifdef HAS_CUDA
 #include <cuda_runtime.h>
-#include <helper_cuda.h>
+#include "helper_cuda.h"
 #include <mutex>
 #include "misc/cuda_flags.h"
 #include "mem/allocator/sfrl_allocator.h"
--- a/src/ops/getitem_op.cc
+++ b/src/ops/getitem_op.cc
@ -10,7 +10,7 @@
 #include "ops/op_register.h"
 #ifdef JIT_cuda
 #include <cuda_runtime.h>
-#include <helper_cuda.h>
+#include "helper_cuda.h"
 #endif
 #ifndef JIT
 #include "misc/stack_vector.h"
--- a/src/ops/index_op.cc
+++ b/src/ops/index_op.cc
@ -72,9 +72,9 @@ void IndexOp::jit_run() {
    @for(d, 0, XDIM, for (index_t i@d=0; i@d < x0shape@d; i@d++)) {
        auto xid = @for(d, 0, XDIM, + i@d * x0stride@d);
        @if(DIM==XDIM,
-            @for(i,0,XDIM, x@i@@p[xid] = i@i;)
+            @for(i,0,XDIM, T x@i@@id = i@i; x@i@@p[xid] = x@i@@id;)
        ,
-            x0p[xid] = i@DIM;
+            T x@DIM@@id = i@DIM; x0p[xid] = x@DIM@@id;
        )
    }
 }
--- a/src/ops/reduce_op.cc
+++ b/src/ops/reduce_op.cc
@ -34,9 +34,9 @@ unordered_set<string> reduce_ops = {
    "add",
    // @pybind(prod, product, reduce_multiply)
    "multiply", 
-    // @pybind(reduce_logical_and, all)
+    // @pybind(reduce_logical_and, all_)
    "logical_and", 
-    // @pybind(reduce_logical_or, any)
+    // @pybind(reduce_logical_or, any_)
    "logical_or", 
    "logical_xor", 
    "bitwise_and", 
@ -65,7 +65,8 @@ ReduceOp::ReduceOp(Var* x, NanoString op, NanoVector dims, bool keepdims)
            reduce_mask |= 1<<dim;
        }
    }
-    if (x->dtype() == ns_bool && ns == ns_add)
+    // if (x->dtype() == ns_bool && ns == ns_add)
+    if (x->dtype() == ns_bool)
        y = create_output(nullptr, ns_int32);
    else
        y = create_output(nullptr, binary_dtype_infer(ns, x, x));
@ -157,6 +158,7 @@ void ReduceOp::jit_run() {
    index_t xstride@{DIM-1} = 1;
    @for(i, DIM-2, -1, -1, auto xstride@i = xstride@{i+1} * xshape@{i+1};)
    Ty count = Ty(x->num) / Ty(y->num);
+    Ty rcount = Ty(y->num) / Ty(x->num);
    @for(d, 0, DIM,@if(REDUCE>>d&1,, for (index_t xi@d=0; xi@d < xshape@d; xi@d++))) {
        auto yid = 0 @for(d, 0, DIM,@if(REDUCE>>d&1,, + xi@d * ystride@d));
        yp[yid] = @expand_macro(init_@OP, Ty);
@ -169,7 +171,7 @@ void ReduceOp::jit_run() {
            yp[yid] = @expand_macro(@OP, Ty, yp[yid], xp[xid]);
        }
    }
-    (void)count, (void)yshape0, (void)ystride0;
+    (void)count, (void)rcount, (void)yshape0, (void)ystride0;
 }
 #endif // JIT

--- a/src/ops/setitem_op.cc
+++ b/src/ops/setitem_op.cc
@ -12,7 +12,7 @@
 #include "ops/binary_op_defs.h"
 #ifdef JIT_cuda
 #include <cuda_runtime.h>
-#include <helper_cuda.h>
+#include "helper_cuda.h"
 #endif
 #else
 #include "ops/op_register.h"
@ -69,7 +69,7 @@ void SetitemOp::infer_shape() {
    for (int i=0; i<data_dim; i++) {
        int j = i - data_dim + out_shape.size();
        if (!(data_shape[i]==1 && out_shape[j]!=-1)) {
-            CHECK(data_shape[i]<0 || data_shape[i]==out_shape[j])
+            CHECK(data_shape[i]<0 || out_shape[j]<0 || data_shape[i]==out_shape[j])
                << "Data shape not match" << data_shape << out_shape;
            bmask |= 1<<j;
        }
--- a/src/ops/ternary_op.cc
+++ b/src/ops/ternary_op.cc
@ -64,8 +64,11 @@ void TernaryOp::jit_run() {
    auto* __restrict__ yp = y->ptr<Ty>();
    auto* __restrict__ zp = z->ptr<Tz>();
    index_t num = z->num;
-    for (index_t i=0; i<num; i++)
-        zp[i] = condp[i] ? xp[i] : yp[i];
+    for (index_t i=0; i<num; i++) {
+        Tz xd_ = xp[i];
+        Tz yd_ = yp[i];
+        zp[i] = condp[i] ? xd_ : yd_;
+    }
 }
 #endif // JIT

--- a/src/ops/transpose_op.cc
+++ b/src/ops/transpose_op.cc
@ -40,38 +40,8 @@ TransposeOp::TransposeOp(Var* x, NanoVector axes_) : x(x), axes(axes_) {
                .get_constructor<VarPtr, Var*, NanoVector>();
        }
        if (cutt_transpose) {
-            bool need_reshape = false;
-            int dims = x->shape.size();
-            vector<int64> in_axes;
-            vector<int64> in_shape;
-            vector<int64> out_shape;
-            vector<int64> trans;
-            int cnt = 0;
-            for (int i = 0; i < dims; ++i) {
-                if (x->shape[i] == 1) {
-                    need_reshape = true;
-                    trans.push_back(-1);
-                } else {
-                    trans.push_back(cnt);
-                    cnt += 1;
-                    in_shape.push_back(x->shape[i]);
-                }
-                out_shape.push_back(x->shape[axes[i]]);
-            }
-            for (int i = 0; i < dims; ++i) {
-                if (x->shape[axes[i]] != 1) {
-                    in_axes.push_back(trans[axes[i]]);
-                }
-            }
-            if (need_reshape) {
-                auto x1 = make_reshape(x, NanoVector(in_shape));
-                auto x2 = cutt_transpose(x1, in_axes);
-                auto x3 = make_reshape(x2, NanoVector(out_shape));
-                forward(x3);
-            } else {
-                auto var = cutt_transpose(x, axes);
-                forward(var);
-            }
+            auto var = cutt_transpose(x, axes);
+            forward(var);
            return;
        }
    }
--- a/src/ops/where_op.cc
+++ b/src/ops/where_op.cc
@ -10,6 +10,8 @@
 #ifdef JIT_cuda
 #include "executor.h"
 #include <assert.h>
+#include <cuda_runtime.h>
+#include "helper_cuda.h"
 #endif

 namespace jittor {
--- a/Show More
+++ b/Show More