Merge branch 'master' of https://github.com/Jittor/jittor into gword

2020-04-20 20:17:43 +08:00 · 2020-04-20 20:17:43 +08:00 · d60b37bb07
parent 5a450a495b 87cbdef949
commit d60b37bb07
32 changed files with 2084 additions and 1196 deletions
--- a/.gitignore
+++ b/.gitignore
@ -18,4 +18,6 @@ build/
 *.md
 !*.src.md
 !README.md
-!README.cn.md
+!README.cn.md
+python/jittor.egg-info
+dist/
--- a/python/jittor/init.py
+++ b/python/jittor/init.py
@ -21,6 +21,7 @@ with lock.lock_scope():
 import contextlib
 import numpy as np
 from collections import OrderedDict
+from collections.abc import Sequence, Mapping
 import types
 import pickle
 import sys
@ -340,6 +341,37 @@ def detach(x):
    return x.clone().stop_grad().clone()
 Var.detach = detach

+origin_reshape = reshape
+def reshape(x, *shape):
+    if len(shape) == 1 and isinstance(shape[0], Sequence):
+        shape = shape[0]
+    return origin_reshape(x, shape)
+reshape.__doc__ = origin_reshape.__doc__
+Var.view = Var.reshape = view = reshape
+
+origin_transpose = transpose
+def transpose(x, *dim):
+    if len(dim) == 1 and isinstance(dim[0], Sequence):
+        dim = dim[0]
+    return origin_transpose(x, dim)
+transpose.__doc__ = origin_transpose.__doc__
+Var.transpose = Var.permute = permute = transpose
+
+def flatten(input, start_dim=0, end_dim=-1):
+    '''flatten dimentions by reshape'''
+    in_shape = input.shape
+    start_dim = len(in_shape) + start_dim if start_dim < 0 else start_dim
+    end_dim = len(in_shape) + end_dim if end_dim < 0 else end_dim
+    assert end_dim > start_dim, "end_dim should be larger than start_dim for flatten function"
+    out_shape = []
+    for i in range(0,start_dim,1): out_shape.append(in_shape[i])
+    dims = 1
+    for i in range(start_dim, end_dim+1, 1): dims *= in_shape[i]
+    out_shape.append(dims)
+    for i in range(end_dim+1,len(in_shape),1): out_shape.append(in_shape[i])
+    return input.reshape(out_shape)
+Var.flatten = flatten
+
 def detach_inplace(x):
    return x.swap(x.stop_grad().clone())
 Var.start_grad = Var.detach_inplace = detach_inplace
@ -509,8 +541,9 @@ class Module:

    def extra_repr(self):
        ss = []
-        n = len(self.__init__.__code__.co_varnames) - \
-            len(self.__init__.__defaults__)
+        n = len(self.__init__.__code__.co_varnames)
+        if self.__init__.__defaults__ is not None:
+            n -= len(self.__init__.__defaults__)
        for i, k in enumerate(self.__init__.__code__.co_varnames[1:]):
            v = getattr(self, k) if hasattr(self, k) else None
            if isinstance(v, Var): v = v.peek()
@ -537,7 +570,8 @@ class Module:
                        end = 1
                        break
            if end ==1:
-                print(f'init {key} fail ...')
+                # print(f'init {key} fail ...')
+                pass
            else:
                # print(f'init {key} success ...')
                if isinstance(params[key], np.ndarray) or isinstance(params[key], list):
@ -650,8 +684,9 @@ def jittor_exit():
        core.sync_all(True)
 atexit.register(jittor_exit)

-Var.__repr__ = Var.__str__ = lambda x: str(x.data)
-Var.peek = lambda x: str(x.dtype)+str(x.shape)
+Var.__str__ = lambda x: str(x.data)
+Var.__repr__ = lambda x: f"jt.Var:{x.dtype}{x.uncertain_shape}"
+Var.peek = lambda x: f"{x.dtype}{x.shape}"

 from . import nn
 from .nn import matmul
--- a/python/jittor/models/init.py
+++ b/python/jittor/models/init.py
@ -1,2 +1,18 @@
 from . import resnet
-from . import vgg
+from .resnet import *
+from . import vgg
+from .vgg import *
+from . import alexnet
+from .alexnet import *
+from . import squeezenet
+from .squeezenet import *
+from . import inception
+from .inception import *
+from . import googlenet
+from .googlenet import *
+from . import mobilenet
+from .mobilenet import *
+from . import mnasnet
+from .mnasnet import *
+from . import shufflenetv2
+from .shufflenetv2 import *
--- a/python/jittor/models/alexnet.py
+++ b/python/jittor/models/alexnet.py
@ -0,0 +1,53 @@
+# ***************************************************************
+# Copyright (c) 2020 Jittor. Authors: 
+#     Wenyang Zhou <576825820@qq.com>
+#     Dun Liang <randonlang@gmail.com>. 
+# All Rights Reserved.
+# This file is subject to the terms and conditions defined in
+# file 'LICENSE.txt', which is part of this source code package.
+# ***************************************************************
+# This model is generated by pytorch converter.
+import jittor as jt
+import jittor.nn as nn
+
+__all__ = ['AlexNet', 'alexnet']
+
+class AlexNet(nn.Module):
+
+    def __init__(self, num_classes=1000):
+        super(AlexNet, self).__init__()
+        self.features = nn.Sequential(
+            nn.Conv(3, 64, kernel_size=11, stride=4, padding=2),
+            nn.Relu(), 
+            nn.Pool(kernel_size=3, stride=2, op='maximum'), 
+            nn.Conv(64, 192, kernel_size=5, padding=2), 
+            nn.Relu(), nn.Pool(kernel_size=3, stride=2, op='maximum'), 
+            nn.Conv(192, 384, kernel_size=3, padding=1), 
+            nn.Relu(), 
+            nn.Conv(384, 256, kernel_size=3, padding=1), 
+            nn.Relu(), 
+            nn.Conv(256, 256, kernel_size=3, padding=1), 
+            nn.Relu(), 
+            nn.Pool(kernel_size=3, stride=2, op='maximum')
+        )
+        self.avgpool = nn.AdaptiveAvgPool2d((6, 6))
+        self.classifier = nn.Sequential(
+            nn.Dropout(), 
+            nn.Linear(((256 * 6) * 6), 4096), 
+            nn.Relu(), 
+            nn.Dropout(), 
+            nn.Linear(4096, 4096), 
+            nn.Relu(), 
+            nn.Linear(4096, num_classes)
+        )
+
+    def execute(self, x):
+        x = self.features(x)
+        x = self.avgpool(x)
+        x = jt.reshape(x, (x.shape[0], (- 1)))
+        x = self.classifier(x)
+        return x
+
+def alexnet(**kwargs):
+    model = AlexNet(**kwargs)
+    return model
--- a/python/jittor/models/googlenet.py
+++ b/python/jittor/models/googlenet.py
@ -0,0 +1,143 @@
+# ***************************************************************
+# Copyright (c) 2020 Jittor. Authors: 
+#     Wenyang Zhou <576825820@qq.com>
+#     Dun Liang <randonlang@gmail.com>. 
+# All Rights Reserved.
+# This file is subject to the terms and conditions defined in
+# file 'LICENSE.txt', which is part of this source code package.
+# ***************************************************************
+# This model is generated by pytorch converter.
+import jittor as jt
+from jittor import nn
+
+__all__ = ['GoogLeNet', 'googlenet']
+
+def googlenet(**kwargs):
+    return GoogLeNet(**kwargs)
+
+class GoogLeNet(nn.Module):
+
+    def __init__(self, num_classes=1000, aux_logits=True, init_weights=True, blocks=None):
+        super(GoogLeNet, self).__init__()
+        if (blocks is None):
+            blocks = [BasicConv2d, Inception, InceptionAux]
+        assert (len(blocks) == 3)
+        conv_block = blocks[0]
+        inception_block = blocks[1]
+        inception_aux_block = blocks[2]
+        self.aux_logits = aux_logits
+        self.conv1 = conv_block(3, 64, kernel_size=7, stride=2, padding=3)
+        self.maxpool1 = nn.Pool(3, stride=2, ceil_mode=True, op='maximum')
+        self.conv2 = conv_block(64, 64, kernel_size=1)
+        self.conv3 = conv_block(64, 192, kernel_size=3, padding=1)
+        self.maxpool2 = nn.Pool(3, stride=2, ceil_mode=True, op='maximum')
+        self.inception3a = inception_block(192, 64, 96, 128, 16, 32, 32)
+        self.inception3b = inception_block(256, 128, 128, 192, 32, 96, 64)
+        self.maxpool3 = nn.Pool(3, stride=2, ceil_mode=True, op='maximum')
+        self.inception4a = inception_block(480, 192, 96, 208, 16, 48, 64)
+        self.inception4b = inception_block(512, 160, 112, 224, 24, 64, 64)
+        self.inception4c = inception_block(512, 128, 128, 256, 24, 64, 64)
+        self.inception4d = inception_block(512, 112, 144, 288, 32, 64, 64)
+        self.inception4e = inception_block(528, 256, 160, 320, 32, 128, 128)
+        self.maxpool4 = nn.Pool(2, stride=2, ceil_mode=True, op='maximum')
+        self.inception5a = inception_block(832, 256, 160, 320, 32, 128, 128)
+        self.inception5b = inception_block(832, 384, 192, 384, 48, 128, 128)
+        if aux_logits:
+            self.aux1 = inception_aux_block(512, num_classes)
+            self.aux2 = inception_aux_block(528, num_classes)
+        else:
+            self.aux1 = None
+            self.aux2 = None
+        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+        self.dropout = nn.Dropout(0.2)
+        self.fc = nn.Linear(1024, num_classes)
+
+    def _forward(self, x):
+        x = self.conv1(x)
+        x = self.maxpool1(x)
+        x = self.conv2(x)
+        x = self.conv3(x)
+        x = self.maxpool2(x)
+        x = self.inception3a(x)
+        x = self.inception3b(x)
+        x = self.maxpool3(x)
+        x = self.inception4a(x)
+        if (self.aux1 is not None):
+            aux1 = self.aux1(x)
+        x = self.inception4b(x)
+        x = self.inception4c(x)
+        x = self.inception4d(x)
+        if (self.aux2 is not None):
+            aux2 = self.aux2(x)
+        x = self.inception4e(x)
+        x = self.maxpool4(x)
+        x = self.inception5a(x)
+        x = self.inception5b(x)
+        x = self.avgpool(x)
+
+        x = jt.reshape(x, (x.shape[0], (- 1)))
+        x = self.dropout(x)
+        x = self.fc(x)
+        return (x, aux2, aux1)
+
+    def eager_outputs(self, x, aux2, aux1):
+        return x
+
+    def execute(self, x):
+        (x, aux1, aux2) = self._forward(x)
+        aux_defined = (self.aux_logits)
+        return self.eager_outputs(x, aux2, aux1)
+
+class Inception(nn.Module):
+
+    def __init__(self, in_channels, ch1x1, ch3x3red, ch3x3, ch5x5red, ch5x5, pool_proj, conv_block=None):
+        super(Inception, self).__init__()
+        if (conv_block is None):
+            conv_block = BasicConv2d
+        self.branch1 = conv_block(in_channels, ch1x1, kernel_size=1)
+        self.branch2 = nn.Sequential(conv_block(in_channels, ch3x3red, kernel_size=1), conv_block(ch3x3red, ch3x3, kernel_size=3, padding=1))
+        self.branch3 = nn.Sequential(conv_block(in_channels, ch5x5red, kernel_size=1), conv_block(ch5x5red, ch5x5, kernel_size=3, padding=1))
+        self.branch4 = nn.Sequential(nn.Pool(kernel_size=3, stride=1, padding=1, ceil_mode=True, op='maximum'), conv_block(in_channels, pool_proj, kernel_size=1))
+
+    def _forward(self, x):
+        branch1 = self.branch1(x)
+        branch2 = self.branch2(x)
+        branch3 = self.branch3(x)
+        branch4 = self.branch4(x)
+        outputs = [branch1, branch2, branch3, branch4]
+        return outputs
+
+    def execute(self, x):
+        outputs = self._forward(x)
+        return jt.contrib.concat(outputs, dim=1)
+
+class InceptionAux(nn.Module):
+
+    def __init__(self, in_channels, num_classes, conv_block=None):
+        super(InceptionAux, self).__init__()
+        if (conv_block is None):
+            conv_block = BasicConv2d
+        self.conv = conv_block(in_channels, 128, kernel_size=1)
+        self.fc1 = nn.Linear(2048, 1024)
+        self.fc2 = nn.Linear(1024, num_classes)
+
+    def execute(self, x):
+        x = nn.AdaptiveAvgPool2d(4)(x)
+        x = self.conv(x)
+        x = jt.reshape(x, (x.shape[0], (- 1)))
+        x = nn.relu(self.fc1(x))
+        x = nn.Dropout(0.7)(x)
+        x = self.fc2(x)
+        return x
+
+class BasicConv2d(nn.Module):
+
+    def __init__(self, in_channels, out_channels, **kwargs):
+        super(BasicConv2d, self).__init__()
+        self.conv = nn.Conv(in_channels, out_channels, bias=False, **kwargs)
+        self.bn = nn.BatchNorm(out_channels, eps=0.001)
+
+    def execute(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        return nn.relu(x)
--- a/python/jittor/models/inception.py
+++ b/python/jittor/models/inception.py
@ -0,0 +1,268 @@
+
+import jittor as jt
+from jittor import nn
+__all__ = ['Inception3', 'inception_v3']
+
+def inception_v3(pretrained=False, progress=True, **kwargs):
+    return Inception3(**kwargs)
+
+class Inception3(nn.Module):
+
+    def __init__(self, num_classes=1000, aux_logits=True, inception_blocks=None, init_weights=True):
+        super(Inception3, self).__init__()
+        if (inception_blocks is None):
+            inception_blocks = [BasicConv2d, InceptionA, InceptionB, InceptionC, InceptionD, InceptionE, InceptionAux]
+        assert (len(inception_blocks) == 7)
+        conv_block = inception_blocks[0]
+        inception_a = inception_blocks[1]
+        inception_b = inception_blocks[2]
+        inception_c = inception_blocks[3]
+        inception_d = inception_blocks[4]
+        inception_e = inception_blocks[5]
+        inception_aux = inception_blocks[6]
+        self.aux_logits = aux_logits
+        self.Conv2d_1a_3x3 = conv_block(3, 32, kernel_size=3, stride=2)
+        self.Conv2d_2a_3x3 = conv_block(32, 32, kernel_size=3)
+        self.Conv2d_2b_3x3 = conv_block(32, 64, kernel_size=3, padding=1)
+        self.Conv2d_3b_1x1 = conv_block(64, 80, kernel_size=1)
+        self.Conv2d_4a_3x3 = conv_block(80, 192, kernel_size=3)
+        self.Mixed_5b = inception_a(192, pool_features=32)
+        self.Mixed_5c = inception_a(256, pool_features=64)
+        self.Mixed_5d = inception_a(288, pool_features=64)
+        self.Mixed_6a = inception_b(288)
+        self.Mixed_6b = inception_c(768, channels_7x7=128)
+        self.Mixed_6c = inception_c(768, channels_7x7=160)
+        self.Mixed_6d = inception_c(768, channels_7x7=160)
+        self.Mixed_6e = inception_c(768, channels_7x7=192)
+        if aux_logits:
+            self.AuxLogits = inception_aux(768, num_classes)
+        self.Mixed_7a = inception_d(768)
+        self.Mixed_7b = inception_e(1280)
+        self.Mixed_7c = inception_e(2048)
+        self.fc = nn.Linear(2048, num_classes)
+
+    def _forward(self, x):
+        x = self.Conv2d_1a_3x3(x)
+        x = self.Conv2d_2a_3x3(x)
+        x = self.Conv2d_2b_3x3(x)
+        x = nn.pool(x, 3, "maximum", stride=2)
+        x = self.Conv2d_3b_1x1(x)
+        x = self.Conv2d_4a_3x3(x)
+        x = nn.pool(x, 3, "maximum",  stride=2)
+        x = self.Mixed_5b(x)
+        x = self.Mixed_5c(x)
+        x = self.Mixed_5d(x)
+        x = self.Mixed_6a(x)
+        x = self.Mixed_6b(x)
+        x = self.Mixed_6c(x)
+        x = self.Mixed_6d(x)
+        x = self.Mixed_6e(x)
+        aux_defined = self.aux_logits
+        if aux_defined:
+            aux = self.AuxLogits(x)
+        else:
+            aux = None
+        x = self.Mixed_7a(x)
+        x = self.Mixed_7b(x)
+        x = self.Mixed_7c(x)
+        x = nn.AdaptiveAvgPool2d(1)(x)
+        x = nn.Dropout()(x)
+        x = jt.reshape(x, (x.shape[0], (- 1)))
+        x = self.fc(x)
+        return (x, aux)
+
+    def eager_outputs(self, x, aux):
+        return x
+
+    def execute(self, x):
+        (x, aux) = self._forward(x)
+        aux_defined = self.aux_logits
+        return self.eager_outputs(x, aux)
+
+class InceptionA(nn.Module):
+
+    def __init__(self, in_channels, pool_features, conv_block=None):
+        super(InceptionA, self).__init__()
+        if (conv_block is None):
+            conv_block = BasicConv2d
+        self.branch1x1 = conv_block(in_channels, 64, kernel_size=1)
+        self.branch5x5_1 = conv_block(in_channels, 48, kernel_size=1)
+        self.branch5x5_2 = conv_block(48, 64, kernel_size=5, padding=2)
+        self.branch3x3dbl_1 = conv_block(in_channels, 64, kernel_size=1)
+        self.branch3x3dbl_2 = conv_block(64, 96, kernel_size=3, padding=1)
+        self.branch3x3dbl_3 = conv_block(96, 96, kernel_size=3, padding=1)
+        self.branch_pool = conv_block(in_channels, pool_features, kernel_size=1)
+
+    def _forward(self, x):
+        branch1x1 = self.branch1x1(x)
+        branch5x5 = self.branch5x5_1(x)
+        branch5x5 = self.branch5x5_2(branch5x5)
+        branch3x3dbl = self.branch3x3dbl_1(x)
+        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
+        branch3x3dbl = self.branch3x3dbl_3(branch3x3dbl)
+        branch_pool = nn.pool(x, 3, "mean", stride=1, padding=1)
+        branch_pool = self.branch_pool(branch_pool)
+        outputs = [branch1x1, branch5x5, branch3x3dbl, branch_pool]
+        return outputs
+
+    def execute(self, x):
+        outputs = self._forward(x)
+        return jt.contrib.concat(outputs, dim=1)
+
+class InceptionB(nn.Module):
+
+    def __init__(self, in_channels, conv_block=None):
+        super(InceptionB, self).__init__()
+        if (conv_block is None):
+            conv_block = BasicConv2d
+        self.branch3x3 = conv_block(in_channels, 384, kernel_size=3, stride=2)
+        self.branch3x3dbl_1 = conv_block(in_channels, 64, kernel_size=1)
+        self.branch3x3dbl_2 = conv_block(64, 96, kernel_size=3, padding=1)
+        self.branch3x3dbl_3 = conv_block(96, 96, kernel_size=3, stride=2)
+
+    def _forward(self, x):
+        branch3x3 = self.branch3x3(x)
+        branch3x3dbl = self.branch3x3dbl_1(x)
+        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
+        branch3x3dbl = self.branch3x3dbl_3(branch3x3dbl)
+        branch_pool = nn.pool(x, 3, "maximum", stride=2)
+        outputs = [branch3x3, branch3x3dbl, branch_pool]
+        return outputs
+
+    def execute(self, x):
+        outputs = self._forward(x)
+        return jt.contrib.concat(outputs, dim=1)
+
+class InceptionC(nn.Module):
+
+    def __init__(self, in_channels, channels_7x7, conv_block=None):
+        super(InceptionC, self).__init__()
+        if (conv_block is None):
+            conv_block = BasicConv2d
+        self.branch1x1 = conv_block(in_channels, 192, kernel_size=1)
+        c7 = channels_7x7
+        self.branch7x7_1 = conv_block(in_channels, c7, kernel_size=1)
+        self.branch7x7_2 = conv_block(c7, c7, kernel_size=(1, 7), padding=(0, 3))
+        self.branch7x7_3 = conv_block(c7, 192, kernel_size=(7, 1), padding=(3, 0))
+        self.branch7x7dbl_1 = conv_block(in_channels, c7, kernel_size=1)
+        self.branch7x7dbl_2 = conv_block(c7, c7, kernel_size=(7, 1), padding=(3, 0))
+        self.branch7x7dbl_3 = conv_block(c7, c7, kernel_size=(1, 7), padding=(0, 3))
+        self.branch7x7dbl_4 = conv_block(c7, c7, kernel_size=(7, 1), padding=(3, 0))
+        self.branch7x7dbl_5 = conv_block(c7, 192, kernel_size=(1, 7), padding=(0, 3))
+        self.branch_pool = conv_block(in_channels, 192, kernel_size=1)
+
+    def _forward(self, x):
+        branch1x1 = self.branch1x1(x)
+        branch7x7 = self.branch7x7_1(x)
+        branch7x7 = self.branch7x7_2(branch7x7)
+        branch7x7 = self.branch7x7_3(branch7x7)
+        branch7x7dbl = self.branch7x7dbl_1(x)
+        branch7x7dbl = self.branch7x7dbl_2(branch7x7dbl)
+        branch7x7dbl = self.branch7x7dbl_3(branch7x7dbl)
+        branch7x7dbl = self.branch7x7dbl_4(branch7x7dbl)
+        branch7x7dbl = self.branch7x7dbl_5(branch7x7dbl)
+        branch_pool = nn.pool(x, kernel_size=3, op="mean", stride=1, padding=1)
+        branch_pool = self.branch_pool(branch_pool)
+        outputs = [branch1x1, branch7x7, branch7x7dbl, branch_pool]
+        return outputs
+
+    def execute(self, x):
+        outputs = self._forward(x)
+        return jt.contrib.concat(outputs, dim=1)
+
+class InceptionD(nn.Module):
+
+    def __init__(self, in_channels, conv_block=None):
+        super(InceptionD, self).__init__()
+        if (conv_block is None):
+            conv_block = BasicConv2d
+        self.branch3x3_1 = conv_block(in_channels, 192, kernel_size=1)
+        self.branch3x3_2 = conv_block(192, 320, kernel_size=3, stride=2)
+        self.branch7x7x3_1 = conv_block(in_channels, 192, kernel_size=1)
+        self.branch7x7x3_2 = conv_block(192, 192, kernel_size=(1, 7), padding=(0, 3))
+        self.branch7x7x3_3 = conv_block(192, 192, kernel_size=(7, 1), padding=(3, 0))
+        self.branch7x7x3_4 = conv_block(192, 192, kernel_size=3, stride=2)
+
+    def _forward(self, x):
+        branch3x3 = self.branch3x3_1(x)
+        branch3x3 = self.branch3x3_2(branch3x3)
+        branch7x7x3 = self.branch7x7x3_1(x)
+        branch7x7x3 = self.branch7x7x3_2(branch7x7x3)
+        branch7x7x3 = self.branch7x7x3_3(branch7x7x3)
+        branch7x7x3 = self.branch7x7x3_4(branch7x7x3)
+        branch_pool = nn.pool(x, kernel_size=3, op="maximum", stride=2)
+        outputs = [branch3x3, branch7x7x3, branch_pool]
+        return outputs
+
+    def execute(self, x):
+        outputs = self._forward(x)
+        return jt.contrib.concat(outputs, dim=1)
+
+class InceptionE(nn.Module):
+
+    def __init__(self, in_channels, conv_block=None):
+        super(InceptionE, self).__init__()
+        if (conv_block is None):
+            conv_block = BasicConv2d
+        self.branch1x1 = conv_block(in_channels, 320, kernel_size=1)
+        self.branch3x3_1 = conv_block(in_channels, 384, kernel_size=1)
+        self.branch3x3_2a = conv_block(384, 384, kernel_size=(1, 3), padding=(0, 1))
+        self.branch3x3_2b = conv_block(384, 384, kernel_size=(3, 1), padding=(1, 0))
+        self.branch3x3dbl_1 = conv_block(in_channels, 448, kernel_size=1)
+        self.branch3x3dbl_2 = conv_block(448, 384, kernel_size=3, padding=1)
+        self.branch3x3dbl_3a = conv_block(384, 384, kernel_size=(1, 3), padding=(0, 1))
+        self.branch3x3dbl_3b = conv_block(384, 384, kernel_size=(3, 1), padding=(1, 0))
+        self.branch_pool = conv_block(in_channels, 192, kernel_size=1)
+
+    def _forward(self, x):
+        branch1x1 = self.branch1x1(x)
+        branch3x3 = self.branch3x3_1(x)
+        branch3x3 = [self.branch3x3_2a(branch3x3), self.branch3x3_2b(branch3x3)]
+        branch3x3 = jt.contrib.concat(branch3x3, dim=1)
+        branch3x3dbl = self.branch3x3dbl_1(x)
+        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
+        branch3x3dbl = [self.branch3x3dbl_3a(branch3x3dbl), self.branch3x3dbl_3b(branch3x3dbl)]
+        branch3x3dbl = jt.contrib.concat(branch3x3dbl, dim=1)
+        branch_pool = nn.pool(x, kernel_size=3, op="mean", stride=1, padding=1)
+        branch_pool = self.branch_pool(branch_pool)
+        outputs = [branch1x1, branch3x3, branch3x3dbl, branch_pool]
+        return outputs
+
+    def execute(self, x):
+        outputs = self._forward(x)
+        return jt.contrib.concat(outputs, dim=1)
+
+class InceptionAux(nn.Module):
+
+    def __init__(self, in_channels, num_classes, conv_block=None):
+        super(InceptionAux, self).__init__()
+        if (conv_block is None):
+            conv_block = BasicConv2d
+        self.conv0 = conv_block(in_channels, 128, kernel_size=1)
+        self.conv1 = conv_block(128, 768, kernel_size=5)
+        self.conv1.stddev = 0.01
+        self.fc = nn.Linear(768, num_classes)
+        self.fc.stddev = 0.001
+
+    def execute(self, x):
+        x = nn.pool(x, kernel_size=5, op="mean", stride=3)
+        x = self.conv0(x)
+        x = self.conv1(x)
+
+
+        x = nn.AdaptiveAvgPool2d(1)(x)
+        x = jt.reshape(x, (x.shape[0], (- 1)))
+        x = self.fc(x)
+        return x
+
+class BasicConv2d(nn.Module):
+
+    def __init__(self, in_channels, out_channels, **kwargs):
+        super(BasicConv2d, self).__init__()
+        self.conv = nn.Conv(in_channels, out_channels, bias=False, **kwargs)
+        self.bn = nn.BatchNorm(out_channels, eps=0.001)
+
+    def execute(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        return nn.relu(x)
--- a/python/jittor/models/mnasnet.py
+++ b/python/jittor/models/mnasnet.py
@ -0,0 +1,99 @@
+# ***************************************************************
+# Copyright (c) 2020 Jittor. Authors: 
+#     Wenyang Zhou <576825820@qq.com>
+#     Dun Liang <randonlang@gmail.com>. 
+# All Rights Reserved.
+# This file is subject to the terms and conditions defined in
+# file 'LICENSE.txt', which is part of this source code package.
+# ***************************************************************
+# This model is generated by pytorch converter.
+
+import jittor as jt
+from jittor import nn
+__all__ = ['MNASNet', 'mnasnet0_5', 'mnasnet0_75', 'mnasnet1_0', 'mnasnet1_3']
+_BN_MOMENTUM = (1 - 0.9997)
+
+class _InvertedResidual(nn.Module):
+
+    def __init__(self, in_ch, out_ch, kernel_size, stride, expansion_factor, bn_momentum=0.1):
+        super(_InvertedResidual, self).__init__()
+        assert (stride in [1, 2])
+        assert (kernel_size in [3, 5])
+        mid_ch = (in_ch * expansion_factor)
+        self.apply_residual = ((in_ch == out_ch) and (stride == 1))
+        self.layers = nn.Sequential(nn.Conv(in_ch, mid_ch, 1, bias=False), nn.BatchNorm(mid_ch, momentum=bn_momentum), nn.Relu(), nn.Conv(mid_ch, mid_ch, kernel_size, padding=(kernel_size // 2), stride=stride, groups=mid_ch, bias=False), nn.BatchNorm(mid_ch, momentum=bn_momentum), nn.Relu(), nn.Conv(mid_ch, out_ch, 1, bias=False), nn.BatchNorm(out_ch, momentum=bn_momentum))
+
+    def execute(self, input):
+        if self.apply_residual:
+            return (self.layers(input) + input)
+        else:
+            return self.layers(input)
+
+def _stack(in_ch, out_ch, kernel_size, stride, exp_factor, repeats, bn_momentum):
+    assert (repeats >= 1)
+    first = _InvertedResidual(in_ch, out_ch, kernel_size, stride, exp_factor, bn_momentum=bn_momentum)
+    remaining = []
+    for _ in range(1, repeats):
+        remaining.append(_InvertedResidual(out_ch, out_ch, kernel_size, 1, exp_factor, bn_momentum=bn_momentum))
+    return nn.Sequential(first, *remaining)
+
+def _round_to_multiple_of(val, divisor, round_up_bias=0.9):
+    assert (0.0 < round_up_bias < 1.0)
+    new_val = max(divisor, ((int((val + (divisor / 2))) // divisor) * divisor))
+    return (new_val if (new_val >= (round_up_bias * val)) else (new_val + divisor))
+
+def _get_depths(alpha):
+    depths = [24, 40, 80, 96, 192, 320]
+    return [_round_to_multiple_of((depth * alpha), 8) for depth in depths]
+
+class MNASNet(nn.Module):
+    _version = 2
+
+    def __init__(self, alpha, num_classes=1000, dropout=0.2):
+        super(MNASNet, self).__init__()
+        assert (alpha > 0.0)
+        self.alpha = alpha
+        self.num_classes = num_classes
+        depths = _get_depths(alpha)
+        layers = [
+            nn.Conv(3, 32, 3, padding=1, stride=2, bias=False), 
+            nn.BatchNorm(32, momentum=_BN_MOMENTUM), 
+            nn.Relu(), 
+            nn.Conv(32, 32, 3, padding=1, stride=1, groups=32, bias=False), 
+            nn.BatchNorm(32, momentum=_BN_MOMENTUM), 
+            nn.Relu(), 
+            nn.Conv(32, 16, 1, padding=0, stride=1, bias=False), 
+            nn.BatchNorm(16, momentum=_BN_MOMENTUM), 
+            _stack(16, depths[0], 3, 2, 3, 3, _BN_MOMENTUM),
+            _stack(depths[0], depths[1], 5, 2, 3, 3, _BN_MOMENTUM),
+            _stack(depths[1], depths[2], 5, 2, 6, 3, _BN_MOMENTUM),
+            _stack(depths[2], depths[3], 3, 1, 6, 2, _BN_MOMENTUM),
+            _stack(depths[3], depths[4], 5, 2, 6, 4, _BN_MOMENTUM),
+            _stack(depths[4], depths[5], 3, 1, 6, 1, _BN_MOMENTUM),
+            nn.Conv(depths[5], 1280, 1, padding=0, stride=1, bias=False), 
+            nn.BatchNorm(1280, momentum=_BN_MOMENTUM), 
+            nn.Relu()
+        ]
+        self.layers = nn.Sequential(*layers)
+        self.classifier = nn.Sequential(nn.Dropout(p=dropout), nn.Linear(1280, num_classes))
+
+    def execute(self, x):
+        x = self.layers(x)
+        x = x.mean([2, 3])
+        return self.classifier(x)
+
+def mnasnet0_5(**kwargs):
+    model = MNASNet(0.5, **kwargs)
+    return model
+
+def mnasnet0_75(**kwargs):
+    model = MNASNet(0.75, **kwargs)
+    return model
+
+def mnasnet1_0(**kwargs):
+    model = MNASNet(1.0, **kwargs)
+    return model
+
+def mnasnet1_3(**kwargs):
+    model = MNASNet(1.3, **kwargs)
+    return model
--- a/python/jittor/models/mobilenet.py
+++ b/python/jittor/models/mobilenet.py
@ -0,0 +1,88 @@
+# ***************************************************************
+# Copyright (c) 2020 Jittor. Authors: 
+#     Wenyang Zhou <576825820@qq.com>
+#     Dun Liang <randonlang@gmail.com>. 
+# All Rights Reserved.
+# This file is subject to the terms and conditions defined in
+# file 'LICENSE.txt', which is part of this source code package.
+# ***************************************************************
+# This model is generated by pytorch converter.
+
+import jittor as jt
+from jittor import init
+from jittor import nn
+__all__ = ['MobileNetV2', 'mobilenet_v2']
+
+def _make_divisible(v, divisor, min_value=None):
+    if (min_value is None):
+        min_value = divisor
+    new_v = max(min_value, ((int((v + (divisor / 2))) // divisor) * divisor))
+    if (new_v < (0.9 * v)):
+        new_v += divisor
+    return new_v
+
+class ConvBNReLU(nn.Sequential):
+
+    def __init__(self, in_planes, out_planes, kernel_size=3, stride=1, groups=1):
+        padding = ((kernel_size - 1) // 2)
+        super(ConvBNReLU, self).__init__(nn.Conv(in_planes, out_planes, kernel_size, stride, padding, groups=groups, bias=False), nn.BatchNorm(out_planes), nn.ReLU6())
+
+class InvertedResidual(nn.Module):
+
+    def __init__(self, inp, oup, stride, expand_ratio):
+        super(InvertedResidual, self).__init__()
+        self.stride = stride
+        assert (stride in [1, 2])
+        hidden_dim = int(round((inp * expand_ratio)))
+        self.use_res_connect = ((self.stride == 1) and (inp == oup))
+        layers = []
+        if (expand_ratio != 1):
+            layers.append(ConvBNReLU(inp, hidden_dim, kernel_size=1))
+        layers.extend([ConvBNReLU(hidden_dim, hidden_dim, stride=stride, groups=hidden_dim), nn.Conv(hidden_dim, oup, 1, 1, 0, bias=False), nn.BatchNorm(oup)])
+        self.conv = nn.Sequential(*layers)
+
+    def execute(self, x):
+        if self.use_res_connect:
+            return (x + self.conv(x))
+        else:
+            return self.conv(x)
+
+class MobileNetV2(nn.Module):
+
+    def __init__(self, num_classes=1000, width_mult=1.0, inverted_residual_setting=None, round_nearest=8, block=None):
+        super(MobileNetV2, self).__init__()
+        if (block is None):
+            block = InvertedResidual
+        input_channel = 32
+        last_channel = 1280
+        if (inverted_residual_setting is None):
+            inverted_residual_setting = [[1, 16, 1, 1], [6, 24, 2, 2], [6, 32, 3, 2], [6, 64, 4, 2], [6, 96, 3, 1], [6, 160, 3, 2], [6, 320, 1, 1]]
+        if ((len(inverted_residual_setting) == 0) or (len(inverted_residual_setting[0]) != 4)):
+            raise ValueError('inverted_residual_setting should be non-empty or a 4-element list, got {}'.format(inverted_residual_setting))
+        input_channel = _make_divisible((input_channel * width_mult), round_nearest)
+        self.last_channel = _make_divisible((last_channel * max(1.0, width_mult)), round_nearest)
+        features = [ConvBNReLU(3, input_channel, stride=2)]
+        for (t, c, n, s) in inverted_residual_setting:
+            output_channel = _make_divisible((c * width_mult), round_nearest)
+            for i in range(n):
+                stride = (s if (i == 0) else 1)
+                features.append(block(input_channel, output_channel, stride, expand_ratio=t))
+                input_channel = output_channel
+        features.append(ConvBNReLU(input_channel, self.last_channel, kernel_size=1))
+        self.features = nn.Sequential(*features)
+        self.classifier = nn.Sequential(nn.Dropout(0.2), nn.Linear(self.last_channel, num_classes))
+
+    def _forward_impl(self, x):
+        x = self.features(x)
+        x = nn.AdaptiveAvgPool2d(1)(x)
+        x = jt.reshape(x, (x.shape[0], -1))
+        x = self.classifier(x)
+        return x
+
+    def execute(self, x):
+        return self._forward_impl(x)
+
+def mobilenet_v2():
+    model = MobileNetV2()
+    return model
+
--- a/python/jittor/models/resnet.py
+++ b/python/jittor/models/resnet.py
@ -7,200 +7,128 @@
 # This file is subject to the terms and conditions defined in
 # file 'LICENSE.txt', which is part of this source code package.
 # ***************************************************************
+# This model is generated by pytorch converter.
 import jittor as jt
 from jittor import nn
-from jittor import Module

-@jt.var_scope('basic_block')
-def basic_block(x, is_train, in_planes, out_planes, stride = 1):
-    identity = x
-    x = nn.conv(x, in_planes, out_planes, 3, 1, stride)
-    x = nn.batch_norm(x, is_train)
-    x = nn.relu(x)
-    x = nn.conv(x, out_planes, out_planes, 3, 1)
-    x = nn.batch_norm(x, is_train)
-    if in_planes!=out_planes:
-        identity = nn.conv(identity, in_planes, out_planes, 1, 0, stride)
-        identity = nn.batch_norm(identity, is_train)
-    x = x+identity
-    x = nn.relu(x)
-    return x
+__all__ = ['ResNet', 'Resnet18', 'Resnet34', 'Resnet50', 'Resnet101', 'Resnet152', 'Resnext50_32x4d', 'Resnext101_32x8d', 'Wide_resnet50_2', 'Wide_resnet101_2',
+    'resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152', 'resnext50_32x4d', 'resnext101_32x8d', 'wide_resnet50_2', 'wide_resnet101_2']

-@jt.var_scope('make_layer')
-def make_layer(x, is_train, out_planes, blocks, layer_in_planes, stride = 1):
-    x = basic_block(x, is_train, layer_in_planes, out_planes, stride)
-    layer_in_planes = out_planes
+def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
+    return nn.Conv(in_planes, out_planes, kernel_size=3, stride=stride, padding=dilation, groups=groups, bias=False, dilation=dilation)

-    for i in range(1, blocks):
-        x = basic_block(x, is_train, layer_in_planes, out_planes)
-    return x, layer_in_planes
+def conv1x1(in_planes, out_planes, stride=1):
+    return nn.Conv(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)

-@jt.var_scope('bottleneck_block')
-def bottleneck_block(x, is_train, in_planes, out_planes, stride = 1):
-    expansion = 4 
-    width = out_planes
-    identity = x
-
-    x = nn.conv(x, in_planes, width, 1, 0)
-    x = nn.batch_norm(x, is_train)
-    x = nn.relu(x)
-
-    x = nn.conv(x, width, width, 3, 1, stride)
-    x = nn.batch_norm(x, is_train)
-    x = nn.relu(x)
-
-    x = nn.conv(x, width, out_planes * expansion, 1, 0)
-    x = nn.batch_norm(x, is_train)
-
-    if in_planes != out_planes * expansion:
-        identity = nn.conv(identity, in_planes, out_planes * expansion, 1, 0, stride)
-        identity = nn.batch_norm(identity, is_train)
-    
-    x = x+identity
-    x = nn.relu(x)
-    return x
-
-@jt.var_scope('make_layer_bottleneck')
-def make_layer_bottleneck(x, is_train, out_planes, blocks, layer_in_planes, stride = 1):
-    expansion = 4
-    x = bottleneck_block(x, is_train, layer_in_planes, out_planes, stride)
-    layer_in_planes = out_planes * expansion
-    for i in range(1, blocks):
-        x = bottleneck_block(x, is_train, layer_in_planes, out_planes)
-    return x, layer_in_planes
-
-@jt.var_scope('resnet')
-def resnet(x, is_train, block, layers, num_classes = 1000):
-    layer_in_planes = 64
-    x = nn.conv(x, 3, layer_in_planes, 7, 3, 2)
-    x = nn.batch_norm(x, is_train)
-    x = nn.relu(x)
-    x = nn.pool(x, 3, "maximum", 1, 2)
-    x, layer_in_planes = block(x, is_train, 64, layers[0], layer_in_planes)
-    x, layer_in_planes = block(x, is_train, 128, layers[1], layer_in_planes, 2)
-    x, layer_in_planes = block(x, is_train, 256, layers[2], layer_in_planes, 2)
-    x, layer_in_planes = block(x, is_train, 512, layers[3], layer_in_planes, 2)
-
-    x = x.reindex_reduce("add", [x.shape[0],x.shape[1]], ["i0","i1"])/x.shape[2]/x.shape[3]
-    x = nn.linear(x, num_classes)
-
-    return x
-
-@jt.var_scope('resnet18', unique=True)
-def resnet18(x, is_train):
-    return resnet(x, is_train, make_layer, [2, 2, 2, 2])
-
-@jt.var_scope('resnet34', unique=True)
-def resnet34(x, is_train):
-    return resnet(x, is_train, make_layer, [3, 4, 6, 3])
-
-@jt.var_scope('resnet50', unique=True)
-def resnet50(x, is_train):
-    return resnet(x, is_train, make_layer_bottleneck, [3, 4, 6, 3])
-
-@jt.var_scope('resnet101', unique=True)
-def resnet101(x, is_train):
-    return resnet(x, is_train, make_layer_bottleneck, [3, 4, 23, 3])
-
-@jt.var_scope('resnet152', unique=True)
-def resnet152(x, is_train):
-    return resnet(x, is_train, make_layer_bottleneck, [3, 8, 36, 3])
-
-class BasicBlock(Module):
+class BasicBlock(nn.Module):
    expansion = 1

-    def __init__(self, inplanes, planes, stride=1, downsample=None):
-        self.conv1 = nn.Conv(inplanes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
-        self.bn1 = nn.BatchNorm(planes)
+    def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1, base_width=64, dilation=1, norm_layer=None):
+        super(BasicBlock, self).__init__()
+        if (norm_layer is None):
+            norm_layer = nn.BatchNorm
+        if ((groups != 1) or (base_width != 64)):
+            raise ValueError('BasicBlock only supports groups=1 and base_width=64')
+        if (dilation > 1):
+            raise NotImplementedError('Dilation > 1 not supported in BasicBlock')
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = norm_layer(planes)
        self.relu = nn.Relu()
-        self.conv2 = nn.Conv(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
-        self.bn2 = nn.BatchNorm(planes)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = norm_layer(planes)
        self.downsample = downsample
        self.stride = stride
-        self.planes = planes

    def execute(self, x):
-        residual = x
+        identity = x
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.conv2(out)
        out = self.bn2(out)
-
-        if self.downsample is not None:
-            residual = self.downsample(x)
-        
-        out += residual
+        if (self.downsample is not None):
+            identity = self.downsample(x)
+        out += identity
        out = self.relu(out)
        return out

-class Bottleneck(Module):
+class Bottleneck(nn.Module):
    expansion = 4
- 
-    def __init__(self, inplanes, planes, stride=1, downsample=None):
-        self.conv1 = nn.Conv(inplanes, planes, kernel_size=1, bias=False)
-        self.bn1 = nn.BatchNorm(planes)
-        self.conv2 = nn.Conv(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
-        self.bn2 = nn.BatchNorm(planes)
-        self.conv3 = nn.Conv(planes, planes * self.expansion, kernel_size=1, bias=False)
-        self.bn3 = nn.BatchNorm(planes * self.expansion)
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1, base_width=64, dilation=1, norm_layer=None):
+        super(Bottleneck, self).__init__()
+        if (norm_layer is None):
+            norm_layer = nn.BatchNorm
+        width = (int((planes * (base_width / 64.0))) * groups)
+        self.conv1 = conv1x1(inplanes, width)
+        self.bn1 = norm_layer(width)
+        self.conv2 = conv3x3(width, width, stride, groups, dilation)
+        self.bn2 = norm_layer(width)
+        self.conv3 = conv1x1(width, (planes * self.expansion))
+        self.bn3 = norm_layer((planes * self.expansion))
        self.relu = nn.Relu()
        self.downsample = downsample
        self.stride = stride
- 
+
    def execute(self, x):
-        residual = x
- 
+        identity = x
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
- 
        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)
- 
        out = self.conv3(out)
        out = self.bn3(out)
- 
-        if self.downsample is not None:
-            residual = self.downsample(x)
- 
-        out += residual
+        if (self.downsample is not None):
+            identity = self.downsample(x)
+        out += identity
        out = self.relu(out)
        return out

-class ResNet(Module):
-    def __init__(self, block, layers, num_classes=1000):
+class ResNet(nn.Module):
+
+    def __init__(self, block, layers, num_classes=1000, zero_init_residual=False, groups=1, width_per_group=64, replace_stride_with_dilation=None, norm_layer=None):
+        super(ResNet, self).__init__()
+        if (norm_layer is None):
+            norm_layer = nn.BatchNorm
+        self._norm_layer = norm_layer
        self.inplanes = 64
-        self.conv1 = nn.Conv(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
-        self.bn1 = nn.BatchNorm(64)
+        self.dilation = 1
+        if (replace_stride_with_dilation is None):
+            replace_stride_with_dilation = [False, False, False]
+        if (len(replace_stride_with_dilation) != 3):
+            raise ValueError('replace_stride_with_dilation should be None or a 3-element tuple, got {}'.format(replace_stride_with_dilation))
+        self.groups = groups
+        self.base_width = width_per_group
+        self.conv1 = nn.Conv(3, self.inplanes, kernel_size=7, stride=2, padding=3, bias=False)
+        self.bn1 = norm_layer(self.inplanes)
        self.relu = nn.Relu()
-        self.maxpool = nn.Pool(kernel_size=3, stride=2, padding=1)
+        self.maxpool = nn.Pool(kernel_size=3, stride=2, padding=1, op='maximum')
        self.layer1 = self._make_layer(block, 64, layers[0])
-        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
-        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
-        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
-        self.avgpool = nn.Pool(7, stride=1, op="mean")
-        self.fc = nn.Linear(512 * block.expansion, num_classes)
- 
-    def _make_layer(self, block, planes, blocks, stride=1):
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2, dilate=replace_stride_with_dilation[0])
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2, dilate=replace_stride_with_dilation[1])
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2, dilate=replace_stride_with_dilation[2])
+        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+        self.fc = nn.Linear((512 * block.expansion), num_classes)
+
+    def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
+        norm_layer = self._norm_layer
        downsample = None
-        if stride != 1 or self.inplanes != planes * block.expansion:
-            downsample = nn.Sequential(
-                nn.Conv(self.inplanes, planes * block.expansion,
-                          kernel_size=1, stride=stride, bias=False),
-                nn.BatchNorm(planes * block.expansion),
-            )
- 
+        previous_dilation = self.dilation
+        if dilate:
+            self.dilation *= stride
+            stride = 1
+        if ((stride != 1) or (self.inplanes != (planes * block.expansion))):
+            downsample = nn.Sequential(conv1x1(self.inplanes, (planes * block.expansion), stride), norm_layer((planes * block.expansion)))
        layers = []
-        layers.append(block(self.inplanes, planes, stride, downsample))
-        self.inplanes = planes * block.expansion
-        for i in range(1, blocks):
-            layers.append(block(self.inplanes, planes))
- 
+        layers.append(block(self.inplanes, planes, stride, downsample, self.groups, self.base_width, previous_dilation, norm_layer))
+        self.inplanes = (planes * block.expansion)
+        for _ in range(1, blocks):
+            layers.append(block(self.inplanes, planes, groups=self.groups, base_width=self.base_width, dilation=self.dilation, norm_layer=norm_layer))
        return nn.Sequential(*layers)
- 
-    def execute(self, x):
+
+    def _forward_impl(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
@ -209,29 +137,56 @@ class ResNet(Module):
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
-
        x = self.avgpool(x)
-        x = jt.reshape(x, [x.shape[0],-1])
+        x = jt.reshape(x, (x.shape[0], (- 1)))
        x = self.fc(x)
- 
        return x

-def Resnet18():
-    model = ResNet(BasicBlock, [2,2,2,2])
+    def execute(self, x):
+        return self._forward_impl(x)
+
+def _resnet(block, layers, **kwargs):
+    model = ResNet(block, layers, **kwargs)
    return model

-def Resnet34():
-    model = ResNet(BasicBlock, [3,4,6,3])
-    return model
+def Resnet18(**kwargs):
+    return _resnet(BasicBlock, [2, 2, 2, 2], **kwargs)
+resnet18 = Resnet18

-def Resnet50():
-    model = ResNet(Bottleneck, [3,4,6,3])
-    return model
+def Resnet34(**kwargs):
+    return _resnet( BasicBlock, [3, 4, 6, 3], **kwargs)
+resnet34 = Resnet34

-def Resnet101():
-    model = ResNet(Bottleneck, [3,4,23,3])
-    return model
+def Resnet50(**kwargs):
+    return _resnet(Bottleneck, [3, 4, 6, 3], **kwargs)
+resnet50 = Resnet50

-def Resnet152():
-    model = ResNet(Bottleneck, [3,8,36,3])
-    return model
+def Resnet101(**kwargs):
+    return _resnet(Bottleneck, [3, 4, 23, 3], **kwargs)
+resnet101 = Resnet101
+
+def Resnet152(**kwargs):
+    return _resnet(Bottleneck, [3, 8, 36, 3], **kwargs)
+resnet152 = Resnet152
+
+def Resnext50_32x4d(**kwargs):
+    kwargs['groups'] = 32
+    kwargs['width_per_group'] = 4
+    return _resnet(Bottleneck, [3, 4, 6, 3], **kwargs)
+resnext50_32x4d = Resnext50_32x4d
+
+def Resnext101_32x8d(**kwargs):
+    kwargs['groups'] = 32
+    kwargs['width_per_group'] = 8
+    return _resnet(Bottleneck, [3, 4, 23, 3], **kwargs)
+resnext101_32x8d = Resnext101_32x8d
+
+def Wide_resnet50_2(**kwargs):
+    kwargs['width_per_group'] = (64 * 2)
+    return _resnet(Bottleneck, [3, 4, 6, 3], **kwargs)
+wide_resnet50_2 = Wide_resnet50_2
+
+def Wide_resnet101_2(**kwargs):
+    kwargs['width_per_group'] = (64 * 2)
+    return _resnet(Bottleneck, [3, 4, 23, 3], **kwargs)
+wide_resnet101_2 = Wide_resnet101_2
--- a/python/jittor/models/shufflenetv2.py
+++ b/python/jittor/models/shufflenetv2.py
@ -0,0 +1,106 @@
+
+# ***************************************************************
+# Copyright (c) 2020 Jittor. Authors: 
+#     Wenyang Zhou <576825820@qq.com>
+#     Dun Liang <randonlang@gmail.com>. 
+# All Rights Reserved.
+# This file is subject to the terms and conditions defined in
+# file 'LICENSE.txt', which is part of this source code package.
+# ***************************************************************
+# This model is generated by pytorch converter.
+import jittor as jt
+from jittor import nn
+
+__all__ = ['ShuffleNetV2', 'shufflenet_v2_x0_5', 'shufflenet_v2_x1_0', 'shufflenet_v2_x1_5', 'shufflenet_v2_x2_0']
+
+def channel_shuffle(x, groups):
+    (batchsize, num_channels, height, width) = x.data.shape
+    channels_per_group = (num_channels // groups)
+    x = jt.reshape(x, [batchsize, groups, channels_per_group, height, width])
+    x = jt.transpose(x, (0,2,1,3,4))
+    x = jt.reshape(x, [batchsize, (- 1), height, width])
+    return x
+
+class InvertedResidual(nn.Module):
+
+    def __init__(self, inp, oup, stride):
+        super(InvertedResidual, self).__init__()
+        if (not (1 <= stride <= 3)):
+            raise ValueError('illegal stride value')
+        self.stride = stride
+        branch_features = (oup // 2)
+        assert ((self.stride != 1) or (inp == (branch_features << 1)))
+        if (self.stride > 1):
+            self.branch1 = nn.Sequential(self.depthwise_conv(inp, inp, kernel_size=3, stride=self.stride, padding=1), nn.BatchNorm(inp), nn.Conv(inp, branch_features, kernel_size=1, stride=1, padding=0, bias=False), nn.BatchNorm(branch_features), nn.Relu())
+        else:
+            self.branch1 = nn.Sequential()
+        self.branch2 = nn.Sequential(nn.Conv((inp if (self.stride > 1) else branch_features), branch_features, kernel_size=1, stride=1, padding=0, bias=False), nn.BatchNorm(branch_features), nn.Relu(), self.depthwise_conv(branch_features, branch_features, kernel_size=3, stride=self.stride, padding=1), nn.BatchNorm(branch_features), nn.Conv(branch_features, branch_features, kernel_size=1, stride=1, padding=0, bias=False), nn.BatchNorm(branch_features), nn.Relu())
+
+    @staticmethod
+    def depthwise_conv(i, o, kernel_size, stride=1, padding=0, bias=False):
+        return nn.Conv(i, o, kernel_size, stride, padding, bias=bias, groups=i)
+
+    def execute(self, x):
+        if (self.stride == 1):
+            x1 = x[:,0:x.shape[1]//2]
+            x2 = x[:,x.shape[1]//2:x.shape[1]]
+            out = jt.contrib.concat([x1, self.branch2(x2)], dim=1)
+        else:
+            out = jt.contrib.concat([self.branch1(x), self.branch2(x)], dim=1)
+        out = channel_shuffle(out, 2)
+        return out
+
+class ShuffleNetV2(nn.Module):
+
+    def __init__(self, stages_repeats, stages_out_channels, num_classes=1000, inverted_residual=InvertedResidual):
+        super(ShuffleNetV2, self).__init__()
+        if (len(stages_repeats) != 3):
+            raise ValueError('expected stages_repeats as list of 3 positive ints')
+        if (len(stages_out_channels) != 5):
+            raise ValueError('expected stages_out_channels as list of 5 positive ints')
+        self._stage_out_channels = stages_out_channels
+        input_channels = 3
+        output_channels = self._stage_out_channels[0]
+        self.conv1 = nn.Sequential(nn.Conv(input_channels, output_channels, 3, 2, 1, bias=False), nn.BatchNorm(output_channels), nn.Relu())
+        input_channels = output_channels
+        self.maxpool = nn.Pool(kernel_size=3, stride=2, padding=1, op='maximum')
+        stage_names = ['stage{}'.format(i) for i in [2, 3, 4]]
+        for (name, repeats, output_channels) in zip(stage_names, stages_repeats, self._stage_out_channels[1:]):
+            seq = [inverted_residual(input_channels, output_channels, 2)]
+            for i in range((repeats - 1)):
+                seq.append(inverted_residual(output_channels, output_channels, 1))
+            setattr(self, name, nn.Sequential(*seq))
+            input_channels = output_channels
+        output_channels = self._stage_out_channels[(- 1)]
+        self.conv5 = nn.Sequential(nn.Conv(input_channels, output_channels, 1, 1, 0, bias=False), nn.BatchNorm(output_channels), nn.Relu())
+        self.fc = nn.Linear(output_channels, num_classes)
+
+    def _forward_impl(self, x):
+        x = self.conv1(x)
+        x = self.maxpool(x)
+        x = self.stage2(x)
+        x = self.stage3(x)
+        x = self.stage4(x)
+        x = self.conv5(x)
+        x = x.mean([2, 3])
+        x = self.fc(x)
+        return x
+
+    def execute(self, x):
+        return self._forward_impl(x)
+
+def _shufflenetv2(arch, *args):
+    model = ShuffleNetV2(*args)
+    return model
+
+def shufflenet_v2_x0_5():
+    return _shufflenetv2('shufflenetv2_x0.5', [4, 8, 4], [24, 48, 96, 192, 1024])
+
+def shufflenet_v2_x1_0():
+    return _shufflenetv2('shufflenetv2_x1.0', [4, 8, 4], [24, 116, 232, 464, 1024])
+
+def shufflenet_v2_x1_5():
+    return _shufflenetv2('shufflenetv2_x1.5', [4, 8, 4], [24, 176, 352, 704, 1024])
+
+def shufflenet_v2_x2_0():
+    return _shufflenetv2('shufflenetv2_x2.0', [4, 8, 4], [24, 244, 488, 976, 2048])
--- a/python/jittor/models/squeezenet.py
+++ b/python/jittor/models/squeezenet.py
@ -0,0 +1,90 @@
+# ***************************************************************
+# Copyright (c) 2020 Jittor. Authors: 
+#     Wenyang Zhou <576825820@qq.com>
+#     Dun Liang <randonlang@gmail.com>. 
+# All Rights Reserved.
+# This file is subject to the terms and conditions defined in
+# file 'LICENSE.txt', which is part of this source code package.
+# ***************************************************************
+# This model is generated by pytorch converter.
+import jittor as jt
+from jittor import nn
+__all__ = ['SqueezeNet', 'squeezenet1_0', 'squeezenet1_1']
+
+class Fire(nn.Module):
+
+    def __init__(self, inplanes, squeeze_planes, expand1x1_planes, expand3x3_planes):
+        super(Fire, self).__init__()
+        self.inplanes = inplanes
+        self.squeeze = nn.Conv(inplanes, squeeze_planes, kernel_size=1)
+        self.squeeze_activation = nn.Relu()
+        self.expand1x1 = nn.Conv(squeeze_planes, expand1x1_planes, kernel_size=1)
+        self.expand1x1_activation = nn.Relu()
+        self.expand3x3 = nn.Conv(squeeze_planes, expand3x3_planes, kernel_size=3, padding=1)
+        self.expand3x3_activation = nn.Relu()
+
+    def execute(self, x):
+        x = self.squeeze_activation(self.squeeze(x))
+        return jt.contrib.concat([self.expand1x1_activation(self.expand1x1(x)), self.expand3x3_activation(self.expand3x3(x))], dim=1)
+
+class SqueezeNet(nn.Module):
+
+    def __init__(self, version='1_0', num_classes=1000):
+        super(SqueezeNet, self).__init__()
+        self.num_classes = num_classes
+        if (version == '1_0'):
+            self.features = nn.Sequential(
+                nn.Conv(3, 96, kernel_size=7, stride=2), 
+                nn.Relu(), 
+                nn.Pool(kernel_size=3, stride=2, ceil_mode=True, op='maximum'), 
+                Fire(96, 16, 64, 64), 
+                Fire(128, 16, 64, 64), 
+                Fire(128, 32, 128, 128), 
+                nn.Pool(kernel_size=3, stride=2, ceil_mode=True, op='maximum'), 
+                Fire(256, 32, 128, 128), 
+                Fire(256, 48, 192, 192), 
+                Fire(384, 48, 192, 192), 
+                Fire(384, 64, 256, 256), 
+                nn.Pool(kernel_size=3, stride=2, ceil_mode=True, op='maximum'), 
+                Fire(512, 64, 256, 256)
+            )
+        elif (version == '1_1'):
+            self.features = nn.Sequential(
+                nn.Conv(3, 64, kernel_size=3, stride=2), 
+                nn.Relu(), 
+                nn.Pool(kernel_size=3, stride=2, ceil_mode=True, op='maximum'), 
+                Fire(64, 16, 64, 64), 
+                Fire(128, 16, 64, 64), 
+                nn.Pool(kernel_size=3, stride=2, ceil_mode=True, op='maximum'), 
+                Fire(128, 32, 128, 128), 
+                Fire(256, 32, 128, 128), 
+                nn.Pool(kernel_size=3, stride=2, ceil_mode=True, op='maximum'), 
+                Fire(256, 48, 192, 192), 
+                Fire(384, 48, 192, 192), 
+                Fire(384, 64, 256, 256), 
+                Fire(512, 64, 256, 256)
+            )
+        else:
+            raise ValueError('Unsupported SqueezeNet version {version}:1_0 or 1_1 expected'.format(version=version))
+        final_conv = nn.Conv(512, self.num_classes, kernel_size=1)
+        self.classifier = nn.Sequential(
+            nn.Dropout(p=0.5), 
+            final_conv, 
+            nn.Relu(), 
+            nn.AdaptiveAvgPool2d((1, 1))
+        )
+
+    def execute(self, x):
+        x = self.features(x)
+        x = self.classifier(x)
+        return jt.reshape(x, (x.shape[0], (- 1)))
+
+def _squeezenet(version, **kwargs):
+    model = SqueezeNet(version, **kwargs)
+    return model
+
+def squeezenet1_0(**kwargs):
+    return _squeezenet('1_0', **kwargs)
+
+def squeezenet1_1(**kwargs):
+    return _squeezenet('1_1', **kwargs)
--- a/python/jittor/models/vgg.py
+++ b/python/jittor/models/vgg.py
@ -6,21 +6,21 @@
 # This file is subject to the terms and conditions defined in
 # file 'LICENSE.txt', which is part of this source code package.
 # ***************************************************************
+# This model is generated by pytorch converter.
 import jittor as jt
 from jittor import nn

-
 __all__ = [
    'VGG', 'vgg11', 'vgg11_bn', 'vgg13', 'vgg13_bn', 'vgg16', 'vgg16_bn',
    'vgg19_bn', 'vgg19',
 ]

-
 class VGG(nn.Module):

    def __init__(self, features, num_classes=1000, init_weights=True):
        super(VGG, self).__init__()
        self.features = features
+        self.avgpool = nn.AdaptiveAvgPool2d((7, 7))
        self.classifier = nn.Sequential(
            nn.Linear(512 * 7 * 7, 4096),
            nn.ReLU(),
@ -33,6 +33,7 @@ class VGG(nn.Module):

    def execute(self, x):
        x = self.features(x)
+        x = self.avgpool(x)
        x = jt.reshape(x, [x.shape[0],-1])
        x = self.classifier(x)
        return x
@ -66,57 +67,33 @@ def _vgg(arch, cfg, batch_norm, **kwargs):
    return model


-def VGG11(**kwargs):
-    r"""VGG 11-layer model (configuration "A") from
-    `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`_
-    """
+def vgg11(**kwargs):
    return _vgg('vgg11', 'A', False, **kwargs)


-def VGG11_bn(**kwargs):
-    r"""VGG 11-layer model (configuration "A") with batch normalization
-    `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`_
-    """
+def vgg11_bn(**kwargs):
    return _vgg('vgg11_bn', 'A', True, **kwargs)


-def VGG13(**kwargs):
-    r"""VGG 13-layer model (configuration "B")
-    `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`_
-    """
+def vgg13(**kwargs):
    return _vgg('vgg13', 'B', False, **kwargs)


-def VGG13_bn(**kwargs):
-    r"""VGG 13-layer model (configuration "B") with batch normalization
-    `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`_
-    """
+def vgg13_bn(**kwargs):
    return _vgg('vgg13_bn', 'B', True, **kwargs)


-def VGG16(**kwargs):
-    r"""VGG 16-layer model (configuration "D")
-    `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`_
-    """
+def vgg16(**kwargs):
    return _vgg('vgg16', 'D', False, **kwargs)


-def VGG16_bn(**kwargs):
-    r"""VGG 16-layer model (configuration "D") with batch normalization
-    `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`_
-    """
+def vgg16_bn(**kwargs):
    return _vgg('vgg16_bn', 'D', True, **kwargs)


-def VGG19(**kwargs):
-    r"""VGG 19-layer model (configuration "E")
-    `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`_
-    """
+def vgg19(**kwargs):
    return _vgg('vgg19', 'E', False, **kwargs)


-def VGG19_bn(**kwargs):
-    r"""VGG 19-layer model (configuration 'E') with batch normalization
-    `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`_
-    """
+def vgg19_bn(**kwargs):
    return _vgg('vgg19_bn', 'E', True, **kwargs)
--- a/python/jittor/nn.py
+++ b/python/jittor/nn.py
@ -14,7 +14,7 @@ import jittor as jt
 from jittor import init, Module
 import numpy as np
 import math
-from jittor.pool import Pool, pool
+from jittor.pool import Pool, pool, AdaptiveAvgPool2d

 def matmul_transpose(a, b):
    '''
@ -107,6 +107,7 @@ def linear(x, n):

 def relu(x): return jt.maximum(x, 0)
 def leaky_relu(x, scale): return jt.ternary(x>0, x, x*scale)
+def relu6(x): return jt.minimum(jt.maximum(x, 0), 6)

 #TODO dims is 4 will cause slowly execution
 def cross_entropy_loss(output, target, ignore_index=None):
@ -233,10 +234,11 @@ class Dropout(Module):
        if self.p > 0 and self.is_train:
            if self.p == 1:
                noise = jt.zeros(input.shape)
+                output = output * noise
            else:
                noise = jt.random(input.shape)
                noise = (noise > self.p).int()
-            output = output * noise
+                output = output * noise / (1.0 - self.p) # div keep prob
        return output

 class Linear(Module):
@ -292,22 +294,24 @@ class BatchNorm(Module):

 Relu = jt.make_module(relu)
 ReLU = Relu
-Leaky_relu = jt.make_module(leaky_relu, 2)
+Leaky_relu = jt.make_module(leaky_relu, 0.01)
+LeakyReLU = Leaky_relu
+ReLU6 = jt.make_module(relu6)
 Softmax = jt.make_module(softmax, 2)

 class Conv(Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True):
-        assert groups == 1
-
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size if isinstance(kernel_size, tuple) else (kernel_size, kernel_size)
        self.stride = stride if isinstance(stride, tuple) else (stride, stride)
        self.padding = padding if isinstance(padding, tuple) else (padding, padding)
        self.dilation = dilation if isinstance(dilation, tuple) else (dilation, dilation)
+        self.groups = groups
+        assert in_channels % groups == 0, 'in_channels must be divisible by groups'
+        assert out_channels % groups == 0, 'out_channels must be divisible by groups'
        Kh, Kw = self.kernel_size
-        assert groups==1, "Group conv not supported yet."
-        self.weight = init.relu_invariant_gauss([out_channels, in_channels, Kh, Kw], dtype="float", mode="fan_out")
+        self.weight = init.relu_invariant_gauss([out_channels, in_channels // groups, Kh, Kw], dtype="float", mode="fan_out")
        if bias:
            self.bias = init.uniform([out_channels], dtype="float", low=-1, high=1)
        else:
@ -317,17 +321,36 @@ class Conv(Module):
        N,C,H,W = x.shape
        Kh, Kw = self.kernel_size
        assert C==self.in_channels
-        oh = (H+self.padding[0]*2-Kh*self.dilation[0]+self.dilation[0]-1)//self.stride[0]+1
-        ow = (W+self.padding[1]*2-Kw*self.dilation[1]+self.dilation[1]-1)//self.stride[1]+1
-        xx = x.reindex([N,self.out_channels,C,oh,ow,Kh,Kw], [
-            'i0', # Nid
-            'i2', # Cid
-            f'i3*{self.stride[0]}-{self.padding[0]}+i5*{self.dilation[0]}', # Hid+Khid
-            f'i4*{self.stride[1]}-{self.padding[1]}+i6*{self.dilation[1]}', # Wid+KWid
-        ])
-        ww = self.weight.broadcast(xx.shape, [0,3,4])
-        yy = xx*ww
-        y = yy.sum([2,5,6]) # Kc, Kh, Kw
+        if self.groups == 1:
+            oh = (H+self.padding[0]*2-Kh*self.dilation[0]+self.dilation[0]-1)//self.stride[0]+1
+            ow = (W+self.padding[1]*2-Kw*self.dilation[1]+self.dilation[1]-1)//self.stride[1]+1
+            xx = x.reindex([N,self.out_channels,C,oh,ow,Kh,Kw], [
+                'i0', # Nid
+                'i2', # Cid
+                f'i3*{self.stride[0]}-{self.padding[0]}+i5*{self.dilation[0]}', # Hid+Khid
+                f'i4*{self.stride[1]}-{self.padding[1]}+i6*{self.dilation[1]}', # Wid+KWid
+            ])
+            ww = self.weight.broadcast(xx.shape, [0,3,4])
+            yy = xx*ww
+            y = yy.sum([2,5,6]) # Kc, Kh, Kw
+        else:
+            G = self.groups
+            oc = self.out_channels
+            oh = (H+self.padding[0]*2-Kh*self.dilation[0]+self.dilation[0]-1)//self.stride[0]+1
+            ow = (W+self.padding[1]*2-Kw*self.dilation[1]+self.dilation[1]-1)//self.stride[1]+1
+            xx = x.reshape((N, G, C//G, H, W))
+            xx = xx.reindex([N,G,oc//G,C//G,oh,ow,Kh,Kw], [
+                'i0', # Nid
+                'i1', # Gid
+                'i3', # C//G id
+                f'i4*{self.stride[0]}-{self.padding[0]}+i6*{self.dilation[0]}', # Hid+Khid
+                f'i5*{self.stride[1]}-{self.padding[1]}+i7*{self.dilation[1]}', # Wid+KWid
+            ])
+            ww = self.weight.reshape((G, oc//G, C//G, Kh, Kw))
+            ww = ww.broadcast(xx.shape, [0,4,5])
+            yy = xx*ww
+            yy = yy.sum([3,6,7]) # oc//G, Kh, Kw
+            y = yy.reshape((N, oc, oh, ow))
        if self.bias is not None:
            b = self.bias.broadcast(y.shape, [0,2,3])
            y = y + b
@ -389,7 +412,7 @@ class Tanh(Module):
    def __init__(self):
        super().__init__()
    def execute(self, x) :
-        return ((jt.exp (x) - jt.exp(-x)) / (jt.exp(x) + jt.exp (-x)))
+        return x.tanh()

 class Sigmoid(Module):
    def __init__(self):
--- a/python/jittor/pool.py
+++ b/python/jittor/pool.py
@ -161,5 +161,34 @@ class Pool(Module):
            ])
            return xx.reduce(self.op, [4,5])

-def pool(x, size, op, padding, stride = 1):
-    return Pool(size, stride, padding, op=op)(x)
+
+class AdaptiveAvgPool2d(Module):
+    def __init__(self, output_size):
+        self.output_size = output_size
+
+    def execute(self, x):
+        if isinstance(self.output_size, int):
+            oh = self.output_size
+            ow = self.output_size
+        elif isinstance(self.output_size, tuple) or isinstance(self.output_size, list):
+            oh = x.shape[2] if self.output_size[0] is None else self.output_size[0]
+            ow = x.shape[3] if self.output_size[1] is None else self.output_size[1]
+        else:
+            raise TypeError(f"AdaptiveAvgPool2d only support int, typle or list input. Not support {type(self.output_size)} yet.")
+        N,C,H,W = x.shape
+        self.sh = math.floor(H / oh)
+        self.sw = math.floor(W / ow)
+        self.ksh = H - (oh - 1) * self.sh
+        self.ksw = W - (ow - 1) * self.sw
+        h = (H-self.ksh)//self.sh+1
+        w = (W-self.ksw)//self.sw+1
+        xx = x.reindex([N,C,h,w,self.ksh,self.ksw], [
+            "i0", # Nid
+            "i1", # Cid
+            f"i2*{self.sh}+i4", # Hid
+            f"i3*{self.sw}+i5", # Wid
+        ])
+        return xx.reduce("mean", [4,5])
+
+def pool(x, kernel_size, op, padding=0, stride = 1):
+    return Pool(kernel_size, stride, padding, op=op)(x)
--- a/python/jittor/test/test_misc_issue.py
+++ b/python/jittor/test/test_misc_issue.py
@ -136,6 +136,11 @@ jt.mkl_ops.mkl_conv(x, w, 1, 2).sync()
        da = jt.grad(a**2, a)
        assert np.isnan(da.data).sum()==0, da.data

+    def test_tanh_nan(self):
+        m=jt.nn.Tanh()
+        a = m(jt.array([1000]))
+        assert np.isnan(a.data).sum()==0, a
+

 if __name__ == "__main__":
    unittest.main()
--- a/python/jittor/test/test_models.py
+++ b/python/jittor/test/test_models.py
@ -0,0 +1,103 @@
+# ***************************************************************
+# Copyright (c) 2020 Jittor. Authors: 
+#     Wenyang Zhou <576825820@qq.com>
+#     Dun Liang <randonlang@gmail.com>. 
+# All Rights Reserved.
+# This file is subject to the terms and conditions defined in
+# file 'LICENSE.txt', which is part of this source code package.
+# ***************************************************************
+import unittest
+import jittor as jt
+import numpy as np
+import jittor.models as jtmodels
+
+try:
+    jt.dirty_fix_pytorch_runtime_error()
+    import torch
+    import torchvision.models as tcmodels
+    from torch import nn
+except:
+    torch = None
+    
+
+skip_this_test = False
+
+
+@unittest.skipIf(skip_this_test, "skip_this_test")
+class test_models(unittest.TestCase):
+    @classmethod
+    def setUpClass(self):
+        self.models = [
+            'inception_v3',
+            'squeezenet1_0',
+            'squeezenet1_1',
+            'alexnet',
+            'resnet18',
+            'resnet34',
+            'resnet50',
+            'resnet101',
+            'resnet152',
+            'resnext50_32x4d',
+            'resnext101_32x8d',
+            'vgg11',
+            'vgg11_bn',
+            'vgg13',
+            'vgg13_bn',
+            'vgg16',
+            'vgg16_bn',
+            'vgg19',
+            'vgg19_bn',
+            'wide_resnet50_2',
+            'wide_resnet101_2',
+            'googlenet',
+            'mobilenet_v2',
+            'mnasnet0_5',
+            'mnasnet0_75',
+            'mnasnet1_0',
+            'mnasnet1_3',
+            'shufflenet_v2_x0_5',
+            'shufflenet_v2_x1_0',
+            'shufflenet_v2_x1_5',
+            'shufflenet_v2_x2_0',
+        ]
+
+    @unittest.skipIf(not jt.has_cuda, "Cuda not found")
+    @jt.flag_scope(use_cuda=1)
+    def test_models(self):
+        def to_cuda(x):
+            if jt.has_cuda:
+                return x.cuda()
+            return x
+        threshold = 1e-2
+        # Define numpy input image
+        bs = 1
+        test_img = np.random.random((bs,3,224,224)).astype('float32')
+        # Define pytorch & jittor input image
+        pytorch_test_img = to_cuda(torch.Tensor(test_img))
+        jittor_test_img = jt.array(test_img)
+        for test_model in self.models:
+            if test_model == "inception_v3":
+                test_img = np.random.random((bs,3,300,300)).astype('float32')
+                pytorch_test_img = to_cuda(torch.Tensor(test_img))
+                jittor_test_img = jt.array(test_img)
+            # Define pytorch & jittor model
+            pytorch_model = to_cuda(tcmodels.__dict__[test_model]())
+            jittor_model = jtmodels.__dict__[test_model]()
+            # Set eval to avoid dropout layer
+            pytorch_model.eval()
+            jittor_model.eval()
+            # Jittor loads pytorch parameters to ensure forward alignment
+            jittor_model.load_parameters(pytorch_model.state_dict())
+            # Judge pytorch & jittor forward relative error. If the differece is lower than threshold, this test passes.
+            pytorch_result = pytorch_model(pytorch_test_img)
+            jittor_result = jittor_model(jittor_test_img)
+            x = pytorch_result.detach().cpu().numpy() + 1
+            y = jittor_result.data + 1
+            relative_error = abs(x - y) / abs(y)
+            diff = relative_error.mean()
+            assert diff < threshold, f"[*] {test_model} forward fails..., Relative Error: {diff}"
+            print(f"[*] {test_model} forword passes with Relative Error {diff}")
+        print('all models pass test.')
+        
+if __name__ == "__main__":
+    unittest.main()
--- a/python/jittor/test/test_pytorch_converter.py
+++ b/python/jittor/test/test_pytorch_converter.py
@ -5,325 +5,233 @@
 # ***************************************************************
 import unittest
 import jittor as jt
-import math
 import numpy as np
+from jittor.utils.pytorch_converter import convert
+import os

 try:
    jt.dirty_fix_pytorch_runtime_error()
    import torch
    from torch import nn
-    from jittor.utils import pytorch_converter
 except:
    torch = None

+code="""
+import torch.nn as nn
+import torch.utils.model_zoo as model_zoo 
+
+__all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152']
+ 
+ 
+model_urls = {
+    'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
+    'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
+    'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
+    'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
+    'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
+}
+ 
+ 
+def conv3x3(in_planes, out_planes, stride=1):
+    '''3x3 convolution with padding'''
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False)
+    
+class BasicBlock(nn.Module):
+    expansion = 1
+ 
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(BasicBlock, self).__init__()
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.relu = nn.ReLU()
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.downsample = downsample
+        self.stride = stride
+ 
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        if self.downsample is not None:
+            residual = self.downsample(x)
+        out += residual
+        out = self.relu(out)
+        return out
+        
+class Bottleneck(nn.Module):
+    expansion = 4
+ 
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
+                               padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
+        self.relu = nn.ReLU()
+        self.downsample = downsample
+        self.stride = stride
+ 
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+        out = self.conv3(out)
+        out = self.bn3(out)
+        if self.downsample is not None:
+            residual = self.downsample(x)
+        out += residual
+        out = self.relu(out)
+        return out
+        
+        
+class ResNet(nn.Module):
+ 
+    def __init__(self, block, layers, num_classes=1000):
+        self.inplanes = 64
+        super(ResNet, self).__init__()
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
+                               bias=False)
+        self.bn1 = nn.BatchNorm2d(64)
+        self.relu = nn.ReLU()
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
+        self.avgpool = nn.AvgPool2d(7, stride=1)
+        self.fc = nn.Linear(512 * block.expansion, num_classes)
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out')
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+ 
+    def _make_layer(self, block, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(self.inplanes, planes * block.expansion,
+                          kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(planes * block.expansion),
+            )
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes))
+ 
+        return nn.Sequential(*layers)
+ 
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = self.avgpool(x)
+        x = x.view(x.size(0), -1)
+        x = self.fc(x)
+ 
+        return x
+        
+def resnet18(pretrained=False, **kwargs):
+    '''Constructs a ResNet-18 model.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    '''
+    model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
+    if pretrained:
+        model.load_state_dict(model_zoo.load_url(model_urls['resnet18']))
+    return model
+ 
+ 
+def resnet34(pretrained=False, **kwargs):
+    '''Constructs a ResNet-34 model.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    '''
+    model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs)
+    if pretrained:
+        model.load_state_dict(model_zoo.load_url(model_urls['resnet34']))
+    return model
+ 
+ 
+def resnet50(pretrained=False, **kwargs):
+    '''Constructs a ResNet-50 model.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    '''
+    model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
+    if pretrained:
+        model.load_state_dict(model_zoo.load_url(model_urls['resnet50']))
+    return model
+ 
+ 
+def resnet101(pretrained=False, **kwargs):
+    '''Constructs a ResNet-101 model.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    '''
+    model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
+    if pretrained:
+        model.load_state_dict(model_zoo.load_url(model_urls['resnet101']))
+    return model
+ 
+ 
+def resnet152(pretrained=False, **kwargs):
+    '''Constructs a ResNet-152 model.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    '''
+    model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs)
+    if pretrained:
+        model.load_state_dict(model_zoo.load_url(model_urls['resnet152']))
+    return model
+"""
+
@unittest.skipIf(torch is None, "pytorch not found.")
 class TestPytorchConverter(unittest.TestCase):
-    def test_simple(self):
-        def model(c):
-            a = torch.Tensor([1,2,3,4,0])
-            b = a+a
-            b = b*2
-            b = b[:2]
-            a = a[1<a]
-            return a[0]+b[0]+c[0]
-
-        c = torch.Tensor([1,2,3])
-        r1 = model(c)
-        with pytorch_converter.trace_scope(["model"]):
-            r2 = model(c)
-        assert r1.numpy()==r2.numpy()
-        r3 = model(c)
-        assert r1.numpy() == r2.numpy() and r2.numpy() == r3.numpy(), (r1,r2,r3)
-        ans = """root in:[] out:[]
-    model in:[input_0] out:[out_11]
-        inj_torch_Tensor___init__ in:[array_1] out:[] args:[array_1, [1, 2, 3, 4, 0]]
-        inj_torch_Tensor___add__ in:[array_1, array_1] out:[out_2] args:[array_1, array_1]
-        inj_torch_Tensor___mul__ in:[out_2] out:[out_3] args:[out_2, 2]
-        inj_torch_Tensor___getitem__ in:[out_3] out:[out_4] args:[out_3, slice(None, 2, None)]
-        inj_torch_Tensor___gt__ in:[array_1] out:[out_5] args:[array_1, 1]
-        inj_torch_Tensor___getitem__ in:[array_1, out_5] out:[out_6] args:[array_1, out_5]
-        inj_torch_Tensor___getitem__ in:[out_6] out:[out_7] args:[out_6, 0]
-        inj_torch_Tensor___getitem__ in:[out_4] out:[out_8] args:[out_4, 0]
-        inj_torch_Tensor___add__ in:[out_7, out_8] out:[out_9] args:[out_7, out_8]
-        inj_torch_Tensor___getitem__ in:[input_0] out:[out_10] args:[input_0, 0]
-        inj_torch_Tensor___add__ in:[out_9, out_10] out:[out_11] args:[out_9, out_10]
-    model in:[input_0] out:[out_11] end
-root in:[] out:[] end"""
-        ct = pytorch_converter.call_tree
-        assert str(ct) == ans
-        code = ct.to_jt()
-        lc = {}
-        exec(code, globals(), lc)
-        print(code)
-        jt_model = lc["model"]
-        assert jt_model(jt.array([1,2,3])).data == r1.numpy()
-
-    def test_resnet(self):
-        class Bottleneck(nn.Module):
-            expansion = 4
-
-            def __init__(self, inplanes, planes, stride=1, downsample=None, dilation=1):
-                super(Bottleneck, self).__init__()
-                self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
-                self.bn1 = nn.BatchNorm2d(planes)
-                self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, dilation=dilation,
-                                    padding=dilation, bias=False)
-                self.bn2 = nn.BatchNorm2d(planes)
-                self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
-                self.bn3 = nn.BatchNorm2d(planes * 4)
-                self.relu = nn.ReLU(inplace=True)
-                self.downsample = downsample
-                self.stride = stride
-
-            def forward(self, x):
-                residual = x
-
-                out = self.conv1(x)
-                out = self.bn1(out)
-                out = self.relu(out)
-
-                out = self.conv2(out)
-                out = self.bn2(out)
-                out = self.relu(out)
-
-                out = self.conv3(out)
-                out = self.bn3(out)
-
-                if self.downsample is not None:
-                    residual = self.downsample(x)
-
-                out += residual
-                out = self.relu(out)
-
-                return out
-
-        class ResNet(nn.Module):
-            def __init__(self, block, layers=(3, 4, 23, 3)):
-                self.inplanes = 64
-                super(ResNet, self).__init__()
-                self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
-                                    bias=False)
-                self.bn1 = nn.BatchNorm2d(64)
-                self.relu = nn.ReLU(inplace=True)
-                self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
-                self.layer1 = self._make_layer(block, 64, layers[0])
-                self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
-                self.layer3 = self._make_layer(block, 256, layers[2], stride=1, dilation=2)
-                self.layer4 = self._make_layer(block, 512, layers[3], stride=1, dilation=4)
-
-                for m in self.modules():
-                    if isinstance(m, nn.Conv2d):
-                        n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
-                        m.weight.data.normal_(0, math.sqrt(2. / n))
-                    elif isinstance(m, nn.BatchNorm2d):
-                        m.weight.data.fill_(1)
-                        m.bias.data.zero_()
-
-            def _make_layer(self, block, planes, blocks, stride=1, dilation=1):
-                downsample = None
-                if stride != 1 or self.inplanes != planes * block.expansion:
-                    downsample = nn.Sequential(
-                        nn.Conv2d(self.inplanes, planes * block.expansion,
-                                kernel_size=1, stride=stride, bias=False),
-                        nn.BatchNorm2d(planes * block.expansion),
-                    )
-
-                layers = [block(self.inplanes, planes, stride, downsample)]
-                self.inplanes = planes * block.expansion
-                for i in range(1, blocks):
-                    layers.append(block(self.inplanes, planes, dilation=dilation))
-
-                return nn.Sequential(*layers)
-
-            def forward(self, x):
-                x1 = self.conv1(x)
-                x2 = self.bn1(x1)
-                x2 = self.relu(x2)
-                x2 = self.maxpool(x2)
-
-                x2 = self.layer1(x2)
-                x3 = self.layer2(x2)
-                x3 = self.layer3(x3)
-                x3 = self.layer4(x3)
-
-                return x1, x2, x3
-
-        return
-        resnet50 = ResNet(Bottleneck, [3, 4, 6, 3])
-        x = torch.Tensor(np.random.rand(2, 3, 224, 224))
-        with pytorch_converter.trace_scope():
-            y = resnet50(x)
-
-        ct = pytorch_converter.call_tree
-        code = ct.to_jt()
-        print(code)
-
-    def test_convert_batchnorm(self):
-        class TestModel(nn.Module):
-            def __init__(self):
-                super(TestModel, self).__init__()
-                self.bn1 = nn.BatchNorm2d(64)
-                self.bn2 = nn.BatchNorm2d(64)
-                self.bn3 = nn.BatchNorm2d(64)
-            def forward(self, x):
-                y = self.bn1(x)
-                z = self.bn2(x*x)
-                x = self.bn3(y+z)
-                return x
-        model = TestModel()
-
-        """ test_code:
-            x = torch.Tensor(np.random.rand(16, 64, 15, 15).astype("float32"))
-            jt_array = jt.array(x.numpy())
-            jt_result = jt.nn.batch_norm(jt_array, is_train=False, eps=1e-05, momentum=0.1)
-            torch_result = nn.BatchNorm2d(64)(x)
-        """
-
-        x = torch.Tensor(np.random.rand(16, 64, 15, 15).astype("float32"))
-        with pytorch_converter.trace_scope():
-            y = model(x)
-
-        ct = pytorch_converter.call_tree
-        ans = """root in:[] out:[]
-    TestModel.forward in:[input_0] out:[out_27] args:{'self': TestModel(
-  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
-  (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
-  (bn3): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
-), 'x': input_0}
-        BatchNorm2d.forward in:[input_0] out:[out_7] args:{'self': BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True), 'input': input_0}
-            functional.batch_norm in:[input_0, out_3, out_4, out_5, out_6] out:[out_7]
-        BatchNorm2d.forward in:[input_0] out:[out_7] args:{'self': BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True), 'input': input_0} end
-        inj_torch_Tensor___mul__ in:[input_0, input_0] out:[out_10] args:[input_0, input_0]
-        BatchNorm2d.forward in:[out_10] out:[out_17] args:{'self': BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True), 'input': out_10}
-            functional.batch_norm in:[out_10, out_13, out_14, out_15, out_16] out:[out_17]
-        BatchNorm2d.forward in:[out_10] out:[out_17] args:{'self': BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True), 'input': out_10} end
-        inj_torch_Tensor___add__ in:[out_7, out_17] out:[out_20] args:[out_7, out_17]
-        BatchNorm2d.forward in:[out_20] out:[out_27] args:{'self': BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True), 'input': out_20}
-            functional.batch_norm in:[out_20, out_23, out_24, out_25, out_26] out:[out_27]
-        BatchNorm2d.forward in:[out_20] out:[out_27] args:{'self': BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True), 'input': out_20} end
-    TestModel.forward in:[input_0] out:[out_27] args:{'self': TestModel(
-  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
-  (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
-  (bn3): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
-), 'x': input_0} end
-root in:[] out:[] end"""
-        assert str(ct) == ans
-        code = ct.to_jt()
-        lc = {}
-        exec(code, globals(), lc)
-        print(code)
+    def test_pytorch_converter(self):
+        name1 = os.path.join(jt.flags.cache_path, 'test_pytorch_converter_1.py')
+        print(f"save source code into {name1}")
+        with open(name1, 'w') as f:
+            f.write(code)
        
-        jt_model = lc["TestModel"]
-        assert (jt_model(jt.array(x.numpy())).data - y.detach().numpy()).mean() < 1e-5
+        ret = convert(code)

-    def test_convert_relu(self):
-        class TestModel(nn.Module):
-            def __init__(self):
-                super(TestModel, self).__init__()
-                self.rl1 = nn.ReLU(inplace=True)
-                self.rl2 = nn.ReLU(inplace=True)
-                self.rl3 = nn.ReLU(inplace=True)
-            def forward(self, x):
-                y = self.rl1(x)
-                z = self.rl2(x*x)
-                x = self.rl3(y+z)
-                return x
-        model = TestModel()
-
-        """ test_code:
-            x = torch.Tensor(np.random.rand(16, 64, 15, 15).astype("float32"))
-            jt_array = jt.array(x.numpy())
-            jt_result = jt.nn.batch_norm(jt_array, is_train=False, eps=1e-05, momentum=0.1)
-            torch_result = nn.BatchNorm2d(64)(x)
-        """
-
-        x = torch.Tensor(np.random.rand(16, 3, 15, 15).astype("float32"))
-        with pytorch_converter.trace_scope():
-            y = model(x)
-
-        ct = pytorch_converter.call_tree
-        ans = """root in:[] out:[]
-    TestModel.forward in:[input_0] out:[out_8] args:{'self': TestModel(
-  (rl1): ReLU(inplace=True)
-  (rl2): ReLU(inplace=True)
-  (rl3): ReLU(inplace=True)
-), 'x': input_0}
-        ReLU.forward in:[input_0] out:[input_0] args:{'self': ReLU(inplace=True), 'input': input_0}
-            functional.relu in:[input_0] out:[input_0]
-        ReLU.forward in:[input_0] out:[input_0] args:{'self': ReLU(inplace=True), 'input': input_0} end
-        inj_torch_Tensor___mul__ in:[input_0, input_0] out:[out_4] args:[input_0, input_0]
-        ReLU.forward in:[out_4] out:[out_4] args:{'self': ReLU(inplace=True), 'input': out_4}
-            functional.relu in:[out_4] out:[out_4]
-        ReLU.forward in:[out_4] out:[out_4] args:{'self': ReLU(inplace=True), 'input': out_4} end
-        inj_torch_Tensor___add__ in:[input_0, out_4] out:[out_8] args:[input_0, out_4]
-        ReLU.forward in:[out_8] out:[out_8] args:{'self': ReLU(inplace=True), 'input': out_8}
-            functional.relu in:[out_8] out:[out_8]
-        ReLU.forward in:[out_8] out:[out_8] args:{'self': ReLU(inplace=True), 'input': out_8} end
-    TestModel.forward in:[input_0] out:[out_8] args:{'self': TestModel(
-  (rl1): ReLU(inplace=True)
-  (rl2): ReLU(inplace=True)
-  (rl3): ReLU(inplace=True)
-), 'x': input_0} end
-root in:[] out:[] end"""
-        assert str(ct) == ans
-        code = ct.to_jt()
-        lc = {}
-        exec(code, globals(), lc)
-        print(code)
+        name2 = os.path.join(jt.flags.cache_path, 'test_pytorch_converter_2.py')
+        print(f"save destination code into {name2}")
+        with open(name2, 'w') as f:
+            f.write(ret)
        
-        jt_model = lc["TestModel"]
-        assert (jt_model(jt.array(x.numpy())).data == y.detach().numpy()).all()
+        from test_pytorch_converter_1 import resnet18 as torch_resnet18
+        from test_pytorch_converter_2 import resnet18 as jittor_resnet18
+        model_torch = torch_resnet18(False)
+        model_jittor = jittor_resnet18(False)
+        model_jittor.load_parameters(model_torch.state_dict())

-    def test_convert_pool(self):
-        class TestModel(nn.Module):
-            def __init__(self):
-                super(TestModel, self).__init__()
-                self.mp1 = nn.MaxPool2d(kernel_size=3, stride=2, padding=1, ceil_mode=False)
-                self.mp2 = nn.MaxPool2d(kernel_size=3, stride=2, padding=1, ceil_mode=False)
-                self.mp3 = nn.MaxPool2d(kernel_size=3, stride=2, padding=1, ceil_mode=False)
-            def forward(self, x):
-                y = self.mp1(x)
-                z = self.mp2(x*x)
-                x = self.mp3(y+z)
-                return x
-        model = TestModel()
-
-        x = torch.Tensor(np.random.rand(16, 3, 15, 15).astype("float32"))
-        with pytorch_converter.trace_scope():
-            y = model(x)
-        ct = pytorch_converter.call_tree
-        ans = """root in:[] out:[]
-    TestModel.forward in:[input_0] out:[out_11] args:{'self': TestModel(
-  (mp1): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
-  (mp2): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
-  (mp3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
-), 'x': input_0}
-        MaxPool2d.forward in:[input_0] out:[out_1] args:{'self': MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False), 'input': input_0}
-            functional._max_pool2d in:[input_0] out:[out_1]
-        MaxPool2d.forward in:[input_0] out:[out_1] args:{'self': MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False), 'input': input_0} end
-        inj_torch_Tensor___mul__ in:[input_0, input_0] out:[out_5] args:[input_0, input_0]
-        MaxPool2d.forward in:[out_5] out:[out_6] args:{'self': MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False), 'input': out_5}
-            functional._max_pool2d in:[out_5] out:[out_6]
-        MaxPool2d.forward in:[out_5] out:[out_6] args:{'self': MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False), 'input': out_5} end
-        inj_torch_Tensor___add__ in:[out_1, out_6] out:[out_10] args:[out_1, out_6]
-        MaxPool2d.forward in:[out_10] out:[out_11] args:{'self': MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False), 'input': out_10}
-            functional._max_pool2d in:[out_10] out:[out_11]
-        MaxPool2d.forward in:[out_10] out:[out_11] args:{'self': MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False), 'input': out_10} end
-    TestModel.forward in:[input_0] out:[out_11] args:{'self': TestModel(
-  (mp1): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
-  (mp2): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
-  (mp3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
-), 'x': input_0} end
-root in:[] out:[] end"""
-        assert str(ct) == ans
-        code = ct.to_jt()
-        lc = {}
-        exec(code, globals(), lc)
-        print(code)
-        
-        jt_model = lc["TestModel"]
-        assert (jt_model(jt.array(x.numpy())).data == y.detach().numpy()).all()
+        img = np.random.randn(1,3,224,224).astype("float32")
+        img_torch = torch.Tensor(img)
+        img_jittor = jt.array(img)

+        out_torch = model_torch(img_torch)
+        out_jittor = model_jittor(img_jittor)
+        assert abs((out_torch.cpu().detach().numpy() - out_jittor.data)).mean() < 1e-4

 if __name__ == "__main__":
    unittest.main()
--- a/python/jittor/test/test_pytorch_converter2.py
+++ b/python/jittor/test/test_pytorch_converter2.py
@ -1,264 +0,0 @@
-# ***************************************************************
-# Copyright (c) 2020 Jittor. Authors: Dun Liang <randonlang@gmail.com>. All Rights Reserved.
-# This file is subject to the terms and conditions defined in
-# file 'LICENSE.txt', which is part of this source code package.
-# ***************************************************************
-import unittest
-import jittor as jt
-import numpy as np
-from jittor.utils.pytorch_converter2 import convert
-import os
-
-try:
-    jt.dirty_fix_pytorch_runtime_error()
-    import torch
-    from torch import nn
-except:
-    torch = None
-
-code="""
-import torch.nn as nn
-import torch.utils.model_zoo as model_zoo 
-
-__all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152']
- 
- 
-model_urls = {
-    'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
-    'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
-    'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
-    'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
-    'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
-}
- 
- 
-def conv3x3(in_planes, out_planes, stride=1):
-    '''3x3 convolution with padding'''
-    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False)
-    
-class BasicBlock(nn.Module):
-    expansion = 1
- 
-    def __init__(self, inplanes, planes, stride=1, downsample=None):
-        super(BasicBlock, self).__init__()
-        self.conv1 = conv3x3(inplanes, planes, stride)
-        self.bn1 = nn.BatchNorm2d(planes)
-        self.relu = nn.ReLU(inplace=True)
-        self.conv2 = conv3x3(planes, planes)
-        self.bn2 = nn.BatchNorm2d(planes)
-        self.downsample = downsample
-        self.stride = stride
- 
-    def forward(self, x):
-        residual = x
- 
-        out = self.conv1(x)
-        out = self.bn1(out)
-        out = self.relu(out)
- 
-        out = self.conv2(out)
-        out = self.bn2(out)
- 
-        if self.downsample is not None:
-            residual = self.downsample(x)
- 
-        out += residual
-        out = self.relu(out)
- 
-        return out
-        
-class Bottleneck(nn.Module):
-    expansion = 4
- 
-    def __init__(self, inplanes, planes, stride=1, downsample=None):
-        super(Bottleneck, self).__init__()
-        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
-        self.bn1 = nn.BatchNorm2d(planes)
-        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
-                               padding=1, bias=False)
-        self.bn2 = nn.BatchNorm2d(planes)
-        self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1, bias=False)
-        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
-        self.relu = nn.ReLU(inplace=True)
-        self.downsample = downsample
-        self.stride = stride
- 
-    def forward(self, x):
-        residual = x
- 
-        out = self.conv1(x)
-        out = self.bn1(out)
-        out = self.relu(out)
- 
-        out = self.conv2(out)
-        out = self.bn2(out)
-        out = self.relu(out)
- 
-        out = self.conv3(out)
-        out = self.bn3(out)
- 
-        if self.downsample is not None:
-            residual = self.downsample(x)
- 
-        out += residual
-        out = self.relu(out)
- 
-        return out
-        
-        
-class ResNet(nn.Module):
- 
-    def __init__(self, block, layers, num_classes=1000):
-        self.inplanes = 64
-        super(ResNet, self).__init__()
-        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
-                               bias=False)
-        self.bn1 = nn.BatchNorm2d(64)
-        self.relu = nn.ReLU(inplace=True)
-        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
-        self.layer1 = self._make_layer(block, 64, layers[0])
-        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
-        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
-        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
-        self.avgpool = nn.AvgPool2d(7, stride=1)
-        self.fc = nn.Linear(512 * block.expansion, num_classes)
- 
-        for m in self.modules():
-            if isinstance(m, nn.Conv2d):
-                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
-            elif isinstance(m, nn.BatchNorm2d):
-                nn.init.constant_(m.weight, 1)
-                nn.init.constant_(m.bias, 0)
- 
-    def _make_layer(self, block, planes, blocks, stride=1):
-        downsample = None
-        if stride != 1 or self.inplanes != planes * block.expansion:
-            downsample = nn.Sequential(
-                nn.Conv2d(self.inplanes, planes * block.expansion,
-                          kernel_size=1, stride=stride, bias=False),
-                nn.BatchNorm2d(planes * block.expansion),
-            )
- 
-        layers = []
-        layers.append(block(self.inplanes, planes, stride, downsample))
-        self.inplanes = planes * block.expansion
-        for i in range(1, blocks):
-            layers.append(block(self.inplanes, planes))
- 
-        return nn.Sequential(*layers)
- 
-    def forward(self, x):
-        x = self.conv1(x)
-        x = self.bn1(x)
-        x = self.relu(x)
-        x = self.maxpool(x)
-        x = self.layer1(x)
-        x = self.layer2(x)
-        x = self.layer3(x)
-        x = self.layer4(x)
- 
-        x = self.avgpool(x)
-        x = x.view(x.size(0), -1)
-        x = self.fc(x)
- 
-        return x
-        
-def resnet18(pretrained=False, **kwargs):
-    '''Constructs a ResNet-18 model.
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-    '''
-    model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
-    if pretrained:
-        model.load_state_dict(model_zoo.load_url(model_urls['resnet18']))
-    return model
- 
- 
-def resnet34(pretrained=False, **kwargs):
-    '''Constructs a ResNet-34 model.
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-    '''
-    model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs)
-    if pretrained:
-        model.load_state_dict(model_zoo.load_url(model_urls['resnet34']))
-    return model
- 
- 
-def resnet50(pretrained=False, **kwargs):
-    '''Constructs a ResNet-50 model.
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-    '''
-    model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
-    if pretrained:
-        model.load_state_dict(model_zoo.load_url(model_urls['resnet50']))
-    return model
- 
- 
-def resnet101(pretrained=False, **kwargs):
-    '''Constructs a ResNet-101 model.
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-    '''
-    model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
-    if pretrained:
-        model.load_state_dict(model_zoo.load_url(model_urls['resnet101']))
-    return model
- 
- 
-def resnet152(pretrained=False, **kwargs):
-    '''Constructs a ResNet-152 model.
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-    '''
-    model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs)
-    if pretrained:
-        model.load_state_dict(model_zoo.load_url(model_urls['resnet152']))
-    return model
-    
-import numpy as np
-import torch
-import random
-
-# setup random seed
-def setup_seed(seed):
-    np.random.seed(seed)
-    random.seed(seed)
-    torch.manual_seed(seed)
-    torch.cuda.manual_seed_all(seed)
-    torch.backends.cudnn.deterministic = True
-    torch.backends.cudnn.benchmark = True
-"""
-
-@unittest.skipIf(torch is None, "pytorch not found.")
-class TestPytorchConverter2(unittest.TestCase):
-    def test_pytorch_converter2(self):
-        name1 = os.path.join(jt.flags.cache_path, 'test_pytorch_converter2_1.py')
-        print(f"save source code into {name1}")
-        with open(name1, 'w') as f:
-            f.write(code)
-        
-        ret = convert(code)
-
-        name2 = os.path.join(jt.flags.cache_path, 'test_pytorch_converter2_2.py')
-        print(f"save destination code into {name2}")
-        with open(name2, 'w') as f:
-            f.write(ret)
-        
-        from test_pytorch_converter2_1 import resnet18 as torch_resnet18
-        from test_pytorch_converter2_2 import resnet18 as jittor_resnet18
-        model_torch = torch_resnet18(False)
-        model_jittor = jittor_resnet18(False)
-        model_jittor.load_parameters(model_torch.state_dict())
-
-        img = np.random.randn(1,3,224,224).astype("float32")
-        img_torch = torch.Tensor(img)
-        img_jittor = jt.array(img)
-
-        out_torch = model_torch(img_torch)
-        out_jittor = model_jittor(img_jittor)
-        assert abs((out_torch.cpu().detach().numpy() - out_jittor.data)).mean() < 1e-4
-
-if __name__ == "__main__":
-    unittest.main()
--- a/python/jittor/test/test_reshape.py
+++ b/python/jittor/test/test_reshape.py
@ -60,5 +60,16 @@ class TestReshapeOp(unittest.TestCase):
        assert node_dict['a'] == node_dict['d']
        assert node_dict['a'] == node_dict['e']

+    def test_view(self):
+        a = jt.ones([2,3,4])
+        assert a.view(2,-1).shape == [2,12]
+
+    def test_flatten(self):
+        a = jt.ones([2,3,4])
+        assert a.flatten().shape == [24]
+        assert a.flatten(1).shape == [2,12]
+        assert a.flatten(0,-2).shape == [6,4]
+
+
 if __name__ == "__main__":
    unittest.main()
--- a/python/jittor/test/test_transpose_op.py
+++ b/python/jittor/test/test_transpose_op.py
@ -61,5 +61,10 @@ class TestTransposeOp(unittest.TestCase):
            assert ((da-jda.data)<1e-5).all(), (da, jda.data, da-jda.data)
            assert ((db-jdb.data)<1e-5).all(), (db-jdb.data)

+    def test_permute(self):
+        a = jt.ones([2,3,4])
+        assert a.permute().shape == [4,3,2]
+        assert a.permute(0,2,1).shape == [2,4,3]
+
 if __name__ == "__main__":
    unittest.main()
--- a/python/jittor/test/test_unary_op.py
+++ b/python/jittor/test/test_unary_op.py
@ -34,11 +34,19 @@ class TestUnaryOp(unittest.TestCase):
        check("sqrt", a)
        
    def test_grad(self):
-        ops = ["abs", "negative", "log", "exp", "sqrt"]
+        ops = ["abs", "negative", "log", "exp", "sqrt",
+            "sin", "arcsin", "sinh", "arcsinh", 
+            "tan", "arctan", "tanh", "arctanh", 
+            "cos", "arccos", "cosh", "arccosh", 
+        ]
        a = [1.1, 2.2, 3.3, 4.4]
        for op in ops:
            if op == "abs":
                b = np.array(a+[-1,])
+            elif op == "arccosh":
+                b = np.array(a)
+            elif "sin" in op or "cos" in op or "tan" in op:
+                b = np.array(a) / 5
            else:
                b = np.array(a)
            func = lambda x: eval(f"np.{op}(x[0]).sum()")
--- a/python/jittor/test/test_vgg.py
+++ b/python/jittor/test/test_vgg.py
@ -24,7 +24,7 @@ skip_model_test = not model_test

 class MnistNet(Module):
    def __init__(self):
-        self.model = vgg.VGG16_bn()
+        self.model = vgg.vgg16_bn()
        self.layer = nn.Linear(1000,10)
    def execute(self, x):
        x = self.model(x)
--- a/python/jittor/utils/pytorch_converter.py
+++ b/python/jittor/utils/pytorch_converter.py
@ -1,288 +1,461 @@
 # ***************************************************************
-# Copyright (c) 2020 Jittor. Authors: Dun Liang <randonlang@gmail.com>. All Rights Reserved.
+# Copyright (c) 2020 Jittor. Authors: 
+#     Wenyang Zhou <576825820@qq.com>
+#     Dun Liang <randonlang@gmail.com>. 
+# All Rights Reserved.
 # This file is subject to the terms and conditions defined in
 # file 'LICENSE.txt', which is part of this source code package.
 # ***************************************************************
-import sys
-import contextlib
-import os
-import signal
-import jittor as jt
-jt.dirty_fix_pytorch_runtime_error()
-import torch
+import ast, astunparse
+import numpy as np

-class CallTree:
-    def __init__(self, parent, name):
-        self.parent = parent
-        self.name = name
-        self.children = []
-        self.input = []
-        self.output = []
-        self.args = None
-        if parent is not None:
-            parent.children.append(self)
+pjmap = {
+    # ***************************************************************
+    # Module
+    # ***************************************************************
+    'Conv2d': {
+        'pytorch': {
+            'args': "in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True, padding_mode='zeros'"
+        },
+        'jittor': {
+            'module': 'nn',
+            'name': 'Conv',
+            'args': 'in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True'
+        },
+        'links': {},
+        'extras': {},
+    },
+    'ConvTranspose2d': {
+        'pytorch': {
+            'args': "in_channels, out_channels, kernel_size, stride=1, padding=0, output_padding=0, groups=1, bias=True, dilation=1, padding_mode='zeros'"
+        },
+        'jittor': {
+            'module': 'nn',
+            'name': 'ConvTranspose',
+            'args': 'in_channels, out_channels, kernel_size, stride=1, padding=0, output_padding=0, groups=1, bias=True, dilation=1'
+        },
+        'links': {},
+        'extras': {},
+    },
+    'MaxPool2d': {
+        'pytorch': {
+            'args': 'kernel_size, stride=None, padding=0, dilation=1, return_indices=False', 
+        },
+        'jittor': {
+            'module': 'nn',
+            'name': 'Pool',
+            'args': 'kernel_size, stride=None, padding=0, dilation=None, return_indices=None, ceil_mode=False, op="maximum"'
+        },
+        'links': {},
+        'extras': {
+            "op": "'maximum'",
+        },
+    },
+    'AvgPool2d': {
+        'pytorch': {
+            'args': 'kernel_size, stride=None, padding=0, dilation=1, return_indices=False', 
+        },
+        'jittor': {
+            'module': 'nn',
+            'name': 'Pool',
+            'args': 'kernel_size, stride=None, padding=0, dilation=None, return_indices=None, ceil_mode=False, op="maximum"'
+        },
+        'links': {},
+        'extras': {
+            "op": "'mean'",
+        },
+    },
+    'ReLU': {
+        'pytorch': {
+            'args': 'inplace=False', 
+        },
+        'jittor': {
+            'module': 'nn',
+            'name': 'ReLU',
+            'args': ''
+        },
+        'links': {},
+        'extras': {},
+    },
+    'ReLU6': {
+        'pytorch': {
+            'args': 'inplace=False', 
+        },
+        'jittor': {
+            'module': 'nn',
+            'name': 'ReLU6',
+            'args': ''
+        },
+        'links': {},
+        'extras': {},
+    },
+    'LeakyReLU': {
+        'pytorch': {
+            'args': 'negative_slope=0.01, inplace=False', 
+        },
+        'jittor': {
+            'module': 'nn',
+            'name': 'LeakyReLU',
+            'args': 'scale'
+        },
+        'links': {'negative_slope': 'scale'},
+        'extras': {},
+    },
+    'BatchNorm2d': {
+        'pytorch': {
+            'args': 'num_features, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True', 
+        },
+        'jittor': {
+            'module': 'nn',
+            'name': 'BatchNorm',
+            'args': 'num_features, eps=1e-5, momentum=0.1, affine=None, is_train=True'
+        },
+        'links': {},
+        'extras': {},
+    },
+    'kaiming_normal_': {
+        'pytorch': {
+            'args': "tensor, a=0, mode='fan_in', nonlinearity='leaky_relu'", 
+        },
+        'jittor': {
+            'module': 'init',
+            'name': 'relu_invariant_gauss_',
+            'args': 'var, mode="fan_in"'
+        },
+        'links': {'tensor': 'var'},
+        'extras': {},
+    },
+    'constant_': {
+        'pytorch': {
+            'args': "tensor, val", 
+        },
+        'jittor': {
+            'module': 'init',
+            'name': 'constant_',
+            'args': 'var, value=0.0'
+        },
+        'links': {'tensor': 'var', 'val': 'value'},
+        'extras': {},
+    },
+    'normal_': {
+        'pytorch': {
+            'args': "tensor, mean=0.0, std=1.0", 
+        },
+        'jittor': {
+            'module': 'init',
+            'name': 'gauss_',
+            'args': 'var, mean=0.0, std=1.0'
+        },
+        'links': {'tensor': 'var'},
+        'extras': {},
+    },
+    'cat': {
+        'pytorch': {
+            'args': "tensors, dim=0, out=None", 
+        },
+        'jittor': {
+            'module': 'jt.contrib',
+            'name': 'concat',
+            'args': 'vars, dim=0'
+        },
+        'links': {'tensors': 'vars'},
+        'extras': {},
+    },
+    # ***************************************************************
+    # Convert format for function which can be writen as either torch.Tensor.xxx(...) or torch.xxx(torch.Tensor, ...)
+    #       Example: x.reshape([2,3]) and torch.reshape(x, [2,3])
+    # ***************************************************************
+    'flatten': {
+        'pytorch': {
+            'prefix': ['torch'],
+            'args_prefix': 'input, start_dim=0, end_dim=-1',
+            'args': 'start_dim=0, end_dim=-1',
+        },
+        'jittor': {
+            'prefix': 'jt',
+            'module': '',
+            'name': 'flatten',
+            'args_prefix': 'input, start_dim=0, end_dim=-1',
+            'args': 'start_dim=0, end_dim=-1'
+        },
+        'links': {},
+        'extras': {},
+    },
+    'reshape': {
+        'pytorch': {
+            'prefix': ['torch'],
+            'args_prefix': 'input, shape',
+            'args': 'shape',
+        },
+        'jittor': {
+            'prefix': 'jt',
+            'module': '',
+            'name': 'reshape',
+            'args_prefix': 'input, shape',
+            'args': 'shape'
+        },
+        'links': {},
+        'extras': {},
+    },
+    'permute': {
+        'pytorch': {
+            'prefix': [],
+            'args_prefix': '',
+            'args': '*dim',
+        },
+        'jittor': {
+            'prefix': '',
+            'module': '',
+            'name': 'permute',
+            'args_prefix': '',
+            'args': '*dim'
+        },
+        'links': {},
+        'extras': {},
+    },
+    # 好像不需要如果一毛一样的话
+    'view': {
+        'pytorch': {
+            'prefix': [],
+            'args_prefix': '',
+            'args': '*shape',
+        },
+        'jittor': {
+            'prefix': '',
+            'module': '',
+            'name': 'view',
+            'args_prefix': '',
+            'args': '*shape'
+        },
+        'links': {},
+        'extras': {},
+    }
+}

-    def __str__(self):
-        ss = []
-        def dfs(v, depth):
-            s = "    "*depth+f"{v.name} in:{v.input} out:{v.output}"
-            if v.args is not None:
-                s += f" args:{v.args}"
-            ss.append(s)
-            if len(v.children):
-                for c in v.children:
-                    dfs(c, depth+1)
-                ss.append(s + " end")
-        dfs(self, 0)
-        return "\n".join(ss)
+unsupport_ops = [
+    # ***************************************************************
+    # torch.nn
+    # ***************************************************************
+    'Parameter', 'ModuleList', 'ModuleDict', 'ParameterList', 'ParameterDict', 
+    'Conv1d', 'Conv3d', 'ConvTranspose1d', 'ConvTranspose3d', 'Unfold', 'Fold', 
+    'MaxPool1d', 'MaxPool3d', 'MaxUnpool1d', 'MaxUnpool2d', 'MaxUnpool3d', 'AvgPool1d', 'AvgPool3d', 'FractionalMaxPool2d', 'LPPool1d', 'LPPool2d', 'AdaptiveMaxPool1d', 'AdaptiveMaxPool2d', 'AdaptiveMaxPool3d', 'AdaptiveAvgPool1d', 'AdaptiveAvgPool3d', 
+    'ReflectionPad1d', 'ReflectionPad2d', 'ReplicationPad1d', 'ReplicationPad2d', 'ReplicationPad3d', 'ZeroPad2d', 'ConstantPad1d', 'ConstantPad2d', 'ConstantPad3d', 'ELU', 'Hardshrink', 'Hardtanh', 'LogSigmoid', 'MultiheadAttention', 
+    'PReLU', 'RReLU', 'SELU', 'CELU', 'GELU', 'Softplus', 'Softshrink', 'Softsign', 'Tanhshrink', 'Threshold', 'Softmin', 'Softmax2d', 'LogSoftmax', 'AdaptiveLogSoftmaxWithLoss', 'BatchNorm1d', 'BatchNorm3d', 'GroupNorm', 'SyncBatchNorm', 'InstanceNorm1d', 'InstanceNorm2d', 'InstanceNorm3d', 'LayerNorm', 'LocalResponseNorm', 'RNNBase', 'RNN', 'LSTM', 'GRU', 'RNNCell', 'LSTMCell', 'GRUCell', 'Transformer', 'TransformerEncoder', 'TransformerDecoder', 'TransformerEncoderLayer', 'TransformerDecoderLayer', 'Identity', 'Bilinear', 'Dropout2d', 'Dropout3d', 'AlphaDropout', 'Embedding', 'EmbeddingBag', 'CosineSimilarity', 'PairwiseDistance', 'L1Loss', 'MSELoss', 'CTCLoss', 'NLLLoss', 'PoissonNLLLoss', 'KLDivLoss', 'BCELoss', 'BCEWithLogitsLoss', 'MarginRankingLoss', 'HingeEmbeddingLoss', 'MultiLabelMarginLoss', 'SmoothL1Loss', 'SoftMarginLoss', 'MultiLabelSoftMarginLoss', 'CosineEmbeddingLoss', 'MultiMarginLoss', 'TripletMarginLoss', 'PixelShuffle', 'Upsample', 'UpsamplingNearest2d', 'UpsamplingBilinear2d', 'DataParallel', 'DistributedDataParallel', 'clip_grad_norm_', 'clip_grad_value_', 'parameters_to_vector', 'vector_to_parameters', 'BasePruningMethod', 'PruningContainer', 'Identity', 'RandomUnstructured', 'L1Unstructured', 'RandomStructured', 'LnStructured', 'CustomFromMask', 'identity', 'random_unstructured', 'l1_unstructured', 'random_structured', 'ln_structured', 'global_unstructured', 'custom_from_mask', 'remove', 'is_pruned', 'weight_norm', 'remove_weight_norm', 'spectral_norm', 'remove_spectral_norm', 'PackedSequence', 'pack_padded_sequence', 'pad_packed_sequence', 'pad_sequence', 'pack_sequence'
+]

-    def to_jt(self):
-        defs = []
-        template = {
-            "add": "{0} + {1}",
-            "mul": "{0} * {1}",
-            "getitem": "{0}[{1}]",
-            "gt": "{0} > {1}",
-        }
-        def dfs(v):
-            if len(v.children)==0:
-                return
-            code = []
-            code.append(f"def {v.name.split('.')[0]}({','.join(map(str,v.input))}):")
-            for c in v.children:
-                # parse the argument into jittor code
-                # code.append(f"    # {c.args}")
-                if c.name == "BatchNorm2d.forward":
-                    bn = c.args["self"]
-                    code.append(f"    {c.output[0]} = jt.nn.batch_norm({c.input[0]}, is_train={bn.training}, eps={bn.eps}, momentum={bn.momentum})")
-                    continue
-                if c.name == "ReLU.forward":
-                    code.append(f"    {c.output[0]} = jt.nn.relu({c.input[0]})")
-                    continue
-                if c.name == "MaxPool2d.forward":
-                    po = c.args["self"]
-                    code.append(f"    {c.output[0]} = jt.nn.pool({c.input[0]}, size={po.kernel_size}, op='maximum', padding={po.padding}, stride={po.stride})")
-                    continue
-                if c.name == "Conv2d.forward":
-                    mod = c.args["self"]
-                    code.append(f"    # {mod}")
-                    assert mod.kernel_size[0] == mod.kernel_size[1]
-                    assert mod.padding[0] == mod.padding[1]
-                    assert mod.stride[0] == mod.stride[1]
-                    assert mod.bias == False                
-                    code.append(f"    {c.output[0]} = nn.conv({c.output[0]}, {mod.in_channels}, {mod.out_channels}, {mod.kernel_size[0]}, {mod.padding[0]}, {mod.stride[0]})")
-                    continue
-                if c.name.startswith("inj"):
-                    if c.name.endswith("__init__"):
-                        code.append(f"    {c.args[0]} = jt.array({c.args[1]})")
-                    else:
-                        assert c.name.startswith("inj_torch_Tensor___") and \
-                            c.name.endswith("__")
-                        name = c.name[19:-2]
-                        if name in template:
-                            code.append(f"    {c.output[0]} = {template[name].format(*c.args)}")
-                        else:
-                            code.append(f"    {c.output[0]} = __{name}__({', '.join(map(str,c.args))})")
-                else:
-                    dfs(c)
-                    out = ""
-                    if len(c.output):
-                        out = f"{','.join(map(str, c.output))} = "
-                    code.append(f"    {out}{c.name.split('.')[0]}({','.join(map(str,c.input))})")
-            if len(v.output):
-                code.append(f"    return {','.join(map(str, v.output))}")
-            defs.extend(code)
-        dfs(self)
-        return "\n".join(defs)
+support_ops = {}
+for key in pjmap.keys():
+    module = pjmap[key]['jittor']['module']
+    name = pjmap[key]['jittor']['name']
+    if module == 'nn':
+        support_ops[key] = name

-class TNode:
-    def __init__(self, s, v):
-        self.s = s
-        self.v = v
-    def __str__(self):
-        return self.s
-    def __repr__(self):
-        return self.s
+def raise_unsupport(name):
+    raise RuntimeError(f'{name} is not supported in Jittor yet. We will appreciate it if you provide an implementation of {a.attr} and make pull request at https://github.com/Jittor/jittor.')

-trace_depth = 0
-stack = []
-g_vars = {}
-g_var_id = 0
-g_func_names = []
-call_tree = CallTree(None, "root")
-
-def push_stack(name=None, input=[]):
-    global trace_depth, call_tree
-    trace_depth += 1
-    if name is not None:
-        # Do not re record functional
-        if len(stack) and (
-            stack[-1][1].startswith("functional.") or
-            stack[-1][1].startswith("inj_")
-        ):
-            return
-        call_tree = CallTree(call_tree, name)
-        call_tree.input = input
-        stack.append((trace_depth, name))
-        return call_tree
-    return None
+def replace(a):
+    if hasattr(a, "attr") and a.attr in unsupport_ops:
+        raise_unsupport(a.attr)
    
-def pop_stack(output=[]):
-    global trace_depth, call_tree
-    if len(stack) and stack[-1][0] == trace_depth:
-        stack.pop()
-        call_tree.output = output
-        call_tree = call_tree.parent
-    trace_depth -= 1
+    if hasattr(a, "id") and a.id in unsupport_ops:
+        raise_unsupport(a.id)

-def trace_calls(frame, event, arg):
-    def dfs(obj, func):
-        if isinstance(obj, list):
-            for i,v in enumerate(obj):
-                dfs(v, func)
-                if isinstance(v, torch.Tensor):
-                    obj[i] = g_vars[id(v)]
-        elif isinstance(obj, dict):
-            for k,v in obj.items():
-                if isinstance(v, tuple):
-                    v = list(v)
-                    obj[k] = v
-                dfs(v, func)
-                if isinstance(v, torch.Tensor):
-                    obj[k] = g_vars[id(v)]
-        elif isinstance(obj, torch.Tensor):
-            func(obj)
-    global g_var_id
-    if event.endswith('call'):
-        co = frame.f_code
-        func_name = co.co_name
-        func_line_no = frame.f_lineno
-        func_filename = co.co_filename
-        args = "???"
-        t_values = []
-        if event == "c_call":
-            func_name = arg.__name__
-        else:
-            args = list(frame.f_locals.keys())
-            if "self" in frame.f_locals:
-                func_name = type(frame.f_locals["self"]).__name__ + "." + func_name
-            
-            val = {k:frame.f_locals[k] for k in args}
-            def func(v):
-                global g_var_id
-                if id(v) not in g_vars:
-                    if func_name.endswith("__init__"):
-                        g_vars[id(v)] = TNode("array_"+str(g_var_id), v)
-                    else:
-                        g_vars[id(v)] = TNode("input_"+str(g_var_id), v)
-                    g_var_id += 1
-                t_values.append(g_vars[id(v)])
-            dfs(val, func)
+    if hasattr(a, "attr"):
+        if a.attr in support_ops.keys(): a.attr = support_ops[a.attr]
        
-        # get arguments you want
-        if func_name.endswith(".forward"):
-            ct = push_stack(func_name, t_values)
-            ct.args = val
-        elif func_filename.endswith("functional.py"): # TODO: not stable
-            push_stack("functional."+func_name, t_values)
-        elif func_name.startswith("inj_"):
-            ct = push_stack(func_name, t_values)
-            ct.args = val["a"]
-        elif func_name in g_func_names:
-            push_stack(func_name, t_values)
+    if hasattr(a, "id"):
+        if a.id in support_ops.keys(): a.id = support_ops[a.id]
+
+import_flag = []
+def convert(code):
+    a = ast.parse(code)
+    dfs(a)
+    a.body.insert(0, ast.parse('import jittor as jt').body[0])
+    if 'init' not in import_flag:
+        a.body.insert(1, ast.parse('from jittor import init').body[0])
+    if 'nn' not in import_flag:
+        a.body.insert(2, ast.parse('from jittor import nn').body[0])
+    return astunparse.unparse(a)
+
+def convert_(prefix, func_name, ags, kws):
+    info = pjmap[func_name]
+    p_prefix = info['pytorch']['prefix'] if 'prefix' in info['pytorch'].keys() else None
+    if p_prefix is not None and prefix in p_prefix:
+        p_ags = info['pytorch']['args_prefix']
+        j_ags = info['jittor']['args_prefix']
+    else:
+        p_ags = info['pytorch']['args']
+        j_ags = info['jittor']['args']
+    j_prefix = info['jittor']['prefix'] if 'prefix' in info['jittor'].keys() else None
+    j_module = info['jittor']['module']
+    j_name = info['jittor']['name']
+    links = info['links']
+    extras = info['extras']
+    jj_ags = []
+    jj_kws = {}
+    pp_ags = []
+    pp_kws = {}
+    if j_ags == '' and p_ags == '':
+        # no args in Pytorch and Jittor.
+        if p_prefix is None:
+            return f"{j_module}.{j_name}()"
        else:
-            push_stack()
-        jt.LOG.vvvv("----"*trace_depth+f"call: {func_name}({args}){t_values}     # {func_filename}:{func_line_no}")
-    elif event.endswith('return'):
-        ret = []
-        if event == "c_return":
-            jt.LOG.vvvv("----"*trace_depth+f"return {arg.__name__}: ???")
-        else:
-            co = frame.f_code
-            func_name = co.co_name
-            def func(arg):
-                global g_var_id
-                if id(arg) not in g_vars:
-                    node = TNode(f"out_{g_var_id}", arg)
-                    g_vars[id(arg)] = node
-                else:
-                    node = g_vars[id(arg)]
-                ret.append(node)
-                g_var_id += 1
-            dfs(arg, func)
-            if "self" in frame.f_locals:
-                func_name = type(frame.f_locals["self"]).__name__ + "." + func_name
-            jt.LOG.vvvv("----"*trace_depth+f"return {func_name}: {ret}")
-        pop_stack(ret)
-    return trace_calls
-
-@contextlib.contextmanager
-def trace_scope(func_names=[]):
-    global g_func_names
-    g_func_names = func_names
-    with func_injection():
-        try:
-            global trace_depth, g_var_id
-            sys.settrace(trace_calls)
-            trace_depth = 1
-            stack.clear()
-            g_vars.clear()
-            call_tree.children.clear()
-
-            g_var_id = 0
-            yield
-        finally:
-            sys.settrace(None)
-            jt.LOG.v("="*20)
-            jt.LOG.v(call_tree)
-
-
-@contextlib.contextmanager
-def func_injection():
-    names = [
-        "torch.Tensor.__init__",
-        "torch.Tensor.__add__",
-        "torch.Tensor.__mul__",
-        "torch.Tensor.__sub__",
-        "torch.Tensor.__truediv__",
-        "torch.Tensor.__floordiv__",
-        "torch.Tensor.__getitem__",
-        # "torch.Tensor.__setitem__",
-        "torch.Tensor.__pow__",
-        "torch.Tensor.__mod__",
-        "torch.Tensor.__lt__",
-        "torch.Tensor.__le__",
-        "torch.Tensor.__gt__",
-        "torch.Tensor.__ge__",
-        "torch.Tensor.__eq__",
-        "torch.Tensor.__ne__",
-        "torch.Tensor.__lshift__",
-        "torch.Tensor.__rshift__",
-        "torch.Tensor.__and__",
-        "torch.Tensor.__or__",
-        "torch.Tensor.__xor__",
-        "torch.Tensor.__abs__",
-        "torch.Tensor.__neg__",
-    ]
-    try:
-        global inject_prevs
-        inject_prevs = []
-        for name in names:
-            inject_prevs.append(eval(name))
-        for i, name in enumerate(names):
-            new_name = "inj_" + name.replace(".", "_")
-            if name.endswith("__getitem__"):
-                exec(f"def {new_name}(*a): return torch._C._TensorBase.__getitem__(a[0], a[1] if isinstance(a[1], tuple) else (a[1],))")
-            elif name.endswith("__init__"):
-                exec(f"def {new_name}(*a, **b): return None")
+            if prefix in p_prefix:
+                return f"{j_prefix}.{j_name}()"
            else:
-                exec(f"def {new_name}(*a, **b): return inject_prevs[{i}](*a, **b)")
-            jt.LOG.v("inject", new_name)
-            exec(f"{name} = {new_name}")
-        yield
-    finally:
-        for i, name in enumerate(names):
-            prev = inject_prevs[i]
-            exec(f"{name} = prev")
-        torch.Tensor.__getitem__ = \
-            lambda s, a: torch._C._TensorBase.__getitem__(s, a if isinstance(a, tuple) else (a,))
+                return f"{prefix}.{j_name}()"
+    else:
+        j_ags = j_ags.replace(' ','').split(',')
+        for j_ag in j_ags:
+            if '=' in j_ag:
+                k,v = j_ag.split('=')
+                jj_kws[k] = v
+            else:
+                jj_ags.append(j_ag)
+        p_ags = p_ags.replace(' ','').split(',')
+        for p_ag in p_ags:
+            if '=' in p_ag:
+                k,v = p_ag.split('=')
+                pp_kws[k] = v
+            else:
+                pp_ags.append(p_ag)
+        if len(jj_ags) == 0 and len(pp_ags) != 0:
+            raise AttributeError(f"{func_name} in Jittor has no Attribute {pp_ags[0]}")
+    if len(pp_ags) > len(ags) + len(kws):
+        raise RuntimeError(f'There are needed {len(pp_ags) + len(list(pp_kws.keys()))} args in Pytorch {func_name} function, but you only provide {len(ags) + len(kws)}')
+    ags_ = []
+    for i in range(len(pp_ags)):
+        if i < len(ags):
+            if '*' in pp_ags[i]:
+                ags_.append('(' + ', '.join(ags[i:]) + ')')
+                ags = ags_
+                break
+            else:
+                ags_.append(ags[i])
+        else:
+            break
+    if len(pp_ags) + len(list(pp_kws.keys())) < len(ags) + len(kws):
+        raise RuntimeError(f'There are only {len(pp_ags) + len(list(pp_kws.keys()))} args in Pytorch {func_name} function, but you provide {len(ags) + len(kws)}')
+    j_ags_flag = np.zeros(len(jj_ags))
+    j_ags_values = {}
+    j_kws_values = {}
+    for i,ag in enumerate(ags):
+        if len(pp_ags) == 0:
+            ag_name = list(pp_kws.keys())[i]
+        elif i < len(pp_ags):
+            ag_name = pp_ags[i]
+        elif i >= len(pp_ags) and (i-len(pp_ags)) <= len(list(pp_kws.keys())):
+            ag_name = list(pp_kws.keys())[i-len(pp_ags)]
+        else:
+            raise RuntimeError(f'The args number is not matc{func_name} in Jittor has no Attribute {ag_name}')
+        if ag_name in links.keys():
+            ag_name = links[ag_name]
+        if ag_name in jj_ags:
+            j_ags_flag[jj_ags.index(ag_name)] = 1
+            j_ags_values[str(jj_ags.index(ag_name))] = ag
+        elif ag_name in jj_kws.keys():
+            j_kws_values[ag_name] = ag
+        else:
+            raise AttributeError(f'{func_name} in Jittor has no Attribute {ag_name}')
+    for i,kw in enumerate(kws):
+        kw_name, kw_value = kw.split('=')
+        if kw_name in links.keys():
+            kw_name = links[kw_name]
+        if kw_name in jj_ags:
+            j_ags_flag[jj_ags.index(kw_name)] = 1
+            j_ags_values[str(jj_ags.index(kw_name))] = kw_value
+        elif kw_name in jj_kws.keys():
+            j_kws_values[kw_name] = kw_value
+        else:
+            raise AttributeError(f'{func_name} in Jittor has no Attribute {kw_name}')
+    len_jj_ags = len(jj_ags) if len(jj_ags) == 0 or jj_ags[0] != '' else 0
+    if j_ags_flag.sum() < len_jj_ags:
+        missing_args = []
+        for i in range(len(jj_ags)):
+            if j_ags_flag[i] == 0:
+                missing_args.append(jj_ags[i])
+        raise AttributeError(f"the needed args of {func_name} in Jittor is {', '.join(jj_ags)}, so you need to give value of {', '.join(missing_args)}.")
+    if extras:
+        for k in extras.keys():
+            if k in jj_ags:
+                j_ags_values[str(jj_ags.index(k))] = extras[k]
+            elif k in jj_kws.keys():
+                j_kws_values[k] = extras[k]
+            else:
+                raise AttributeError(f"there is not attribute named {k} in Jittor {func_name}, you should delete it in {func_name} extras.")
+    j_ags_ = [j_ags_values[str(i)] for i in range(len(list(j_ags_values.keys())))]
+    j_kws_ = [key + "=" + j_kws_values[key] for key in j_kws_values.keys()]
+    j_func = f"{j_module}.{j_name}({', '.join(j_ags_+j_kws_)})"
+    if p_prefix is None:
+        return f"{j_module}.{j_name}({', '.join(j_ags_+j_kws_)})"
+    else:
+        if prefix in p_prefix:
+            return f"{j_prefix}.{j_name}({', '.join(j_ags_+j_kws_)})"
+        else:
+            return f"{prefix}.{j_name}({', '.join(j_ags_+j_kws_)})"
+    return j_func
+
+def dfs(a):
+    if isinstance(a, ast.Import):
+        if 'torch' in astunparse.unparse(a) and 'init' in astunparse.unparse(a):
+            import_flag.append('init')
+            return ast.parse('from jittor import init').body[0]
+        if 'torch' in astunparse.unparse(a) and 'nn' in astunparse.unparse(a):
+            import_flag.append('nn')
+            return ast.parse('from jittor import nn').body[0]
+        if a.names[0].name == 'torch': 
+            return 'delete'
+    elif isinstance(a, ast.ImportFrom):
+        if 'torch' in a.module:
+            return 'delete'
+    elif isinstance(a, ast.Call):
+        for idx, ag in enumerate(a.args): 
+            ret = dfs(ag)
+            if ret is not None:
+                a.args[idx] = ret
+        for idx, kw in enumerate(a.keywords): 
+            ret = dfs(kw)
+            if ret is not None:
+                a.keywords[idx] = ret
+        func = astunparse.unparse(a.func).strip('\n').split('.')
+        prefix = '.'.join(func[0:-1])
+        func_name = func[-1]
+        if func_name in unsupport_ops:
+            raise_unsupport(func_name)
+        if func_name in pjmap.keys():
+            ags = [astunparse.unparse(ag).strip('\n') for ag in a.args]
+            kws = [astunparse.unparse(kw).strip('\n') for kw in a.keywords]
+            ret = convert_(prefix, func_name, ags, kws)
+            return ast.parse(ret).body[0].value
+        if ".load_state_dict" in astunparse.unparse(a.func):
+            a.func.attr = 'load_parameters'
+        if astunparse.unparse(a.func).strip('\n').endswith(".size"):
+            ags = [astunparse.unparse(ag).strip('\n') for ag in a.args]
+            if len(ags) != 0:
+                con = astunparse.unparse(a.func).split('.size')[0] + '.shape[' + ','.join(ags) + ']'
+            else:
+                con = astunparse.unparse(a.func).replace('size', 'shape')
+            return ast.parse(con).body[0].value
+    elif isinstance(a, ast.Expr): pass
+    elif isinstance(a, ast.Attribute) or isinstance(a, ast.Name): replace(a)
+    elif isinstance(a, ast.FunctionDef):
+        if a.name == 'forward': a.name = 'execute'
+    if hasattr(a, '__dict__'):
+        for k in a.__dict__.keys():
+            if isinstance(a.__dict__[k], list):
+                delete_flag = []
+                for i,a_ in enumerate(a.__dict__[k]):
+                    ret = dfs(a_)
+                    if ret is 'delete':
+                        delete_flag.append(True)
+                        del a.__dict__[k][i]
+                        continue
+                    if ret is not None:
+                        a.__dict__[k][i] = ret
+                    delete_flag.append(False)
+                tmp = [a_ for i,a_ in enumerate(a.__dict__[k]) if delete_flag[i] == False]
+                a.__dict__[k] = tmp
+            else:
+                ret = dfs(a.__dict__[k])
+                if ret is not None:
+                    a.__dict__[k] = ret
--- a/python/jittor/utils/pytorch_converter2.py
+++ b/python/jittor/utils/pytorch_converter2.py
@ -1,117 +0,0 @@
-# ***************************************************************
-# Copyright (c) 2020 Jittor. Authors: Dun Liang <randonlang@gmail.com>. All Rights Reserved.
-# This file is subject to the terms and conditions defined in
-# file 'LICENSE.txt', which is part of this source code package.
-# ***************************************************************
-import ast, astunparse
-
-def convert(code):
-    a = ast.parse(code)
-    a.body.insert(0, ast.parse('import jittor as jt').body[0])
-    a.body.insert(1, ast.parse('from jittor import init').body[0])
-    dfs(a)
-    return astunparse.unparse(a)
-
-def replace(a):
-    if hasattr(a, "attr"):
-        if a.attr == "Conv2d": a.attr = "Conv"
-        if a.attr == "BatchNorm2d": a.attr = "BatchNorm"
-        if a.attr == "ReLU": a.attr = "Relu"
-        if a.attr == "AvgPool2d": a.attr = "Pool"
-        if a.attr == "MaxPool2d": a.attr = "Pool"
-    if hasattr(a, "id"):
-        if a.id == "Conv2d": a.id = "Conv"
-        if a.id == "BatchNorm2d": a.id = "BatchNorm"
-        if a.id == "ReLU": a.id = "Relu"
-        if a.id == "AvgPool2d": a.id = "Pool"
-        if a.id == "MaxPool2d": a.id = "Pool"
-
-def dfs(a):
-    if isinstance(a, ast.Import):
-        if a.names[0].name == 'torch.nn' and a.names[0].asname == 'nn':
-            a.names[0].name = 'jittor.nn'
-            a.names[0].asname = 'nn'
-    elif isinstance(a, ast.ImportFrom):
-        if a.module == 'torch': 
-            a.module = a.module.replace('torch', 'jittor')
-            return a
-    elif isinstance(a, ast.Call):
-        for idx, ag in enumerate(a.args): 
-            ret = dfs(ag)
-            if ret is not None:
-                a.args[idx] = ret
-        for idx, kw in enumerate(a.keywords): 
-            ret = dfs(kw)
-            if ret is not None:
-                a.keywords[idx] = ret
-        if ".load_state_dict" in astunparse.unparse(a.func):
-            a.func.attr = 'load_parameters'
-        if astunparse.unparse(a.func).startswith("torch.Tensor"):
-            a.func.value.id = 'jt'
-            a.func.attr = 'array'
-        if ".cat" in astunparse.unparse(a.func):
-            if len(a.args) == 1:
-                dim = a.keywords[0].value.n
-            else:
-                dim = a.args[1].n
-            if isinstance(a.args[0], ast.List):
-                objs = [elt.id for elt in a.args[0].elts]
-                con = 'jt.contrib.concat([' + ','.join(objs) + '], dim=' + str(dim) + ')'
-            else:
-                con = 'jt.contrib.concat(' + a.args[0].id + ', dim=' + str(dim) + ')'
-            return ast.parse(con).body[0].value
-        if "view" in astunparse.unparse(a.func):
-            ags = [astunparse.unparse(ag).strip('\n') for ag in a.args]
-            con = 'jt.reshape(' + a.func.value.id + ', [' + ','.join(ags) + '])'
-            return ast.parse(con).body[0].value
-        if "permute" in astunparse.unparse(a.func):
-            ags = [astunparse.unparse(ag).strip('\n') for ag in a.func.value.args]
-            con = 'jt.transpose(' + a.func.value.func.value.id + ', [' + ','.join(ags) + '])'
-            return ast.parse(con).body[0].value
-        if astunparse.unparse(a.func).strip('\n').endswith(".size"):
-            ags = [astunparse.unparse(ag).strip('\n') for ag in a.args]
-            con = astunparse.unparse(a.func).split('.size')[0] + '.shape[' + ','.join(ags) + ']'
-            return ast.parse(con).body[0].value
-        if astunparse.unparse(a.func).startswith("F."):
-            a.func.value.id = "nn"
-            return a
-        if "kaiming_normal_" in astunparse.unparse(a.func):
-            ag = astunparse.unparse(a.args[0]).strip('\n')
-            kws = {}
-            for kw in a.keywords:
-                tmp = astunparse.unparse(kw).split('=')
-                kws[tmp[0]] = tmp[1].strip('\n')
-            con = 'init.relu_invariant_gauss_(' + ag + ', mode=' + kws['mode'] + ')'
-            return ast.parse(con).body[0].value
-        if "constant_" in astunparse.unparse(a.func):
-            ags = [astunparse.unparse(ag).strip('\n') for ag in a.args]
-            con = 'init.constant_(' + ','.join(ags) + ')'
-            return ast.parse(con).body[0].value
-        if "ReLU" in astunparse.unparse(a.func):
-            a.args.clear()
-            a.keywords.clear()
-        elif "Conv2d" in astunparse.unparse(a.func):
-            pass
-        elif "AvgPool2d" in astunparse.unparse(a.func):
-            a.keywords.append(ast.keyword(arg='op', value=ast.Str(s='mean')))
-        elif "MaxPool2d" in astunparse.unparse(a.func):
-            a.keywords.append(ast.keyword(arg='op', value=ast.Str(s='maximum')))
-        for kw in a.keywords:
-            if kw.arg in ['return_indices', 'groups']:
-                kw.value = ast.NameConstant(value=None)
-    elif isinstance(a, ast.Expr): pass
-    elif isinstance(a, ast.Attribute) or isinstance(a, ast.Name): replace(a)
-    elif isinstance(a, ast.FunctionDef):
-        if a.name == 'forward': a.name = 'execute'
-    if hasattr(a, '__dict__'):
-        for k in a.__dict__.keys():
-            if isinstance(a.__dict__[k], list):
-                for i,a_ in enumerate(a.__dict__[k]):
-                    ret = dfs(a_)
-                    if ret is not None:
-                        a.__dict__[k][i] = ret
-                            
-            else:
-                ret = dfs(a.__dict__[k])
-                if ret is not None:
-                    a.__dict__[k] = ret
--- a/src/misc/nano_string.cc
+++ b/src/misc/nano_string.cc
@ -61,6 +61,18 @@ static unordered_set<string> unary_ops = {
    "floor",
    "ceil",
    "cast",
+    "sin",
+    "asin",
+    "sinh",
+    "asinh",
+    "tan",
+    "atan",
+    "tanh",
+    "atanh",
+    "cos",
+    "acos",
+    "cosh",
+    "acosh",
 };

 static unordered_set<string> unary_float_ops = {
--- a/src/misc/nano_string.h
+++ b/src/misc/nano_string.h
@ -63,6 +63,19 @@ namespace jittor {
    m(floor) \
    m(ceil) \
    m(cast) \
+    \
+    m(sin) \
+    m(asin) \
+    m(sinh) \
+    m(asinh) \
+    m(tan) \
+    m(atan) \
+    m(tanh) \
+    m(atanh) \
+    m(cos) \
+    m(acos) \
+    m(cosh) \
+    m(acosh) \

 struct NanoString;
 #define DECLEAR_NS(T) extern NanoString ns_##T;
--- a/src/node.h
+++ b/src/node.h
@ -7,6 +7,7 @@
 #include "common.h"
 #include "misc/nano_string.h"
 #include "misc/nano_vector.h"
+#include "pybind/py_var_tracer.h"

 namespace jittor {

@ -105,8 +106,15 @@ struct Node {
    list<output_t> _outputs;

 #ifdef NODE_MEMCHECK
-    Node();
-    virtual ~Node();
+    inline Node() {
+        lived_nodes[(void*)this] = lived_nodes.size()+1;
+        registe_node_trace(this);
+    }
+
+    inline virtual ~Node() {
+        lived_nodes.erase((void*)this);
+        unregiste_node_trace(this);
+    }
 #else
    inline Node() {};
    inline virtual ~Node() {};
--- a/src/ops/unary_op.cc
+++ b/src/ops/unary_op.cc
@ -49,6 +49,24 @@ static unordered_set<string> unary_ops = {
    "round",
    "floor",
    "ceil",
+    "sin",
+    // @pybind(asin, arcsin)
+    "asin",
+    "sinh",
+    // @pybind(asinh, arcsinh)
+    "asinh",
+    "tan",
+    // @pybind(atan, arctan)
+    "atan",
+    "tanh",
+    // @pybind(atanh, arctanh)
+    "atanh",
+    "cos",
+    // @pybind(acos, arccos)
+    "acos",
+    "cosh",
+    // @pybind(acosh, arccosh)
+    "acosh",
 };

 UnaryOp::UnaryOp(Var* x, NanoString op) : x(x) {
@ -92,6 +110,79 @@ VarPtr UnaryOp::grad(Var* out, Var* dout, Var* v, int v_index) {
        auto twoy = make_binary(two, y, ns_multiply);
        return make_binary(dout, twoy, ns_divide);
    }
+    // dsin(x) = cos(x)
+    if (ns == ns_sin)
+        return make_binary(dout, make_unary(x, ns_cos), ns_multiply);
+    // dcos(x) = -sin(x)
+    if (ns == ns_cos)
+        return make_binary(dout, make_unary(make_unary(x, ns_sin), ns_negative), ns_multiply);
+    // dtan(x) = 1/cos^2(x)
+    if (ns == ns_tan) {
+        auto one = make_number(1, x);
+        auto cosx = make_unary(x, ns_cos);
+        auto cos2x = make_binary(cosx, cosx, ns_multiply);
+        return make_binary(dout, cos2x, ns_divide);
+    }
+    // dasin(x) = 1/sqrt(1-x^2)
+    if (ns == ns_asin) {
+        auto one = make_number(1, x);
+        auto x2 = make_binary(x, x, ns_multiply);
+        x2 = make_binary(one, x2, ns_subtract);
+        x2 = make_unary(x2, ns_sqrt);
+        return make_binary(dout, x2, ns_divide);
+    }
+    // dacos(x) = -1/sqrt(1-x^2)
+    if (ns == ns_acos) {
+        auto one = make_number(1, x);
+        auto x2 = make_binary(x, x, ns_multiply);
+        x2 = make_binary(one, x2, ns_subtract);
+        x2 = make_unary(x2, ns_sqrt);
+        return make_unary(make_binary(dout, x2, ns_divide), ns_negative);
+    }
+    // datan(x) = 1/(x^2+1)
+    if (ns == ns_atan) {
+        auto one = make_number(1, x);
+        auto x2 = make_binary(x, x, ns_multiply);
+        x2 = make_binary(one, x2, ns_add);
+        return make_binary(dout, x2, ns_divide);
+    }
+
+    // dsinh(x) = cosh(x)
+    if (ns == ns_sinh)
+        return make_binary(dout, make_unary(x, ns_cosh), ns_multiply);
+    // dcosh(x) = sinh(x)
+    if (ns == ns_cosh)
+        return make_binary(dout, make_unary(x, ns_sinh), ns_multiply);
+    // dtanh(x) = 1/cosh^2(x)
+    if (ns == ns_tanh) {
+        auto cosx = make_unary(x, ns_cosh);
+        auto cos2x = make_binary(cosx, cosx, ns_multiply);
+        return make_binary(dout, cos2x, ns_divide);
+    }
+
+    // dasinh(x) = 1/sqrt(x^2+1)
+    if (ns == ns_asinh) {
+        auto one = make_number(1, x);
+        auto x2 = make_binary(x, x, ns_multiply);
+        x2 = make_binary(x2, one, ns_add);
+        x2 = make_unary(x2, ns_sqrt);
+        return make_binary(dout, x2, ns_divide);
+    }
+    // dacosh(x) = 1/sqrt(x^2-1)
+    if (ns == ns_acosh) {
+        auto one = make_number(1, x);
+        auto x2 = make_binary(x, x, ns_multiply);
+        x2 = make_binary(x2, one, ns_subtract);
+        x2 = make_unary(x2, ns_sqrt);
+        return make_binary(dout, x2, ns_divide);
+    }
+    // datanh(x) = 1/(1-x^2)
+    if (ns == ns_atanh) {
+        auto one = make_number(1, x);
+        auto x2 = make_binary(x, x, ns_multiply);
+        x2 = make_binary(one, x2, ns_subtract);
+        return make_binary(dout, x2, ns_divide);
+    }
    return nullptr;
 }

--- a/src/ops/unary_op_defs.h
+++ b/src/ops/unary_op_defs.h
@ -11,13 +11,30 @@ namespace jittor {
 #define bitwise_not(T,x) (~(x))
 #define negative(T,x) (-(x))
 #ifdef JIT_cuda
+// TODO: add float64 version
 #define abs(T,x) ::abs(x)
-#define log(T,x) ::log((T)(x))
-#define exp(T,x) ::exp((T)(x))
-#define sqrt(T,x) ::sqrt((T)(x))
+#define log(T,x) ::logf((T)(x))
+#define exp(T,x) ::expf((T)(x))
+#define sqrt(T,x) ::sqrtf((T)(x))
 #define round(T,x) ((T) ::roundf((x)))
 #define floor(T,x) ((T) ::floorf((x)))
 #define ceil(T,x) ((T) ::ceilf((x)))
+
+#define sin(T,x) ((T) ::sinf((x)))
+#define asin(T,x) ((T) ::asinf((x)))
+#define sinh(T,x) ((T) ::sinhf((x)))
+#define asinh(T,x) ((T) ::asinhf((x)))
+
+#define cos(T,x) ((T) ::cosf((x)))
+#define acos(T,x) ((T) ::acosf((x)))
+#define cosh(T,x) ((T) ::coshf((x)))
+#define acosh(T,x) ((T) ::acoshf((x)))
+
+#define tan(T,x) ((T) ::tanf((x)))
+#define atan(T,x) ((T) ::atanf((x)))
+#define tanh(T,x) ((T) ::tanhf((x)))
+#define atanh(T,x) ((T) ::atanhf((x)))
+
 #else
 #define abs(T,x) std::abs(x)
 #define log(T,x) std::log((T)(x))
@ -26,7 +43,24 @@ namespace jittor {
 #define round(T,x) ((T)std::round((x)))
 #define floor(T,x) ((T)std::floor((x)))
 #define ceil(T,x) ((T)std::ceil((x)))
+
+#define sin(T,x) ((T) std::sin((x)))
+#define asin(T,x) ((T) std::asin((x)))
+#define sinh(T,x) ((T) std::sinh((x)))
+#define asinh(T,x) ((T) std::asinh((x)))
+
+#define cos(T,x) ((T) std::cos((x)))
+#define acos(T,x) ((T) std::acos((x)))
+#define cosh(T,x) ((T) std::cosh((x)))
+#define acosh(T,x) ((T) std::acosh((x)))
+
+#define tan(T,x) ((T) std::tan((x)))
+#define atan(T,x) ((T) std::atan((x)))
+#define tanh(T,x) ((T) std::tanh((x)))
+#define atanh(T,x) ((T) std::atanh((x)))
+
 #endif
+
 #define cast(T,x) ((T)(x))

 } // jittor
--- a/src/pybind/py_var_tracer.cc
+++ b/src/pybind/py_var_tracer.cc
@ -8,6 +8,7 @@
 #include "pybind/py_var_tracer.h"
 #include "misc/str_utils.h"
 #include "op.h"
+#include "var.h"

 namespace py = pybind11;
 using namespace pybind11::literals;
--- a/src/pybind/py_var_tracer.h
+++ b/src/pybind/py_var_tracer.h
@ -4,7 +4,7 @@
 // file 'LICENSE.txt', which is part of this source code package.
 // ***************************************************************
 #pragma once
-#include "var.h"
+#include "common.h"

 namespace jittor {

--- a/src/var_holder.h
+++ b/src/var_holder.h
@ -62,6 +62,19 @@ struct VarHolder {
        return var->name.c_str();
    }

+    // @pyjt(size)
+    inline NanoVector size() {
+        if (var->num<0) sync();
+        return var->shape;
+    }
+
+    // @pyjt(size)
+    inline int64 size(int64 dim) {
+        if (var->num<0) sync();
+        ASSERT(dim>=0 && dim<var->shape.size()) << "dim is out of index";
+        return var->shape[dim];
+    }
+
    // @pyjt(stop_grad)
    // @attrs(return_self)
    inline VarHolder* stop_grad() {