Merge branch 'master' into macOS

2021-06-09 20:40:43 +08:00 · 2021-06-09 20:40:43 +08:00 · 5935f77fbb
parent 23106e0606 77b293b6b8
commit 5935f77fbb
25 changed files with 3254 additions and 65 deletions
--- a/python/jittor/init.py
+++ b/python/jittor/init.py
@ -9,7 +9,7 @@
 # file 'LICENSE.txt', which is part of this source code package.
 # ***************************************************************

-__version__ = '1.2.3.14'
+__version__ = '1.2.3.22'
 from jittor_utils import lock
 with lock.lock_scope():
    ori_int = int
--- a/python/jittor/compile_extern.py
+++ b/python/jittor/compile_extern.py
@ -53,6 +53,19 @@ def setup_mkl():
    use_mkl = os.environ.get("use_mkl", "1")=="1"
    mkl_ops = None
    if not use_mkl: return
+
+    # pytorch mkl is conflict with jittor mkl
+    # yield error "free: invalide size" or
+    # "mmap error"
+    # import pytorch(>1.8) first can fix this problem
+
+    try:
+        # jt.dirty_fix_pytorch_runtime_error()
+        import torch
+        from torch import nn
+    except:
+        torch = None
+
    mkl_include_path = os.environ.get("mkl_include_path")
    mkl_lib_path = os.environ.get("mkl_lib_path")
    
@ -188,11 +201,11 @@ def setup_cuda_lib(lib_name, link=True, extra_flags=""):
        prefer_version = ()
        if nvcc_version[0] == 11:
            prefer_version = ("8",)
-        culib_path = search_file([cuda_lib, extra_lib_path, "/usr/lib/x86_64-linux-gnu"], f"lib{lib_name}.so", prefer_version)
+        culib_path = search_file([cuda_lib, extra_lib_path, "/usr/lib/x86_64-linux-gnu", "/usr/lib"], f"lib{lib_name}.so", prefer_version)

        if lib_name == "cublas" and nvcc_version[0] >= 10:
            # manual link libcublasLt.so
-            cublas_lt_lib_path = search_file([cuda_lib, extra_lib_path, "/usr/lib/x86_64-linux-gnu"], f"libcublasLt.so", nvcc_version)
+            cublas_lt_lib_path = search_file([cuda_lib, extra_lib_path, "/usr/lib/x86_64-linux-gnu", "/usr/lib"], f"libcublasLt.so", nvcc_version)
            ctypes.CDLL(cublas_lt_lib_path, dlopen_flags)


@ -201,7 +214,7 @@ def setup_cuda_lib(lib_name, link=True, extra_flags=""):
            if nvcc_version >= (11,0,0):
                libs = ["libcudnn_ops_infer.so", "libcudnn_ops_train.so", "libcudnn_cnn_infer.so", "libcudnn_cnn_train.so"]
                for l in libs:
-                    ex_cudnn_path = search_file([cuda_lib, extra_lib_path, "/usr/lib/x86_64-linux-gnu"], l, prefer_version)
+                    ex_cudnn_path = search_file([cuda_lib, extra_lib_path, "/usr/lib/x86_64-linux-gnu", "/usr/lib"], l, prefer_version)
                    ctypes.CDLL(ex_cudnn_path, dlopen_flags)

        # dynamic link cuda library
--- a/python/jittor/compiler.py
+++ b/python/jittor/compiler.py
@ -887,7 +887,10 @@ if install_cuda.has_installation():
        nvcc_path = try_find_exe(nvcc_path)
 # check system installed cuda
 if not nvcc_path:
-    nvcc_path = env_or_try_find('nvcc_path', 'nvcc') or try_find_exe('/usr/local/cuda/bin/nvcc') or try_find_exe('/usr/bin/nvcc')
+    nvcc_path = env_or_try_find('nvcc_path', 'nvcc') or \
+        try_find_exe('/usr/local/cuda/bin/nvcc') or \
+        try_find_exe('/usr/bin/nvcc') or \
+        try_find_exe('/opt/cuda/bin/nvcc')
 # if system has no cuda, install jtcuda
 if not nvcc_path:
    nvcc_path = install_cuda.install_cuda()
--- a/python/jittor/misc.py
+++ b/python/jittor/misc.py
@ -1197,7 +1197,7 @@ def gather(x, dim, index):

 Parameters::

-    * input (jt.Var) – the source array
+    * x (jt.Var) – the source array
    * dim (int) – the axis along which to index
    * index (jt.Var) – the indices of elements to gather

@ -1216,3 +1216,42 @@ Example::
    return x.getitem(tuple(indexes))

 jt.Var.gather = gather
+
+def roll(x, shifts, dims=None):
+    '''Roll the tensor along the given dimension(s).
+
+Parameters::
+
+    * x (jt.Var) – the source array
+    * shifts (int or tuple) – shift offset of dims
+    * dims (int or tuple) – shift dims
+
+Examples::
+
+        x = jt.array([1, 2, 3, 4, 5, 6, 7, 8]).view(4, 2)
+        y = x.roll(1, 0)
+        assert (y.numpy() == [[7,8],[1,2],[3,4],[5,6]]).all()
+        y = x.roll(-1, 0)
+        assert (y.numpy() == [[3,4],[5,6],[7,8],[1,2]]).all()
+        y = x.roll(shifts=(2, 1), dims=(0, 1))
+        assert (y.numpy() == [[6,5],[8,7],[2,1],[4,3]]).all()
+
+    '''
+    if isinstance(shifts, int):
+        shifts = (shifts,)
+    if dims is None:
+        dims = tuple(range(len(shifts)))
+    elif isinstance(dims, int):
+        dims = (dims,)
+    assert len(dims) == len(shifts)
+    ids = [ f'i{i}' for i in range(x.ndim) ]
+    for i in range(len(dims)):
+        shift = shifts[i]
+        d = dims[i]
+        size = x.shape[d]
+        shift = shift % size
+        if shift<0: shift += size
+        ids[d] = f'(i{d}<{shift}?i{d}+{size-shift}:(i{d}-{shift}))'
+    return x.reindex(x.shape, ids)
+
+jt.Var.roll = roll
--- a/python/jittor/nn.py
+++ b/python/jittor/nn.py
@ -20,7 +20,7 @@ import math
 from collections import OrderedDict
 from jittor.pool import *
 from jittor.optim import *
-from jittor.misc import _pair
+from jittor.misc import _pair, _triple


 def matmul_transpose(a, b):
@ -423,7 +423,7 @@ class BatchNorm(Module):
            norm_x = x * w.broadcast(x, dims) + b.broadcast(x, dims)
            return norm_x

-BatchNorm2d = BatchNorm1d = BatchNorm
+BatchNorm3d = BatchNorm2d = BatchNorm1d = BatchNorm

 class InstanceNorm(Module):
    def __init__(self, num_features, eps=1e-05, momentum=0.1, affine=True, is_train=True, sync=True):
@ -447,7 +447,7 @@ class InstanceNorm(Module):
        b = self.bias - xmean * w
        return x * w.broadcast(x, dims) + b.broadcast(x, dims)

-InstanceNorm2d = InstanceNorm1d = InstanceNorm
+InstanceNorm3d = InstanceNorm2d = InstanceNorm1d = InstanceNorm

 class LayerNorm(Module):
    def __init__(self, normalized_shape, eps: float = 1e-5, elementwise_affine: bool = True) -> None:
@ -470,7 +470,7 @@ class LayerNorm(Module):
        return x * w + b


-LayerNorm2d = LayerNorm1d = LayerNorm
+LayerNorm3d = LayerNorm2d = LayerNorm1d = LayerNorm

 class GroupNorm(Module):
    def __init__(self, num_groups, num_channels, eps=1e-05, affine=True, is_train=True):
@ -637,6 +637,93 @@ class Conv1d(Module):
        y = x.squeeze(-1)
        return y

+class Conv3d(Module):
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True):
+        LOG.w("Optimizations of Conv3d are working in progress, it maybe slow currently.")
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size if isinstance(kernel_size, tuple) else (kernel_size, kernel_size, kernel_size)
+        self.stride = stride if isinstance(stride, tuple) else (stride, stride, stride)
+        self.padding = padding if isinstance(padding, tuple) else (padding, padding, padding)
+        self.dilation = dilation if isinstance(dilation, tuple) else (dilation, dilation, dilation)
+        self.groups = groups
+        assert in_channels % groups == 0, 'in_channels must be divisible by groups'
+        assert out_channels % groups == 0, 'out_channels must be divisible by groups'
+        Kh, Kw, Kd = self.kernel_size
+        self.groups = groups
+        assert in_channels % groups == 0, 'in_channels must be divisible by groups'
+        assert out_channels % groups == 0, 'out_channels must be divisible by groups'
+
+        self.weight = init.invariant_uniform([out_channels, in_channels//groups, Kh, Kw, Kd], dtype="float")
+        if bias:
+            fan=1
+            for i in self.weight.shape[1:]:
+                fan *= i
+            bound = 1 / math.sqrt(fan)
+            self.bias = init.uniform([out_channels], dtype="float", low=-bound, high=bound)
+        else:
+            self.bias = None
+
+    def execute(self, x):
+        if self.groups == 1:
+            N,C,H,W,D = x.shape
+            Kh, Kw, Kd = self.kernel_size
+            assert C==self.in_channels
+            oh = (H+self.padding[0]*2-Kh*self.dilation[0]+self.dilation[0]-1)//self.stride[0]+1
+            ow = (W+self.padding[1]*2-Kw*self.dilation[1]+self.dilation[1]-1)//self.stride[1]+1
+            od = (D+self.padding[2]*2-Kd*self.dilation[2]+self.dilation[2]-1)//self.stride[2]+1
+            xx = x.reindex([N,self.out_channels,C,oh,ow,od,Kh,Kw,Kd], [
+                'i0', # Nid
+                'i2', # Cid
+                f'i3*{self.stride[0]}-{self.padding[0]}+i6*{self.dilation[0]}', # Hid+Khid
+                f'i4*{self.stride[1]}-{self.padding[1]}+i7*{self.dilation[1]}', # Wid+KWid
+                f'i5*{self.stride[2]}-{self.padding[2]}+i8*{self.dilation[2]}', # Did+KDid
+            ])
+            ww = self.weight.broadcast(xx.shape, [0,3,4,5])
+            yy = xx*ww
+            y = yy.sum([2,6,7,8]) # Kc, Kh, Kw, Kd
+            if self.bias is not None:
+                b = self.bias.broadcast(y.shape, [0,2,3,4])
+                y = y + b
+            return y
+        else:
+            N,C,H,W,D = x.shape
+            Kh, Kw, Kd = self.kernel_size
+            G = self.groups
+            CpG = C // G # channels per group
+            assert C==self.in_channels
+            oc = self.out_channels
+            oh = (H+self.padding[0]*2-Kh*self.dilation[0]+self.dilation[0]-1)//self.stride[0]+1
+            ow = (W+self.padding[1]*2-Kw*self.dilation[1]+self.dilation[1]-1)//self.stride[1]+1
+            od = (D+self.padding[2]*2-Kd*self.dilation[2]+self.dilation[2]-1)//self.stride[2]+1
+            xx = x.reindex([N,G,oc//G,CpG,oh,ow,od,Kh,Kw,Kd], [
+                'i0', # Nid
+                f'i1*{CpG}+i3', # Gid
+                f'i4*{self.stride[0]}-{self.padding[0]}+i7*{self.dilation[0]}', # Hid+Khid
+                f'i5*{self.stride[1]}-{self.padding[1]}+i8*{self.dilation[1]}', # Wid+KWid
+                f'i6*{self.stride[2]}-{self.padding[2]}+i9*{self.dilation[2]}', # Did+KDid
+            ])
+            # w: [oc, CpG, Kh, Kw, Kd]
+            ww = self.weight.reindex([N, G, oc//G, CpG, oh, ow, od, Kh, Kw, Kd], [
+                f'i1*{oc//G}+i2',
+                'i3',
+                'i7',
+                'i8',
+                'i9'
+            ])
+            ww.compile_options = xx.compile_options = {"G":G,"C":C}
+            yy = xx*ww
+            y = yy.reindex_reduce('add', [N, oc, oh, ow, od], [
+                'i0',
+                f'i1*{oc//G}+i2',
+                'i4',
+                'i5',
+                'i6'
+            ])
+            if self.bias is not None:
+                b = self.bias.broadcast(y.shape, [0,2,3,4])
+                y = y + b
+            return y 

 def conv2d(x, weight, bias=None, stride=1, padding=0, dilation=1, groups=1):
    padding = _pair(padding)
@ -696,6 +783,71 @@ def conv2d(x, weight, bias=None, stride=1, padding=0, dilation=1, groups=1):
            y = y + b
        return y

+def conv3d(x, weight, bias=None, stride=1, padding=0, dilation=1, groups=1):
+    padding = _triple(padding)
+    stride = _triple(stride)
+    dilation = _triple(dilation)
+    out_channels = weight.shape[0]
+
+    if groups == 1:
+        N,C,H,W,D = x.shape
+        Kh, Kw, Kd = weight.shape[-3:]
+        oh = (H+padding[0]*2-Kh*dilation[0]+dilation[0]-1)//stride[0]+1
+        ow = (W+padding[1]*2-Kw*dilation[1]+dilation[1]-1)//stride[1]+1
+        od = (D+padding[2]*2-Kd*dilation[2]+dilation[2]-1)//stride[2]+1
+        xx = x.reindex([N,out_channels,C,oh,ow,od,Kh,Kw,Kd], [
+                'i0', # Nid
+                'i2', # Cid
+                f'i3*{stride[0]}-{padding[0]}+i6*{dilation[0]}', # Hid+Khid
+                f'i4*{stride[1]}-{padding[1]}+i7*{dilation[1]}', # Wid+KWid
+                f'i5*{stride[2]}-{padding[2]}+i8*{dilation[2]}', # Did+KDid
+            ])
+        ww = weight.broadcast(xx.shape, [0,3,4,5])
+        yy = xx*ww
+        y = yy.sum([2,6,7,8]) # Kc, Kh, Kw,Kd
+        if bias is not None:
+            b = bias.broadcast(y.shape, [0,2,3,4])
+            y = y + b
+        return y
+    else:
+        N,C,H,W,D = x.shape
+        Kh, Kw, Kd = weight.shape[-3:]
+        G = groups
+        CpG = C // G # channels per group
+        oc = out_channels
+        oh = (H+padding[0]*2-Kh*dilation[0]+dilation[0]-1)//stride[0]+1
+        ow = (W+padding[1]*2-Kw*dilation[1]+dilation[1]-1)//stride[1]+1
+        od = (D+padding[2]*2-Kd*dilation[2]+dilation[2]-1)//stride[2]+1
+        xx = x.reindex([N,G,oc//G,CpG,oh,ow,od,Kh,Kw,Kd], [
+                'i0', # Nid
+                f'i1*{CpG}+i3', # Gid
+                f'i4*{stride[0]}-{padding[0]}+i7*{dilation[0]}', # Hid+Khid
+                f'i5*{stride[1]}-{padding[1]}+i8*{dilation[1]}', # Wid+KWid
+                f'i6*{stride[2]}-{padding[2]}+i9*{dilation[2]}', # Did+KDid
+            ])
+        xx.compile_options = {"G":G}
+        # w: [oc, CpG, Kh, Kw, Kd]
+        ww = weight.reindex([N, G, oc//G, CpG, oh, ow, od, Kh, Kw, Kd], [
+                f'i1*{oc//G}+i2',
+                'i3',
+                'i7',
+                'i8',
+                'i9'
+            ])
+        yy = xx*ww
+        y = yy.reindex_reduce('add', [N, oc, oh, ow, od], [
+                'i0',
+                f'i1*{oc//G}+i2',
+                'i4',
+                'i5',
+                'i6'
+            ])
+
+        if bias is not None:
+            b = bias.broadcast(y.shape, [0,2,3,4])
+            y = y + b
+        return y
+
 class ConvTranspose(Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride=1, \
                 padding=0, output_padding=0, groups=1, bias=True, dilation=1):
--- a/python/jittor/pool.py
+++ b/python/jittor/pool.py
@ -73,7 +73,7 @@ class Pool(Module):
                    for (int q = k3; q < k3_; ++q) 
                        if (out_value < @in0(i0, i1, p, q)) {{
                            out_value = @in0(i0, i1, p, q);
-                            out_index = (p - k2) * {self.kernel_size[0]} + (q - k3);
+                            out_index = (p - k2) * {self.kernel_size[1]} + (q - k3);
                        }}
                @out(i0, i1, i2, i3) = out_value;
                @out1(i0, i1, i2, i3) = out_index;
@ -184,6 +184,204 @@ class Pool(Module):
            ])
            return xx.reduce(self.op, [4,5])

+def _triple(x):
+    if isinstance(x, tuple):
+        assert len(x) == 3
+        return x
+    else:
+        return (x,x,x)
+
+
+class Pool3d(Module):
+    def __init__(self, kernel_size, stride=None, padding=0, dilation=None, return_indices=None, ceil_mode=False, count_include_pad=True, op="maximum"):
+        assert dilation == None
+        assert return_indices == None or op == "maximum"
+        self.return_indices = return_indices
+        self.kernel_size = _triple(kernel_size)
+        self.op = op
+        stride = stride if stride else kernel_size
+        self.stride = _triple(stride)
+        self.padding = _triple(padding)
+        self.ceil_mode = ceil_mode
+        self.count_include_pad = count_include_pad and padding != 0
+
+    def execute(self, x):
+        N,C,D,H,W = x.shape
+        if self.ceil_mode == False:
+            d = (D+self.padding[0]*2-self.kernel_size[0])//self.stride[0]+1
+            h = (H+self.padding[1]*2-self.kernel_size[1])//self.stride[1]+1
+            w = (W+self.padding[2]*2-self.kernel_size[2])//self.stride[2]+1
+            use_code_op = self.op in ['maximum', 'minimum']
+            # some second order avg_pool is require, so we don't use code op here  
+        else:
+            d = (D+self.padding[0]*2-self.kernel_size[0] + self.stride[0] - 1)//self.stride[0]+1
+            h = (H+self.padding[1]*2-self.kernel_size[1] + self.stride[1] - 1)//self.stride[1]+1
+            w = (W+self.padding[2]*2-self.kernel_size[2] + self.stride[2] - 1)//self.stride[2]+1
+            use_code_op = self.op in ['maximum', 'minimum', 'mean']
+
+        if use_code_op:
+            if self.op == 'mean':
+                if self.count_include_pad:
+                    count = f"int count = {self.kernel_size[0]*self.kernel_size[1]*self.kernel_size[2]};"
+                else:
+                    count = "int count = (k2_ - k2) * (k3_ - k3) * (k4_ - k4);"
+                count += "float32 rcount = 1.0f / count;"
+            else:
+                count = ""
+            forward_body = f'''
+                int k4 = i4*{self.stride[2]}-{self.padding[2]};
+                int k3 = i3*{self.stride[1]}-{self.padding[1]};
+                int k2 = i2*{self.stride[0]}-{self.padding[0]};
+                int k4_ = min(k4 + {self.kernel_size[2]}, in0_shape4);
+                int k3_ = min(k3 + {self.kernel_size[1]}, in0_shape3);
+                int k2_ = min(k2 + {self.kernel_size[0]}, in0_shape2);
+                k4 = max(0, k4);
+                k3 = max(0, k3);
+                k2 = max(0, k2);
+                {count}
+            '''
+            if not self.return_indices:
+                forward_body += f'''
+                @out(i0, i1, i2, i3, i4) = init_{self.op}(out_type);
+                for (int p = k2; p < k2_; ++p)
+                    for (int q = k3; q < k3_; ++q)
+                        for (int r = k4; r < k4_; ++r)
+                            @out(i0, i1, i2, i3, i4) = {self.op}(out_type, @out(i0, i1, i2, i3, i4), @in0(i0, i1, p, q, r));
+                '''
+            else:
+                forward_body += f'''
+                auto out_value = init_{self.op}(out_type);
+                int out_index = -1;
+                for (int p = k2; p < k2_; ++p)
+                    for (int q = k3; q < k3_; ++q) 
+                        for (int r = k4; q < k4_; ++r) 
+                        if (out_value < @in0(i0, i1, p, q, r)) {{
+                            out_value = @in0(i0, i1, p, q, r);
+                            out_index = (p - k2) * {self.kernel_size[1]} * {self.kernel_size[2]} + (q - k3) * {self.kernel_size[2]} + (r - k4);
+                        }}
+                @out(i0, i1, i2, i3, i4) = out_value;
+                @out1(i0, i1, i2, i3, i4) = out_index;
+                '''
+            backward_body = f'''
+                int k4 = i4*{self.stride[2]}-{self.padding[2]};
+                int k3 = i3*{self.stride[1]}-{self.padding[1]};
+                int k2 = i2*{self.stride[0]}-{self.padding[0]};
+                int k4_ = min(k4 + {self.kernel_size[2]}, in0_shape4);
+                int k3_ = min(k3 + {self.kernel_size[1]}, in0_shape3);
+                int k2_ = min(k2 + {self.kernel_size[0]}, in0_shape2);
+                k4 = max(0, k4);
+                k3 = max(0, k3);
+                k2 = max(0, k2);
+                {count}
+                int bo=1;
+                for (int p = k2; p < k2_ && bo; ++p)
+                    for (int q = k3; q < k3_ && bo; ++q) 
+                        for (int r = k4; r < k4_ && bo; ++r) {{
+                            {"atomicAdd(&@out(i0,i1,p,q,r), @dout(i0,i1,i2,i3,i4)/count);"
+                                if self.op == "mean" else
+                            f"""if (@pout(i0,i1,i2,i3,i4) == @in0(i0,i1,p,q,r)) {{
+                                atomicAdd(&@out(i0,i1,p,q,r), @dout(i0,i1,i2,i3,i4)),
+                                bo=0;
+                            }}"""}
+                        }}
+            '''
+            if self.return_indices:
+                return_shapes = [[N,C,d,h,w]] * 2
+                return_dtypes = [x.dtype, 'uint8']
+            else:
+                return_shapes = [N,C,d,h,w]
+                return_dtypes = x.dtype
+            out = jt.code(return_shapes, return_dtypes, [x],
+                cuda_header="""
+                    #include <ops/binary_op_defs.h>
+                    #include <misc/cuda_limits.h>
+                """,
+                cuda_src=f'''
+                    __global__ static void kernel1(@ARGS_DEF) {{
+                        @PRECALC
+                        int p4 = threadIdx.x;
+                        int s4 = blockDim.x;
+                        int p3 = threadIdx.y;
+                        int s3 = blockDim.y;
+                        int p2 = threadIdx.z + blockIdx.x * blockDim.z;
+                        int s2 = blockDim.z * gridDim.x;
+                        int i1 = blockIdx.y;
+                        int i0 = blockIdx.z;
+                        for (int i4 = p4; i4 < out_shape4; i4 += s4)
+                        for (int i3 = p3; i3 < out_shape3; i3 += s3)
+                        for (int i2 = p2; i2 < out_shape2; i2 += s2)
+                            {{ {forward_body} }}
+                    }}
+                    int tx = min(1024, out_shape4);
+                    int ty = min(1024 / tx, out_shape3);
+                    int tz = min(1024 / tx / ty, out_shape2);
+                    int bx = (out_shape2 - 1) / tz + 1;
+                    int by = out_shape1;
+                    int bz = out_shape0;
+                    dim3 s1(bx, by, bz);
+                    dim3 s2(tx, ty, tz);
+                    kernel1<<<s1, s2>>>(@ARGS);
+                ''',
+                cuda_grad_src=[f'''
+                    __global__ static void kernel3(@ARGS_DEF) {{
+                        @PRECALC
+                        int p4 = threadIdx.x;
+                        int s4 = blockDim.x;
+                        int p3 = threadIdx.y;
+                        int s3 = blockDim.y;
+                        int p2 = threadIdx.z + blockIdx.x * blockDim.z;
+                        int s2 = blockDim.z * gridDim.x;
+                        int i1 = blockIdx.y;
+                        int i0 = blockIdx.z;
+                        for (int i4 = p4; i4 < out_shape4; i4 += s4)
+                        for (int i3 = p3; i3 < out_shape3; i3 += s3)
+                        for (int i2 = p2; i2 < out_shape2; i2 += s2)
+                                {{ {backward_body} }}
+                    }}
+                    cudaMemsetAsync(out_p, 0, out->size);
+                    int tx = min(1024, pout_shape4);
+                    int ty = min(1024 / tx, pout_shape3);
+                    int tz = min(1024 / tx / ty, pout_shape2);
+                    int bx = (pout_shape2 - 1) / tz + 1;
+                    int by = pout_shape1;
+                    int bz = pout_shape0;
+                    dim3 s1(bx, by, bz);
+                    dim3 s2(tx, ty, tz);
+                    kernel3<<<s1, s2>>>(@ARGS);
+                '''],
+                cpu_header='#include <ops/binary_op_defs.h>',
+                cpu_src=f'''
+                    using namespace std;
+                    for (int i0=0; i0<out_shape0; i0++)
+                    for (int i1=0; i1<out_shape1; i1++)
+                    for (int i2=0; i2<out_shape2; i2++)
+                    for (int i3=0; i3<out_shape3; i3++)
+                    for (int i4=0; i4<out_shape4; i4++)
+                        {{ {forward_body} }}
+                ''',
+                cpu_grad_src = [f'''
+                    using namespace std;
+                    std::memset(out_p, 0, out->size);
+                    #define atomicAdd(a,b) (*a) += b
+
+                    for (int i0=0; i0<pout_shape0; i0++)
+                    for (int i1=0; i1<pout_shape1; i1++)
+                    for (int i2=0; i2<pout_shape2; i2++) 
+                    for (int i3=0; i3<pout_shape3; i3++)
+                    for (int i4=0; i4<pout_shape4; i4++)
+                        {{ {backward_body} }}
+                '''])
+            return out
+        else:
+            # TODO: backward 
+            xx = x.reindex([N,C,d,h,w,self.kernel_size[0],self.kernel_size[1],self.kernel_size[2]], [
+                "i0", # Nid
+                "i1", # Cid
+                f"i2*{self.stride[0]}-{self.padding[0]}+i5", # Did
+                f"i3*{self.stride[1]}-{self.padding[1]}+i6", # Hid
+                f"i4*{self.stride[2]}-{self.padding[2]}+i7", # Hid
+            ])
+            return xx.reduce(self.op, [5,6,7])

 class AdaptiveAvgPool2d(Module):
    def __init__(self, output_size):
@ -245,9 +443,69 @@ class AdaptiveMaxPool2d(Module):
        ])
        return xx.reduce("maximum", [4,5])

+
+class AdaptiveAvgPool3d(Module):
+    def __init__(self, output_size):
+        self.output_size = _triple(output_size)
+
+    def execute(self, x):
+        od, oh, ow = self.output_size
+        if od == 1 and oh == 1 and ow == 1:
+            return x.reduce("mean", [2,3,4], keepdims=True)
+        N,C,D,H,W = x.shape
+        self.sd = math.floor(D / od)
+        self.sh = math.floor(H / oh)
+        self.sw = math.floor(W / ow)
+        self.ksd = D - (od - 1) * self.sd
+        self.ksh = H - (oh - 1) * self.sh
+        self.ksw = W - (ow - 1) * self.sw
+        d = (D-self.ksd)//self.sd+1
+        h = (H-self.ksh)//self.sh+1
+        w = (W-self.ksw)//self.sw+1
+        xx = x.reindex([N,C,d,h,w,self.ksd,self.ksh,self.ksw], [
+            "i0", # Nid
+            "i1", # Cid
+            f"i2*{self.sd}+i5", # Did
+            f"i3*{self.sh}+i6", # Hid
+            f"i4*{self.sw}+i7", # Wid
+        ])
+        return xx.reduce("mean", [5,6,7])
+
+class AdaptiveMaxPool2d(Module):
+    def __init__(self, output_size):
+        self.output_size = _triple(output_size)
+
+    def execute(self, x):
+        od, oh, ow = self.output_size
+        if od == 1 and oh == 1 and ow == 1:
+            return x.reduce("maximum", [2,3,4], keepdims=True)
+        N,C,D,H,W = x.shape
+        self.sd = math.floor(D / od)
+        self.sh = math.floor(H / oh)
+        self.sw = math.floor(W / ow)
+        self.ksd = D - (od - 1) * self.sd
+        self.ksh = H - (oh - 1) * self.sh
+        self.ksw = W - (ow - 1) * self.sw
+        d = (D-self.ksd)//self.sd+1
+        h = (H-self.ksh)//self.sh+1
+        w = (W-self.ksw)//self.sw+1
+        xx = x.reindex([N,C,d,h,w,self.ksd,self.ksh,self.ksw], [
+            "i0", # Nid
+            "i1", # Cid
+            f"i2*{self.sd}+i5", # Did
+            f"i3*{self.sh}+i6", # Hid
+            f"i4*{self.sw}+i7", # Wid
+        ])
+        return xx.reduce("maximun", [5,6,7])
+
 def pool(x, kernel_size, op, padding=0, stride=None):
    return Pool(kernel_size, stride, padding, op=op)(x)

+pool2d = pool
+
+def pool3d(x, kernel_size, op, padding=0, stride=None):
+    return Pool3d(kernel_size, stride, padding, op=op)(x)
+
 class AvgPool2d(Module):
    def __init__(self, kernel_size, stride=None, padding=0, ceil_mode=False, count_include_pad=True):
        self.layer = Pool(kernel_size=kernel_size, stride=stride, padding=padding, ceil_mode=ceil_mode, count_include_pad=count_include_pad, op="mean")
@ -255,6 +513,13 @@ class AvgPool2d(Module):
    def execute(self, x):
        return self.layer(x)

+class AvgPool3d(Module):
+    def __init__(self, kernel_size, stride=None, padding=0, ceil_mode=False, count_include_pad=True):
+        self.layer = Pool3d(kernel_size=kernel_size, stride=stride, padding=padding, ceil_mode=ceil_mode, count_include_pad=count_include_pad, op="mean")
+    
+    def execute(self, x):
+        return self.layer(x)
+
 def avg_pool2d(x, kernel_size, stride=None, padding=0, ceil_mode=False, count_include_pad=True):
    return AvgPool2d(kernel_size, stride, padding, ceil_mode, count_include_pad)(x)

@ -265,9 +530,20 @@ class MaxPool2d(Module):
    def execute(self, x):
        return self._layer(x)

+class MaxPool3d(Module):
+    def __init__(self, kernel_size, stride=None, padding=0, dilation=None, return_indices=None, ceil_mode=False):
+        self._layer = Pool3d(kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, return_indices=return_indices, ceil_mode=ceil_mode, op="maximum")
+    
+    def execute(self, x):
+        return self._layer(x)
+
 def max_pool2d(x, kernel_size, stride=None, padding=0, dilation=None, return_indices=None, ceil_mode=False):
    return MaxPool2d(kernel_size, stride, padding, dilation, return_indices, ceil_mode)(x)

+
+def max_pool3d(x, kernel_size, stride=None, padding=0, dilation=None, return_indices=None, ceil_mode=False):
+    return MaxPool3d(kernel_size, stride, padding, dilation, return_indices, ceil_mode)(x)
+
 class MaxUnpool2d(Module):
    def __init__(self, kernel_size, stride=None):
        ''' MaxUnpool2d is the invert version of MaxPool2d with indices.
@ -316,3 +592,30 @@ class MaxUnpool2d(Module):
                f'((i2%{kh})*{kw}+i3%{kw}) != @e0(i0,i1,i2/{kh},i3/{kw})'],
            overflow_value=0)
        return x
+
+class MaxUnpool3d(Module):
+    def __init__(self, kernel_size, stride=None):
+        ''' MaxUnpool3d is the invert version of MaxPool3d with indices.
+        It takes the output index of MaxPool3d as input.
+        The element will be zero if it is not the max pooled value.
+        '''
+        if stride is None: stride = kernel_size
+        kernel_size = _triple(kernel_size)
+        stride = _triple(stride)
+        assert stride == kernel_size, "Different stride and kernel is not supported yet."
+        self.kernel_size = kernel_size
+
+    def execute(self, x, id, output_size=None):
+        b, c, pd, ph, pw = x.shape
+        kd, kh, kw = self.kernel_size
+        if output_size:
+            d, h, w = output_size[-3:]
+        else:
+            d, h, w = pd * kd, ph * kh, pw * kw
+        x = x.reindex(shape=[b, c, d, h, w], 
+            indexes=['i0', 'i1', f'i2/{kd}', f'i3/{kh}', f'i4/{kw}'],
+            extras=[id], 
+            overflow_conditions=[
+                f'((i2%{kd})*{kh*kw}+(i3%{kh})*{kw}+i4%{kw}) != @e0(i0,i1,i2/{kd},i3/{kh},i4/{kw})'],
+            overflow_value=0)
+        return x
--- a/python/jittor/src/jit_compiler.cc
+++ b/python/jittor/src/jit_compiler.cc
--- a/python/jittor/src/opt/pass/loop_var_analyze_pass.cc
+++ b/python/jittor/src/opt/pass/loop_var_analyze_pass.cc
@ -130,6 +130,7 @@ void LoopVarAnalyzePass::run() {
            }
            loop_vars.reserve(loop_var->shape.size());
            string vname = pm->oc->get_name_by_op_var(op, loop_var);
+            ASSERT(vname!="__fill__");
            for (uint j=0; j<loop_var->shape.size(); j++)
                loop_vars.emplace_back(vname+"->shape["+S(j)+"]");
            break;
--- a/python/jittor/src/pybind/py_var_tracer.cc
+++ b/python/jittor/src/pybind/py_var_tracer.cc
@ -20,6 +20,7 @@
 namespace jittor {

 DEFINE_FLAG(int, trace_py_var, 0, "Trace py stack max depth for debug.");
+DEFINE_FLAG(int, trace_var_data, 0, "Trace py stack max depth for debug.");
 Op* trace_grad_op = nullptr;

 TraceData trace_data;
@ -185,6 +186,44 @@ static vector<Stack> get_stack_info() {
    return stacks;
 }

+template<class T>
+string get_str(T* t, int64 num) {
+    string s = "";
+    for (int64 i=0; i<num; i++) {
+        s += S(t[i]);
+        if (i != num-1)
+            s += ',';
+    }
+    return s;
+} 
+
+static inline string get_var_data_str(Var* v) {
+    if (v->dtype() == ns_int8)
+        return get_str(v->ptr<int8>(), v->num);
+    if (v->dtype() == ns_int16)
+        return get_str(v->ptr<int16>(), v->num);
+    if (v->dtype() == ns_int32)
+        return get_str(v->ptr<int32>(), v->num);
+    if (v->dtype() == ns_int64)
+        return get_str(v->ptr<int64>(), v->num);
+
+
+    if (v->dtype() == ns_uint8)
+        return get_str(v->ptr<uint8>(), v->num);
+    if (v->dtype() == ns_uint16)
+        return get_str(v->ptr<uint16>(), v->num);
+    if (v->dtype() == ns_uint32)
+        return get_str(v->ptr<uint32>(), v->num);
+    if (v->dtype() == ns_uint64)
+        return get_str(v->ptr<uint64>(), v->num);
+
+    if (v->dtype() == ns_float32)
+        return get_str(v->ptr<float32>(), v->num);
+    if (v->dtype() == ns_float64)
+        return get_str(v->ptr<float64>(), v->num);
+    return "";
+}
+
 void TraceData::record_node(Node* node, bool record_stack) {
    if (thread_name.size()) return;
    NodeData data;
@ -255,6 +294,8 @@ void TraceData::record_exe_node(Node* node) {
            data.attrs["dsize"] = S(v->dtype().dsize());
            data.attrs["name"] = v->name.c_str();
            data.attrs["is_var"] = "1";
+            if (trace_var_data && v->mem_ptr)
+                data.attrs["data"] = get_var_data_str(v);
        } else {
            auto op = node->op();
            data.attrs["name"] = op->name_ex();
--- a/python/jittor/test/test_arg_pool_op.py
+++ b/python/jittor/test/test_arg_pool_op.py
@ -238,7 +238,33 @@ class TestArgPoolOp(unittest.TestCase):
        jt_model = j_max_pool2d(jt.array(arr), 3, 1, 1)
        torch_model = max_pool2d(torch.Tensor(arr), 3, 1, 1)
        assert np.allclose(jt_model.numpy(), torch_model.numpy())
-        print('finish')
+
+    def test_pool_3d(self):
+        from torch.nn.functional import max_pool2d
+        arr = np.random.random((2, 16, 20, 20, 20)).astype("float32")
+        # arr = np.random.random((1, 1, 1, 5, 5)).astype("float32")
+        jin = jt.array(arr)
+        tin = torch.Tensor(arr)
+        tin.requires_grad = True
+        jt_model = jt.nn.Pool3d(3,1,1)(jin)
+        torch_model = torch.nn.MaxPool3d(3,1,1)(tin)
+        assert np.allclose(jt_model.numpy(), torch_model.detach().numpy())
+
+
+        nout = np.random.random(tuple(jt_model.shape)).astype("float32")
+        jout = jt_model * nout
+        tout = torch_model * torch.Tensor(nout)
+        dj = jt.grad(jout, jin)
+        
+        tout.sum().backward()
+        dt = tin.grad
+        assert np.allclose(dj.numpy(), dt.numpy())
+
+    @unittest.skipIf(not jt.compiler.has_cuda, "No cuda found")
+    @jt.flag_scope(use_cuda=1)
+    def test_cuda_pool_3d(self):
+        self.test_pool_3d()
+

 if __name__ == "__main__":
    unittest.main()
--- a/python/jittor/test/test_batchnorm.py
+++ b/python/jittor/test/test_batchnorm.py
@ -51,6 +51,7 @@ def check_equal_without_istrain(arr, j_layer, p_layer, threshold=1e-5):

@unittest.skipIf(skip_this_test, "No Torch found")
 class TestBatchNorm(unittest.TestCase):
+    @jt.flag_scope(auto_convert_64_to_32=0)
    def test_batchnorm(self):
        # ***************************************************************
        # Test BatchNorm Layer
--- a/python/jittor/test/test_default_var.py
+++ b/python/jittor/test/test_default_var.py
@ -21,6 +21,7 @@ class TestDefaultVar(unittest.TestCase):
    def setUpClass(self):
        return

+    @jt.flag_scope(auto_convert_64_to_32=0)
    def test_default_var(self):
        a=jt.array((2,3,3), np.float32)
        b=a*2.0
--- a/python/jittor/test/test_grad.py
+++ b/python/jittor/test/test_grad.py
@ -73,6 +73,7 @@ class TestGrad(unittest.TestCase):
        assert dx.data == 0
        
    def test_random_graph(self):
+        @jt.flag_scope(auto_convert_64_to_32=0)
        def test(num_vars, num_ops, seed):
            np.random.seed(seed)
            vars = []
--- a/python/jittor/test/test_resize_and_crop.py
+++ b/python/jittor/test/test_resize_and_crop.py
@ -91,7 +91,7 @@ def check_equal(arr, j_layer, p_layer):
    pytorch_arr = torch.Tensor(arr)
    jittor_result = j_layer(jittor_arr)
    pytorch_result = p_layer(pytorch_arr)
-    assert np.allclose(pytorch_result.detach().numpy(), jittor_result.numpy())
+    np.testing.assert_allclose(pytorch_result.detach().numpy(), jittor_result.numpy(), rtol=1e-6)

 class TestResizeAndCrop(unittest.TestCase):
    def test(self):
@ -114,7 +114,10 @@ class TestResizeAndCrop(unittest.TestCase):
    def test_upsample(self):
        arr = np.random.randn(2,3,224,224)
        check_equal(arr, jnn.Upsample(scale_factor=2), tnn.Upsample(scale_factor=2))
-        check_equal(arr, jnn.Upsample(scale_factor=0.2), tnn.Upsample(scale_factor=0.2))
+        check_equal(arr, jnn.Upsample(scale_factor=0.5), tnn.Upsample(scale_factor=0.5))
+        # pytorch change behav when scale_factor changed
+        # this test cannot pass
+        # check_equal(arr, jnn.Upsample(scale_factor=0.2), tnn.Upsample(scale_factor=0.2))

    @unittest.skipIf(torch is None, "no torch found")
    def test_pixelshuffle(self):
--- a/python/jittor/test/test_setitem.py
+++ b/python/jittor/test/test_setitem.py
@ -201,6 +201,15 @@ class TestSetitem(unittest.TestCase):
        a = jt.array([1,2])
        assert a[None,:,None,None,...,None].shape == (1,2,1,1,1)

+    def test_roll(self):
+        x = jt.array([1, 2, 3, 4, 5, 6, 7, 8]).view(4, 2)
+        y = x.roll(1, 0)
+        assert (y.numpy() == [[7,8],[1,2],[3,4],[5,6]]).all(), y
+        y = x.roll(-1, 0)
+        assert (y.numpy() == [[3,4],[5,6],[7,8],[1,2]]).all()
+        y = x.roll(shifts=(2, 1), dims=(0, 1))
+        assert (y.numpy() == [[6,5],[8,7],[2,1],[4,3]]).all()
+

 if __name__ == "__main__":
    unittest.main()
--- a/python/jittor/test/test_slice.py
+++ b/python/jittor/test/test_slice.py
@ -19,6 +19,7 @@ class TestSlice(unittest.TestCase):
        a[2] = 1
        assert a.dtype == "bool"
        a.sync()
+        assert np.equal(a.data, np.array([0,1,1,0,0,0,0,0,0,0])).all()

    def test_var_slices(self):
        def check(slices, msg):
--- a/python/jittor/test/test_ternary_op.py
+++ b/python/jittor/test/test_ternary_op.py
@ -14,8 +14,8 @@ from .test_cuda import test_cuda
 class TestTernaryOp(unittest.TestCase):
    def test_with_np(self):
        np.random.seed(0)
-        a = np.random.rand(5,10)
-        b = np.random.rand(5,10)
+        a = np.random.rand(5,10).astype("float32")
+        b = np.random.rand(5,10).astype("float32")
        ja = jt.array(a)
        jb = jt.array(b)
        jc = jt.ternary(ja>jb, ja, jb)
@ -26,8 +26,8 @@ class TestTernaryOp(unittest.TestCase):

    def test_min(self):
        np.random.seed(1)
-        a = np.random.rand(5,10)
-        b = np.random.rand(5,10)
+        a = np.random.rand(5,10).astype("float32")
+        b = np.random.rand(5,10).astype("float32")
        ja = jt.array(a)
        jb = jt.array(b)
        jc = jt.minimum(ja,jb)
--- a/python/jittor/test/test_trace_var.py
+++ b/python/jittor/test/test_trace_var.py
@ -10,6 +10,7 @@ import numpy as np
 from jittor import Module
 from jittor.models import resnet
 import pickle
+from PIL import Image

 f32 = jt.float32

@ -117,6 +118,37 @@ class TestTraceVar(unittest.TestCase):
                    if i not in data["node_data"]:
                        assert 0, (i, "not found")

+    def test_resnet_infer_with_feature(self):
+        cat_url = "https://ss1.bdstatic.com/70cFuXSh_Q1YnxGkpoWK1HF6hhy/it/u=3782485413,1118109468&fm=26&gp=0.jpg"
+        import jittor_utils
+        cat_path = f"{jt.flags.cache_path}/cat.jpg"
+        print("download")
+        jittor_utils.download(cat_url, cat_path)
+        with open(cat_path, 'rb') as f:
+            img = Image.open(f).convert('RGB')
+            img = jt.array(np.array(img))
+            print(img.shape, img.dtype)
+            img = ((img.float() - 128) / 255).transpose(2,0,1)
+
+
+        with jt.flag_scope(trace_py_var=2, trace_var_data=1):
+            img = img[None,...]
+
+            resnet18 = resnet.Resnet18(pretrained=True)
+            x = jt.float32(img)
+            y = resnet18(x)
+            y.sync()
+
+            data = jt.dump_trace_data()
+            jt.clear_trace_data()
+            with open(f"{jt.flags.cache_path}/resnet_with_feature.pkl", "wb") as f:
+                pickle.dump(data, f)
+            for k,v in data["execute_op_info"].items():
+                for i in v['fused_ops']:
+                    if i not in data["node_data"]:
+                        assert 0, (i, "not found")
+
+
    def test_resnet_trainx(self):
        with jt.flag_scope(trace_py_var=2):

--- a/python/jittor/test/test_transform.py
+++ b/python/jittor/test/test_transform.py
@ -0,0 +1,960 @@
+# ***************************************************************
+# Copyright (c) 2021 Jittor.
+# All Rights Reserved. 
+# Maintainers:
+#     Dun Liang <randonlang@gmail.com>. 
+#
+# Contributors:
+#     Xin Yao <yaox12@outlook.com>
+# 
+# This file is subject to the terms and conditions defined in
+# file 'LICENSE.txt', which is part of this source code package.
+# ***************************************************************
+
+import unittest
+import random
+from PIL import Image
+import numpy as np
+from numpy.testing import assert_array_almost_equal
+import jittor as jt
+import jittor.transform as transform
+
+try:
+    from scipy import stats
+except ImportError:
+    stats = None
+
+
+class Tester(unittest.TestCase):
+
+    def test_crop(self):
+        height = random.randint(10, 32) * 2
+        width = random.randint(10, 32) * 2
+        oheight = random.randint(5, (height - 2) / 2) * 2
+        owidth = random.randint(5, (width - 2) / 2) * 2
+
+        img = np.ones([height, width, 3])
+        oh1 = (height - oheight) // 2
+        ow1 = (width - owidth) // 2
+        # imgnarrow = img[oh1:oh1 + oheight, ow1:ow1 + owidth, :]
+        # imgnarrow.fill(0)
+        img[oh1:oh1 + oheight, ow1:ow1 + owidth, :] = 0
+        # img = jt.array(img)
+        result = transform.Compose([
+            transform.ToPILImage(),
+            transform.CenterCrop((oheight, owidth)),
+            transform.ToTensor(),
+        ])(img)
+        self.assertEqual(result.sum(), 0,
+                         f"height: {height} width: {width} oheight: {oheight} owdith: {owidth}")
+        oheight += 1
+        owidth += 1
+        result = transform.Compose([
+            transform.ToPILImage(),
+            transform.CenterCrop((oheight, owidth)),
+            transform.ToTensor(),
+        ])(img)
+        sum1 = result.sum()
+        # TODO: not pass
+        # self.assertGreater(sum1, 1,
+        #                    f"height: {height} width: {width} oheight: {oheight} owdith: {owidth}")
+        oheight += 1
+        owidth += 1
+        result = transform.Compose([
+            transform.ToPILImage(),
+            transform.CenterCrop((oheight, owidth)),
+            transform.ToTensor(),
+        ])(img)
+        sum2 = result.sum()
+        self.assertGreater(sum2, 0,
+                           f"height: {height} width: {width} oheight: {oheight} owdith: {owidth}")
+        self.assertGreaterEqual(sum2, sum1,
+                           f"height: {height} width: {width} oheight: {oheight} owdith: {owidth}")
+
+    def test_resize(self):
+        height = random.randint(24, 32) * 2
+        width = random.randint(24, 32) * 2
+        osize = random.randint(5, 12) * 2
+
+        img = jt.ones([height, width, 3])
+        result = transform.Compose([
+            transform.ToPILImage(),
+            transform.Resize(osize),
+            transform.ToTensor(),
+        ])(img)
+        self.assertIn(osize, result.shape)
+        if height < width:
+            self.assertLessEqual(result.shape[1], result.shape[2])
+        elif width < height:
+            self.assertGreaterEqual(result.shape[1], result.shape[2])
+
+        result = transform.Compose([
+            transform.ToPILImage(),
+            transform.Resize([osize, osize]),
+            transform.ToTensor(),
+        ])(img)
+        self.assertIn(osize, result.shape)
+        self.assertEqual(result.shape[1], osize)
+        self.assertEqual(result.shape[2], osize)
+
+        oheight = random.randint(5, 12) * 2
+        owidth = random.randint(5, 12) * 2
+        result = transform.Compose([
+            transform.ToPILImage(),
+            transform.Resize((oheight, owidth)),
+            transform.ToTensor(),
+        ])(img)
+        self.assertEqual(result.shape[1], oheight)
+        self.assertEqual(result.shape[2], owidth)
+
+        result = transform.Compose([
+            transform.ToPILImage(),
+            transform.Resize([oheight, owidth]),
+            transform.ToTensor(),
+        ])(img)
+        self.assertEqual(result.shape[1], oheight)
+        self.assertEqual(result.shape[2], owidth)
+
+    def test_random_crop(self):
+        height = random.randint(10, 32) * 2
+        width = random.randint(10, 32) * 2
+        oheight = random.randint(5, (height - 2) / 2) * 2
+        owidth = random.randint(5, (width - 2) / 2) * 2
+        img = np.ones((height, width, 3))
+        result = transform.Compose([
+            transform.ToPILImage(),
+            transform.RandomCrop((oheight, owidth)),
+            transform.ToTensor(),
+        ])(img)
+        self.assertEqual(result.shape[1], oheight)
+        self.assertEqual(result.shape[2], owidth)
+
+        result = transform.Compose([
+            transform.ToPILImage(),
+            transform.RandomCrop((oheight, owidth)),
+            transform.ToTensor(),
+        ])(img)
+        self.assertEqual(result.shape[1], oheight)
+        self.assertEqual(result.shape[2], owidth)
+
+        result = transform.Compose([
+            transform.ToPILImage(),
+            transform.RandomCrop((height, width)),
+            transform.ToTensor()
+        ])(img)
+        self.assertEqual(result.shape[1], height)
+        self.assertEqual(result.shape[2], width)
+        self.assertTrue(np.allclose(img, result.transpose(1,2,0)))
+
+        with self.assertRaises(AssertionError):
+            result = transform.Compose([
+                transform.ToPILImage(),
+                transform.RandomCrop((height + 1, width + 1)),
+                transform.ToTensor(),
+            ])(img)
+
+    def test_lambda(self):
+        trans = transform.Lambda(lambda x: x.add(10))
+        x = jt.random([10])
+        y = trans(x)
+        self.assertTrue(np.allclose(y.data, jt.add(x, 10).data))
+
+    @unittest.skipIf(stats is None, 'scipy.stats not available')
+    def test_random_apply(self):
+        random_state = random.getstate()
+        random.seed(42)
+        random_apply_transform = transform.RandomApply(
+            [
+                transform.RandomHorizontalFlip(),
+                transform.RandomVerticalFlip(),
+            ], p=0.4
+        )
+        img = transform.ToPILImage()(jt.random((3, 10, 10)))
+        num_samples = 250
+        num_applies = 0
+        for _ in range(num_samples):
+            out = random_apply_transform(img)
+            if out != img:
+                num_applies += 1
+
+        p_value = stats.binom_test(num_applies, num_samples, p=0.3)
+        random.setstate(random_state)
+        self.assertGreater(p_value, 0.0001)
+
+    @unittest.skipIf(stats is None, 'scipy.stats not available')
+    def test_random_choice(self):
+        random_state = random.getstate()
+        random.seed(42)
+        random_choice_transform = transform.RandomChoice(
+            [
+                transform.Resize(15),
+                transform.Resize(20),
+                transform.CenterCrop(10)
+            ]
+        )
+        img = transform.ToPILImage()(jt.random((25, 25, 3)))
+        num_samples = 250
+        num_resize_15 = 0
+        num_resize_20 = 0
+        num_crop_10 = 0
+        for _ in range(num_samples):
+            out = random_choice_transform(img)
+            if out.size == (15, 15):
+                num_resize_15 += 1
+            elif out.size == (20, 20):
+                num_resize_20 += 1
+            elif out.size == (10, 10):
+                num_crop_10 += 1
+
+        p_value = stats.binom_test(num_resize_15, num_samples, p=0.33333)
+        self.assertGreater(p_value, 0.0001)
+        p_value = stats.binom_test(num_resize_20, num_samples, p=0.33333)
+        self.assertGreater(p_value, 0.0001)
+        p_value = stats.binom_test(num_crop_10, num_samples, p=0.33333)
+        self.assertGreater(p_value, 0.0001)
+
+        random.setstate(random_state)
+
+    @unittest.skipIf(stats is None, 'scipy.stats not available')
+    def test_random_order(self):
+        random_state = random.getstate()
+        random.seed(42)
+        random_order_transform = transform.RandomOrder(
+            [
+                transform.Resize(20),
+                transform.CenterCrop(10)
+            ]
+        )
+        img = transform.ToPILImage()(jt.random((3, 25, 25)))
+        num_samples = 250
+        num_normal_order = 0
+        resize_crop_out = transform.CenterCrop(10)(transform.Resize(20)(img))
+        for _ in range(num_samples):
+            out = random_order_transform(img)
+            if out == resize_crop_out:
+                num_normal_order += 1
+
+        p_value = stats.binom_test(num_normal_order, num_samples, p=0.5)
+        random.setstate(random_state)
+        self.assertGreater(p_value, 0.0001)
+
+    def test_to_tensor(self):
+        test_channels = [1, 3, 4]
+        height, width = 4, 4
+        trans = transform.ToTensor()
+
+        with self.assertRaises(TypeError):
+            trans(np.random.rand(1, height, width).tolist())
+
+        with self.assertRaises(ValueError):
+            trans(np.random.rand(height))
+            trans(np.random.rand(1, 1, height, width))
+
+        for channels in test_channels:
+            input_data = np.random.randint(low=0, high=255, size=(height, width, channels)).astype(np.float32) / np.float32(255.0)
+            img = transform.ToPILImage()(input_data)
+            output = trans(img)
+            expect = input_data.transpose(2,0,1)
+            self.assertTrue(np.allclose(expect, output), f"{expect.shape}\n{output.shape}")
+
+            ndarray = np.random.randint(low=0, high=255, size=(height, width, channels)).astype(np.uint8)
+            output = trans(ndarray)
+            expected_output = ndarray.transpose((2, 0, 1)) / 255.0
+            self.assertTrue(np.allclose(output, expected_output))
+
+            ndarray = np.random.rand(height, width, channels).astype(np.float32)
+            output = trans(ndarray)
+            expected_output = ndarray.transpose((2, 0, 1))
+            self.assertTrue(np.allclose(output, expected_output))
+
+        # separate test for mode '1' PIL images
+        input_data = np.random.binomial(1, 0.5, size=(height, width, 1)).astype(np.uint8)
+        img = transform.ToPILImage()(input_data * 255).convert('1')
+        output = trans(img)
+        self.assertTrue(np.allclose(input_data[:,:,0], output[0]), f"{input_data.shape}\n{output.shape}")
+
+    def test_1_channel_tensor_to_pil_image(self):
+        to_tensor = transform.ToTensor()
+        shape = (4, 4, 1)
+
+        img_data_float = jt.array(np.random.rand(*shape), dtype='float32')
+        img_data_byte = jt.array(np.random.randint(0, 255, shape), dtype='uint8')
+        img_data_short = jt.array(np.random.randint(0, 32767, shape), dtype='int16')
+        img_data_int = jt.array(np.random.randint(0, 2147483647, shape), dtype='int32')
+
+        inputs = [img_data_float, img_data_byte, img_data_short, img_data_int]
+        expected_outputs = [img_data_float.multiply(255).int().float().divide(255).numpy(),
+                            img_data_byte.float().divide(255.0).numpy(),
+                            img_data_short.numpy(),
+                            img_data_int.numpy()]
+        expected_modes = ['F', 'L', 'I;16', 'I']
+
+        for img_data, expected_output, mode in zip(inputs, expected_outputs, expected_modes):
+            for t in [transform.ToPILImage(), transform.ToPILImage(mode=mode)]:
+                img = t(img_data)
+                self.assertEqual(img.mode, mode)
+                np.testing.assert_allclose(expected_output[:,:,0], to_tensor(img)[0], atol=0.01)
+        # 'F' mode for torch.FloatTensor
+        img_F_mode = transform.ToPILImage(mode='F')(img_data_float)
+        self.assertEqual(img_F_mode.mode, 'F')
+
+    def test_1_channel_ndarray_to_pil_image(self):
+        img_data_float = np.random.rand(4, 4, 1).astype(np.float32)
+        img_data_byte = np.random.randint(0, 255, (4, 4, 1)).astype(np.uint8)
+        img_data_short = np.random.randint(0, 32767, (4, 4, 1)).astype(np.int16)
+        img_data_int = np.random.randint(0, 2147483647, (4, 4, 1)).astype(np.int32)
+
+        inputs = [img_data_float, img_data_byte, img_data_short, img_data_int]
+        expected_modes = ['F', 'L', 'I;16', 'I']
+        for img_data, mode in zip(inputs, expected_modes):
+            for t in [transform.ToPILImage(), transform.ToPILImage(mode=mode)]:
+                img = t(img_data)
+                self.assertEqual(img.mode, mode)
+                self.assertTrue(np.allclose(img_data[:, :, 0], img))
+
+    def test_2_channel_ndarray_to_pil_image(self):
+        def verify_img_data(img_data, mode):
+            if mode is None:
+                img = transform.ToPILImage()(img_data)
+                self.assertEqual(img.mode, 'LA')  # default should assume LA
+            else:
+                img = transform.ToPILImage(mode=mode)(img_data)
+                self.assertEqual(img.mode, mode)
+            split = img.split()
+            for i in range(2):
+                self.assertTrue(np.allclose(img_data[:, :, i], split[i]))
+
+        img_data = np.random.randint(0, 255, (4, 4, 2)).astype(np.uint8)
+        for mode in [None, 'LA']:
+            verify_img_data(img_data, mode)
+
+        with self.assertRaises(ValueError):
+            # should raise if we try a mode for 4 or 1 or 3 channel images
+            transform.ToPILImage(mode='RGBA')(img_data)
+            transform.ToPILImage(mode='P')(img_data)
+            transform.ToPILImage(mode='RGB')(img_data)
+
+    def test_2_channel_tensor_to_pil_image(self):
+        def verify_img_data(img_data, expected_output, mode):
+            if mode is None:
+                img = transform.ToPILImage()(img_data)
+                self.assertEqual(img.mode, 'LA')  # default should assume LA
+            else:
+                img = transform.ToPILImage(mode=mode)(img_data)
+                self.assertEqual(img.mode, mode)
+            split = img.split()
+            for i in range(2):
+                self.assertTrue(np.allclose(expected_output[:,:,i], transform.to_tensor(split[i])))
+
+        img_data = jt.random((4, 4, 2))
+        expected_output = img_data.multiply(255).int().float().divide(255)
+        for mode in [None, 'LA']:
+            verify_img_data(img_data, expected_output, mode=mode)
+
+        with self.assertRaises(ValueError):
+            # should raise if we try a mode for 4 or 1 or 3 channel images
+            transform.ToPILImage(mode='RGBA')(img_data)
+            transform.ToPILImage(mode='P')(img_data)
+            transform.ToPILImage(mode='RGB')(img_data)
+
+    def test_3_channel_tensor_to_pil_image(self):
+        def verify_img_data(img_data, expected_output, mode):
+            if mode is None:
+                img = transform.ToPILImage()(img_data)
+                self.assertEqual(img.mode, 'RGB')  # default should assume RGB
+            else:
+                img = transform.ToPILImage(mode=mode)(img_data)
+                self.assertEqual(img.mode, mode)
+            split = img.split()
+            for i in range(3):
+                self.assertTrue(np.allclose(expected_output[:,:,i], transform.to_tensor(split[i])))
+
+        img_data = jt.random((4, 4, 3))
+        expected_output = img_data.multiply(255).int().float().divide(255)
+        for mode in [None, 'RGB', 'HSV', 'YCbCr']:
+            verify_img_data(img_data, expected_output, mode=mode)
+
+        with self.assertRaises(ValueError):
+            # should raise if we try a mode for 4 or 1 or 2 channel images
+            transform.ToPILImage(mode='RGBA')(img_data)
+            transform.ToPILImage(mode='P')(img_data)
+            transform.ToPILImage(mode='LA')(img_data)
+
+        with self.assertRaises(ValueError):
+            transform.ToPILImage()(jt.random((1, 3, 4, 4)))
+
+    def test_3_channel_ndarray_to_pil_image(self):
+        def verify_img_data(img_data, mode):
+            if mode is None:
+                img = transform.ToPILImage()(img_data)
+                self.assertEqual(img.mode, 'RGB')  # default should assume RGB
+            else:
+                img = transform.ToPILImage(mode=mode)(img_data)
+                self.assertEqual(img.mode, mode)
+            split = img.split()
+            for i in range(3):
+                self.assertTrue(np.allclose(img_data[:, :, i], split[i]))
+
+        img_data = np.random.randint(0, 255, (4, 4, 3)).astype(np.uint8)
+        for mode in [None, 'RGB', 'HSV', 'YCbCr']:
+            verify_img_data(img_data, mode)
+
+        with self.assertRaises(ValueError):
+            # should raise if we try a mode for 4 or 1 or 2 channel images
+            transform.ToPILImage(mode='RGBA')(img_data)
+            transform.ToPILImage(mode='P')(img_data)
+            transform.ToPILImage(mode='LA')(img_data)
+
+    def test_4_channel_tensor_to_pil_image(self):
+        def verify_img_data(img_data, expected_output, mode):
+            if mode is None:
+                img = transform.ToPILImage()(img_data)
+                self.assertEqual(img.mode, 'RGBA')  # default should assume RGBA
+            else:
+                img = transform.ToPILImage(mode=mode)(img_data)
+                self.assertEqual(img.mode, mode)
+
+            split = img.split()
+            for i in range(4):
+                np.testing.assert_allclose(expected_output[:,:,i], transform.to_tensor(split[i])[0])
+
+        img_data = jt.random((4, 4, 4))
+        expected_output = img_data.multiply(255).int().float().divide(255)
+        for mode in [None, 'RGBA', 'CMYK', 'RGBX']:
+            verify_img_data(img_data, expected_output, mode)
+
+        with self.assertRaises(ValueError):
+            # should raise if we try a mode for 3 or 1 or 2 channel images
+            transform.ToPILImage(mode='RGB')(img_data)
+            transform.ToPILImage(mode='P')(img_data)
+            transform.ToPILImage(mode='LA')(img_data)
+
+    def test_4_channel_ndarray_to_pil_image(self):
+        def verify_img_data(img_data, mode):
+            if mode is None:
+                img = transform.ToPILImage()(img_data)
+                self.assertEqual(img.mode, 'RGBA')  # default should assume RGBA
+            else:
+                img = transform.ToPILImage(mode=mode)(img_data)
+                self.assertEqual(img.mode, mode)
+            split = img.split()
+            for i in range(4):
+                self.assertTrue(np.allclose(img_data[:, :, i], split[i]))
+
+        img_data = np.random.randint(0, 255, (4, 4, 4)).astype(np.uint8)
+        for mode in [None, 'RGBA', 'CMYK', 'RGBX']:
+            verify_img_data(img_data, mode)
+
+        with self.assertRaises(ValueError):
+            # should raise if we try a mode for 3 or 1 or 2 channel images
+            transform.ToPILImage(mode='RGB')(img_data)
+            transform.ToPILImage(mode='P')(img_data)
+            transform.ToPILImage(mode='LA')(img_data)
+
+    def test_2d_tensor_to_pil_image(self):
+        to_tensor = transform.ToTensor()
+
+        img_data_float = jt.array(np.random.rand(4, 4), dtype='float32')
+        img_data_byte = jt.array(np.random.randint(0, 255, (4, 4)), dtype='uint8')
+        img_data_short = jt.array(np.random.randint(0, 32767, (4, 4)), dtype='int16')
+        img_data_int = jt.array(np.random.randint(0, 2147483647, (4, 4)), dtype='int32')
+
+        inputs = [img_data_float, img_data_byte, img_data_short, img_data_int]
+        expected_outputs = [img_data_float.multiply(255).int().float().divide(255).numpy(),
+                            img_data_byte.float().divide(255.0).numpy(),
+                            img_data_short.numpy(),
+                            img_data_int.numpy()]
+        expected_modes = ['F', 'L', 'I;16', 'I']
+
+        for img_data, expected_output, mode in zip(inputs, expected_outputs, expected_modes):
+            for t in [transform.ToPILImage(), transform.ToPILImage(mode=mode)]:
+                img = t(img_data)
+                self.assertEqual(img.mode, mode)
+                self.assertTrue(np.allclose(expected_output, to_tensor(img), atol=0.01, rtol=0.01))
+
+    def test_2d_ndarray_to_pil_image(self):
+        img_data_float = np.random.rand(4, 4).astype(np.float32)
+        img_data_byte = np.random.randint(0, 255, (4, 4)).astype(np.uint8)
+        img_data_short = np.random.randint(0, 32767, (4, 4)).astype(np.int16)
+        img_data_int = np.random.randint(0, 2147483647, (4, 4)).astype(np.int32)
+
+        inputs = [img_data_float, img_data_byte, img_data_short, img_data_int]
+        expected_modes = ['F', 'L', 'I;16', 'I']
+        for img_data, mode in zip(inputs, expected_modes):
+            for t in [transform.ToPILImage(), transform.ToPILImage(mode=mode)]:
+                img = t(img_data)
+                self.assertEqual(img.mode, mode)
+                self.assertTrue(np.allclose(img_data, img))
+
+    def test_tensor_bad_types_to_pil_image(self):
+        with self.assertRaises(ValueError):
+            transform.ToPILImage()(jt.ones((1, 3, 4, 4)))
+
+    def test_ndarray_bad_types_to_pil_image(self):
+        trans = transform.ToPILImage()
+        with self.assertRaises(TypeError):
+            trans(np.ones([4, 4, 1], np.int64))
+            trans(np.ones([4, 4, 1], np.uint16))
+            trans(np.ones([4, 4, 1], np.uint32))
+            trans(np.ones([4, 4, 1], np.float64))
+
+        with self.assertRaises(ValueError):
+            transform.ToPILImage()(np.ones([1, 4, 4, 3]))
+
+    @unittest.skipIf(stats is None, 'scipy.stats not available')
+    def test_random_vertical_flip(self):
+        random_state = random.getstate()
+        random.seed(42)
+        img = transform.ToPILImage()(jt.random((3, 10, 10)))
+        vimg = img.transpose(Image.FLIP_TOP_BOTTOM)
+
+        num_samples = 250
+        num_vertical = 0
+        for _ in range(num_samples):
+            out = transform.RandomVerticalFlip()(img)
+            if out == vimg:
+                num_vertical += 1
+
+        p_value = stats.binom_test(num_vertical, num_samples, p=0.5)
+        random.setstate(random_state)
+        self.assertGreater(p_value, 0.0001)
+
+        num_samples = 250
+        num_vertical = 0
+        for _ in range(num_samples):
+            out = transform.RandomVerticalFlip(p=0.7)(img)
+            if out == vimg:
+                num_vertical += 1
+
+        p_value = stats.binom_test(num_vertical, num_samples, p=0.7)
+        random.setstate(random_state)
+        self.assertGreater(p_value, 0.0001)
+
+    @unittest.skipIf(stats is None, 'scipy.stats not available')
+    def test_random_horizontal_flip(self):
+        random_state = random.getstate()
+        random.seed(42)
+        img = transform.ToPILImage()(jt.random((3, 10, 10)))
+        himg = img.transpose(Image.FLIP_LEFT_RIGHT)
+
+        num_samples = 250
+        num_horizontal = 0
+        for _ in range(num_samples):
+            out = transform.RandomHorizontalFlip()(img)
+            if out == himg:
+                num_horizontal += 1
+
+        p_value = stats.binom_test(num_horizontal, num_samples, p=0.5)
+        random.setstate(random_state)
+        self.assertGreater(p_value, 0.0001)
+
+        num_samples = 250
+        num_horizontal = 0
+        for _ in range(num_samples):
+            out = transform.RandomHorizontalFlip(p=0.7)(img)
+            if out == himg:
+                num_horizontal += 1
+
+        p_value = stats.binom_test(num_horizontal, num_samples, p=0.7)
+        random.setstate(random_state)
+        self.assertGreater(p_value, 0.0001)
+
+    @unittest.skipIf(stats is None, 'scipy.stats is not available')
+    def test_normalize(self):
+        def samples_from_standard_normal(tensor):
+            p_value = stats.kstest(list(tensor.reshape(-1).data), 'norm', args=(0, 1)).pvalue
+            return p_value > 0.0001
+
+        random_state = random.getstate()
+        random.seed(42)
+        for channels in [1, 3]:
+            img = jt.random((channels, 10, 10))
+            mean = [img[c].mean().item() for c in range(channels)]
+            std = [img[c].std().item() for c in range(channels)]
+            normalized = transform.ImageNormalize(mean, std)(img)
+            self.assertTrue(samples_from_standard_normal(normalized))
+        random.setstate(random_state)
+
+    def test_normalize_different_dtype(self):
+        for dtype1 in ['float32', 'float64']:
+            img = jt.random((3, 10, 10), dtype=dtype1)
+            for dtype2 in ['int64', 'float32', 'float64']:
+                mean = jt.array([1, 2, 3], dtype=dtype2)
+                std = jt.array([1, 2, 1], dtype=dtype2)
+                # checks that it doesn't crash
+                transform.image_normalize(img, mean, std)
+
+    def test_normalize_3d_tensor(self):
+        jt.seed(28)
+        n_channels = 3
+        img_size = 10
+        mean = jt.random((n_channels,)).data
+        std = jt.random((n_channels,)).data
+        img = jt.random((n_channels, img_size, img_size)).data
+        target = transform.image_normalize(img, mean, std)
+
+        mean_unsqueezed = mean.reshape(-1, 1, 1)
+        std_unsqueezed = std.reshape(-1, 1, 1)
+        result1 = transform.image_normalize(img, mean_unsqueezed, std_unsqueezed)
+        result2 = transform.image_normalize(img,
+            mean_unsqueezed,
+            std_unsqueezed)
+        assert_array_almost_equal(target, result1)
+        assert_array_almost_equal(target, result2)
+
+    def test_adjust_brightness(self):
+        x_shape = [2, 2, 3]
+        x_data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1]
+        x_np = np.array(x_data, dtype=np.uint8).reshape(x_shape)
+        x_pil = Image.fromarray(x_np, mode='RGB')
+
+        # test 0
+        y_pil = transform.adjust_brightness(x_pil, 1)
+        y_np = np.array(y_pil)
+        self.assertTrue(np.allclose(y_np, x_np))
+
+        # test 1
+        y_pil = transform.adjust_brightness(x_pil, 0.5)
+        y_np = np.array(y_pil)
+        y_ans = [0, 2, 6, 27, 67, 113, 18, 4, 117, 45, 127, 0]
+        y_ans = np.array(y_ans, dtype=np.uint8).reshape(x_shape)
+        self.assertTrue(np.allclose(y_np, y_ans))
+
+        # test 2
+        y_pil = transform.adjust_brightness(x_pil, 2)
+        y_np = np.array(y_pil)
+        y_ans = [0, 10, 26, 108, 255, 255, 74, 16, 255, 180, 255, 2]
+        y_ans = np.array(y_ans, dtype=np.uint8).reshape(x_shape)
+        self.assertTrue(np.allclose(y_np, y_ans))
+
+    def test_adjust_contrast(self):
+        x_shape = [2, 2, 3]
+        x_data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1]
+        x_np = np.array(x_data, dtype=np.uint8).reshape(x_shape)
+        x_pil = Image.fromarray(x_np, mode='RGB')
+
+        # test 0
+        y_pil = transform.adjust_contrast(x_pil, 1)
+        y_np = np.array(y_pil)
+        self.assertTrue(np.allclose(y_np, x_np))
+
+        # test 1
+        y_pil = transform.adjust_contrast(x_pil, 0.5)
+        y_np = np.array(y_pil)
+        y_ans = [43, 45, 49, 70, 110, 156, 61, 47, 160, 88, 170, 43]
+        y_ans = np.array(y_ans, dtype=np.uint8).reshape(x_shape)
+        self.assertTrue(np.allclose(y_np, y_ans))
+
+        # test 2
+        y_pil = transform.adjust_contrast(x_pil, 2)
+        y_np = np.array(y_pil)
+        y_ans = [0, 0, 0, 22, 184, 255, 0, 0, 255, 94, 255, 0]
+        y_ans = np.array(y_ans, dtype=np.uint8).reshape(x_shape)
+        self.assertTrue(np.allclose(y_np, y_ans))
+
+    # @unittest.skipIf(Image.__version__ >= '7', "Temporarily disabled")
+    def test_adjust_saturation(self):
+        x_shape = [2, 2, 3]
+        x_data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1]
+        x_np = np.array(x_data, dtype=np.uint8).reshape(x_shape)
+        x_pil = Image.fromarray(x_np, mode='RGB')
+
+        # test 0
+        y_pil = transform.adjust_saturation(x_pil, 1)
+        y_np = np.array(y_pil)
+        self.assertTrue(np.allclose(y_np, x_np))
+
+        # test 1
+        y_pil = transform.adjust_saturation(x_pil, 0.5)
+        y_np = np.array(y_pil)
+        y_ans = [2, 4, 8, 87, 128, 173, 39, 25, 138, 133, 216, 89]
+        y_ans = np.array(y_ans, dtype=np.uint8).reshape(x_shape)
+        self.assertTrue(np.allclose(y_np, y_ans))
+
+        # test 2
+        y_pil = transform.adjust_saturation(x_pil, 2)
+        y_np = np.array(y_pil)
+        y_ans = [0, 6, 22, 0, 149, 255, 32, 0, 255, 3, 255, 0]
+        y_ans = np.array(y_ans, dtype=np.uint8).reshape(x_shape)
+        self.assertTrue(np.allclose(y_np, y_ans))
+
+    def test_adjust_hue(self):
+        x_shape = [2, 2, 3]
+        x_data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1]
+        x_np = np.array(x_data, dtype=np.uint8).reshape(x_shape)
+        x_pil = Image.fromarray(x_np, mode='RGB')
+
+        with self.assertRaises(ValueError):
+            transform.adjust_hue(x_pil, -0.7)
+            transform.adjust_hue(x_pil, 1)
+
+        # test 0: almost same as x_data but not exact.
+        # probably because hsv <-> rgb floating point ops
+        y_pil = transform.adjust_hue(x_pil, 0)
+        y_np = np.array(y_pil)
+        y_ans = [0, 5, 13, 54, 139, 226, 35, 8, 234, 91, 255, 1]
+        y_ans = np.array(y_ans, dtype=np.uint8).reshape(x_shape)
+        self.assertTrue(np.allclose(y_np, y_ans))
+
+        # test 1
+        y_pil = transform.adjust_hue(x_pil, 0.25)
+        y_np = np.array(y_pil)
+        y_ans = [13, 0, 12, 224, 54, 226, 234, 8, 99, 1, 222, 255]
+        y_ans = np.array(y_ans, dtype=np.uint8).reshape(x_shape)
+        self.assertTrue(np.allclose(y_np, y_ans))
+
+        # test 2
+        y_pil = transform.adjust_hue(x_pil, -0.25)
+        y_np = np.array(y_pil)
+        y_ans = [0, 13, 2, 54, 226, 58, 8, 234, 152, 255, 43, 1]
+        y_ans = np.array(y_ans, dtype=np.uint8).reshape(x_shape)
+        self.assertTrue(np.allclose(y_np, y_ans))
+
+    def test_adjust_gamma(self):
+        x_shape = [2, 2, 3]
+        x_data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1]
+        x_np = np.array(x_data, dtype=np.uint8).reshape(x_shape)
+        x_pil = Image.fromarray(x_np, mode='RGB')
+
+        # test 0
+        y_pil = transform.adjust_gamma(x_pil, 1)
+        y_np = np.array(y_pil)
+        self.assertTrue(np.allclose(y_np, x_np))
+
+        # test 1
+        y_pil = transform.adjust_gamma(x_pil, 0.5)
+        y_np = np.array(y_pil)
+        y_ans = [0, 35, 57, 117, 186, 241, 97, 45, 245, 152, 255, 16]
+        y_ans = np.array(y_ans, dtype=np.uint8).reshape(x_shape)
+        self.assertTrue(np.allclose(y_np, y_ans))
+
+        # test 2
+        y_pil = transform.adjust_gamma(x_pil, 2)
+        y_np = np.array(y_pil)
+        y_ans = [0, 0, 0, 11, 71, 201, 5, 0, 215, 31, 255, 0]
+        y_ans = np.array(y_ans, dtype=np.uint8).reshape(x_shape)
+        self.assertTrue(np.allclose(y_np, y_ans))
+
+    def test_adjusts_L_mode(self):
+        x_shape = [2, 2, 3]
+        x_data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1]
+        x_np = np.array(x_data, dtype=np.uint8).reshape(x_shape)
+        x_rgb = Image.fromarray(x_np, mode='RGB')
+
+        x_l = x_rgb.convert('L')
+        self.assertEqual(transform.adjust_brightness(x_l, 2).mode, 'L')
+        self.assertEqual(transform.adjust_saturation(x_l, 2).mode, 'L')
+        self.assertEqual(transform.adjust_contrast(x_l, 2).mode, 'L')
+        self.assertEqual(transform.adjust_hue(x_l, 0.4).mode, 'L')
+        self.assertEqual(transform.adjust_gamma(x_l, 0.5).mode, 'L')
+
+    def test_color_jitter(self):
+        color_jitter = transform.ColorJitter(2, 2, 2, 0.1)
+
+        x_shape = [2, 2, 3]
+        x_data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1]
+        x_np = np.array(x_data, dtype=np.uint8).reshape(x_shape)
+        x_pil = Image.fromarray(x_np, mode='RGB')
+        x_pil_2 = x_pil.convert('L')
+
+        for i in range(10):
+            y_pil = color_jitter(x_pil)
+            self.assertEqual(y_pil.mode, x_pil.mode)
+
+            y_pil_2 = color_jitter(x_pil_2)
+            self.assertEqual(y_pil_2.mode, x_pil_2.mode)
+
+    def test_gray(self):
+        """Unit tests for grayscale transform"""
+
+        x_shape = [2, 2, 3]
+        x_data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1]
+        x_np = np.array(x_data, dtype=np.uint8).reshape(x_shape)
+        x_pil = Image.fromarray(x_np, mode='RGB')
+        x_pil_2 = x_pil.convert('L')
+        gray_np = np.array(x_pil_2)
+
+        # Test Set: Gray an image with desired number of output channels
+        # Case 1: RGB -> 1 channel grayscale
+        trans1 = transform.Gray(num_output_channels=1)
+        gray_pil_1 = trans1(x_pil)
+        gray_np_1 = np.array(gray_pil_1)
+        # self.assertEqual(gray_pil_1.mode, 'L', 'mode should be L')
+        self.assertEqual(gray_np_1.shape[1:], tuple(x_shape[0:2]), 'should be 1 channel')
+        assert np.allclose(gray_np/255, gray_np_1[0], atol=0.01)
+
+        # Case 2: RGB -> 3 channel grayscale
+        trans2 = transform.Gray(num_output_channels=3)
+        gray_pil_2 = trans2(x_pil)
+        gray_np_2 = np.array(gray_pil_2)
+        # self.assertEqual(gray_pil_2.mode, 'RGB', 'mode should be RGB')
+        self.assertEqual(gray_np_2.shape, tuple(x_shape), 'should be 3 channel')
+        np.testing.assert_equal(gray_np_2[:, :, 0], gray_np_2[:, :, 1])
+        np.testing.assert_equal(gray_np_2[:, :, 1], gray_np_2[:, :, 2])
+        assert np.allclose(gray_np/255, gray_np_2[:, :, 0], atol=0.01)
+
+        # Case 3: 1 channel grayscale -> 1 channel grayscale
+        trans3 = transform.Gray(num_output_channels=1)
+        gray_pil_3 = trans3(x_pil_2)
+        gray_np_3 = np.array(gray_pil_3)
+        # self.assertEqual(gray_pil_3.mode, 'L', 'mode should be L')
+        self.assertEqual(gray_np_3.shape[1:], tuple(x_shape[0:2]), 'should be 1 channel')
+        np.testing.assert_allclose(gray_np/255, gray_np_3[0], atol=0.01)
+
+        # Case 4: 1 channel grayscale -> 3 channel grayscale
+        trans4 = transform.Gray(num_output_channels=3)
+        gray_pil_4 = trans4(x_pil_2)
+        gray_np_4 = np.array(gray_pil_4)
+        # self.assertEqual(gray_pil_4.mode, 'RGB', 'mode should be RGB')
+        self.assertEqual(gray_np_4.shape, tuple(x_shape), 'should be 3 channel')
+        np.testing.assert_equal(gray_np_4[:, :, 0], gray_np_4[:, :, 1])
+        np.testing.assert_equal(gray_np_4[:, :, 1], gray_np_4[:, :, 2])
+        np.testing.assert_allclose(gray_np/255, gray_np_4[:, :, 0], atol=0.01)
+
+    @unittest.skipIf(stats is None, 'scipy.stats not available')
+    def test_random_gray(self):
+        """Unit tests for random grayscale transform"""
+
+        # Test Set 1: RGB -> 3 channel grayscale
+        random_state = random.getstate()
+        random.seed(42)
+        x_shape = [2, 2, 3]
+        x_np = np.random.randint(0, 256, x_shape, np.uint8)
+        x_pil = Image.fromarray(x_np, mode='RGB')
+        x_pil_2 = x_pil.convert('L')
+        gray_np = np.array(x_pil_2)
+
+        num_samples = 250
+        num_gray = 0
+        for _ in range(num_samples):
+            gray_pil_2 = transform.RandomGray(p=0.5)(x_pil)
+            gray_np_2 = np.array(gray_pil_2)
+            if np.array_equal(gray_np_2[:, :, 0], gray_np_2[:, :, 1]) and \
+                    np.array_equal(gray_np_2[:, :, 1], gray_np_2[:, :, 2]) and \
+                    np.array_equal(gray_np, gray_np_2[:, :, 0]):
+                num_gray = num_gray + 1
+
+        p_value = stats.binom_test(num_gray, num_samples, p=0.5)
+        random.setstate(random_state)
+        self.assertGreater(p_value, 0.0001)
+
+        # Test Set 2: grayscale -> 1 channel grayscale
+        random_state = random.getstate()
+        random.seed(42)
+        x_shape = [2, 2, 3]
+        x_np = np.random.randint(0, 256, x_shape, np.uint8)
+        x_pil = Image.fromarray(x_np, mode='RGB')
+        x_pil_2 = x_pil.convert('L')
+        gray_np = np.array(x_pil_2)
+
+        num_samples = 250
+        num_gray = 0
+        for _ in range(num_samples):
+            gray_pil_3 = transform.RandomGray(p=0.5)(x_pil_2)
+            gray_np_3 = np.array(gray_pil_3)
+            if np.array_equal(gray_np, gray_np_3):
+                num_gray = num_gray + 1
+
+        p_value = stats.binom_test(num_gray, num_samples, p=1.0)  # Note: grayscale is always unchanged
+        random.setstate(random_state)
+        self.assertGreater(p_value, 0.0001)
+
+        # Test set 3: Explicit tests
+        x_shape = [2, 2, 3]
+        x_data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1]
+        x_np = np.array(x_data, dtype=np.uint8).reshape(x_shape)
+        x_pil = Image.fromarray(x_np, mode='RGB')
+        x_pil_2 = x_pil.convert('L')
+        gray_np = np.array(x_pil_2)
+
+        # Case 3a: RGB -> 3 channel grayscale (grayscaled)
+        trans2 = transform.RandomGray(p=1.0)
+        gray_pil_2 = trans2(x_pil)
+        gray_np_2 = np.array(gray_pil_2)
+        self.assertEqual(gray_pil_2.mode, 'RGB', 'mode should be RGB')
+        self.assertEqual(gray_np_2.shape, tuple(x_shape), 'should be 3 channel')
+        np.testing.assert_equal(gray_np_2[:, :, 0], gray_np_2[:, :, 1])
+        np.testing.assert_equal(gray_np_2[:, :, 1], gray_np_2[:, :, 2])
+        np.testing.assert_equal(gray_np, gray_np_2[:, :, 0])
+
+        # Case 3b: RGB -> 3 channel grayscale (unchanged)
+        trans2 = transform.RandomGray(p=0.0)
+        gray_pil_2 = trans2(x_pil)
+        gray_np_2 = np.array(gray_pil_2)
+        self.assertEqual(gray_pil_2.mode, 'RGB', 'mode should be RGB')
+        self.assertEqual(gray_np_2.shape, tuple(x_shape), 'should be 3 channel')
+        np.testing.assert_equal(x_np, gray_np_2)
+
+        # Case 3c: 1 channel grayscale -> 1 channel grayscale (grayscaled)
+        trans3 = transform.RandomGray(p=1.0)
+        gray_pil_3 = trans3(x_pil_2)
+        gray_np_3 = np.array(gray_pil_3)
+        self.assertEqual(gray_pil_3.mode, 'L', 'mode should be L')
+        self.assertEqual(gray_np_3.shape, tuple(x_shape[0:2]), 'should be 1 channel')
+        np.testing.assert_equal(gray_np, gray_np_3)
+
+        # Case 3d: 1 channel grayscale -> 1 channel grayscale (unchanged)
+        trans3 = transform.RandomGray(p=0.0)
+        gray_pil_3 = trans3(x_pil_2)
+        gray_np_3 = np.array(gray_pil_3)
+        self.assertEqual(gray_pil_3.mode, 'L', 'mode should be L')
+        self.assertEqual(gray_np_3.shape, tuple(x_shape[0:2]), 'should be 1 channel')
+        np.testing.assert_equal(gray_np, gray_np_3)
+
+    def test_RandomPerspective(self):
+        img = jt.random((30,40,3))
+        result = transform.Compose([
+            transform.ToPILImage(),
+            transform.RandomPerspective(p=1),
+            transform.ToTensor(),
+        ])(img)
+
+
+    def test_RandomResizedCrop(self):
+        img = jt.random((30,40,3))
+        result = transform.Compose([
+            transform.ToPILImage(),
+            transform.RandomResizedCrop(20),
+            transform.ToTensor(),
+        ])(img)
+
+
+    def test_FiveCrop(self):
+        img = jt.random((30,40,3))
+        result = transform.Compose([
+            transform.ToPILImage(),
+            transform.FiveCrop(20),
+            transform.ToTensor(),
+        ])(img)
+
+
+    def test_TenCrop(self):
+        img = jt.random((30,40,3))
+        result = transform.Compose([
+            transform.ToPILImage(),
+            transform.TenCrop(20),
+            transform.ToTensor(),
+        ])(img)
+
+
+    def test_RandomRotation(self):
+        img = jt.random((30,40,3))
+        result = transform.Compose([
+            transform.ToPILImage(),
+            transform.RandomRotation(20),
+            transform.ToTensor(),
+        ])(img)
+
+
+    def test_RandomAffine(self):
+        img = jt.random((30,40,3))
+        result = transform.Compose([
+            transform.ToPILImage(),
+            transform.RandomAffine(20),
+            transform.ToTensor(),
+        ])(img)
+
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/jittor/transform/init.py
+++ b/python/jittor/transform/init.py
--- a/python/jittor/transform/function_pil.py
+++ b/python/jittor/transform/function_pil.py
@ -0,0 +1,649 @@
+# ***************************************************************
+# Copyright (c) 2021 Jittor.
+# All Rights Reserved. 
+# Maintainers:
+#     Dun Liang <randonlang@gmail.com>. 
+#
+# Contributors:
+#     Xin Yao <yaox12@outlook.com>
+# 
+# This file is subject to the terms and conditions defined in
+# file 'LICENSE.txt', which is part of this source code package.
+# ***************************************************************
+from typing import Sequence
+from PIL import Image, ImageOps, ImageEnhance, __version__ as PILLOW_VERSION
+import numpy as np
+import numbers
+import math
+from math import cos, sin, tan
+
+
+def _is_pil_image(img):
+    return isinstance(img, Image.Image)
+
+
+def _get_image_size(img):
+    if _is_pil_image(img):
+        return img.size
+    raise TypeError(f"Unexpected type {type(img)}")
+
+
+def _get_image_num_channels(img):
+    if _is_pil_image(img):
+        return 1 if img.mode == 'L' else 3
+    raise TypeError(f"Unexpected type {type(img)}")
+
+
+def hflip(img):
+    """
+    Function for horizontally flipping the given image.
+
+    Args::
+
+        [in] img(PIL Image.Image): Input image.
+
+    Example::
+        
+        img = Image.open(...)
+        img_ = transform.hflip(img)
+    """
+    if not _is_pil_image(img):
+        raise TypeError(f'img should be PIL Image. Got {type(img)}')
+
+    return img.transpose(Image.FLIP_LEFT_RIGHT)
+
+
+def vflip(img):
+    """
+    Function for vertically flipping the given image.
+
+    Args::
+
+        [in] img(PIL Image.Image): Input image.
+
+    Example::
+        
+        img = Image.open(...)
+        img_ = transform.vflip(img)
+    """
+    if not _is_pil_image(img):
+        raise TypeError(f'img should be PIL Image. Got {type(img)}')
+
+    return img.transpose(Image.FLIP_TOP_BOTTOM)
+
+
+def adjust_brightness(img, brightness_factor):
+    """
+    Function for adjusting brightness of an RGB image.
+
+    Args::
+
+        [in] img (PIL Image.Image): Image to be adjusted.
+        [in] brightness_factor (float):  How much to adjust the brightness.
+             Can be any non negative number. 0 gives a black image, 1 gives the
+             original image while 2 increases the brightness by a factor of 2.
+
+    Returns::
+
+        [out] PIL Image.Image: Brightness adjusted image.
+
+    Example::
+        
+        img = Image.open(...)
+        img_ = transform.adjust_brightness(img, 0.5)
+    """
+    if not _is_pil_image(img):
+        raise TypeError(f'img should be PIL Image. Got {type(img)}')
+
+    enhancer = ImageEnhance.Brightness(img)
+    img = enhancer.enhance(brightness_factor)
+    return img
+
+
+def adjust_contrast(img, contrast_factor):
+    """
+    Function for adjusting contrast of an image.
+
+    Args::
+
+        [in] img (PIL Image.Image): Image to be adjusted.
+        [in] contrast_factor (float): How much to adjust the contrast.
+             Can be any non negative number. 0 gives a solid gray image,
+             1 gives the original image while 2 increases the contrast by a factor of 2.
+
+    Returns::
+
+        [out] PIL Image.Image: Contrast adjusted image.
+
+    Example::
+        
+        img = Image.open(...)
+        img_ = transform.adjust_contrast(img, 0.5)
+    """
+    if not _is_pil_image(img):
+        raise TypeError(f'img should be PIL Image. Got {type(img)}')
+
+    enhancer = ImageEnhance.Contrast(img)
+    img = enhancer.enhance(contrast_factor)
+    return img
+
+
+def adjust_saturation(img, saturation_factor):
+    """
+    Function for adjusting saturation of an image.
+
+    Args::
+
+        [in] img (PIL Image.Image): Image to be adjusted.
+        [in] saturation_factor (float):  How much to adjust the saturation.
+             0 will give a black and white image, 1 will give the original image
+             while 2 will enhance the saturation by a factor of 2.
+
+    Returns::
+
+        [out] PIL Image.Image: Saturation adjusted image.
+
+    Example::
+        
+        img = Image.open(...)
+        img_ = transform.adjust_saturation(img, 0.5)
+    """
+    if not _is_pil_image(img):
+        raise TypeError(f'img should be PIL Image. Got {type(img)}')
+
+    enhancer = ImageEnhance.Color(img)
+    img = enhancer.enhance(saturation_factor)
+    return img
+
+
+def adjust_hue(img, hue_factor):
+    """
+    Function for adjusting hue of an image.
+
+    The image hue is adjusted by converting the image to HSV and
+    cyclically shifting the intensities in the hue channel (H).
+    The image is then converted back to original image mode.
+
+    `hue_factor` is the amount of shift in H channel and must be in the
+    interval `[-0.5, 0.5]`.
+
+    See `Hue`_ for more details.
+
+    .. _Hue: https://en.wikipedia.org/wiki/Hue
+
+    Args::
+
+        [in] img (PIL Image.Image): Image to be adjusted.
+        [in] hue_factor (float):  How much to shift the hue channel.
+             Should be in [-0.5, 0.5]. 0.5 and -0.5 give complete reversal of
+             hue channel in HSV space in positive and negative direction respectively.
+             0 means no shift. Therefore, both -0.5 and 0.5 will give an image
+             with complementary colors while 0 gives the original image.
+
+    Returns::
+
+        [out] PIL Image.Image: Saturation adjusted image.
+
+    Example::
+        
+        img = Image.open(...)
+        img_ = transform.adjust_hue(img, 0.1)
+    """
+    if not(-0.5 <= hue_factor <= 0.5):
+        raise ValueError(f'hue_factor ({hue_factor}) is not in [-0.5, 0.5].')
+
+    if not _is_pil_image(img):
+        raise TypeError(f'img should be PIL Image. Got {type(img)}')
+
+    input_mode = img.mode
+    if input_mode in {'L', '1', 'I', 'F'}:
+        return img
+
+    h, s, v = img.convert('HSV').split()
+
+    np_h = np.array(h, dtype=np.uint8)
+    # uint8 addition take cares of rotation across boundaries
+    with np.errstate(over='ignore'):
+        np_h += np.uint8(hue_factor * 255)
+    h = Image.fromarray(np_h, 'L')
+
+    img = Image.merge('HSV', (h, s, v)).convert(input_mode)
+    return img
+
+
+def adjust_gamma(img, gamma, gain=1):
+    """
+    Function for performing gamma correction on an image.
+
+    Also known as Power Law Transform. Intensities in RGB mode are adjusted
+    based on the following equation:
+
+    .. math::
+        I_{\text{out}} = 255 \times \text{gain} \times \left(\frac{I_{\text{in}}}{255}\right)^{\gamma}
+
+    See `Gamma Correction`_ for more details.
+
+    .. _Gamma Correction: https://en.wikipedia.org/wiki/Gamma_correction
+
+    Args::
+
+        [in] img (PIL Image.Image): Image to be adjusted.
+        [in] gamma (float): Non negative real number, same as :math:`\gamma` in the equation.
+             gamma larger than 1 make the shadows darker,
+             while gamma smaller than 1 make dark regions lighter.
+        [in] gain (float): The constant multiplier.
+
+    Returns::
+
+        [out] PIL Image.Image: Gamma adjusted image.
+    """
+    if not _is_pil_image(img):
+        raise TypeError(f'img should be PIL Image. Got {type(img)}')
+
+    if gamma < 0:
+        raise ValueError('Gamma should be a non-negative real number')
+
+    input_mode = img.mode
+    img = img.convert('RGB')
+    gamma_map = [(255 + 1 - 1e-3) * gain * pow(ele / 255., gamma) for ele in range(256)] * 3
+    img = img.point(gamma_map)  # use PIL's point-function to accelerate this part
+
+    img = img.convert(input_mode)
+    return img
+
+
+def crop(img, top, left, height, width):
+    """
+    Function for cropping image.
+
+    Args::
+
+        [in] img(PIL Image.Image): Input image.
+        [in] top(int): the top boundary of the cropping box.
+        [in] left(int): the left boundary of the cropping box.
+        [in] height(int): height of the cropping box.
+        [in] width(int): width of the cropping box.
+
+    Returns::
+
+        [out] PIL Image.Image: Cropped image.
+
+    Example::
+        
+        img = Image.open(...)
+        img_ = transform.crop(img, 10, 10, 100, 100)
+    """
+    if not _is_pil_image(img):
+        raise TypeError(f'img should be PIL Image. Got {type(img)}')
+
+    return img.crop((left, top, left + width, top + height))
+
+
+def resize(img, size, interpolation=Image.BILINEAR):
+    """
+    Function for resizing the input image to the given size.
+
+    Args::
+
+        [in] img(PIL Image.Image): Input image.
+        [in] size(sequence or int): Desired output size. If size is a sequence like
+             (h, w), the output size will be matched to this. If size is an int,
+             the smaller edge of the image will be matched to this number maintaining
+             the aspect ratio. If a tuple or list of length 1 is provided, it is
+             interpreted as a single int.
+        [in] interpolation(int, optional): type of interpolation. default: PIL.Image.BILINEAR
+
+    Returns::
+
+        [out] PIL Image.Image: Resized image.
+
+    Example::
+        
+        img = Image.open(...)
+        img_ = transform.resize(img, (100, 100))
+    """
+    if not _is_pil_image(img):
+        raise TypeError(f'img should be PIL Image. Got {type(img)}')
+    if not (isinstance(size, int) or (isinstance(size, Sequence) and len(size) in (1, 2))):
+        raise TypeError(f'Got inappropriate size arg: {size}')
+
+    if isinstance(size, int) or len(size) == 1:
+        if isinstance(size, Sequence):
+            size = size[0]
+        w, h = img.size
+        if (w <= h and w == size) or (h <= w and h == size):
+            return img
+        if w < h:
+            ow = size
+            oh = int(size * h / w)
+            return img.resize((ow, oh), interpolation)
+        else:
+            oh = size
+            ow = int(size * w / h)
+            return img.resize((ow, oh), interpolation)
+    else:
+        return img.resize(size[::-1], interpolation)
+
+
+def gray(img, num_output_channels):
+    """
+    Function for converting PIL image of any mode (RGB, HSV, LAB, etc) to grayscale version of image.
+
+    Args::
+
+        [in] img(PIL Image.Image): Input image.
+        [in] num_output_channels (int): number of channels of the output image. Value can be 1 or 3. Default, 1.
+
+    Returns::
+
+        [out] PIL Image: Grayscale version of the image.
+              if num_output_channels = 1 : returned image is single channel
+              if num_output_channels = 3 : returned image is 3 channel with r = g = b
+    """
+    if not _is_pil_image(img):
+        raise TypeError(f'img should be PIL Image. Got {type(img)}')
+
+    if num_output_channels == 1:
+        img = img.convert('L')
+    elif num_output_channels == 3:
+        img = img.convert('L')
+        np_img = np.array(img, dtype=np.uint8)
+        np_img = np.dstack([np_img, np_img, np_img])
+        img = Image.fromarray(np_img, 'RGB')
+    else:
+        raise ValueError('num_output_channels should be either 1 or 3')
+
+    return img
+
+def _get_perspective_coeffs(startpoints, endpoints):
+    """Helper function to get the coefficients (a, b, c, d, e, f, g, h) for the perspective transforms.
+
+    In Perspective Transform each pixel (x, y) in the orignal image gets transformed as,
+     (x, y) -> ( (ax + by + c) / (gx + hy + 1), (dx + ey + f) / (gx + hy + 1) )
+
+    Args:
+        List containing [top-left, top-right, bottom-right, bottom-left] of the orignal image,
+        List containing [top-left, top-right, bottom-right, bottom-left] of the transformed
+                   image
+    Returns:
+        octuple (a, b, c, d, e, f, g, h) for transforming each pixel.
+    """
+    matrix = []
+
+    for p1, p2 in zip(endpoints, startpoints):
+        matrix.append([p1[0], p1[1], 1, 0, 0, 0, -p2[0] * p1[0], -p2[0] * p1[1]])
+        matrix.append([0, 0, 0, p1[0], p1[1], 1, -p2[1] * p1[0], -p2[1] * p1[1]])
+
+    A = np.array(matrix, dtype="float")
+    B = np.array(startpoints, dtype="float").reshape(8)
+    res = np.linalg.lstsq(A, B, rcond=-1)[0]
+    return res.tolist()
+
+
+def perspective(img, startpoints, endpoints, interpolation=Image.BICUBIC):
+    """Perform perspective transform of the given PIL Image.
+
+    Args:
+        img (PIL Image): Image to be transformed.
+        startpoints: List containing [top-left, top-right, bottom-right, bottom-left] of the orignal image
+        endpoints: List containing [top-left, top-right, bottom-right, bottom-left] of the transformed image
+        interpolation: Default- Image.BICUBIC
+    Returns:
+        PIL Image:  Perspectively transformed Image.
+    """
+    if not _is_pil_image(img):
+        raise TypeError('img should be PIL Image. Got {}'.format(type(img)))
+
+    coeffs = _get_perspective_coeffs(startpoints, endpoints)
+    return img.transform(img.size, Image.PERSPECTIVE, coeffs, interpolation)
+
+
+def resized_crop(img, top, left, height, width, size, interpolation=Image.BILINEAR):
+    """Crop the given PIL Image and resize it to desired size.
+
+    Notably used in :class:`~torchvision.transforms.RandomResizedCrop`.
+
+    Args:
+        img (PIL Image): Image to be cropped. (0,0) denotes the top left corner of the image.
+        top (int): Vertical component of the top left corner of the crop box.
+        left (int): Horizontal component of the top left corner of the crop box.
+        height (int): Height of the crop box.
+        width (int): Width of the crop box.
+        size (sequence or int): Desired output size. Same semantics as ``resize``.
+        interpolation (int, optional): Desired interpolation. Default is
+            ``PIL.Image.BILINEAR``.
+    Returns:
+        PIL Image: Cropped image.
+    """
+    assert _is_pil_image(img), 'img should be PIL Image'
+    img = crop(img, top, left, height, width)
+    img = resize(img, size, interpolation)
+    return img
+
+def center_crop(img, output_size):
+    """Crop the given PIL Image and resize it to desired size.
+
+        Args:
+            img (PIL Image): Image to be cropped. (0,0) denotes the top left corner of the image.
+            output_size (sequence or int): (height, width) of the crop box. If int,
+                it is used for both directions
+        Returns:
+            PIL Image: Cropped image.
+        """
+    if isinstance(output_size, numbers.Number):
+        output_size = (int(output_size), int(output_size))
+    image_width, image_height = img.size
+    crop_height, crop_width = output_size
+    crop_top = int(round((image_height - crop_height) / 2.))
+    crop_left = int(round((image_width - crop_width) / 2.))
+    return crop(img, crop_top, crop_left, crop_height, crop_width)
+
+def five_crop(img, size):
+    """Crop the given PIL Image into four corners and the central crop.
+
+    .. Note::
+        This transform returns a tuple of images and there may be a
+        mismatch in the number of inputs and targets your ``Dataset`` returns.
+
+    Args:
+       size (sequence or int): Desired output size of the crop. If size is an
+           int instead of sequence like (h, w), a square crop (size, size) is
+           made.
+
+    Returns:
+       tuple: tuple (tl, tr, bl, br, center)
+                Corresponding top left, top right, bottom left, bottom right and center crop.
+    """
+    if isinstance(size, numbers.Number):
+        size = (int(size), int(size))
+    else:
+        assert len(size) == 2, "Please provide only two dimensions (h, w) for size."
+
+    image_width, image_height = img.size
+    crop_height, crop_width = size
+    if crop_width > image_width or crop_height > image_height:
+        msg = "Requested crop size {} is bigger than input size {}"
+        raise ValueError(msg.format(size, (image_height, image_width)))
+
+    tl = img.crop((0, 0, crop_width, crop_height))
+    tr = img.crop((image_width - crop_width, 0, image_width, crop_height))
+    bl = img.crop((0, image_height - crop_height, crop_width, image_height))
+    br = img.crop((image_width - crop_width, image_height - crop_height,
+                   image_width, image_height))
+    center = center_crop(img, (crop_height, crop_width))
+    return (tl, tr, bl, br, center)
+
+def ten_crop(img, size, vertical_flip=False):
+    r"""Crop the given PIL Image into four corners and the central crop plus the
+        flipped version of these (horizontal flipping is used by default).
+
+    .. Note::
+        This transform returns a tuple of images and there may be a
+        mismatch in the number of inputs and targets your ``Dataset`` returns.
+
+    Args:
+       size (sequence or int): Desired output size of the crop. If size is an
+            int instead of sequence like (h, w), a square crop (size, size) is
+            made.
+       vertical_flip (bool): Use vertical flipping instead of horizontal
+
+    Returns:
+       tuple: tuple (tl, tr, bl, br, center, tl_flip, tr_flip, bl_flip, br_flip, center_flip)
+                Corresponding top left, top right, bottom left, bottom right and center crop
+                and same for the flipped image.
+    """
+    if isinstance(size, numbers.Number):
+        size = (int(size), int(size))
+    else:
+        assert len(size) == 2, "Please provide only two dimensions (h, w) for size."
+
+    first_five = five_crop(img, size)
+
+    if vertical_flip:
+        img = vflip(img)
+    else:
+        img = hflip(img)
+
+    second_five = five_crop(img, size)
+    return first_five + second_five
+
+
+def rotate(img, angle, resample=False, expand=False, center=None, fill=None):
+    """Rotate the image by angle.
+
+
+    Args:
+        img (PIL Image): PIL Image to be rotated.
+        angle (float or int): In degrees degrees counter clockwise order.
+        resample (``PIL.Image.NEAREST`` or ``PIL.Image.BILINEAR`` or ``PIL.Image.BICUBIC``, optional):
+            An optional resampling filter. See `filters`_ for more information.
+            If omitted, or if the image has mode "1" or "P", it is set to ``PIL.Image.NEAREST``.
+        expand (bool, optional): Optional expansion flag.
+            If true, expands the output image to make it large enough to hold the entire rotated image.
+            If false or omitted, make the output image the same size as the input image.
+            Note that the expand flag assumes rotation around the center and no translation.
+        center (2-tuple, optional): Optional center of rotation.
+            Origin is the upper left corner.
+            Default is the center of the image.
+        fill (n-tuple or int or float): Pixel fill value for area outside the rotated
+            image. If int or float, the value is used for all bands respectively.
+            Defaults to 0 for all bands. This option is only available for ``pillow>=5.2.0``.
+
+    .. _filters: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#filters
+
+    """
+    def parse_fill(fill, num_bands):
+        if PILLOW_VERSION < "5.2.0":
+            if fill is None:
+                return {}
+            else:
+                msg = ("The option to fill background area of the rotated image, "
+                       "requires pillow>=5.2.0")
+                raise RuntimeError(msg)
+
+        if fill is None:
+            fill = 0
+        if isinstance(fill, (int, float)) and num_bands > 1:
+            fill = tuple([fill] * num_bands)
+        if not isinstance(fill, (int, float)) and len(fill) != num_bands:
+            msg = ("The number of elements in 'fill' does not match the number of "
+                   "bands of the image ({} != {})")
+            raise ValueError(msg.format(len(fill), num_bands))
+
+        return {"fillcolor": fill}
+
+    if not _is_pil_image(img):
+        raise TypeError('img should be PIL Image. Got {}'.format(type(img)))
+
+    opts = parse_fill(fill, len(img.getbands()))
+
+    return img.rotate(angle, resample, expand, center, **opts)
+
+
+def _get_inverse_affine_matrix(center, angle, translate, scale, shear):
+    # Helper method to compute inverse matrix for affine transformation
+
+    # As it is explained in PIL.Image.rotate
+    # We need compute INVERSE of affine transformation matrix: M = T * C * RSS * C^-1
+    # where T is translation matrix: [1, 0, tx | 0, 1, ty | 0, 0, 1]
+    #       C is translation matrix to keep center: [1, 0, cx | 0, 1, cy | 0, 0, 1]
+    #       RSS is rotation with scale and shear matrix
+    #       RSS(a, s, (sx, sy)) =
+    #       = R(a) * S(s) * SHy(sy) * SHx(sx)
+    #       = [ s*cos(a - sy)/cos(sy), s*(-cos(a - sy)*tan(x)/cos(y) - sin(a)), 0 ]
+    #         [ s*sin(a + sy)/cos(sy), s*(-sin(a - sy)*tan(x)/cos(y) + cos(a)), 0 ]
+    #         [ 0                    , 0                                      , 1 ]
+    #
+    # where R is a rotation matrix, S is a scaling matrix, and SHx and SHy are the shears:
+    # SHx(s) = [1, -tan(s)] and SHy(s) = [1      , 0]
+    #          [0, 1      ]              [-tan(s), 1]
+    #
+    # Thus, the inverse is M^-1 = C * RSS^-1 * C^-1 * T^-1
+
+    if isinstance(shear, numbers.Number):
+        shear = [shear, 0]
+
+    if not isinstance(shear, (tuple, list)) and len(shear) == 2:
+        raise ValueError(
+            "Shear should be a single value or a tuple/list containing " +
+            "two values. Got {}".format(shear))
+
+    rot = math.radians(angle)
+    sx, sy = [math.radians(s) for s in shear]
+
+    cx, cy = center
+    tx, ty = translate
+
+    # RSS without scaling
+    a = cos(rot - sy) / cos(sy)
+    b = -cos(rot - sy) * tan(sx) / cos(sy) - sin(rot)
+    c = sin(rot - sy) / cos(sy)
+    d = -sin(rot - sy) * tan(sx) / cos(sy) + cos(rot)
+
+    # Inverted rotation matrix with scale and shear
+    # det([[a, b], [c, d]]) == 1, since det(rotation) = 1 and det(shear) = 1
+    M = [d, -b, 0,
+         -c, a, 0]
+    M = [x / scale for x in M]
+
+    # Apply inverse of translation and of center translation: RSS^-1 * C^-1 * T^-1
+    M[2] += M[0] * (-cx - tx) + M[1] * (-cy - ty)
+    M[5] += M[3] * (-cx - tx) + M[4] * (-cy - ty)
+
+    # Apply center translation: C * RSS^-1 * C^-1 * T^-1
+    M[2] += cx
+    M[5] += cy
+    return M
+
+
+def affine(img, angle, translate, scale, shear, resample=0, fillcolor=None):
+    """Apply affine transformation on the image keeping image center invariant
+
+    Args:
+        img (PIL Image): PIL Image to be rotated.
+        angle (float or int): rotation angle in degrees between -180 and 180, clockwise direction.
+        translate (list or tuple of integers): horizontal and vertical translations (post-rotation translation)
+        scale (float): overall scale
+        shear (float or tuple or list): shear angle value in degrees between -180 to 180, clockwise direction.
+        If a tuple of list is specified, the first value corresponds to a shear parallel to the x axis, while
+        the second value corresponds to a shear parallel to the y axis.
+        resample (``PIL.Image.NEAREST`` or ``PIL.Image.BILINEAR`` or ``PIL.Image.BICUBIC``, optional):
+            An optional resampling filter.
+            See `filters`_ for more information.
+            If omitted, or if the image has mode "1" or "P", it is set to ``PIL.Image.NEAREST``.
+        fillcolor (int): Optional fill color for the area outside the transform in the output image. (Pillow>=5.0.0)
+    """
+    if not _is_pil_image(img):
+        raise TypeError('img should be PIL Image. Got {}'.format(type(img)))
+
+    assert isinstance(translate, (tuple, list)) and len(translate) == 2, \
+        "Argument translate should be a list or tuple of length 2"
+
+    assert scale > 0.0, "Argument scale should be positive"
+
+    output_size = img.size
+    center = (img.size[0] * 0.5 + 0.5, img.size[1] * 0.5 + 0.5)
+    matrix = _get_inverse_affine_matrix(center, angle, translate, scale, shear)
+    kwargs = {"fillcolor": fillcolor} if PILLOW_VERSION[0] >= '5' else {}
+    return img.transform(output_size, Image.AFFINE, matrix, resample, **kwargs)
+
--- a/python/jittor/utils/polish_centos.py
+++ b/python/jittor/utils/polish_centos.py
@ -52,8 +52,8 @@ def run_in_centos(env):
    centos_path = os.path.join(home_path, ".cache", "centos")
    os.makedirs(centos_path+"/src/jittor", exist_ok=True)
    os.makedirs(centos_path+"/src/jittor_utils", exist_ok=True)
-    os.system(f"cp -rL {jt.flags.jittor_path} {centos_path+'/src/'}")
-    os.system(f"cp -rL {jt.flags.jittor_path}/../jittor_utils {centos_path+'/src/'}")
+    os.system(f"sudo cp -rL {jt.flags.jittor_path} {centos_path+'/src/'}")
+    os.system(f"sudo cp -rL {jt.flags.jittor_path}/../jittor_utils {centos_path+'/src/'}")

    run_cmd(f"sudo docker build --tag centos_build_env -f /tmp/centos_build_env .")
    run_cmd(f"sudo docker run --rm -v {centos_path}:/root/.cache/jittor centos_build_env scl enable devtoolset-7 'PYTHONPATH=/root/.cache/jittor/src {env} python3.8 -m jittor.test.test_core'")
--- a/python/jittor/utils/pytorch_converter.py
+++ b/python/jittor/utils/pytorch_converter.py
@ -358,9 +358,9 @@ unsupport_ops = [
    # ***************************************************************
    'ModuleDict', 'ParameterList', 'ParameterDict', 
    'Conv1d', 'Conv3d', 'ConvTranspose1d', 'ConvTranspose3d', 'Unfold', 'Fold', 
-    'MaxPool1d', 'MaxPool3d', 'MaxUnpool1d', 'MaxUnpool2d', 'MaxUnpool3d', 'AvgPool1d', 
-    'AvgPool3d', 'FractionalMaxPool2d', 'LPPool1d', 'LPPool2d', 'AdaptiveMaxPool1d', 
-    'AdaptiveMaxPool2d', 'AdaptiveMaxPool3d', 'AdaptiveAvgPool1d', 'AdaptiveAvgPool3d', 
+    'MaxPool1d', 'MaxUnpool1d', 'MaxUnpool2d', 'AvgPool1d', 
+    'FractionalMaxPool2d', 'LPPool1d', 'LPPool2d', 'AdaptiveMaxPool1d', 
+    'AdaptiveAvgPool1d',
    'ReflectionPad1d', 'ReplicationPad1d', 'ReplicationPad3d', 'ConstantPad1d', 'ConstantPad3d', 
    'ELU', 'Hardshrink', 'Hardtanh', 'LogSigmoid', 'MultiheadAttention', 
    'RReLU', 'SELU', 'CELU', 'GELU', 'Softshrink', 'Softsign', 'Tanhshrink', 
--- a/python/jittor/version
+++ b/python/jittor/version
@ -1 +1 @@
-5f0e1aa2f9891c12fc1e190d6cc6177fc6498302
+939b29514b2e5cc591053aab614efd569772585d
--- a/python/jittor_utils/install_cuda.py
+++ b/python/jittor_utils/install_cuda.py
@ -44,7 +44,7 @@ def install_cuda():
        md5 = "5dbdb43e35b4db8249027997720bf1ca"
    elif cuda_driver_version >= [10,2]:
        cuda_tgz = "cuda10.2_cudnn7_linux.tgz"
-        md5 = "a78f296746d97e9d76615289c2fe98ac"
+        md5 = "40f0563e8eb176f53e55943f6d212ad7"
    elif cuda_driver_version >= [10,]:
        cuda_tgz = "cuda10.0_cudnn7_linux.tgz"
        md5 = "f16d3ff63f081031d21faec3ec8b7dac"