diff --git a/python/jittor/__init__.py b/python/jittor/__init__.py
index f8852bca..6be24cd7 100644
--- a/python/jittor/__init__.py
+++ b/python/jittor/__init__.py
@@ -9,7 +9,7 @@
 # file 'LICENSE.txt', which is part of this source code package.
 # ***************************************************************
 
-__version__ = '1.3.1.46'
+__version__ = '1.3.1.55'
 from jittor_utils import lock
 with lock.lock_scope():
     ori_int = int
@@ -91,7 +91,7 @@ def safeunpickle(path):
             import torch
         except:
             raise RuntimeError("pytorch need to be installed when load pth format.")
-        model_dict = torch.load(path, map_location=torch.device('cpu'))
+        model_dict = torch.load(path, map_location='cpu')
         try:
             for k, v in model_dict.items():
                 try:
@@ -231,19 +231,19 @@ class profile_scope(_call_no_record_scope):
 
     def __enter__(self):
         assert not flags.profiler_enable
-        profiler.start(self.warmup, self.rerun)
         self.report = []
         try:
             self.fs.__enter__()
+            profiler.start(self.warmup, self.rerun)
             return self.report
         except:
             profiler.stop()
             raise
 
     def __exit__(self, *exc):
-        self.fs.__exit__(*exc)
         profiler.stop()
         self.report.extend(profiler.report())
+        self.fs.__exit__(*exc)
 
 class __single_process_scope:
     def __init__(self, rank=0):
@@ -304,15 +304,52 @@ Var.cast = Var.cast
 def array(data, dtype=None):
     if isinstance(data, core.Var):
         if dtype is None:
-            return data.clone()
-        return cast(data, dtype)
-    if dtype is not None:
+            ret = data.clone()
+        else:
+            ret = cast(data, dtype)
+    elif dtype is not None:
         if isinstance(dtype, NanoString):
             dtype = str(dtype)
         elif callable(dtype):
             dtype = dtype.__name__
-        return ops.array(np.array(data, dtype))
-    return ops.array(data)
+        ret = ops.array(np.array(data, dtype))
+    else:
+        ret = ops.array(data)
+    # TODO: move those code to core
+    amp_reg = jt.flags.amp_reg
+    if amp_reg and ret.numel() != 1 and ret.dtype.is_float():
+        if amp_reg & 16:
+            if amp_reg & 1:
+                if ret.dtype != "float32":
+                    return ret.float32()
+            elif amp_reg & 2:
+                if ret.dtype != "float16":
+                    return ret.float16()
+    return ret
+
+def random(shape, dtype="float32", type="uniform"):
+    # TODO: move those code to core
+    if dtype == "float16":
+        # TODO: make curand support fp16
+        ret = ops.random(shape, "float32", type).float16()
+    else:
+        ret = ops.random(shape, dtype, type)
+    amp_reg = jt.flags.amp_reg
+    if amp_reg:
+        if amp_reg & 16:
+            if amp_reg & 1:
+                if ret.dtype != "float32":
+                    return ret.float32()
+            elif amp_reg & 2:
+                if ret.dtype != "float16":
+                    return ret.float16()
+    return ret
+
+def float_auto(x):
+    if jt.flags.amp_reg & 2:
+        return x.float16()
+    return x.float32()
+Var.float_auto = float_auto
 
 def array64(data, dtype=None):
     with jt.flag_scope(auto_convert_64_to_32=0):
@@ -920,6 +957,14 @@ class Module:
         self.dfs([], "", callback, callback_leave)
         return ms
 
+    @property
+    def _modules(self):
+        return { k:v for k,v in self.__dict__.items() if isinstance(v, Module) }
+
+    @property
+    def _parameters(self):
+        return { k:v for k,v in self.__dict__.items() if isinstance(v, Var) }
+
     def requires_grad_(self, requires_grad=True):
         self._requires_grad = requires_grad
         self._place_hooker()
@@ -1187,6 +1232,33 @@ Arguments of hook are defined as::
     def __getattr__(self, key):
         return object.__getattribute__(self, key)
 
+    def float64(self):
+        '''convert all parameters to float16'''
+        for p in self.parameters():
+            if p.dtype.is_float():
+                p.assign(p.float64())
+        return self
+
+    def float16(self):
+        '''convert all parameters to float16'''
+        for p in self.parameters():
+            if p.dtype.is_float():
+                p.assign(p.float16())
+        return self
+
+    def half(self):
+        '''convert all parameters to float16'''
+        return self.float16()
+
+    def float_auto(self):
+        '''convert all parameters to float16 or float32 automatically
+        by jt.flags.auto_mixed_precision_level and jt.flags.amp_reg'''
+        for p in self.parameters():
+            if p.dtype.is_float():
+                p.assign(p.float_auto())
+        return self
+
+
 
 class Function(Module):
     ''' Function Module for customized backward operations
@@ -1417,18 +1489,15 @@ Var.size = size
 
 
 def to_int(v):
-    dtype = str(v.dtype)
-    assert dtype.startswith("int")
+    assert v.dtype.is_int()
     return v.item()
 
 def to_float(v):
-    dtype = str(v.dtype)
-    assert dtype.startswith("float")
+    assert v.dtype.is_float()
     return v.item()
 
 def to_bool(v):
-    dtype = str(v.dtype)
-    assert dtype.startswith("int") or dtype=="bool"
+    assert v.dtype.is_int() or v.dtype.is_bool()
     return ori_bool(v.item())
 
 Var.__int__ = to_int
@@ -1450,6 +1519,8 @@ float = float32
 Var.float = Var.float32
 double = float64
 Var.double = Var.float64
+half = float16
+Var.half = Var.float16
 
 def is_var(v):
     return isinstance(v, Var)
diff --git a/python/jittor/__init__.pyi b/python/jittor/__init__.pyi
index f82c7a0a..823183d5 100644
--- a/python/jittor/__init__.pyi
+++ b/python/jittor/__init__.pyi
@@ -5,7 +5,7 @@ from . import attention as attention, contrib as contrib, dataset as dataset, in
 from .compile_extern import cublas as cublas, cudnn as cudnn, curand as curand, cufft as cufft, mkl_ops as mkl_ops, mpi_ops as mpi_ops, world_size as world_size
 from .compiler import compile_custom_op as compile_custom_op, compile_custom_ops as compile_custom_ops
 from .contrib import concat as concat
-from .nn import matmul as matmul
+from .nn import bmm as bmm, bmm_transpose as bmm_transpose, matmul as matmul
 from collections import OrderedDict as OrderedDict
 from collections.abc import Mapping as Mapping
 from typing import Any
@@ -64,6 +64,8 @@ def clean() -> None: ...
 cast = unary
 
 def array(data, dtype: Any | None = ...): ...
+def random(shape, dtype: str = ..., type: str = ...): ...
+def float_auto(x): ...
 def array64(data, dtype: Any | None = ...): ...
 def grad(loss, targets): ...
 def liveness_info(): ...
@@ -85,7 +87,6 @@ origin_transpose = transpose
 def transpose(x, *dim): ...
 permute = transpose
 def flatten(input, start_dim: int = ..., end_dim: int = ...): ...
-def start_grad(x): ...
 def detach(x): ...
 def unsqueeze(x, dim): ...
 def squeeze(x, dim): ...
@@ -149,6 +150,11 @@ class Module:
     is_train: bool
     def is_training(self) -> bool: ...
     def mpi_param_broadcast(self, root: int = ...) -> None: ...
+    def __setattr__(self, key, value) -> None: ...
+    def __getattr__(self, key): ...
+    def float16(self) -> None: ...
+    def half(self) -> None: ...
+    def float_auto(self) -> None: ...
 
 class Function(Module):
     input_mask: Any
@@ -187,6 +193,7 @@ def to_float(v): ...
 def to_bool(v): ...
 def format(v, spec): ...
 def get_len(var): ...
+half = float16
 
 def is_var(v): ...
 from typing import List, Tuple, Callable, overload
@@ -374,7 +381,7 @@ def index(shape: Tuple[int], dim: int, dtype: str="int32")-> Var:
 	        # output: [[0,1],[0,1]]'''
 	...
 @overload
-def index(shape: Tuple[int], dtype: str="int32")-> List[Var]:
+def index(shape: Tuple[int], dtype: str="int32"):
 	'''Document:
 	* 
 	    Index Operator generate index of shape.
@@ -428,7 +435,7 @@ def index(a: Var, dim: int, dtype: str="int32")-> Var:
 	        # output: [[0,1],[0,1]]'''
 	...
 @overload
-def index(a: Var, dtype: str="int32")-> List[Var]:
+def index(a: Var, dtype: str="int32"):
 	'''Document:
 	* 
 	    Index Operator generate index of shape.
@@ -461,7 +468,7 @@ def index_var(a: Var, dim: int, dtype: str="int32")-> Var:
 	        jt.index_var(a, 1) similar with jt.index(a.shape, 1)'''
 	...
 @overload
-def index_var(a: Var, dtype: str="int32")-> List[Var]:
+def index_var(a: Var, dtype: str="int32"):
 	'''Document:
 	* shape dependency version of index op
 	        jt.index_var(a, 1) similar with jt.index(a.shape, 1)'''
@@ -824,7 +831,7 @@ def bitwise_xor(x: Var, y: Var)-> Var:
 	...
 def tape(x: Var)-> Var:
  ...
-def where(cond: Var, dtype: str="int32")-> List[Var]:
+def where(cond: Var, dtype: str="int32"):
 	'''Document:
 	*
 	    Where Operator generate index of true condition.
@@ -838,9 +845,9 @@ def where(cond: Var, dtype: str="int32")-> List[Var]:
 	    Example::
 	
 	        jt.where([[0,0,1],[1,0,0]])
-	        # return ( [0,2], [1,0] )'''
+	        # return [jt.Var([0 1], dtype=int32), jt.Var([2 0], dtype=int32)]'''
 	...
-def argsort(x: Var, dim: int=-1, descending: bool=False, dtype: str="int32")-> List[Var]:
+def argsort(x: Var, dim: int=-1, descending: bool=False, dtype: str="int32"):
 	'''Document:
 	* 
 	    Argsort Operator Perform an indirect sort by given key or compare function.
@@ -883,7 +890,7 @@ def argsort(x: Var, dim: int=-1, descending: bool=False, dtype: str="int32")-> L
 	...
 def fetch(inputs: List[Var], func: Callable)-> Var:
  ...
-def arg_reduce(x: Var, op: str, dim: int, keepdims: bool)-> List[Var]:
+def arg_reduce(x: Var, op: str, dim: int, keepdims: bool):
 	'''Document:
 	*
 	    Returns the indices of the maximum / minimum of the input across a dimension.
@@ -908,7 +915,7 @@ def arg_reduce(x: Var, op: str, dim: int, keepdims: bool)-> List[Var]:
 	        >>> jt.arg_reduce(x, 'max', dim=1, keepdims=False)
 	        [jt.Var([2 1], dtype=int32), jt.Var([5 7], dtype=int32)]
 	        >>> jt.arg_reduce(x, 'min', dim=1, keepdims=False)
-	        [jt.Var([1 2], dtype=int32), jt.Var([5 7], dtype=int32)]'''
+	        [jt.Var([1 2], dtype=int32), jt.Var([2 1], dtype=int32)]'''
 	...
 def random(shape: Tuple[int], dtype: str="float32", type: str="uniform")-> Var:
  ...
@@ -2278,6 +2285,8 @@ def uint32(x: Var)-> Var:
  ...
 def uint64(x: Var)-> Var:
  ...
+def float16(x: Var)-> Var:
+ ...
 def float32(x: Var)-> Var:
  ...
 def float64(x: Var)-> Var:
@@ -2870,6 +2879,8 @@ def erf(x: Var)-> Var:
 	        >>> jt.erf(a)
 	        jt.Var([ 0.51559156  0.45739546 -0.85728306 -0.9258883 ], dtype=float32)'''
 	...
+def erfinv(x: Var)-> Var:
+ ...
 def transpose(x: Var, axes: Tuple[int]=())-> Var:
  ...
 def fuse_transpose(x: Var, axes: Tuple[int]=())-> Var:
@@ -3005,7 +3016,7 @@ def numpy_code(shape: Tuple[int], dtype: str, inputs: List[Var], forward: Callab
 	        )'''
 	...
 @overload
-def numpy_code(shapes: List[Tuple[int]], dtypes: List[str], inputs: List[Var], forward: Callable, backward: List[Callable])-> List[Var]:
+def numpy_code(shapes: List[Tuple[int]], dtypes: List[str], inputs: List[Var], forward: Callable, backward: List[Callable]):
 	'''Document:
 	*
 	    Numpy Code Operator for easily customized op.
@@ -3151,7 +3162,7 @@ def numpy_code(shape: Tuple[int], dtype: str, inputs: List[Var], forward: Callab
 	        )'''
 	...
 @overload
-def numpy_code(shapes: List[Tuple[int]], dtypes: List[str], inputs: List[Var], forward: Callable)-> List[Var]:
+def numpy_code(shapes: List[Tuple[int]], dtypes: List[str], inputs: List[Var], forward: Callable):
 	'''Document:
 	*
 	    Numpy Code Operator for easily customized op.
@@ -3345,6 +3356,23 @@ def code(shape: Tuple[int], dtype: str, inputs: List[Var]={}, cpu_src: str="", c
 	        assert (b.data == [5,3,1]).all()
 	        assert (c.data == [-4,-2]).all()
 	
+	    Example-5::
+	
+	        # This example shows how to customize code op
+	        # compilation flags, such as add include search
+	        # path, add definitions, or any command line options
+	
+	        a = jt.random([10])
+	        b = jt.code(a.shape, a.dtype, [a],
+	            cpu_src="""
+	                @out0(0) = HAHAHA;
+	            """)
+	        # HAHAHA is defined in flags below
+	        # /any/include/path can be change to any path you want to include
+	        b.compile_options = {"FLAGS: -DHAHAHA=233 -I/any/include/path ": 1}
+	        print(b[0])
+	        # will output 233
+	
 	
 	    CUDA Example-1::
 	
@@ -3435,7 +3463,7 @@ def code(shape: Tuple[int], dtype: str, inputs: List[Var]={}, cpu_src: str="", c
 	        print(jt.grad(c, [a, b]))'''
 	...
 @overload
-def code(shapes: List[Tuple[int]], dtypes: List[str], inputs: List[Var]={}, cpu_src: str="", cpu_grad_src: List[str]={}, cpu_header: str="", cuda_src: str="", cuda_grad_src: List[str]={}, cuda_header: str="")-> List[Var]:
+def code(shapes: List[Tuple[int]], dtypes: List[str], inputs: List[Var]={}, cpu_src: str="", cpu_grad_src: List[str]={}, cpu_header: str="", cuda_src: str="", cuda_grad_src: List[str]={}, cuda_header: str=""):
 	'''Document:
 	*
 	    Code Operator for easily customized op.
@@ -3556,6 +3584,23 @@ def code(shapes: List[Tuple[int]], dtypes: List[str], inputs: List[Var]={}, cpu_
 	        assert (b.data == [5,3,1]).all()
 	        assert (c.data == [-4,-2]).all()
 	
+	    Example-5::
+	
+	        # This example shows how to customize code op
+	        # compilation flags, such as add include search
+	        # path, add definitions, or any command line options
+	
+	        a = jt.random([10])
+	        b = jt.code(a.shape, a.dtype, [a],
+	            cpu_src="""
+	                @out0(0) = HAHAHA;
+	            """)
+	        # HAHAHA is defined in flags below
+	        # /any/include/path can be change to any path you want to include
+	        b.compile_options = {"FLAGS: -DHAHAHA=233 -I/any/include/path ": 1}
+	        print(b[0])
+	        # will output 233
+	
 	
 	    CUDA Example-1::
 	
@@ -3646,7 +3691,7 @@ def code(shapes: List[Tuple[int]], dtypes: List[str], inputs: List[Var]={}, cpu_
 	        print(jt.grad(c, [a, b]))'''
 	...
 @overload
-def code(inputs: List[Var], outputs: List[Var], cpu_src: str="", cpu_grad_src: List[str]={}, cpu_header: str="", cuda_src: str="", cuda_grad_src: List[str]={}, cuda_header: str="")-> List[Var]:
+def code(inputs: List[Var], outputs: List[Var], cpu_src: str="", cpu_grad_src: List[str]={}, cpu_header: str="", cuda_src: str="", cuda_grad_src: List[str]={}, cuda_header: str=""):
 	'''Document:
 	*
 	    Code Operator for easily customized op.
@@ -3767,6 +3812,23 @@ def code(inputs: List[Var], outputs: List[Var], cpu_src: str="", cpu_grad_src: L
 	        assert (b.data == [5,3,1]).all()
 	        assert (c.data == [-4,-2]).all()
 	
+	    Example-5::
+	
+	        # This example shows how to customize code op
+	        # compilation flags, such as add include search
+	        # path, add definitions, or any command line options
+	
+	        a = jt.random([10])
+	        b = jt.code(a.shape, a.dtype, [a],
+	            cpu_src="""
+	                @out0(0) = HAHAHA;
+	            """)
+	        # HAHAHA is defined in flags below
+	        # /any/include/path can be change to any path you want to include
+	        b.compile_options = {"FLAGS: -DHAHAHA=233 -I/any/include/path ": 1}
+	        print(b[0])
+	        # will output 233
+	
 	
 	    CUDA Example-1::
 	
@@ -4239,7 +4301,7 @@ class Var:
 		        # output: [[0,1],[0,1]]'''
 		...
 	@overload
-	def index(self, dtype: str="int32")-> List[Var]:		
+	def index(self, dtype: str="int32"):		
 		'''Document:
 		* 
 		    Index Operator generate index of shape.
@@ -4272,7 +4334,7 @@ class Var:
 		        jt.index_var(a, 1) similar with jt.index(a.shape, 1)'''
 		...
 	@overload
-	def index_var(self, dtype: str="int32")-> List[Var]:		
+	def index_var(self, dtype: str="int32"):		
 		'''Document:
 		* shape dependency version of index op
 		        jt.index_var(a, 1) similar with jt.index(a.shape, 1)'''
@@ -4633,7 +4695,7 @@ class Var:
 		    * [in] y: the second input, jt.Var (integal or boolean).'''
 		...
 	def tape(self)-> Var: ...
-	def where(self, dtype: str="int32")-> List[Var]:		
+	def where(self, dtype: str="int32"):		
 		'''Document:
 		*
 		    Where Operator generate index of true condition.
@@ -4647,9 +4709,9 @@ class Var:
 		    Example::
 		
 		        jt.where([[0,0,1],[1,0,0]])
-		        # return ( [0,2], [1,0] )'''
+		        # return [jt.Var([0 1], dtype=int32), jt.Var([2 0], dtype=int32)]'''
 		...
-	def argsort(self, dim: int=-1, descending: bool=False, dtype: str="int32")-> List[Var]:		
+	def argsort(self, dim: int=-1, descending: bool=False, dtype: str="int32"):		
 		'''Document:
 		* 
 		    Argsort Operator Perform an indirect sort by given key or compare function.
@@ -4691,7 +4753,7 @@ class Var:
 		            # return [[0 1 0],[1 0 1]],  [[11 11 12],[12 13 13]]'''
 		...
 	def fetch(self, func: Callable)-> Var: ...
-	def arg_reduce(self, op: str, dim: int, keepdims: bool)-> List[Var]:		
+	def arg_reduce(self, op: str, dim: int, keepdims: bool):		
 		'''Document:
 		*
 		    Returns the indices of the maximum / minimum of the input across a dimension.
@@ -6059,6 +6121,7 @@ class Var:
 	def uint16(self)-> Var: ...
 	def uint32(self)-> Var: ...
 	def uint64(self)-> Var: ...
+	def float16(self)-> Var: ...
 	def float32(self)-> Var: ...
 	def float64(self)-> Var: ...
 	def abs(self)-> Var:		
@@ -6649,6 +6712,7 @@ class Var:
 		        >>> jt.erf(a)
 		        jt.Var([ 0.51559156  0.45739546 -0.85728306 -0.9258883 ], dtype=float32)'''
 		...
+	def erfinv(self)-> Var: ...
 	def transpose(self, axes: Tuple[int]=())-> Var: ...
 	def fuse_transpose(self, axes: Tuple[int]=())-> Var: ...
 	def safe_clip(self, left: float, right: float)-> Var:		
@@ -6705,7 +6769,7 @@ class Var:
 		        #    x[y[0], 1] <= x[y[1], 1] and x[y[1], 1] <= x[y[2], 1] and ... and x[y[m-2], 1] <= x[y[m-1], 1]'''
 		...
 	@overload
-	def code(self, outputs: List[Var], cpu_src: str="", cpu_grad_src: List[str]={}, cpu_header: str="", cuda_src: str="", cuda_grad_src: List[str]={}, cuda_header: str="")-> List[Var]:		
+	def code(self, outputs: List[Var], cpu_src: str="", cpu_grad_src: List[str]={}, cpu_header: str="", cuda_src: str="", cuda_grad_src: List[str]={}, cuda_header: str=""):		
 		'''Document:
 		*
 		    Code Operator for easily customized op.
@@ -6826,6 +6890,23 @@ class Var:
 		        assert (b.data == [5,3,1]).all()
 		        assert (c.data == [-4,-2]).all()
 		
+		    Example-5::
+		
+		        # This example shows how to customize code op
+		        # compilation flags, such as add include search
+		        # path, add definitions, or any command line options
+		
+		        a = jt.random([10])
+		        b = jt.code(a.shape, a.dtype, [a],
+		            cpu_src="""
+		                @out0(0) = HAHAHA;
+		            """)
+		        # HAHAHA is defined in flags below
+		        # /any/include/path can be change to any path you want to include
+		        b.compile_options = {"FLAGS: -DHAHAHA=233 -I/any/include/path ": 1}
+		        print(b[0])
+		        # will output 233
+		
 		
 		    CUDA Example-1::
 		
@@ -7177,6 +7258,11 @@ class Var:
 		*
 		     * return True if operator fusion is stopped.'''
 		...
+	def start_grad(self)-> Var:		
+		'''Document:
+		* 
+		     * enable the gradient calculation for the Var.'''
+		...
 	def item(self)-> float | int | bool:		
 		'''Document:
 		*
@@ -7266,7 +7352,13 @@ class Var:
 		         [1 8 1 1 2 2]], dtype=int32)'''
 		...
 	def permute(self, x: Var, axes: Tuple[int]=())-> Var: ...
+	def detach_inplace(self)-> Var:		
+		'''Document:
+		* 
+		     * enable the gradient calculation for the Var.'''
+		...
 	def astype(self, x: Var, op: str)-> Var: ...
+	def half(self, x: Var)-> Var: ...
 	def expand_as(self, x: Var, y: Var, dims: Tuple[int]=())-> Var:		
 		'''Document:
 		*
@@ -7310,8 +7402,12 @@ class Flags:
 	'''A set of flags to configure jittor running behaviors'''
 	addr2line_path: str
 	'''Path of addr2line. Default: ""'''
+	amp_reg: int
+	'''Auto mixed-precision control registers, bit 0: prefer 32; bit 1: prefer 16; bit 2: keep reduce type; bit 3 keep white list type; bit 4: array like op prefer too. Default: 0'''
 	auto_convert_64_to_32: int
 	'''auto convert 64bit numpy array into 32bit jittor array. Default: 1'''
+	auto_mixed_precision_level: int
+	'''Auto mixed-precision optimization level, 0: not use fp16, 1-3: preserve level, not use fp16 for now; 4: perfer fp16, but some ops use fp32 e.g. sum,exp; 5: simular with 4, and array op will automatically convert to fp16; 6: all ops prefer fp16. Default: 0'''
 	cache_path: str
 	'''Cache path of jittor. Default: ""'''
 	cc_flags: str
@@ -7324,10 +7420,12 @@ class Flags:
 	'''Unify graph sanity check. Default: 0'''
 	compile_options: Any
 	'''Override the default loop transfrom options. Default: {}'''
+	disable_lock: bool
+	'''Disable file lock. Default: 0'''
 	enable_tuner: int
 	'''Enable tuner. Default: 1'''
 	exclude_pass: str
-	'''Don't run certian pass. Default: ""'''
+	'''Don't run certain pass. Default: ""'''
 	extra_gdb_cmd: str
 	'''Extra command pass to GDB, seperate by(;) . Default: ""): Extra command pass to GDB, seperate by(;'''
 	gdb_attach: int
@@ -7352,6 +7450,8 @@ class Flags:
 	'''Default enabled, if disable, use immediately eager execution rather than lazy execution, This flag makes error message and traceback infomation better. But this flag will raise memory consumption and lower the performance. Default: 1'''
 	log_file: str
 	'''log to file, mpi env will add $OMPI_COMM_WORLD_RANK suffix. Default: ""'''
+	log_op_hash: str
+	'''Output compiler pass result of certain hash of op. Default: ""'''
 	log_silent: int
 	'''The log will be completely silent. Default: 0'''
 	log_sync: int
@@ -7376,6 +7476,10 @@ class Flags:
 	'''Enable profiler. Default: 0'''
 	profiler_hide_relay: int
 	'''Profiler hide relayed op. Default: 0'''
+	profiler_record_peek: int
+	'''Profiler record peek mem bandwidth. Default: 0'''
+	profiler_record_shape: int
+	'''Profiler record shape for op. Default: 0'''
 	profiler_rerun: int
 	'''Profiler rerun. Default: 0'''
 	profiler_warmup: int
@@ -7402,8 +7506,12 @@ class Flags:
 	'''If not overflow, try to use 32 bit type as index type. Default: 0'''
 	update_queue_auto_flush_delay: int
 	'''when size of a update queue is great than this value, update queue trigger auto flush(default 2). Default: 2): when size of a update queue is great than this value, update queue trigger auto flush(default 2'''
+	use_acl: int
+	'''Use cuda or not. 1 for trying to use cuda, 2 for forcing to use cuda. Default: 0'''
 	use_cuda: int
 	'''Use cuda or not. 1 for trying to use cuda, 2 for forcing to use cuda. Default: 0'''
+	use_device: int
+	'''Use cuda or not. 1 for trying to use cuda, 2 for forcing to use cuda. Default: 0'''
 	use_nfef_allocator: int
 	'''Enable never free exact fit allocator. Default: 0'''
 	use_parallel_op_compiler: int
@@ -7414,5 +7522,7 @@ class Flags:
 	'''Enable stat allocator. Default: 0'''
 	use_temp_allocator: int
 	'''Enable temp allocator. Default: 1'''
+	use_tensorcore: int
+	'''use tensor core. Default: 0'''
 flags: Flags
 '''Jittor running time flags instance'''
diff --git a/python/jittor/compile_extern.py b/python/jittor/compile_extern.py
index 24136911..c90f1e4c 100644
--- a/python/jittor/compile_extern.py
+++ b/python/jittor/compile_extern.py
@@ -9,6 +9,7 @@ import platform
 from .compiler import *
 from jittor_utils import run_cmd, get_version, get_int_version
 from jittor_utils.misc import download_url_to_local
+import jittor_utils as jit_utils
 
 def search_file(dirs, name, prefer_version=()):
     if os.name == 'nt':
@@ -110,8 +111,7 @@ def setup_mkl():
             LOG.v("setup mkl...")
             # mkl_path = os.path.join(cache_path, "mkl")
             # mkl_path decouple with cc_path
-            from pathlib import Path
-            mkl_path = os.path.join(str(Path.home()), ".cache", "jittor", "mkl")
+            mkl_path = os.path.join(jit_utils.home(), ".cache", "jittor", "mkl")
             
             make_cache_dir(mkl_path)
             install_mkl(mkl_path)
@@ -141,12 +141,12 @@ def setup_mkl():
 
     elif platform.system() == 'Darwin':
         mkl_lib_paths = [
-            "/usr/local/lib/libmkldnn.dylib",       # x86_64
-            "/opt/homebrew/lib/libmkldnn.dylib",    # arm64
+            "/usr/local/lib/libdnnl.dylib",       # x86_64
+            "/opt/homebrew/lib/libdnnl.dylib",    # arm64
         ]
         if not any([os.path.exists(lib) for lib in mkl_lib_paths]):
             raise RuntimeError("Not found onednn, please install it by the command 'brew install onednn'")
-        extra_flags = f" -lmkldnn "
+        extra_flags = f" -ldnnl "
 
     mkl_op_dir = os.path.join(jittor_path, "extern", "mkl", "ops")
     mkl_op_files = [os.path.join(mkl_op_dir, name) for name in os.listdir(mkl_op_dir)]
@@ -178,8 +178,7 @@ def install_cub(root_folder):
 def setup_cub():
     global cub_home
     cub_home = ""
-    from pathlib import Path
-    cub_path = os.path.join(str(Path.home()), ".cache", "jittor", "cub")
+    cub_path = os.path.join(jit_utils.home(), ".cache", "jittor", "cub")
     cuda_version = int(get_version(nvcc_path)[1:-1].split('.')[0])
     extra_flags = ""
     if cuda_version < 11:
@@ -221,6 +220,12 @@ def setup_cuda_extern():
         LOG.w(f"CUDA found but cub is not loaded:\n{line}")
 
     libs = ["cublas", "cudnn", "curand", "cufft"]
+    # in cuda 11.4, module memory comsumptions:
+    # default context: 259 MB
+    # cublas: 340 MB
+    # cudnn: 340 MB
+    if int(os.environ.get("conv_opt", "0")):
+        libs = ["cublas", "curand"]
     for lib_name in libs:
         try:
             setup_cuda_lib(lib_name, extra_flags=link_cuda_extern)
@@ -320,22 +325,27 @@ def install_cutt(root_folder):
         if md5 != true_md5:
             os.remove(fullname)
             shutil.rmtree(dirname)
-    if not os.path.isfile(os.path.join(cache_path, "libcutt"+so)):
-        LOG.i("Downloading cutt...")
-        download_url_to_local(url, filename, root_folder, true_md5)
+    CUTT_PATH = os.environ.get("CUTT_PATH", "")
+    if not os.path.isfile(os.path.join(cache_path, "libcutt"+so)) or CUTT_PATH:
+        if CUTT_PATH:
+            dirname = CUTT_PATH
+        else:
+            LOG.i("Downloading cutt...")
+            download_url_to_local(url, filename, root_folder, true_md5)
 
-        import zipfile
+            import zipfile
 
-        zf = zipfile.ZipFile(fullname)
-        try:
-            zf.extractall(path=root_folder)
-        except RuntimeError as e:
-            print(e)
-            raise
-        zf.close()
+            zf = zipfile.ZipFile(fullname)
+            try:
+                zf.extractall(path=root_folder)
+            except RuntimeError as e:
+                print(e)
+                raise
+            zf.close()
 
         LOG.i("installing cutt...")
-        arch_flag = ""
+        # -Xptxas -dlcm=ca actually not work
+        arch_flag = " -Xptxas -dlcm=ca "
         if len(flags.cuda_archs):
             arch_flag = f" -arch=compute_{min(flags.cuda_archs)} "
             arch_flag += ''.join(map(lambda x:f' -code=sm_{x} ', flags.cuda_archs))
@@ -365,8 +375,7 @@ def setup_cutt():
     if cutt_lib_path is None or cutt_include_path is None:
         LOG.v("setup cutt...")
         # cutt_path decouple with cc_path
-        from pathlib import Path
-        cutt_path = os.path.join(str(Path.home()), ".cache", "jittor", "cutt")
+        cutt_path = os.path.join(jit_utils.home(), ".cache", "jittor", "cutt")
         
         make_cache_dir(cutt_path)
         install_cutt(cutt_path)
@@ -442,8 +451,7 @@ def setup_nccl():
     if nccl_lib_path is None or nccl_include_path is None:
         LOG.v("setup nccl...")
         # nccl_path decouple with cc_path
-        from pathlib import Path
-        nccl_path = os.path.join(str(Path.home()), ".cache", "jittor", "nccl")
+        nccl_path = os.path.join(jit_utils.home(), ".cache", "jittor", "nccl")
         
         make_cache_dir(nccl_path)
         nccl_home = install_nccl(nccl_path)
diff --git a/python/jittor/compiler.py b/python/jittor/compiler.py
index 22e9f118..a61419ad 100644
--- a/python/jittor/compiler.py
+++ b/python/jittor/compiler.py
@@ -19,7 +19,7 @@ from ctypes import cdll
 from ctypes.util import find_library
 
 import jittor_utils as jit_utils
-from jittor_utils import LOG, run_cmd, cache_path, find_exe, cc_path, cc_type, cache_path
+from jittor_utils import LOG, run_cmd, find_exe, cc_path, cc_type, cache_path
 from . import pyjt_compiler
 from jittor_utils import lock
 from jittor_utils import install_cuda
@@ -228,13 +228,20 @@ def gen_jit_flags():
                 continue
             visit[name] = 1
             jit_declares.append(f"DECLARE_FLAG({type}, {name});")
+            alias = []
+            if name == "use_cuda":
+                alias = ["use_device", "use_acl"]
+            elif name == "auto_mixed_precision_level":
+                alias = ["amp_level"]
+            get_names = ",".join(["__get__"+a for a in [name]+alias])
+            set_names = ",".join(["__set__"+a for a in [name]+alias])
             flags_defs.append(f"""
                 /* {name}(type:{type}, default:{default}): {doc} */
-                // @pyjt(__get__{name})
+                // @pyjt({get_names})
                 {type} _get_{name}() {{ return {name}; }}
-                // @pyjt(__set__{name})
+                // @pyjt({set_names})
                 void _set_{name}({type} v) {{ set_{name}(v); }}
-                {f'''// @pyjt(__set__{name})
+                {f'''// @pyjt({set_names})
                 void _set_{name}(bool v) {{ set_{name}(v); }}
                 ''' if type=="int" else ""}
             """)
@@ -843,7 +850,7 @@ def check_cuda():
         # this nvcc is install by package manager
         cuda_lib = "/usr/lib/x86_64-linux-gnu"
     cuda_include2 = os.path.join(jittor_path, "extern","cuda","inc")
-    cc_flags += f" -DHAS_CUDA -I\"{cuda_include}\" -I\"{cuda_include2}\" "
+    cc_flags += f" -DHAS_CUDA -DIS_CUDA -I\"{cuda_include}\" -I\"{cuda_include2}\" "
     if os.name == 'nt':
         cuda_lib = os.path.abspath(os.path.join(cuda_dir, "..", "lib", "x64"))
         # cc_flags += f" \"{cuda_lib}\\cudart.lib\" "
@@ -1212,6 +1219,14 @@ if has_cuda:
         return nvcc_flags
     nvcc_flags = convert_nvcc_flags(nvcc_flags)
 
+# from .acl_compiler import check_acl
+from .extern.acl import acl_compiler
+jit_utils.add_backend(acl_compiler)
+
+for mod in jit_utils.backends:
+    if mod.check():
+        break
+
 # build core
 gen_jit_flags()
 gen_jit_tests()
@@ -1236,6 +1251,8 @@ files4 = [ f[len(jittor_path)+1:] for f in files4 ]
 # files4 = run_cmd('find -L src | grep '+grep_args, jittor_path).splitlines()
 at_beginning = [
     "src/ops/op_utils.cc",
+    "src/ops/op_register.cc",
+    "src/init.cc",
     "src/event_queue.cc",
     "src/mem/allocator/sfrl_allocator.cc",
     "src/mem/allocator.cc",
diff --git a/python/jittor/dataset/dataset.py b/python/jittor/dataset/dataset.py
index a122040b..f4fc06ba 100644
--- a/python/jittor/dataset/dataset.py
+++ b/python/jittor/dataset/dataset.py
@@ -21,8 +21,9 @@ import signal
 from jittor_utils import LOG
 import jittor as jt
 import time
+import jittor_utils as jit_utils
 
-dataset_root = os.path.join(pathlib.Path.home(), ".cache", "jittor", "dataset")
+dataset_root = os.path.join(jit_utils.home(), ".cache", "jittor", "dataset")
 mp_log_v = os.environ.get("mp_log_v", 0) 
 mpi = jt.mpi
 img_open_hook = HookTimer(Image, "open")
diff --git a/python/jittor/demo/simple_cgan.py b/python/jittor/demo/simple_cgan.py
new file mode 100644
index 00000000..8e984e3c
--- /dev/null
+++ b/python/jittor/demo/simple_cgan.py
@@ -0,0 +1,107 @@
+import jittor as jt
+from jittor import nn
+import numpy as np
+# import pylab as pl
+
+# 隐空间向量长度
+latent_dim = 100
+# 类别数量
+n_classes = 10
+# 图片大小
+img_size = 32
+# 图片通道数量
+channels = 1
+# 图片张量的形状
+img_shape = (channels, img_size, img_size)
+
+class Generator(nn.Module):
+    def __init__(self):
+        super(Generator, self).__init__()
+        self.label_emb = nn.Embedding(n_classes, n_classes)
+
+        def block(in_feat, out_feat, normalize=True):
+            layers = [nn.Linear(in_feat, out_feat)]
+            if normalize:
+                layers.append(nn.BatchNorm1d(out_feat, 0.8))
+            layers.append(nn.LeakyReLU(0.2))
+            return layers
+        self.model = nn.Sequential(
+            *block((latent_dim + n_classes), 128, normalize=False), 
+            *block(128, 256), 
+            *block(256, 512), 
+            *block(512, 1024), 
+            nn.Linear(1024, int(np.prod(img_shape))), 
+            nn.Tanh())
+
+    def execute(self, noise, labels):
+        gen_input = jt.contrib.concat((self.label_emb(labels), noise), dim=1)
+        img = self.model(gen_input)
+        img = img.view((img.shape[0], *img_shape))
+        return img
+
+class Discriminator(nn.Module):
+    def __init__(self):
+        super(Discriminator, self).__init__()
+        self.label_embedding = nn.Embedding(n_classes, n_classes)
+        self.model = nn.Sequential(
+            nn.Linear((n_classes + int(np.prod(img_shape))), 512), 
+            nn.LeakyReLU(0.2), 
+            nn.Linear(512, 512), 
+            nn.Dropout(0.4), 
+            nn.LeakyReLU(0.2), 
+            nn.Linear(512, 512), 
+            nn.Dropout(0.4), 
+            nn.LeakyReLU(0.2), 
+            nn.Linear(512, 1))
+
+    def execute(self, img, labels):
+        d_in = jt.contrib.concat((img.view((img.shape[0], (- 1))), self.label_embedding(labels)), dim=1)
+        validity = self.model(d_in)
+        return validity
+
+
+# 定义模型
+generator = Generator()
+discriminator = Discriminator()
+generator.eval()
+discriminator.eval()
+
+# 加载参数
+generator.load('https://cg.cs.tsinghua.edu.cn/jittor/assets/build/generator_last.pkl')
+discriminator.load('https://cg.cs.tsinghua.edu.cn/jittor/assets/build/discriminator_last.pkl')
+
+
+
+def gen_img(number):
+    print(number, type(number))
+    n_row = len(number)
+    z = jt.array(np.random.normal(0, 1, (n_row, latent_dim))).float32().stop_grad()
+    labels = jt.array(np.array([int(number[num]) for num in range(n_row)])).float32().stop_grad()
+    gen_imgs = generator(z,labels)
+    gen_imgs = gen_imgs.transpose((1,2,0,3)).reshape(gen_imgs.shape[2], -1)
+    gen_imgs = gen_imgs[:,:,None].broadcast(gen_imgs.shape+(3,)) # .uint8()
+    gen_imgs = (gen_imgs - gen_imgs.min()) / (gen_imgs.max() - gen_imgs.min()) * 255
+    gen_imgs = gen_imgs.uint8()
+    # print(gen_imgs.shape, gen_imgs.max(), gen_imgs.min())
+    return gen_imgs.numpy()
+    # gen_imgs = gen_imgs.data.transpose((1,2,0,3))[0].reshape((gen_imgs.shape[2], -1))
+    # print(gen_imgs.shape)
+    return gen_imgs[:,:,None]
+
+from PIL import Image
+import pywebio as pw
+# 定义一串数字
+number = "201962517"
+# gen_img(number)
+Image.fromarray(gen_img(number))
+# pl.imshow()
+# pl.show()
+# print("done")
+
+
+def web_server():
+    pw.pin.put_input("number", label="输入用于生成的数字(由计图框架支持)：")
+    pw.output.put_buttons(['Gen image'], 
+        lambda _: pw.output.put_image(Image.fromarray(gen_img(pw.pin.pin.number))))
+
+pw.start_server(web_server, port=8123)
\ No newline at end of file
diff --git a/python/jittor/extern/acl/acl_compiler.py b/python/jittor/extern/acl/acl_compiler.py
new file mode 100644
index 00000000..289ea67c
--- /dev/null
+++ b/python/jittor/extern/acl/acl_compiler.py
@@ -0,0 +1,54 @@
+# ***************************************************************
+# Copyright (c) 2021 Jittor. All Rights Reserved. 
+# Maintainers: Dun Liang <randonlang@gmail.com>. 
+# This file is subject to the terms and conditions defined in
+# file 'LICENSE.txt', which is part of this source code package.
+# ***************************************************************
+import os
+from jittor_utils import env_or_try_find
+import jittor_utils
+import ctypes
+import glob
+
+has_acl = 0
+cc_flags = ""
+tikcc_path = env_or_try_find('tikcc_path', 'tikcc')
+dlopen_flags = os.RTLD_NOW | os.RTLD_GLOBAL
+
+def install():
+    import jittor.compiler as compiler
+    global has_acl, cc_flags
+    acl_compiler_home = os.path.dirname(__file__)
+    cc_files = sorted(glob.glob(acl_compiler_home+"/**/*.cc", recursive=True))
+    cc_flags += f" -DHAS_CUDA -DIS_ACL -I/usr/local/Ascend/runtime/include -I/usr/local/Ascend/driver/include -L/usr/local/Ascend/compiler/lib64 -L/usr/local/Ascend/runtime/lib64 -I{acl_compiler_home}  -ltikc_runtime -lascendcl  "
+    ctypes.CDLL("libascendcl.so", dlopen_flags)
+    jittor_utils.LOG.i("ACL detected")
+
+    mod = jittor_utils.compile_module('''
+#include "common.h"
+namespace jittor {
+// @pyjt(process)
+string process_acl(const string& src, const string& name, const map<string,string>& kargs);
+}''', compiler.cc_flags + " " + " ".join(cc_files) + cc_flags)
+    jittor_utils.process_jittor_source("acl", mod.process)
+
+    has_acl = 1
+
+
+def check():
+    import jittor.compiler as compiler
+    global has_acl, cc_flags
+    if tikcc_path:
+        try:
+            install()
+        except Exception as e:
+            jittor_utils.LOG.w(f"load ACL failed, exception: {e}")
+            has_acl = 0
+    compiler.has_acl = has_acl
+    compiler.tikcc_path = tikcc_path
+    if not has_acl: return False
+    compiler.cc_flags += cc_flags
+    compiler.nvcc_path = tikcc_path
+    compiler.nvcc_flags = compiler.cc_flags.replace("-std=c++14","")
+    return True
+
diff --git a/python/jittor/extern/acl/acl_error_code.cc b/python/jittor/extern/acl/acl_error_code.cc
new file mode 100644
index 00000000..d6163185
--- /dev/null
+++ b/python/jittor/extern/acl/acl_error_code.cc
@@ -0,0 +1,228 @@
+// ***************************************************************
+// Copyright (c) 2021 Jittor. All Rights Reserved. 
+// Maintainers: Dun Liang <randonlang@gmail.com>. 
+// This file is subject to the terms and conditions defined in
+// file 'LICENSE.txt', which is part of this source code package.
+// ***************************************************************
+
+#include "common.h"
+using std::string;
+using std::unordered_map;
+
+typedef int aclError;
+
+static inline unordered_map<aclError,string> gen_map(string s) {
+    unordered_map<aclError,string> smap;
+    for (int i=0; i<s.size(); i++) {
+        if (s[i] == ';') {
+            int j=s.rfind(" ", i);
+            int code = std::stoi(s.substr(j+1, i-j-1));
+            int k = s.rfind(" ", j-1);
+            int l = s.rfind(" ACL_", k-1);
+            smap[code] = s.substr(l+1, k-l-1);
+        }
+    }
+    return smap;
+}
+
+string acl_error_to_string(aclError error) {
+
+static unordered_map<aclError,string> acl_error_map = gen_map(R"(
+// from acl_base.h
+static const int ACL_ERROR_INVALID_PARAM = 100000;
+static const int ACL_ERROR_UNINITIALIZE = 100001;
+static const int ACL_ERROR_REPEAT_INITIALIZE = 100002;
+static const int ACL_ERROR_INVALID_FILE = 100003;
+static const int ACL_ERROR_WRITE_FILE = 100004;
+static const int ACL_ERROR_INVALID_FILE_SIZE = 100005;
+static const int ACL_ERROR_PARSE_FILE = 100006;
+static const int ACL_ERROR_FILE_MISSING_ATTR = 100007;
+static const int ACL_ERROR_FILE_ATTR_INVALID = 100008;
+static const int ACL_ERROR_INVALID_DUMP_CONFIG = 100009;
+static const int ACL_ERROR_INVALID_PROFILING_CONFIG = 100010;
+static const int ACL_ERROR_INVALID_MODEL_ID = 100011;
+static const int ACL_ERROR_DESERIALIZE_MODEL = 100012;
+static const int ACL_ERROR_PARSE_MODEL = 100013;
+static const int ACL_ERROR_READ_MODEL_FAILURE = 100014;
+static const int ACL_ERROR_MODEL_SIZE_INVALID = 100015;
+static const int ACL_ERROR_MODEL_MISSING_ATTR = 100016;
+static const int ACL_ERROR_MODEL_INPUT_NOT_MATCH = 100017;
+static const int ACL_ERROR_MODEL_OUTPUT_NOT_MATCH = 100018;
+static const int ACL_ERROR_MODEL_NOT_DYNAMIC = 100019;
+static const int ACL_ERROR_OP_TYPE_NOT_MATCH = 100020;
+static const int ACL_ERROR_OP_INPUT_NOT_MATCH = 100021;
+static const int ACL_ERROR_OP_OUTPUT_NOT_MATCH = 100022;
+static const int ACL_ERROR_OP_ATTR_NOT_MATCH = 100023;
+static const int ACL_ERROR_OP_NOT_FOUND = 100024;
+static const int ACL_ERROR_OP_LOAD_FAILED = 100025;
+static const int ACL_ERROR_UNSUPPORTED_DATA_TYPE = 100026;
+static const int ACL_ERROR_FORMAT_NOT_MATCH = 100027;
+static const int ACL_ERROR_BIN_SELECTOR_NOT_REGISTERED = 100028;
+static const int ACL_ERROR_KERNEL_NOT_FOUND = 100029;
+static const int ACL_ERROR_BIN_SELECTOR_ALREADY_REGISTERED = 100030;
+static const int ACL_ERROR_KERNEL_ALREADY_REGISTERED = 100031;
+static const int ACL_ERROR_INVALID_QUEUE_ID = 100032;
+static const int ACL_ERROR_REPEAT_SUBSCRIBE = 100033;
+static const int ACL_ERROR_STREAM_NOT_SUBSCRIBE = 100034;
+static const int ACL_ERROR_THREAD_NOT_SUBSCRIBE = 100035;
+static const int ACL_ERROR_WAIT_CALLBACK_TIMEOUT = 100036;
+static const int ACL_ERROR_REPEAT_FINALIZE = 100037;
+static const int ACL_ERROR_NOT_STATIC_AIPP = 100038;
+static const int ACL_ERROR_COMPILING_STUB_MODE = 100039;
+static const int ACL_ERROR_GROUP_NOT_SET = 100040;
+static const int ACL_ERROR_GROUP_NOT_CREATE = 100041;
+static const int ACL_ERROR_PROF_ALREADY_RUN = 100042;
+static const int ACL_ERROR_PROF_NOT_RUN = 100043;
+static const int ACL_ERROR_DUMP_ALREADY_RUN = 100044;
+static const int ACL_ERROR_DUMP_NOT_RUN = 100045;
+static const int ACL_ERROR_PROF_REPEAT_SUBSCRIBE = 148046;
+static const int ACL_ERROR_PROF_API_CONFLICT = 148047;
+static const int ACL_ERROR_INVALID_MAX_OPQUEUE_NUM_CONFIG = 148048;
+static const int ACL_ERROR_INVALID_OPP_PATH = 148049;
+static const int ACL_ERROR_OP_UNSUPPORTED_DYNAMIC = 148050;
+static const int ACL_ERROR_RELATIVE_RESOURCE_NOT_CLEARED = 148051;
+
+static const int ACL_ERROR_BAD_ALLOC = 200000;
+static const int ACL_ERROR_API_NOT_SUPPORT = 200001;
+static const int ACL_ERROR_INVALID_DEVICE = 200002;
+static const int ACL_ERROR_MEMORY_ADDRESS_UNALIGNED = 200003;
+static const int ACL_ERROR_RESOURCE_NOT_MATCH = 200004;
+static const int ACL_ERROR_INVALID_RESOURCE_HANDLE = 200005;
+static const int ACL_ERROR_FEATURE_UNSUPPORTED = 200006;
+static const int ACL_ERROR_PROF_MODULES_UNSUPPORTED = 200007;
+
+static const int ACL_ERROR_STORAGE_OVER_LIMIT = 300000;
+
+static const int ACL_ERROR_INTERNAL_ERROR = 500000;
+static const int ACL_ERROR_FAILURE = 500001;
+static const int ACL_ERROR_GE_FAILURE = 500002;
+static const int ACL_ERROR_RT_FAILURE = 500003;
+static const int ACL_ERROR_DRV_FAILURE = 500004;
+static const int ACL_ERROR_PROFILING_FAILURE = 500005;
+
+// from ge_error_codes.h
+static const uint32_t ACL_ERROR_GE_PARAM_INVALID = 145000U;
+static const uint32_t ACL_ERROR_GE_EXEC_NOT_INIT = 145001U;
+static const uint32_t ACL_ERROR_GE_EXEC_MODEL_PATH_INVALID = 145002U;
+static const uint32_t ACL_ERROR_GE_EXEC_MODEL_ID_INVALID = 145003U;
+static const uint32_t ACL_ERROR_GE_EXEC_MODEL_DATA_SIZE_INVALID = 145006U;
+static const uint32_t ACL_ERROR_GE_EXEC_MODEL_ADDR_INVALID = 145007U;
+static const uint32_t ACL_ERROR_GE_EXEC_MODEL_QUEUE_ID_INVALID = 145008U;
+static const uint32_t ACL_ERROR_GE_EXEC_LOAD_MODEL_REPEATED = 145009U;
+static const uint32_t ACL_ERROR_GE_DYNAMIC_INPUT_ADDR_INVALID = 145011U;
+static const uint32_t ACL_ERROR_GE_DYNAMIC_INPUT_LENGTH_INVALID = 145012U;
+static const uint32_t ACL_ERROR_GE_DYNAMIC_BATCH_SIZE_INVALID = 145013U;
+static const uint32_t ACL_ERROR_GE_AIPP_BATCH_EMPTY = 145014U;
+static const uint32_t ACL_ERROR_GE_AIPP_NOT_EXIST = 145015U;
+static const uint32_t ACL_ERROR_GE_AIPP_MODE_INVALID = 145016U;
+static const uint32_t ACL_ERROR_GE_OP_TASK_TYPE_INVALID = 145017U;
+static const uint32_t ACL_ERROR_GE_OP_KERNEL_TYPE_INVALID = 145018U;
+static const uint32_t ACL_ERROR_GE_PLGMGR_PATH_INVALID = 145019U;
+static const uint32_t ACL_ERROR_GE_FORMAT_INVALID = 145020U;
+static const uint32_t ACL_ERROR_GE_SHAPE_INVALID = 145021U;
+static const uint32_t ACL_ERROR_GE_DATATYPE_INVALID = 145022U;
+static const uint32_t ACL_ERROR_GE_MEMORY_ALLOCATION = 245000U;
+static const uint32_t ACL_ERROR_GE_MEMORY_OPERATE_FAILED = 245001U;
+static const uint32_t ACL_ERROR_GE_INTERNAL_ERROR = 545000U;
+static const uint32_t ACL_ERROR_GE_LOAD_MODEL = 545001U;
+static const uint32_t ACL_ERROR_GE_EXEC_LOAD_MODEL_PARTITION_FAILED = 545002U;
+static const uint32_t ACL_ERROR_GE_EXEC_LOAD_WEIGHT_PARTITION_FAILED = 545003U;
+static const uint32_t ACL_ERROR_GE_EXEC_LOAD_TASK_PARTITION_FAILED = 545004U;
+static const uint32_t ACL_ERROR_GE_EXEC_LOAD_KERNEL_PARTITION_FAILED = 545005U;
+static const uint32_t ACL_ERROR_GE_EXEC_RELEASE_MODEL_DATA = 545006U;
+static const uint32_t ACL_ERROR_GE_COMMAND_HANDLE = 545007U;
+static const uint32_t ACL_ERROR_GE_GET_TENSOR_INFO = 545008U;
+static const uint32_t ACL_ERROR_GE_UNLOAD_MODEL = 545009U;
+
+
+static const int32_t ACL_ERROR_RT_PARAM_INVALID              = 107000; // param invalid
+static const int32_t ACL_ERROR_RT_INVALID_DEVICEID           = 107001; // invalid device id
+static const int32_t ACL_ERROR_RT_CONTEXT_NULL               = 107002; // current context null
+static const int32_t ACL_ERROR_RT_STREAM_CONTEXT             = 107003; // stream not in current context
+static const int32_t ACL_ERROR_RT_MODEL_CONTEXT              = 107004; // model not in current context
+static const int32_t ACL_ERROR_RT_STREAM_MODEL               = 107005; // stream not in model
+static const int32_t ACL_ERROR_RT_EVENT_TIMESTAMP_INVALID    = 107006; // event timestamp invalid
+static const int32_t ACL_ERROR_RT_EVENT_TIMESTAMP_REVERSAL   = 107007; // event timestamp reversal
+static const int32_t ACL_ERROR_RT_ADDR_UNALIGNED             = 107008; // memory address unaligned
+static const int32_t ACL_ERROR_RT_FILE_OPEN                  = 107009; // open file failed
+static const int32_t ACL_ERROR_RT_FILE_WRITE                 = 107010; // write file failed
+static const int32_t ACL_ERROR_RT_STREAM_SUBSCRIBE           = 107011; // error subscribe stream
+static const int32_t ACL_ERROR_RT_THREAD_SUBSCRIBE           = 107012; // error subscribe thread
+static const int32_t ACL_ERROR_RT_GROUP_NOT_SET              = 107013; // group not set
+static const int32_t ACL_ERROR_RT_GROUP_NOT_CREATE           = 107014; // group not create
+static const int32_t ACL_ERROR_RT_STREAM_NO_CB_REG           = 107015; // callback not register to stream
+static const int32_t ACL_ERROR_RT_INVALID_MEMORY_TYPE        = 107016; // invalid memory type
+static const int32_t ACL_ERROR_RT_INVALID_HANDLE             = 107017; // invalid handle
+static const int32_t ACL_ERROR_RT_INVALID_MALLOC_TYPE        = 107018; // invalid malloc type
+static const int32_t ACL_ERROR_RT_WAIT_TIMEOUT               = 107019; // wait timeout
+
+static const int32_t ACL_ERROR_RT_FEATURE_NOT_SUPPORT        = 207000; // feature not support
+static const int32_t ACL_ERROR_RT_MEMORY_ALLOCATION          = 207001; // memory allocation error
+static const int32_t ACL_ERROR_RT_MEMORY_FREE                = 207002; // memory free error
+static const int32_t ACL_ERROR_RT_AICORE_OVER_FLOW           = 207003; // aicore over flow
+static const int32_t ACL_ERROR_RT_NO_DEVICE                  = 207004; // no device
+static const int32_t ACL_ERROR_RT_RESOURCE_ALLOC_FAIL        = 207005; // resource alloc fail
+static const int32_t ACL_ERROR_RT_NO_PERMISSION              = 207006; // no permission
+static const int32_t ACL_ERROR_RT_NO_EVENT_RESOURCE          = 207007; // no event resource
+static const int32_t ACL_ERROR_RT_NO_STREAM_RESOURCE         = 207008; // no stream resource
+static const int32_t ACL_ERROR_RT_NO_NOTIFY_RESOURCE         = 207009; // no notify resource
+static const int32_t ACL_ERROR_RT_NO_MODEL_RESOURCE          = 207010; // no model resource
+static const int32_t ACL_ERROR_RT_NO_CDQ_RESOURCE            = 207011; // no cdq resource
+static const int32_t ACL_ERROR_RT_OVER_LIMIT                 = 207012; // over limit
+static const int32_t ACL_ERROR_RT_QUEUE_EMPTY                = 207013; // queue is empty
+static const int32_t ACL_ERROR_RT_QUEUE_FULL                 = 207014; // queue is full
+static const int32_t ACL_ERROR_RT_REPEATED_INIT              = 207015; // repeated init
+static const int32_t ACL_ERROR_RT_AIVEC_OVER_FLOW            = 207016; // aivec over flow
+
+static const int32_t ACL_ERROR_RT_INTERNAL_ERROR             = 507000; // runtime internal error
+static const int32_t ACL_ERROR_RT_TS_ERROR                   = 507001; // ts internel error
+static const int32_t ACL_ERROR_RT_STREAM_TASK_FULL           = 507002; // task full in stream
+static const int32_t ACL_ERROR_RT_STREAM_TASK_EMPTY          = 507003; // task empty in stream
+static const int32_t ACL_ERROR_RT_STREAM_NOT_COMPLETE        = 507004; // stream not complete
+static const int32_t ACL_ERROR_RT_END_OF_SEQUENCE            = 507005; // end of sequence
+static const int32_t ACL_ERROR_RT_EVENT_NOT_COMPLETE         = 507006; // event not complete
+static const int32_t ACL_ERROR_RT_CONTEXT_RELEASE_ERROR      = 507007; // context release error
+static const int32_t ACL_ERROR_RT_SOC_VERSION                = 507008; // soc version error
+static const int32_t ACL_ERROR_RT_TASK_TYPE_NOT_SUPPORT      = 507009; // task type not support
+static const int32_t ACL_ERROR_RT_LOST_HEARTBEAT             = 507010; // ts lost heartbeat
+static const int32_t ACL_ERROR_RT_MODEL_EXECUTE              = 507011; // model execute failed
+static const int32_t ACL_ERROR_RT_REPORT_TIMEOUT             = 507012; // report timeout
+static const int32_t ACL_ERROR_RT_SYS_DMA                    = 507013; // sys dma error
+static const int32_t ACL_ERROR_RT_AICORE_TIMEOUT             = 507014; // aicore timeout
+static const int32_t ACL_ERROR_RT_AICORE_EXCEPTION           = 507015; // aicore exception
+static const int32_t ACL_ERROR_RT_AICORE_TRAP_EXCEPTION      = 507016; // aicore trap exception
+static const int32_t ACL_ERROR_RT_AICPU_TIMEOUT              = 507017; // aicpu timeout
+static const int32_t ACL_ERROR_RT_AICPU_EXCEPTION            = 507018; // aicpu exception
+static const int32_t ACL_ERROR_RT_AICPU_DATADUMP_RSP_ERR     = 507019; // aicpu datadump response error
+static const int32_t ACL_ERROR_RT_AICPU_MODEL_RSP_ERR        = 507020; // aicpu model operate response error
+static const int32_t ACL_ERROR_RT_PROFILING_ERROR            = 507021; // profiling error
+static const int32_t ACL_ERROR_RT_IPC_ERROR                  = 507022; // ipc error
+static const int32_t ACL_ERROR_RT_MODEL_ABORT_NORMAL         = 507023; // model abort normal
+static const int32_t ACL_ERROR_RT_KERNEL_UNREGISTERING       = 507024; // kernel unregistering
+static const int32_t ACL_ERROR_RT_RINGBUFFER_NOT_INIT        = 507025; // ringbuffer not init
+static const int32_t ACL_ERROR_RT_RINGBUFFER_NO_DATA         = 507026; // ringbuffer no data
+static const int32_t ACL_ERROR_RT_KERNEL_LOOKUP              = 507027; // kernel lookup error
+static const int32_t ACL_ERROR_RT_KERNEL_DUPLICATE           = 507028; // kernel register duplicate
+static const int32_t ACL_ERROR_RT_DEBUG_REGISTER_FAIL        = 507029; // debug register failed
+static const int32_t ACL_ERROR_RT_DEBUG_UNREGISTER_FAIL      = 507030; // debug unregister failed
+static const int32_t ACL_ERROR_RT_LABEL_CONTEXT              = 507031; // label not in current context
+static const int32_t ACL_ERROR_RT_PROGRAM_USE_OUT            = 507032; // program register num use out
+static const int32_t ACL_ERROR_RT_DEV_SETUP_ERROR            = 507033; // device setup error
+static const int32_t ACL_ERROR_RT_VECTOR_CORE_TIMEOUT        = 507034; // vector core timeout
+static const int32_t ACL_ERROR_RT_VECTOR_CORE_EXCEPTION      = 507035; // vector core exception
+static const int32_t ACL_ERROR_RT_VECTOR_CORE_TRAP_EXCEPTION = 507036; // vector core trap exception
+static const int32_t ACL_ERROR_RT_CDQ_BATCH_ABNORMAL         = 507037; // cdq alloc batch abnormal
+static const int32_t ACL_ERROR_RT_DIE_MODE_CHANGE_ERROR      = 507038; // can not change die mode
+static const int32_t ACL_ERROR_RT_DIE_SET_ERROR              = 507039; // single die mode can not set die
+static const int32_t ACL_ERROR_RT_INVALID_DIEID              = 507040; // invalid die id
+static const int32_t ACL_ERROR_RT_DIE_MODE_NOT_SET           = 507041; // die mode not set
+
+static const int32_t ACL_ERROR_RT_DRV_INTERNAL_ERROR         = 507899; // drv internal error
+static const int32_t ACL_ERROR_RT_AICPU_INTERNAL_ERROR       = 507900; // aicpu internal error
+static const int32_t ACL_ERROR_RT_SOCKET_CLOSE               = 507901; // hdc disconnect
+
+)");
+    if (acl_error_map.count(error))
+        return acl_error_map[error];
+    return "unknown " + std::to_string((int)error);
+}
\ No newline at end of file
diff --git a/python/jittor/extern/acl/acl_jittor.cc b/python/jittor/extern/acl/acl_jittor.cc
new file mode 100644
index 00000000..a48d39e6
--- /dev/null
+++ b/python/jittor/extern/acl/acl_jittor.cc
@@ -0,0 +1,186 @@
+// ***************************************************************
+// Copyright (c) 2021 Jittor. All Rights Reserved. 
+// Maintainers: Dun Liang <randonlang@gmail.com>. 
+// This file is subject to the terms and conditions defined in
+// file 'LICENSE.txt', which is part of this source code package.
+// ***************************************************************
+#include "acl_jittor.h"
+#include "utils/str_utils.h"
+#include <chrono>
+#include <thread>
+
+namespace jittor {
+
+uint64_t acl_jittor_tid;
+int acl_jittor_thread_running=0;
+aclrtContext acl_jittor_context;
+
+#define CHECK_ACL(x) ASSERTop(x,==,0)
+
+static void* acl_jittor_process_callback(void*) {
+    acl_jittor_thread_running = 1;
+    int deviceId = 0;
+    CHECK_ACL(aclrtSetCurrentContext(acl_jittor_context));
+    
+    while (acl_jittor_thread_running) {
+        // LOGir << "acl_jittor_process_callback";
+        auto ret = aclrtProcessReport(1000);
+        if (ret) {
+            if (acl_jittor_thread_running && ret != ACL_ERROR_RT_REPORT_TIMEOUT)
+                LOGir << "aclrtProcessReport:" << ret << acl_error_to_string(ret);
+            break;
+        }
+    }
+    acl_jittor_thread_running = 0;
+    return (void*)0;
+}
+
+// void aaa(void*) {
+//     LOGir << "haha";
+// }
+
+struct acl_jittor_initer {
+
+acl_jittor_initer() {
+    CHECK_ACL(aclInit(nullptr));
+    uint device_count = 0;
+    // 获取可用的Device数量
+    CHECK_ACL(aclrtGetDeviceCount(&device_count));
+    LOGi << "Found ACL device number:" << device_count;
+    CHECK_ACL(aclrtSetDevice(0));
+    CHECK_ACL(aclrtCreateContext(&acl_jittor_context, 0));
+    CHECK_ACL(aclrtSetCurrentContext(acl_jittor_context));
+    
+    pthread_create(&acl_jittor_tid, nullptr, acl_jittor_process_callback, 0);
+
+    // subscribe for default stream
+    CHECK_ACL(aclrtSubscribeReport(acl_jittor_tid,0));
+
+    // simple callback test
+    // aclrtStream stream;
+    // CHECK_ACL(aclrtCreateStream(&stream));
+    // CHECK_ACL(aclrtSubscribeReport(acl_jittor_tid,stream));
+    // CHECK_ACL(aclrtLaunchCallback((aclrtCallback)&aaa, 0, ACL_CALLBACK_NO_BLOCK, stream));
+    // CHECK_ACL(aclrtLaunchCallback((aclrtCallback)&aaa, 0, ACL_CALLBACK_NO_BLOCK, 0));
+}
+
+~acl_jittor_initer() {
+    acl_jittor_thread_running = 0;
+    CHECK_ACL(aclrtUnSubscribeReport(acl_jittor_tid,0));
+    CHECK_ACL(aclrtDestroyContext(acl_jittor_context));
+    CHECK_ACL(aclFinalize());
+}
+
+} _acl_jittor_initer;
+
+string process_acl(const string& src, const string& name, const map<string,string>& kargs) {
+    auto tokens = token_split(src);
+    int edit = 0;
+    for (int i=0; i<tokens.size(); i++) {
+        auto& token = tokens[i];
+        if (token == "cuda_runtime") token = "acl_jittor", edit ++; else
+        if (token == "CUDA") token = "ACL", edit ++; else
+        if (startswith(token, "cuda")) {
+            if (token.size()>=5 && token[4] >= 'A' && token[4] <= 'Z') {
+                if (token == "cudaGetDeviceCount") {
+                    token_replace(tokens, i, "($1);", "((uint*)$1);");
+                } else if (token == "cudaLaunchHostFunc") {
+                    // ACL_CALLBACK_BLOCK for 310
+                    token_replace(tokens, i, "LaunchHostFunc($1,$2,$3)",
+                        "LaunchCallback($2,$3,ACL_CALLBACK_NO_BLOCK,$1)");
+                } else if (token == "cudaMemcpy")
+                    token_replace(tokens, i, "cudaMemcpy($1,$2,$3,",
+                        "aclrtMemcpy($1,$3,$2,$3,");
+                else if (token == "cudaMemcpyAsync")
+                    token_replace(tokens, i, "cudaMemcpyAsync($1,$2,$3,",
+                        "aclrtMemcpyAsync($1,$3,$2,$3,");
+                else if (token == "cudaMemcpyDeviceToHost") token = "ACL_MEMCPY_DEVICE_TO_HOST";
+                else if (token == "cudaMemcpyHostToDevice") token = "ACL_MEMCPY_HOST_TO_DEVICE";
+                else if (token == "cudaMemcpyDeviceToDevice") token = "ACL_MEMCPY_DEVICE_TO_DEVICE";
+                else if (token == "cudaMallocManaged" || token == "cudaMalloc") {
+                    // unified address not supported
+                    token = "aclrtMalloc";
+                    token_replace(tokens, i, "($1,$2)",
+                        "($1,$2,ACL_MEM_MALLOC_HUGE_FIRST)");
+                } else if (token == "cudaMemGetInfo")
+                    token_replace(tokens, i, "cudaMemGetInfo($1,$2)",
+                        "aclrtGetMemInfo(ACL_DDR_MEM,$1,$2)");
+                else if (token == "cudaGetLastError")
+                    token_replace(tokens, i, "cudaGetLastError()", "0");
+                else if (token == "cudaStreamCreateWithFlags")
+                    token_replace(tokens, i-1, 
+                        "(cudaStreamCreateWithFlags($1,$2));",
+                        "(aclrtCreateStream($1)); checkAclErrors(aclrtSubscribeReport(acl_jittor_tid,*$1));");
+                else if (token == "cudaEventCreate")
+                    token_replace(tokens, i, 
+                        "cudaEventCreate($1,$2)",
+                        "aclrtCreateEvent($1)");
+                else if (token == "cudaDeviceSynchronize")
+                    token = "aclrtSynchronizeDevice";
+                else if (token == "cudaStreamDestroy")
+                    token_replace(tokens, i, "cudaStreamDestroy($1)",
+                        "(aclrtUnSubscribeReport(acl_jittor_tid,$1), aclrtDestroyStream($1))");
+                else if (token == "cudaEventDestroy")
+                    token = "aclrtDestroyEvent";
+                else if (token == "cudaEventRecord")
+                    token = "aclrtRecordEvent";
+                else if (token == "cudaStreamWaitEvent")
+                    token_replace(tokens, i, 
+                        "cudaStreamWaitEvent($1,$2,$3)",
+                        "aclrtStreamWaitEvent($1,$2)");
+                
+                if (token.size() && token[0] == 'c')
+                    token = "aclrt" + token.substr(4);
+                if (endswith(token, "_t"))
+                    token = token.substr(0, token.size()-2);
+                edit ++;
+            }
+        } else
+        if (token == "_cudaGetErrorEnum") {
+            token_replace(tokens, i, "_cudaGetErrorEnum($1)", "(acl_error_to_string($1))");
+            edit ++;
+        } else
+        if (token == "checkCudaErrors")
+            token = "checkAclErrors";
+        else if (token == "JPU") {
+            edit ++;
+            string new_code;
+            if (tokens[i+2] == "op_compiler")
+                token_replace(tokens, i, 
+                    "JPU(op_compiler($1,$2,$3))",
+                    "acl_jittor_op_compiler($1,$2,$3)");
+            else if (tokens[i+2] == "header")
+                new_code = "#include \"acl_jittor.h\"";
+            if (new_code.size())
+                token_replace(tokens, i,  "JPU($1)", new_code);
+        } else if (token == "use_cuda_managed_allocator" && tokens[i+1][0]==',') {
+            tokens[i+2] = "0"; // disable unified address
+        }
+    }
+    if (!edit) return src;
+    return join(tokens, "");
+}
+
+void acl_jittor_op_compiler(string& filename, string& src, bool is_acl) {
+    if (!is_acl) return;
+    filename = replace(filename, ".cc", ".tikcc");
+    // LOGir << filename;
+    string new_src = process_acl(src, "", {});
+    new_src = replace(new_src, R"(#include "misc/cuda_atomic.h")", "");
+    new_src = replace(new_src, R"(#include "misc/cuda_limits.h")", "");
+    new_src = replace(new_src, "__global__", "__ai_device_entry__");
+    new_src = token_replace(new_src, "__launch_bounds__($1)", "");
+    new_src = token_replace(new_src, "int thread_num = $1;", "int thread_num = 1;");
+    new_src = token_replace(new_src, "tn0=std::max(tn0, $1);", "");
+    new_src = token_replace(new_src, "<<<$1,$2>>>", "<<<1,0>>>");
+    new_src = token_replace(new_src, "int thread_id = $1;", "int thread_id = 1;");
+    // for inc error
+    new_src = token_replace(new_src, "for ($1+=$2)", "for ($1++)");
+    // bit op error
+    new_src = token_replace(new_src, "int tnum$1;", "");
+    new_src = token_replace(new_src, "int tid$1=$2;", "int tid$1=0;");
+    src = new_src;
+    // auto tokens = token_split(new_src);
+}
+
+}
diff --git a/python/jittor/extern/acl/acl_jittor.h b/python/jittor/extern/acl/acl_jittor.h
new file mode 100644
index 00000000..cda85d93
--- /dev/null
+++ b/python/jittor/extern/acl/acl_jittor.h
@@ -0,0 +1,19 @@
+// ***************************************************************
+// Copyright (c) 2021 Jittor. All Rights Reserved. 
+// Maintainers: Dun Liang <randonlang@gmail.com>. 
+// This file is subject to the terms and conditions defined in
+// file 'LICENSE.txt', which is part of this source code package.
+// ***************************************************************
+#pragma once
+#include "common.h"
+#include <acl/acl.h>
+
+std::string acl_error_to_string(aclError error);
+
+namespace jittor {
+
+EXTERN_LIB uint64_t acl_jittor_tid;
+
+void acl_jittor_op_compiler(string& filename, string& src, bool is_acl);
+
+}
diff --git a/python/jittor/extern/cuda/cublas/inc/cublas_wrapper.h b/python/jittor/extern/cuda/cublas/inc/cublas_wrapper.h
index 67d46a69..5ff0de2c 100644
--- a/python/jittor/extern/cuda/cublas/inc/cublas_wrapper.h
+++ b/python/jittor/extern/cuda/cublas/inc/cublas_wrapper.h
@@ -23,8 +23,8 @@ EXTERN_LIB cublasHandle_t cublas_handle;
 
 static inline cudaDataType get_dtype(NanoString dtype) {
     if (dtype == ns_float32) return CUDA_R_32F;
-    // if (dtype == ns_float64) return CUDA_R_64F;
-    // if (dtype == ns_float16) return CUDA_R_16F;
+    if (dtype == ns_float64) return CUDA_R_64F;
+    if (dtype == ns_float16) return CUDA_R_16F;
     LOGf << "not support type" << dtype;
     return CUDA_R_32F;
 }
diff --git a/python/jittor/extern/cuda/cublas/ops/cublas_batched_matmul_op.cc b/python/jittor/extern/cuda/cublas/ops/cublas_batched_matmul_op.cc
index 6ab519a0..6f93e343 100644
--- a/python/jittor/extern/cuda/cublas/ops/cublas_batched_matmul_op.cc
+++ b/python/jittor/extern/cuda/cublas/ops/cublas_batched_matmul_op.cc
@@ -89,7 +89,7 @@ void CublasBatchedMatmulOp::jit_prepare(JK& jk) {
     jk << _CS("[T:") << a->dtype();
     jk << _CS("][Trans_a:") << (trans_a ? 'T' : 'N');
     jk << _CS("][Trans_b:") << (trans_b ? 'T' : 'N');
-    jk << _CS("][op:") << (a->dtype().dsize() == 4 ? 'S' : 'D');
+    jk << _CS("][op:") << (a->dtype().dsize() == 2? 'H' : (a->dtype().dsize() == 4 ? 'S' : 'D'));
     jk << ']';
 }
 
@@ -124,6 +124,22 @@ void CublasBatchedMatmulOp::jit_run() {
     if (use_tensorcore) {
         computeType = CUBLAS_COMPUTE_32F_FAST_16F;
     }
+    if (a->dtype() == ns_float16
+        || b->dtype() == ns_float16 || c->dtype() == ns_float16) {
+        computeType = CUBLAS_COMPUTE_16F;
+    }
+    #else 
+    cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT;
+    cudaDataType_t computeType = CUDA_R_32F;
+    if (use_tensorcore) {
+        algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP;
+    }
+    if (a->dtype() == ns_float16
+        || b->dtype() == ns_float16 || c->dtype() == ns_float16) {
+        computeType = CUDA_R_16F;
+        algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP;
+    }
+    #endif
     checkCudaErrors(cublasGemmStridedBatchedEx(handle_,
     CUBLAS_OP_@Trans_b, CUBLAS_OP_@Trans_a,
     k, n, m, &alpha,
@@ -131,15 +147,13 @@ void CublasBatchedMatmulOp::jit_run() {
     a->ptr<T>(),get_dtype(a->dtype()), '@Trans_a' == 'N' ? m : n, n * m, &beta,
     c->ptr<T>(),get_dtype(c->dtype()), k, k * n,
     batch_size,computeType,algo));
-    #else
-    checkCudaErrors(cublas@op@@gemmStridedBatched(handle_,
-    CUBLAS_OP_@Trans_b, CUBLAS_OP_@Trans_a,
-    k, n, m, &alpha,
-    b->ptr<T>(), '@Trans_b' == 'N' ? k : m, k * m, 
-    a->ptr<T>(), '@Trans_a' == 'N' ? m : n, n * m, &beta,
-    c->ptr<T>(), k, k * n,
-    batch_size));
-    #endif
+    // checkCudaErrors(cublas@op@@gemmStridedBatched(handle_,
+    // CUBLAS_OP_@Trans_b, CUBLAS_OP_@Trans_a,
+    // k, n, m, &alpha,
+    // b->ptr<T>(), '@Trans_b' == 'N' ? k : m, k * m, 
+    // a->ptr<T>(), '@Trans_a' == 'N' ? m : n, n * m, &beta,
+    // c->ptr<T>(), k, k * n,
+    // batch_size));
 }
 #endif
 #endif // JIT
diff --git a/python/jittor/extern/cuda/cublas/ops/cublas_matmul_op.cc b/python/jittor/extern/cuda/cublas/ops/cublas_matmul_op.cc
index 0ed46bc4..a6708225 100644
--- a/python/jittor/extern/cuda/cublas/ops/cublas_matmul_op.cc
+++ b/python/jittor/extern/cuda/cublas/ops/cublas_matmul_op.cc
@@ -50,7 +50,7 @@ void CublasMatmulOp::jit_prepare(JK& jk) {
     jk << _CS("[T:") << a->dtype();
     jk << _CS("][Trans_a:") << (trans_a ? 'T' : 'N');
     jk << _CS("][Trans_b:") << (trans_b ? 'T' : 'N');
-    jk << _CS("][op:") << (a->dtype().dsize() == 4 ? 'S' : 'D');
+    jk << _CS("][op:") << (a->dtype().dsize() == 2? 'H' : (a->dtype().dsize() == 4 ? 'S' : 'D'));
     jk << ']';
 }
 
@@ -81,6 +81,22 @@ void CublasMatmulOp::jit_run() {
     if (use_tensorcore) {
         computeType = CUBLAS_COMPUTE_32F_FAST_16F;
     }
+    if (a->dtype() == ns_float16
+        || b->dtype() == ns_float16 || c->dtype() == ns_float16) {
+        computeType = CUBLAS_COMPUTE_16F;
+    }
+    #else
+    cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT;
+    cudaDataType_t computeType = CUDA_R_32F;
+    if (use_tensorcore) {
+        algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP;
+    }
+    if (a->dtype() == ns_float16
+        || b->dtype() == ns_float16 || c->dtype() == ns_float16) {
+        computeType = CUDA_R_16F;
+        algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP;
+    }
+    #endif
     checkCudaErrors(cublasGemmEx(handle_, 
     CUBLAS_OP_@Trans_b, CUBLAS_OP_@Trans_a, 
     k, n, m, &alpha, 
@@ -88,15 +104,13 @@ void CublasMatmulOp::jit_run() {
     a->ptr<T>(),get_dtype(a->dtype()), '@Trans_a' == 'N' ? m : n, &beta, 
     c->ptr<T>(),get_dtype(c->dtype()), k,
     computeType, algo));
-    #else
-    checkCudaErrors(cublas@op@@gemm(handle_, 
-    CUBLAS_OP_@Trans_b, CUBLAS_OP_@Trans_a, 
-    k, n, m, &alpha, 
-    b->ptr<T>(), '@Trans_b' == 'N' ? k : m, 
-    a->ptr<T>(), '@Trans_a' == 'N' ? m : n, &beta, 
-    c->ptr<T>(), k));
+    // checkCudaErrors(cublas@op@@gemm(handle_, 
+    // CUBLAS_OP_@Trans_b, CUBLAS_OP_@Trans_a, 
+    // k, n, m, &alpha, 
+    // b->ptr<T>(), '@Trans_b' == 'N' ? k : m, 
+    // a->ptr<T>(), '@Trans_a' == 'N' ? m : n, &beta, 
+    // c->ptr<T>(), k));
 
-    #endif
     
 }
 #endif // JIT
diff --git a/python/jittor/extern/cuda/cudnn/ops/cudnn_conv_op.cc b/python/jittor/extern/cuda/cudnn/ops/cudnn_conv_op.cc
index acb9bd52..c495fdb6 100644
--- a/python/jittor/extern/cuda/cudnn/ops/cudnn_conv_op.cc
+++ b/python/jittor/extern/cuda/cudnn/ops/cudnn_conv_op.cc
@@ -174,6 +174,11 @@ void CudnnConvOp::jit_run() {
     if(use_tensorcore){
         checkCudaErrors( cudnnSetConvolutionMathType(cudnnConvDesc, CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION) );
     }
+    
+    if (x->dtype() == ns_float16
+        || y->dtype() == ns_float16 || w->dtype() == ns_float16) {
+        checkCudaErrors( cudnnSetConvolutionMathType(cudnnConvDesc, CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION) );
+    }
 
     int dimY[] = {
         (int)y->shape[findc("@YFORMAT", 'a')], // n
diff --git a/python/jittor/extern/cuda/cutt/ops/cutt_transpose_op.cc b/python/jittor/extern/cuda/cutt/ops/cutt_transpose_op.cc
index 96f68401..fc3da8c9 100644
--- a/python/jittor/extern/cuda/cutt/ops/cutt_transpose_op.cc
+++ b/python/jittor/extern/cuda/cutt/ops/cutt_transpose_op.cc
@@ -90,7 +90,7 @@ void CuttTransposeOp::jit_run() {
     for (int i=0; i<dim; i++)
         x_shape[i] = new_shape[dim-1-i];
     if (dim == 1 || x->num==1) {
-        checkCudaErrors(cudaMemcpyAsync(yp, xp, x->size, cudaMemcpyDefault, 0));
+        checkCudaErrors(cudaMemcpyAsync(yp, xp, x->size, cudaMemcpyDeviceToDevice, 0));
         return;
     }
     JK& jk = get_jk();
diff --git a/python/jittor/extern/cuda/inc/helper_cuda.h b/python/jittor/extern/cuda/inc/helper_cuda.h
index bba31f87..fc2e525f 100644
--- a/python/jittor/extern/cuda/inc/helper_cuda.h
+++ b/python/jittor/extern/cuda/inc/helper_cuda.h
@@ -25,7 +25,9 @@
 #include <stdlib.h>
 #include <string.h>
 
+#ifdef IS_CUDA
 #include <helper_string.h>
+#endif
 
 #ifndef EXIT_WAIVED
 #define EXIT_WAIVED 2
@@ -129,6 +131,9 @@ void check(T result, char const *const func, const char *const file,
   }
 }
 
+#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
+#define peekCudaErrors(val) peek((val), #val, __FILE__, __LINE__)
+
 #ifdef __DRIVER_TYPES_H__
 // This will output the proper CUDA error strings in the event
 // that a CUDA host call returns an error
diff --git a/python/jittor/extern/mkl/ops/cpu_cnn_inference_f32.cpp b/python/jittor/extern/mkl/ops/cpu_cnn_inference_f32.cpp
index 33ea4bdc..4bf1cba4 100644
--- a/python/jittor/extern/mkl/ops/cpu_cnn_inference_f32.cpp
+++ b/python/jittor/extern/mkl/ops/cpu_cnn_inference_f32.cpp
@@ -47,9 +47,9 @@
 #include <unordered_map>
 #include <vector>
 
-#include <mkldnn.hpp>
+#include <dnnl.hpp>
 
-using namespace mkldnn;
+using namespace dnnl;
 
 using namespace std;
 
@@ -159,8 +159,8 @@ void simple_net(int times = 100) {
     if (conv1_prim_desc.src_desc() != user_src_memory.get_desc()) {
         conv1_src_memory = memory(conv1_prim_desc.src_desc(), eng);
         net.push_back(reorder(user_src_memory, conv1_src_memory));
-        net_args.push_back({ { MKLDNN_ARG_FROM, user_src_memory },
-                { MKLDNN_ARG_TO, conv1_src_memory } });
+        net_args.push_back({ { DNNL_ARG_FROM, user_src_memory },
+                { DNNL_ARG_TO, conv1_src_memory } });
     }
 
     auto conv1_weights_memory = user_weights_memory;
@@ -181,10 +181,10 @@ void simple_net(int times = 100) {
 /// @snippet cpu_cnn_inference_f32.cpp Create memory for output
 //[Create convolution primitive]
     net.push_back(convolution_forward(conv1_prim_desc));
-    net_args.push_back({ { MKLDNN_ARG_SRC, conv1_src_memory },
-            { MKLDNN_ARG_WEIGHTS, conv1_weights_memory },
-            { MKLDNN_ARG_BIAS, conv1_user_bias_memory },
-            { MKLDNN_ARG_DST, conv1_dst_memory } });
+    net_args.push_back({ { DNNL_ARG_SRC, conv1_src_memory },
+            { DNNL_ARG_WEIGHTS, conv1_weights_memory },
+            { DNNL_ARG_BIAS, conv1_user_bias_memory },
+            { DNNL_ARG_DST, conv1_dst_memory } });
 //[Create convolution primitive]
 
     // AlexNet: relu1
@@ -204,8 +204,8 @@ void simple_net(int times = 100) {
     auto relu1_prim_desc = eltwise_forward::primitive_desc(relu1_desc, eng);
 
     net.push_back(eltwise_forward(relu1_prim_desc));
-    net_args.push_back({ { MKLDNN_ARG_SRC, conv1_dst_memory },
-            { MKLDNN_ARG_DST, conv1_dst_memory } });
+    net_args.push_back({ { DNNL_ARG_SRC, conv1_dst_memory },
+            { DNNL_ARG_DST, conv1_dst_memory } });
 //[Create relu primitive]
 
     // AlexNet: lrn1
@@ -226,8 +226,8 @@ void simple_net(int times = 100) {
     auto lrn1_dst_memory = memory(lrn1_prim_desc.dst_desc(), eng);
 
     net.push_back(lrn_forward(lrn1_prim_desc));
-    net_args.push_back({ { MKLDNN_ARG_SRC, conv1_dst_memory },
-            { MKLDNN_ARG_DST, lrn1_dst_memory } });
+    net_args.push_back({ { DNNL_ARG_SRC, conv1_dst_memory },
+            { DNNL_ARG_DST, lrn1_dst_memory } });
 
     // AlexNet: pool1
     // {batch, 96, 55, 55} -> {batch, 96, 27, 27}
@@ -255,8 +255,8 @@ void simple_net(int times = 100) {
     auto pool1_dst_memory = memory(pool1_pd.dst_desc(), eng);
 
     net.push_back(pooling_forward(pool1_pd));
-    net_args.push_back({ { MKLDNN_ARG_SRC, lrn1_dst_memory },
-            { MKLDNN_ARG_DST, pool1_dst_memory } });
+    net_args.push_back({ { DNNL_ARG_SRC, lrn1_dst_memory },
+            { DNNL_ARG_DST, pool1_dst_memory } });
 //[Create pooling primitive]
 
     // AlexNet: conv2
@@ -296,8 +296,8 @@ void simple_net(int times = 100) {
     if (conv2_prim_desc.src_desc() != conv2_src_memory.get_desc()) {
         conv2_src_memory = memory(conv2_prim_desc.src_desc(), eng);
         net.push_back(reorder(pool1_dst_memory, conv2_src_memory));
-        net_args.push_back({ { MKLDNN_ARG_FROM, pool1_dst_memory },
-                { MKLDNN_ARG_TO, conv2_src_memory } });
+        net_args.push_back({ { DNNL_ARG_FROM, pool1_dst_memory },
+                { DNNL_ARG_TO, conv2_src_memory } });
     }
 
     auto conv2_weights_memory = conv2_user_weights_memory;
@@ -312,10 +312,10 @@ void simple_net(int times = 100) {
 
     // create convolution primitive and add it to net
     net.push_back(convolution_forward(conv2_prim_desc));
-    net_args.push_back({ { MKLDNN_ARG_SRC, conv2_src_memory },
-            { MKLDNN_ARG_WEIGHTS, conv2_weights_memory },
-            { MKLDNN_ARG_BIAS, conv2_user_bias_memory },
-            { MKLDNN_ARG_DST, conv2_dst_memory } });
+    net_args.push_back({ { DNNL_ARG_SRC, conv2_src_memory },
+            { DNNL_ARG_WEIGHTS, conv2_weights_memory },
+            { DNNL_ARG_BIAS, conv2_user_bias_memory },
+            { DNNL_ARG_DST, conv2_dst_memory } });
 
     // AlexNet: relu2
     // {batch, 256, 27, 27} -> {batch, 256, 27, 27}
@@ -328,8 +328,8 @@ void simple_net(int times = 100) {
     auto relu2_prim_desc = eltwise_forward::primitive_desc(relu2_desc, eng);
 
     net.push_back(eltwise_forward(relu2_prim_desc));
-    net_args.push_back({ { MKLDNN_ARG_SRC, conv2_dst_memory },
-            { MKLDNN_ARG_DST, conv2_dst_memory } });
+    net_args.push_back({ { DNNL_ARG_SRC, conv2_dst_memory },
+            { DNNL_ARG_DST, conv2_dst_memory } });
 
     // AlexNet: lrn2
     // {batch, 256, 27, 27} -> {batch, 256, 27, 27}
@@ -349,8 +349,8 @@ void simple_net(int times = 100) {
     auto lrn2_dst_memory = memory(lrn2_prim_desc.dst_desc(), eng);
 
     net.push_back(lrn_forward(lrn2_prim_desc));
-    net_args.push_back({ { MKLDNN_ARG_SRC, conv2_dst_memory },
-            { MKLDNN_ARG_DST, lrn2_dst_memory } });
+    net_args.push_back({ { DNNL_ARG_SRC, conv2_dst_memory },
+            { DNNL_ARG_DST, lrn2_dst_memory } });
 
     // AlexNet: pool2
     // {batch, 256, 27, 27} -> {batch, 256, 13, 13}
@@ -372,8 +372,8 @@ void simple_net(int times = 100) {
 
     // create pooling primitive an add it to net
     net.push_back(pooling_forward(pool2_pd));
-    net_args.push_back({ { MKLDNN_ARG_SRC, lrn2_dst_memory },
-            { MKLDNN_ARG_DST, pool2_dst_memory } });
+    net_args.push_back({ { DNNL_ARG_SRC, lrn2_dst_memory },
+            { DNNL_ARG_DST, pool2_dst_memory } });
 
     // AlexNet: conv3
     // {batch, 256, 13, 13} (x)  {384, 256, 3, 3}; -> {batch, 384, 13, 13};
@@ -412,8 +412,8 @@ void simple_net(int times = 100) {
     if (conv3_prim_desc.src_desc() != conv3_src_memory.get_desc()) {
         conv3_src_memory = memory(conv3_prim_desc.src_desc(), eng);
         net.push_back(reorder(pool2_dst_memory, conv3_src_memory));
-        net_args.push_back({ { MKLDNN_ARG_FROM, pool2_dst_memory },
-                { MKLDNN_ARG_TO, conv3_src_memory } });
+        net_args.push_back({ { DNNL_ARG_FROM, pool2_dst_memory },
+                { DNNL_ARG_TO, conv3_src_memory } });
     }
 
     auto conv3_weights_memory = conv3_user_weights_memory;
@@ -428,10 +428,10 @@ void simple_net(int times = 100) {
 
     // create convolution primitive and add it to net
     net.push_back(convolution_forward(conv3_prim_desc));
-    net_args.push_back({ { MKLDNN_ARG_SRC, conv3_src_memory },
-            { MKLDNN_ARG_WEIGHTS, conv3_weights_memory },
-            { MKLDNN_ARG_BIAS, conv3_user_bias_memory },
-            { MKLDNN_ARG_DST, conv3_dst_memory } });
+    net_args.push_back({ { DNNL_ARG_SRC, conv3_src_memory },
+            { DNNL_ARG_WEIGHTS, conv3_weights_memory },
+            { DNNL_ARG_BIAS, conv3_user_bias_memory },
+            { DNNL_ARG_DST, conv3_dst_memory } });
 
     // AlexNet: relu3
     // {batch, 384, 13, 13} -> {batch, 384, 13, 13}
@@ -444,8 +444,8 @@ void simple_net(int times = 100) {
     auto relu3_prim_desc = eltwise_forward::primitive_desc(relu3_desc, eng);
 
     net.push_back(eltwise_forward(relu3_prim_desc));
-    net_args.push_back({ { MKLDNN_ARG_SRC, conv3_dst_memory },
-            { MKLDNN_ARG_DST, conv3_dst_memory } });
+    net_args.push_back({ { DNNL_ARG_SRC, conv3_dst_memory },
+            { DNNL_ARG_DST, conv3_dst_memory } });
 
     // AlexNet: conv4
     // {batch, 384, 13, 13} (x)  {2, 192, 192, 3, 3}; ->
@@ -485,8 +485,8 @@ void simple_net(int times = 100) {
     if (conv4_prim_desc.src_desc() != conv4_src_memory.get_desc()) {
         conv4_src_memory = memory(conv4_prim_desc.src_desc(), eng);
         net.push_back(reorder(conv3_dst_memory, conv4_src_memory));
-        net_args.push_back({ { MKLDNN_ARG_FROM, conv3_dst_memory },
-                { MKLDNN_ARG_TO, conv4_src_memory } });
+        net_args.push_back({ { DNNL_ARG_FROM, conv3_dst_memory },
+                { DNNL_ARG_TO, conv4_src_memory } });
     }
 
     auto conv4_weights_memory = conv4_user_weights_memory;
@@ -501,10 +501,10 @@ void simple_net(int times = 100) {
 
     // create convolution primitive and add it to net
     net.push_back(convolution_forward(conv4_prim_desc));
-    net_args.push_back({ { MKLDNN_ARG_SRC, conv4_src_memory },
-            { MKLDNN_ARG_WEIGHTS, conv4_weights_memory },
-            { MKLDNN_ARG_BIAS, conv4_user_bias_memory },
-            { MKLDNN_ARG_DST, conv4_dst_memory } });
+    net_args.push_back({ { DNNL_ARG_SRC, conv4_src_memory },
+            { DNNL_ARG_WEIGHTS, conv4_weights_memory },
+            { DNNL_ARG_BIAS, conv4_user_bias_memory },
+            { DNNL_ARG_DST, conv4_dst_memory } });
 
     // AlexNet: relu4
     // {batch, 384, 13, 13} -> {batch, 384, 13, 13}
@@ -517,8 +517,8 @@ void simple_net(int times = 100) {
     auto relu4_prim_desc = eltwise_forward::primitive_desc(relu4_desc, eng);
 
     net.push_back(eltwise_forward(relu4_prim_desc));
-    net_args.push_back({ { MKLDNN_ARG_SRC, conv4_dst_memory },
-            { MKLDNN_ARG_DST, conv4_dst_memory } });
+    net_args.push_back({ { DNNL_ARG_SRC, conv4_dst_memory },
+            { DNNL_ARG_DST, conv4_dst_memory } });
 
     // AlexNet: conv5
     // {batch, 384, 13, 13} (x)  {2, 128, 192, 3, 3}; -> {batch, 256, 13, 13};
@@ -557,8 +557,8 @@ void simple_net(int times = 100) {
     if (conv5_prim_desc.src_desc() != conv5_src_memory.get_desc()) {
         conv5_src_memory = memory(conv5_prim_desc.src_desc(), eng);
         net.push_back(reorder(conv4_dst_memory, conv5_src_memory));
-        net_args.push_back({ { MKLDNN_ARG_FROM, conv4_dst_memory },
-                { MKLDNN_ARG_TO, conv5_src_memory } });
+        net_args.push_back({ { DNNL_ARG_FROM, conv4_dst_memory },
+                { DNNL_ARG_TO, conv5_src_memory } });
     }
 
     auto conv5_weights_memory = conv5_user_weights_memory;
@@ -573,10 +573,10 @@ void simple_net(int times = 100) {
 
     // create convolution primitive and add it to net
     net.push_back(convolution_forward(conv5_prim_desc));
-    net_args.push_back({ { MKLDNN_ARG_SRC, conv5_src_memory },
-            { MKLDNN_ARG_WEIGHTS, conv5_weights_memory },
-            { MKLDNN_ARG_BIAS, conv5_user_bias_memory },
-            { MKLDNN_ARG_DST, conv5_dst_memory } });
+    net_args.push_back({ { DNNL_ARG_SRC, conv5_src_memory },
+            { DNNL_ARG_WEIGHTS, conv5_weights_memory },
+            { DNNL_ARG_BIAS, conv5_user_bias_memory },
+            { DNNL_ARG_DST, conv5_dst_memory } });
 
     // AlexNet: relu5
     // {batch, 256, 13, 13} -> {batch, 256, 13, 13}
@@ -589,8 +589,8 @@ void simple_net(int times = 100) {
     auto relu5_prim_desc = eltwise_forward::primitive_desc(relu5_desc, eng);
 
     net.push_back(eltwise_forward(relu5_prim_desc));
-    net_args.push_back({ { MKLDNN_ARG_SRC, conv5_dst_memory },
-            { MKLDNN_ARG_DST, conv5_dst_memory } });
+    net_args.push_back({ { DNNL_ARG_SRC, conv5_dst_memory },
+            { DNNL_ARG_DST, conv5_dst_memory } });
 
     // AlexNet: pool5
     // {batch, 256, 13, 13} -> {batch, 256, 6, 6}
@@ -615,8 +615,8 @@ void simple_net(int times = 100) {
 
     // create pooling primitive an add it to net
     net.push_back(pooling_forward(pool5_pd));
-    net_args.push_back({ { MKLDNN_ARG_SRC, conv5_dst_memory },
-            { MKLDNN_ARG_DST, pool5_dst_memory } });
+    net_args.push_back({ { DNNL_ARG_SRC, conv5_dst_memory },
+            { DNNL_ARG_DST, pool5_dst_memory } });
 
 
     // fc6 inner product {batch, 256, 6, 6} (x) {4096, 256, 6, 6}-> {batch,
@@ -651,8 +651,8 @@ void simple_net(int times = 100) {
     if (fc6_prim_desc.src_desc() != fc6_src_memory.get_desc()) {
         fc6_src_memory = memory(fc6_prim_desc.src_desc(), eng);
         net.push_back(reorder(pool5_dst_memory, fc6_src_memory));
-        net_args.push_back({ { MKLDNN_ARG_FROM, pool5_dst_memory },
-                { MKLDNN_ARG_TO, fc6_src_memory } });
+        net_args.push_back({ { DNNL_ARG_FROM, pool5_dst_memory },
+                { DNNL_ARG_TO, fc6_src_memory } });
     }
 
     auto fc6_weights_memory = fc6_user_weights_memory;
@@ -666,10 +666,10 @@ void simple_net(int times = 100) {
 
     // create convolution primitive and add it to net
     net.push_back(inner_product_forward(fc6_prim_desc));
-    net_args.push_back({ { MKLDNN_ARG_SRC, fc6_src_memory },
-            { MKLDNN_ARG_WEIGHTS, fc6_weights_memory },
-            { MKLDNN_ARG_BIAS, fc6_user_bias_memory },
-            { MKLDNN_ARG_DST, fc6_dst_memory } });
+    net_args.push_back({ { DNNL_ARG_SRC, fc6_src_memory },
+            { DNNL_ARG_WEIGHTS, fc6_weights_memory },
+            { DNNL_ARG_BIAS, fc6_user_bias_memory },
+            { DNNL_ARG_DST, fc6_dst_memory } });
 
 
     // fc7 inner product {batch, 4096} (x) {4096, 4096}-> {batch, 4096}
@@ -708,10 +708,10 @@ void simple_net(int times = 100) {
 
     // create convolution primitive and add it to net
     net.push_back(inner_product_forward(fc7_prim_desc));
-    net_args.push_back({ { MKLDNN_ARG_SRC, fc6_dst_memory },
-            { MKLDNN_ARG_WEIGHTS, fc7_weights_memory },
-            { MKLDNN_ARG_BIAS, fc7_user_bias_memory },
-            { MKLDNN_ARG_DST, fc7_dst_memory } });
+    net_args.push_back({ { DNNL_ARG_SRC, fc6_dst_memory },
+            { DNNL_ARG_WEIGHTS, fc7_weights_memory },
+            { DNNL_ARG_BIAS, fc7_user_bias_memory },
+            { DNNL_ARG_DST, fc7_dst_memory } });
 
     // fc8 inner product {batch, 4096} (x) {1000, 4096}-> {batch, 1000}
     memory::dims fc8_weights_tz = { 1000, 4096 };
@@ -750,17 +750,17 @@ void simple_net(int times = 100) {
 
     // create convolution primitive and add it to net
     net.push_back(inner_product_forward(fc8_prim_desc));
-    net_args.push_back({ { MKLDNN_ARG_SRC, fc7_dst_memory },
-            { MKLDNN_ARG_WEIGHTS, fc8_weights_memory },
-            { MKLDNN_ARG_BIAS, fc8_user_bias_memory },
-            { MKLDNN_ARG_DST, fc8_dst_memory } });
+    net_args.push_back({ { DNNL_ARG_SRC, fc7_dst_memory },
+            { DNNL_ARG_WEIGHTS, fc8_weights_memory },
+            { DNNL_ARG_BIAS, fc8_user_bias_memory },
+            { DNNL_ARG_DST, fc8_dst_memory } });
 
     // create reorder between internal and user data if it is needed and
     // add it to net after pooling
     if (fc8_dst_memory != user_dst_memory) {
         net.push_back(reorder(fc8_dst_memory, user_dst_memory));
-        net_args.push_back({ { MKLDNN_ARG_FROM, fc8_dst_memory },
-                { MKLDNN_ARG_TO, user_dst_memory } });
+        net_args.push_back({ { DNNL_ARG_FROM, fc8_dst_memory },
+                { DNNL_ARG_TO, user_dst_memory } });
     }
 
 /// @page cpu_cnn_inference_f32_cpp
diff --git a/python/jittor/extern/mkl/ops/mkl_conv_backward_w_op.cc b/python/jittor/extern/mkl/ops/mkl_conv_backward_w_op.cc
index 117285d7..58a91a22 100644
--- a/python/jittor/extern/mkl/ops/mkl_conv_backward_w_op.cc
+++ b/python/jittor/extern/mkl/ops/mkl_conv_backward_w_op.cc
@@ -13,9 +13,9 @@
 #include "var.h"
 #include "mkl_conv_backward_w_op.h"
 
-#include <mkldnn.hpp>
+#include <dnnl.hpp>
 
-using namespace mkldnn;
+using namespace dnnl;
 using namespace std;
 
 namespace jittor {
@@ -143,8 +143,8 @@ void MklConvBackwardWOp::jit_run() {
     if (conv_pd.src_desc() != conv_user_src_memory.get_desc()) {
         conv_src_memory = memory(conv_pd.src_desc(), eng);
         net_bwd.push_back(reorder(conv_user_src_memory, conv_src_memory));
-        net_bwd_args.push_back({{MKLDNN_ARG_FROM, conv_user_src_memory},
-                {MKLDNN_ARG_TO, conv_src_memory}});
+        net_bwd_args.push_back({{DNNL_ARG_FROM, conv_user_src_memory},
+                {DNNL_ARG_TO, conv_src_memory}});
     }
     
     auto conv_user_diff_dst_memory
@@ -169,8 +169,8 @@ void MklConvBackwardWOp::jit_run() {
     if (conv_bwd_weights_pd.src_desc() != conv_src_memory.get_desc()) {
         conv_bwd_src_memory = memory(conv_bwd_weights_pd.src_desc(), eng);
         net_bwd.push_back(reorder(conv_src_memory, conv_bwd_src_memory));
-        net_bwd_args.push_back({{MKLDNN_ARG_FROM, conv_src_memory},
-                {MKLDNN_ARG_TO, conv_bwd_src_memory}});
+        net_bwd_args.push_back({{DNNL_ARG_FROM, conv_src_memory},
+                {DNNL_ARG_TO, conv_bwd_src_memory}});
     }
 
     auto conv_diff_dst_memory = conv_user_diff_dst_memory;
@@ -178,13 +178,13 @@ void MklConvBackwardWOp::jit_run() {
             != conv_user_diff_dst_memory.get_desc()) {
         conv_diff_dst_memory = memory(conv_bwd_weights_pd.diff_dst_desc(), eng);
         net_bwd.push_back(reorder(conv_user_diff_dst_memory, conv_diff_dst_memory));
-        net_bwd_args.push_back({{MKLDNN_ARG_FROM, conv_user_diff_dst_memory},
-                {MKLDNN_ARG_TO, conv_diff_dst_memory}});
+        net_bwd_args.push_back({{DNNL_ARG_FROM, conv_user_diff_dst_memory},
+                {DNNL_ARG_TO, conv_diff_dst_memory}});
     }
 
     net_bwd.push_back(convolution_backward_weights(conv_bwd_weights_pd));
-    net_bwd_args.push_back({{MKLDNN_ARG_SRC, conv_bwd_src_memory},
-            {MKLDNN_ARG_DIFF_DST, conv_diff_dst_memory}});
+    net_bwd_args.push_back({{DNNL_ARG_SRC, conv_bwd_src_memory},
+            {DNNL_ARG_DIFF_DST, conv_diff_dst_memory}});
 
     auto conv_diff_weights_memory = conv_user_diff_weights_memory;
     if (conv_bwd_weights_pd.diff_weights_desc()
@@ -192,15 +192,15 @@ void MklConvBackwardWOp::jit_run() {
         conv_diff_weights_memory
                 = memory(conv_bwd_weights_pd.diff_weights_desc(), eng);
         net_bwd_args.back().insert(
-                {MKLDNN_ARG_DIFF_WEIGHTS, conv_diff_weights_memory});
+                {DNNL_ARG_DIFF_WEIGHTS, conv_diff_weights_memory});
 
         net_bwd.push_back(reorder(
                 conv_diff_weights_memory, conv_user_diff_weights_memory));
-        net_bwd_args.push_back({{MKLDNN_ARG_FROM, conv_diff_weights_memory},
-                {MKLDNN_ARG_TO, conv_user_diff_weights_memory}});
+        net_bwd_args.push_back({{DNNL_ARG_FROM, conv_diff_weights_memory},
+                {DNNL_ARG_TO, conv_user_diff_weights_memory}});
     } else {
         net_bwd_args.back().insert(
-                {MKLDNN_ARG_DIFF_WEIGHTS, conv_diff_weights_memory});
+                {DNNL_ARG_DIFF_WEIGHTS, conv_diff_weights_memory});
     }
 
     ASSERTop(net_bwd.size(),==,net_bwd_args.size());
diff --git a/python/jittor/extern/mkl/ops/mkl_conv_backward_x_op.cc b/python/jittor/extern/mkl/ops/mkl_conv_backward_x_op.cc
index 201524b6..62a5518f 100644
--- a/python/jittor/extern/mkl/ops/mkl_conv_backward_x_op.cc
+++ b/python/jittor/extern/mkl/ops/mkl_conv_backward_x_op.cc
@@ -13,9 +13,9 @@
 #include "var.h"
 #include "mkl_conv_backward_x_op.h"
 
-#include <mkldnn.hpp>
+#include <dnnl.hpp>
 
-using namespace mkldnn;
+using namespace dnnl;
 using namespace std;
 
 namespace jittor {
@@ -142,8 +142,8 @@ void MklConvBackwardXOp::jit_run() {
         conv_weights_memory = memory(conv_pd.weights_desc(), eng);
         net_bwd.push_back(
                 reorder(conv_user_weights_memory, conv_weights_memory));
-        net_bwd_args.push_back({{MKLDNN_ARG_FROM, conv_user_weights_memory},
-                {MKLDNN_ARG_TO, conv_weights_memory}});
+        net_bwd_args.push_back({{DNNL_ARG_FROM, conv_user_weights_memory},
+                {DNNL_ARG_TO, conv_weights_memory}});
     }
     
     auto conv_user_diff_dst_memory
@@ -168,21 +168,21 @@ void MklConvBackwardXOp::jit_run() {
             != conv_user_diff_dst_memory.get_desc()) {
         conv_diff_dst_memory = memory(conv_bwd_data_pd.diff_dst_desc(), eng);
         net_bwd.push_back(reorder(conv_user_diff_dst_memory, conv_diff_dst_memory));
-        net_bwd_args.push_back({{MKLDNN_ARG_FROM, conv_user_diff_dst_memory},
-                {MKLDNN_ARG_TO, conv_diff_dst_memory}});
+        net_bwd_args.push_back({{DNNL_ARG_FROM, conv_user_diff_dst_memory},
+                {DNNL_ARG_TO, conv_diff_dst_memory}});
     }
 
     auto conv_bwd_weights_memory = conv_weights_memory;
     if (conv_bwd_data_pd.weights_desc() != conv_weights_memory.get_desc()) {
         conv_bwd_weights_memory = memory(conv_bwd_data_pd.weights_desc(), eng);
         net_bwd.push_back(reorder(conv_weights_memory, conv_bwd_weights_memory));
-        net_bwd_args.push_back({{MKLDNN_ARG_FROM, conv_weights_memory},
-                {MKLDNN_ARG_TO, conv_bwd_weights_memory}});
+        net_bwd_args.push_back({{DNNL_ARG_FROM, conv_weights_memory},
+                {DNNL_ARG_TO, conv_bwd_weights_memory}});
     }
 
     net_bwd.push_back(convolution_backward_data(conv_bwd_data_pd));
-    net_bwd_args.push_back({{MKLDNN_ARG_WEIGHTS, conv_bwd_weights_memory},
-        {MKLDNN_ARG_DIFF_DST, conv_diff_dst_memory}});
+    net_bwd_args.push_back({{DNNL_ARG_WEIGHTS, conv_bwd_weights_memory},
+        {DNNL_ARG_DIFF_DST, conv_diff_dst_memory}});
             
     auto conv_diff_src_memory = conv_user_diff_src_memory;
     if (conv_bwd_data_pd.diff_src_desc()
@@ -190,15 +190,15 @@ void MklConvBackwardXOp::jit_run() {
         conv_diff_src_memory
                 = memory(conv_bwd_data_pd.diff_src_desc(), eng);
         net_bwd_args.back().insert(
-                {MKLDNN_ARG_DIFF_SRC, conv_diff_src_memory});
+                {DNNL_ARG_DIFF_SRC, conv_diff_src_memory});
                 
         net_bwd.push_back(reorder(
                 conv_diff_src_memory, conv_user_diff_src_memory));
-        net_bwd_args.push_back({{MKLDNN_ARG_FROM, conv_diff_src_memory},
-                {MKLDNN_ARG_TO, conv_user_diff_src_memory}});
+        net_bwd_args.push_back({{DNNL_ARG_FROM, conv_diff_src_memory},
+                {DNNL_ARG_TO, conv_user_diff_src_memory}});
     } else {
         net_bwd_args.back().insert(
-                {MKLDNN_ARG_DIFF_SRC, conv_diff_src_memory});
+                {DNNL_ARG_DIFF_SRC, conv_diff_src_memory});
     }
 
     ASSERTop(net_bwd.size(),==,net_bwd_args.size());
diff --git a/python/jittor/extern/mkl/ops/mkl_conv_op.cc b/python/jittor/extern/mkl/ops/mkl_conv_op.cc
index fb9c87e3..b8ddc0da 100644
--- a/python/jittor/extern/mkl/ops/mkl_conv_op.cc
+++ b/python/jittor/extern/mkl/ops/mkl_conv_op.cc
@@ -7,12 +7,12 @@
 // This file is subject to the terms and conditions defined in
 // file 'LICENSE.txt', which is part of this source code package.
 // ***************************************************************
-#include <mkldnn.hpp>
+#include <dnnl.hpp>
 
 #include "var.h"
 #include "mkl_conv_op.h"
 
-using namespace mkldnn;
+using namespace dnnl;
 using namespace std;
 
 namespace jittor {
@@ -110,7 +110,7 @@ void MklConvOp::jit_run() {
         auto n = ws[3];
         auto k = xs[3];
         // x: [m,k], w: [k,n], y: [m,n]
-        ASSERTop(0,==,mkldnn_sgemm('N', 'N', m, n, k,
+        ASSERTop(0,==,dnnl_sgemm('N', 'N', m, n, k,
             1.f, x->ptr<float32>(), k,
             w->ptr<float32>(), n,
             0.f, y->ptr<float32>(), n));
@@ -162,27 +162,27 @@ void MklConvOp::jit_run() {
     if (conv1_prim_desc.src_desc() != user_src_memory.get_desc()) {
         conv1_src_memory = memory(conv1_prim_desc.src_desc(), eng);
         net.push_back(reorder(user_src_memory, conv1_src_memory));
-        net_args.push_back({ { MKLDNN_ARG_FROM, user_src_memory },
-                { MKLDNN_ARG_TO, conv1_src_memory } });
+        net_args.push_back({ { DNNL_ARG_FROM, user_src_memory },
+                { DNNL_ARG_TO, conv1_src_memory } });
     }
 
     auto conv1_weights_memory = user_weights_memory;
     if (conv1_prim_desc.weights_desc() != user_weights_memory.get_desc()) {
         conv1_weights_memory = memory(conv1_prim_desc.weights_desc(), eng);
         net.push_back(reorder(user_weights_memory, conv1_weights_memory));
-        net_args.push_back({ { MKLDNN_ARG_FROM, user_weights_memory }, { MKLDNN_ARG_TO, conv1_weights_memory } });
+        net_args.push_back({ { DNNL_ARG_FROM, user_weights_memory }, { DNNL_ARG_TO, conv1_weights_memory } });
     }
     
     auto conv1_dst_memory = memory(conv1_prim_desc.dst_desc(), eng);
     
     net.push_back(convolution_forward(conv1_prim_desc));
-    net_args.push_back({ { MKLDNN_ARG_SRC, conv1_src_memory },
-            { MKLDNN_ARG_WEIGHTS, conv1_weights_memory },
-            { MKLDNN_ARG_DST, conv1_dst_memory } });
+    net_args.push_back({ { DNNL_ARG_SRC, conv1_src_memory },
+            { DNNL_ARG_WEIGHTS, conv1_weights_memory },
+            { DNNL_ARG_DST, conv1_dst_memory } });
 
     if (conv1_dst_memory != user_dst_memory) {
         net.push_back(reorder(conv1_dst_memory, user_dst_memory));
-        net_args.push_back({ { MKLDNN_ARG_FROM, conv1_dst_memory },{ MKLDNN_ARG_TO, user_dst_memory } });
+        net_args.push_back({ { DNNL_ARG_FROM, conv1_dst_memory },{ DNNL_ARG_TO, user_dst_memory } });
     }
 
     ASSERTop(net.size(),==,net_args.size());
diff --git a/python/jittor/extern/mkl/ops/mkl_matmul_op.cc b/python/jittor/extern/mkl/ops/mkl_matmul_op.cc
index a35ac53b..2a29f1ad 100644
--- a/python/jittor/extern/mkl/ops/mkl_matmul_op.cc
+++ b/python/jittor/extern/mkl/ops/mkl_matmul_op.cc
@@ -7,12 +7,12 @@
 // This file is subject to the terms and conditions defined in
 // file 'LICENSE.txt', which is part of this source code package.
 // ***************************************************************
-#include <mkldnn.hpp>
+#include <dnnl.hpp>
 
 #include "var.h"
 #include "mkl_matmul_op.h"
 
-using namespace mkldnn;
+using namespace dnnl;
 using namespace std;
 
 namespace jittor {
@@ -66,7 +66,7 @@ void MklMatmulOp::jit_run() {
         k = bs[0];
     }
     // a: [n,m], b: [m,k], c: [n,k]
-    ASSERTop(0,==,mkldnn_sgemm('@Trans_a', '@Trans_b', n, k, m,
+    ASSERTop(0,==,dnnl_sgemm('@Trans_a', '@Trans_b', n, k, m,
         1.f, a->ptr<T>(), '@Trans_a'=='N'? m : n,
         b->ptr<T>(), '@Trans_b' == 'N' ? k : m,
         0.f, c->ptr<T>(), k));
diff --git a/python/jittor/init.py b/python/jittor/init.py
index 9d74f121..39a1215e 100644
--- a/python/jittor/init.py
+++ b/python/jittor/init.py
@@ -706,26 +706,26 @@ def _no_grad_trunc_normal_(var, mean, std, a, b):
                       "The distribution of values may be incorrect.",
                       stacklevel=2)
 
-    with jt.no_grad():
-        # Values are generated by using a truncated uniform distribution and
-        # then using the inverse CDF for the normal distribution.
-        # Get upper and lower cdf values
-        l = norm_cdf((a - mean) / std)
-        u = norm_cdf((b - mean) / std)
 
-        # Uniformly fill tensor with values from [l, u], then translate to
-        # [2l-1, 2u-1].
-        # var.uniform(2 * l - 1, 2 * u - 1)
-        jt.init.uniform_(var, low=2 * l - 1, high=2 * u - 1)
+    # Values are generated by using a truncated uniform distribution and
+    # then using the inverse CDF for the normal distribution.
+    # Get upper and lower cdf values
+    l = norm_cdf((a - mean) / std)
+    u = norm_cdf((b - mean) / std)
 
-        # Use inverse cdf transform for normal distribution to get truncated
-        # standard normal
-        var.erfinv()
+    # Uniformly fill tensor with values from [l, u], then translate to
+    # [2l-1, 2u-1].
+    # var.uniform(2 * l - 1, 2 * u - 1)
+    var.uniform_(low=2 * l - 1, high=2 * u - 1)
 
-        # Transform to proper mean, std
-        var.multiply(std * math.sqrt(2.))
-        var.add(mean)
+    # Use inverse cdf transform for normal distribution to get truncated
+    # standard normal
+    var = var.erfinv()
 
-        # Clamp to ensure it's in the proper range
-        var.clamp(min_v=a, max_v=b)
-        return var
\ No newline at end of file
+    # Transform to proper mean, std
+    var = var.multiply(std * math.sqrt(2.))
+    var = var.add(mean)
+
+    # Clamp to ensure it's in the proper range
+    var = var.clamp(min_v=a, max_v=b)
+    return var
\ No newline at end of file
diff --git a/python/jittor/misc.py b/python/jittor/misc.py
index ba7e89ae..fc977581 100644
--- a/python/jittor/misc.py
+++ b/python/jittor/misc.py
@@ -488,18 +488,11 @@ def arctan2(y,x):
     angle = jt.zeros(x.shape,dtype=x.dtype)
     x = (x!=0.0).ternary(x, x+1e-30)
     angle = (y/x).arctan()
-        
-    mask = (y<0) & (x<0)
-    if angle[mask].numel()>0:
-        angle[mask] -= np.pi
-        
-    mask = (y>=0) &(x<0)
-    if angle[mask].numel()>0:
-        angle[mask] +=np.pi
+    mask = y<0 | ((y==0) & (x<0))
+    angle = angle + mask*np.pi
     return angle
 
 
-
 def nonzero(x):
     r'''
     Return the index of the elements of input tensor which are not equal to zero.
diff --git a/python/jittor/models/resnet.py b/python/jittor/models/resnet.py
index 1ef54d65..3aa60928 100644
--- a/python/jittor/models/resnet.py
+++ b/python/jittor/models/resnet.py
@@ -143,7 +143,7 @@ class ResNet(nn.Module):
         x = self.layer2(x)
         x = self.layer3(x)
         x = self.layer4(x)
-        x = self.avgpool(x)
+        x = self.avgpool(x).float_auto()
         x = jt.reshape(x, (x.shape[0], -1))
         x = self.fc(x)
         return x
diff --git a/python/jittor/nn.py b/python/jittor/nn.py
index cc486220..072feb45 100644
--- a/python/jittor/nn.py
+++ b/python/jittor/nn.py
@@ -37,9 +37,10 @@ def matmul_transpose(a, b):
     assert len(a.shape) == 2 and len(b.shape) == 2
 
     shape = list(a.shape)[:-1] + list(b.shape)
-    a = a.broadcast(shape, [len(shape)-2])
-    b = b.broadcast(shape)
-    return (a*b).sum(len(shape)-1)
+    with jt.flag_scope(amp_reg = jt.flags.amp_reg | 4):
+        a = a.broadcast(shape, [len(shape)-2])
+        b = b.broadcast(shape)
+        return (a*b).sum(len(shape)-1)
 
 
 def bmm_transpose(a, b):
@@ -108,47 +109,48 @@ Example::
     c = jt.matmul(a, b)
     assert c.shape == [8, 10, 3, 5]
     '''
-    len_a = len(a.shape)
-    len_b = len(b.shape)
-    if len_b == 1:
-        # a: [n, m], b:[m], c:[n]
-        return (a*b).sum(-1)
-    if len_a == 1:
-        # a: [n], b:[n,k], c:[k]
-        return (a.broadcast(b, [-1]) * b).sum(0)
-    if len_a>=3 and len_a==len_b:
-        # bmm
-        # a: [..., n, m], b: [..., m, k], c:[..., n, k]
-        if jt.flags.use_cuda and jt.compile_extern.cublas_ops:
-            return jt.compile_extern.cublas_ops.cublas_batched_matmul(a, b, 0, 0)
-    shape = []
-    len_c = max(len_a, len_b)
-    (n, m), (m_, k) = a.shape[-2:], b.shape[-2:]
-    assert m == m_, f"dimension not match, a.shape:{a.shape}, b.shape:{b.shape}"
-    # a: [..., n, m]
-    # b: [..., m, k]
-    # cc:[..., n, m, k]
-    #     -->
-    #     012
-    if len_b == 2 and len_a>2:
-        # TODO:ugly implementation for tuner
-        aa = a.reshape((-1, m))
-        cc = matmul(aa, b)
-        # print(a.shape, b.shape, cc.shape) 
-        return cc.reshape(a.shape[:-1] + [k])
-    for i in range(len_c-2):
-        ai = len_a-(len_c-i)
-        bi = len_b-(len_c-i)
-        an = a.shape[ai] if ai>=0 else 1
-        bn = b.shape[bi] if bi>=0 else 1
-        if an!=1 and bn!=1:
-            assert an == bn, f"dimension not match, a.shape:{a.shape}, b.shape:{b.shape}"
-        cn = max(an, bn)
-        shape.append(cn)
-    shape.extend([n, m, k])
-    a = a.broadcast(shape, [-1])
-    b = b.broadcast(shape, [-3])
-    return (a*b).sum(-2)
+    with jt.flag_scope(amp_reg = jt.flags.amp_reg | 4):
+        len_a = len(a.shape)
+        len_b = len(b.shape)
+        if len_b == 1:
+            # a: [n, m], b:[m], c:[n]
+            return (a*b).sum(-1)
+        if len_a == 1:
+            # a: [n], b:[n,k], c:[k]
+            return (a.broadcast(b, [-1]) * b).sum(0)
+        if len_a>=3 and len_a==len_b:
+            # bmm
+            # a: [..., n, m], b: [..., m, k], c:[..., n, k]
+            if jt.flags.use_cuda and jt.compile_extern.cublas_ops:
+                return jt.compile_extern.cublas_ops.cublas_batched_matmul(a, b, 0, 0)
+        shape = []
+        len_c = max(len_a, len_b)
+        (n, m), (m_, k) = a.shape[-2:], b.shape[-2:]
+        assert m == m_, f"dimension not match, a.shape:{a.shape}, b.shape:{b.shape}"
+        # a: [..., n, m]
+        # b: [..., m, k]
+        # cc:[..., n, m, k]
+        #     -->
+        #     012
+        if len_b == 2 and len_a>2:
+            # TODO:ugly implementation for tuner
+            aa = a.reshape((-1, m))
+            cc = matmul(aa, b)
+            # print(a.shape, b.shape, cc.shape) 
+            return cc.reshape(a.shape[:-1] + [k])
+        for i in range(len_c-2):
+            ai = len_a-(len_c-i)
+            bi = len_b-(len_c-i)
+            an = a.shape[ai] if ai>=0 else 1
+            bn = b.shape[bi] if bi>=0 else 1
+            if an!=1 and bn!=1:
+                assert an == bn, f"dimension not match, a.shape:{a.shape}, b.shape:{b.shape}"
+            cn = max(an, bn)
+            shape.append(cn)
+        shape.extend([n, m, k])
+        a = a.broadcast(shape, [-1])
+        b = b.broadcast(shape, [-3])
+        return (a*b).sum(-2)
 jt.Var.matmul = jt.Var.__matmul__ = matmul
 jt.Var.__imatmul__ = lambda a,b: a.assign(matmul(a,b))
 
@@ -488,19 +490,22 @@ class BCEWithLogitsLoss(Module):
     def execute(self, output, target):
         return binary_cross_entropy_with_logits(output,target,self.weight,self.pos_weight,self.size_average)
 
-def softmax(x, dim = None):
+def softmax(x, dim=None, log=False):
+    import jittor.other.code_softmax as code_softmax
+    if code_softmax.can_softmax_v1(x, dim):
+        return code_softmax.softmax_v1(x, log)
     if dim is None:
         x = (x - x.max()).exp()
         ret = x / x.sum()
     else:
         x = (x-x.max(dim, keepdims=True)).exp()
         ret = x / x.sum(dim, keepdims=True)
+    if log: return ret.log()
     return ret
 jt.Var.softmax = softmax
 
 def log_softmax(x,dim=None):
-    x = softmax(x,dim=dim)
-    return jt.log(x)
+    return softmax(x,dim=dim, log=True)
 jt.Var.log_softmax = log_softmax
 
 def log_sigmoid(x):
@@ -829,15 +834,16 @@ class Conv(Module):
             oh = (H+self.padding[0]*2-Kh*self.dilation[0]+self.dilation[0]-1)//self.stride[0]+1
             ow = (W+self.padding[1]*2-Kw*self.dilation[1]+self.dilation[1]-1)//self.stride[1]+1
             assert oh>0 and ow>0
-            xx = x.reindex([N,self.out_channels,C,oh,ow,Kh,Kw], [
-                'i0', # Nid
-                'i2', # Cid
-                f'i3*{self.stride[0]}-{self.padding[0]}+i5*{self.dilation[0]}', # Hid+Khid
-                f'i4*{self.stride[1]}-{self.padding[1]}+i6*{self.dilation[1]}', # Wid+KWid
-            ])
-            ww = self.weight.broadcast(xx.shape, [0,3,4])
-            yy = xx*ww
-            y = yy.sum([2,5,6]) # Kc, Kh, Kw
+            with jt.flag_scope(amp_reg = jt.flags.amp_reg | 4):
+                xx = x.reindex([N,self.out_channels,C,oh,ow,Kh,Kw], [
+                    'i0', # Nid
+                    'i2', # Cid
+                    f'i3*{self.stride[0]}-{self.padding[0]}+i5*{self.dilation[0]}', # Hid+Khid
+                    f'i4*{self.stride[1]}-{self.padding[1]}+i6*{self.dilation[1]}', # Wid+KWid
+                ])
+                ww = self.weight.broadcast(xx.shape, [0,3,4])
+                yy = xx*ww
+                y = yy.sum([2,5,6]) # Kc, Kh, Kw
             if self.bias is not None:
                 b = self.bias.broadcast(y.shape, [0,2,3])
                 y = y + b
@@ -1005,6 +1011,18 @@ class Conv3d(Module):
     def execute(self, x):
         return conv3d(x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
 
+
+class Conv1d_sp(Linear):
+    def __init__(self, inchannels, outchannels, kernel_size=1, bias=True):
+        super().__init__(inchannels, outchannels, bias=bias)
+        assert kernel_size == 1
+
+    def execute(self, x):
+        x = x.transpose(0, 2, 1)
+        x = super().execute(x)
+        x = x.transpose(0, 2, 1)
+        return x
+
 def conv2d(x, weight, bias=None, stride=1, padding=0, dilation=1, groups=1):
     ''' Applies a 2D convolution over an input signal composed of several input planes.
 
@@ -1045,15 +1063,16 @@ def conv2d(x, weight, bias=None, stride=1, padding=0, dilation=1, groups=1):
         Kh, Kw = weight.shape[-2:]
         oh = (H+padding[0]*2-Kh*dilation[0]+dilation[0]-1)//stride[0]+1
         ow = (W+padding[1]*2-Kw*dilation[1]+dilation[1]-1)//stride[1]+1
-        xx = x.reindex([N,out_channels,C,oh,ow,Kh,Kw], [
-                'i0', # Nid
-                'i2', # Cid
-                f'i3*{stride[0]}-{padding[0]}+i5*{dilation[0]}', # Hid+Khid
-                f'i4*{stride[1]}-{padding[1]}+i6*{dilation[1]}', # Wid+KWid
-            ])
-        ww = weight.broadcast(xx.shape, [0,3,4])
-        yy = xx*ww
-        y = yy.sum([2,5,6]) # Kc, Kh, Kw
+        with jt.flag_scope(amp_reg = jt.flags.amp_reg | 4):
+            xx = x.reindex([N,out_channels,C,oh,ow,Kh,Kw], [
+                    'i0', # Nid
+                    'i2', # Cid
+                    f'i3*{stride[0]}-{padding[0]}+i5*{dilation[0]}', # Hid+Khid
+                    f'i4*{stride[1]}-{padding[1]}+i6*{dilation[1]}', # Wid+KWid
+                ])
+            ww = weight.broadcast(xx.shape, [0,3,4])
+            yy = xx*ww
+            y = yy.sum([2,5,6]) # Kc, Kh, Kw
         if bias is not None:
             b = bias.broadcast(y.shape, [0,2,3])
             y = y + b
diff --git a/python/jittor/notebook/md_to_ipynb.py b/python/jittor/notebook/md_to_ipynb.py
index 61302e49..175b50f8 100644
--- a/python/jittor/notebook/md_to_ipynb.py
+++ b/python/jittor/notebook/md_to_ipynb.py
@@ -1,7 +1,7 @@
 #!python3
 import os, json
-from pathlib import Path
-notebook_dir = os.path.join(str(Path.home()), ".cache","jittor","notebook")
+import jittor_utils as jit_utils
+notebook_dir = os.path.join(jit_utils.home(), ".cache","jittor","notebook")
 if not os.path.isdir(notebook_dir):
     os.mkdir(notebook_dir)
 dirname = os.path.dirname(__file__)
diff --git a/python/jittor/other/code_softmax.py b/python/jittor/other/code_softmax.py
new file mode 100644
index 00000000..837bd648
--- /dev/null
+++ b/python/jittor/other/code_softmax.py
@@ -0,0 +1,130 @@
+import jittor as jt
+from jittor import nn
+
+def can_softmax_v1(a, dim):
+    if not jt.flags.use_cuda:
+        return False
+    if dim != -1 and dim != len(a.shape)-1:
+        return False
+    if a.shape[len(a.shape)-1] > 10000:
+        return False
+    return True
+
+def softmax_v1(a, log=False):
+    assert can_softmax_v1(a, -1)
+    length = a.shape[-1]
+    # tnum = 1024
+    tnum = 500 if length % 500 == 0 else 512
+    tnum = 125 if length % 125 == 0 else 128
+    # tnum = 125
+    # tnum = 1000 if length % 1000 == 0 else 1024
+    # tnum = 250
+    per_thread = (length-1) // tnum + 1
+    ILP = 1
+    for ilp in [8,4,2]:
+        if length % tnum == 0 and per_thread % ilp == 0:
+            ILP = ilp
+            per_thread //= ILP
+            break
+    for_loop = f"""
+    #pragma unroll
+    for (int i=0; i<{per_thread}; i++)
+    """
+    if length % tnum != 0:
+        for_loop += f"if ((i*{tnum}+threadIdx.x)*{ILP} < len)\n"
+
+    return jt.code(a.shape, a.dtype, [a], cuda_header=f'''
+#include <{jt.compile_extern.cub_home}cub/cub.cuh>
+#include <type/fp16_compute.h>
+''', cuda_src=f'''
+__global__ void kernel(in0_type* x, out0_type* y, int len) {{
+    typedef cub::BlockReduce<float, {tnum}> BlockReduce;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+
+    int id = blockIdx.x * len;
+    in0_type v[{per_thread}][{ILP}];
+    {for_loop}
+        vload<sizeof(in0_type)*{ILP}>(v[i], &x[id+(i*{tnum}+threadIdx.x)*{ILP}]);
+    // v[i] = x[id+i*{tnum}+threadIdx.x];
+    float v1 = -1e30;
+    {for_loop}
+        #pragma unroll
+        for (int j=0; j<{ILP}; j++) {{
+            v1 = max(v1, float(v[i][j]));
+        }}
+    __shared__ float vmax;
+    auto tmp = BlockReduce(temp_storage).Reduce(v1, cub::Max());
+    if (threadIdx.x == 0)
+        vmax = tmp;
+    __syncthreads();
+
+    v1 = 0;
+    {for_loop}
+        #pragma unroll
+        for (int j=0; j<{ILP}; j++) {{
+            v[i][j] = expf(float(v[i][j]) - vmax);
+            v1 += float(v[i][j]);
+        }}
+
+    tmp = BlockReduce(temp_storage).Sum(v1);
+    __shared__ float vsum;
+    if (threadIdx.x == 0)
+        vsum = tmp;
+    __syncthreads();
+
+    {for_loop}
+        #pragma unroll
+        for (int j=0; j<{ILP}; j++)
+            v[i][j] = {
+                "@expand_op(log,@in0_type,float(v[i][j])/vsum)" if log
+                else "float(v[i][j])/vsum"
+            };
+    {for_loop}
+        vload<sizeof(in0_type)*{ILP}>(&y[id+(i*{tnum}+threadIdx.x)*{ILP}], v[i]);
+}}
+int len = in0->shape[in0->shape.size()-1];
+int bnum = in0->numel() / len;
+cudaGetLastError();
+kernel<<<bnum, {tnum}>>>(in0_p, out0_p, len);
+CHECK(0 == cudaGetLastError());
+''', cuda_grad_src=[f"""
+__global__ void kernel(pout0_type* x, dout_type* y, out0_type* z, int len) {{
+    int id = blockIdx.x * len;
+    in0_type vx[{per_thread}][{ILP}];
+    in0_type vy[{per_thread}][{ILP}];
+    {for_loop} {{
+        vload<sizeof(in0_type)*{ILP}>(vx[i], &x[id+(i*{tnum}+threadIdx.x)*{ILP}]);
+        vload<sizeof(in0_type)*{ILP}>(vy[i], &y[id+(i*{tnum}+threadIdx.x)*{ILP}]);
+    }}
+    float v1 = 0;
+    {for_loop} 
+        #pragma unroll
+        for (int j=0; j<{ILP}; j++)
+            v1 += {"float(vy[i][j]);" if log else "float(vx[i][j]*vy[i][j]);"}
+
+    typedef cub::BlockReduce<float, {tnum}> BlockReduce;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+    auto tmp = BlockReduce(temp_storage).Sum(v1);
+    __shared__ float reduce_var;
+    if (threadIdx.x == 0)
+        reduce_var = tmp;
+    __syncthreads();
+
+    {for_loop}
+        #pragma unroll
+        for (int j=0; j<{ILP}; j++)
+            vx[i][j] = {
+                "vy[i][j] - in0_type(expf(vx[i][j]) * reduce_var);" if log 
+                else "vx[i][j] * (vy[i][j] - in0_type(reduce_var));"
+            }
+
+    {for_loop}
+        vload<sizeof(in0_type)*{ILP}>(&z[id+(i*{tnum}+threadIdx.x)*{ILP}],
+            vx[i]);
+}}
+int len = in0->shape[in0->shape.size()-1];
+int bnum = in0->numel() / len;
+cudaGetLastError();
+kernel<<<bnum, {tnum}>>>(pout0_p, dout_p, out0_p, len);
+CHECK(0 == cudaGetLastError());
+"""])
\ No newline at end of file
diff --git a/python/jittor/pool.py b/python/jittor/pool.py
index 150aff68..fd3c681c 100644
--- a/python/jittor/pool.py
+++ b/python/jittor/pool.py
@@ -60,14 +60,14 @@ class Pool(Module):
             '''
             if not self.return_indices:
                 forward_body += f'''
-                @out(i0, i1, i2, i3) = init_{self.op}(out_type);
+                @out(i0, i1, i2, i3) = @expand_op(init_{self.op}, @out_type);
                 for (int p = k2; p < k2_; ++p)
                     for (int q = k3; q < k3_; ++q)
-                        @out(i0, i1, i2, i3) = {self.op}(out_type, @out(i0, i1, i2, i3), @in0(i0, i1, p, q));
+                        @out(i0, i1, i2, i3) = @expand_op({self.op}, @out_type, @out(i0, i1, i2, i3), @out_type, @in0(i0, i1, p, q), @in0_type);
                 '''
             else:
                 forward_body += f'''
-                auto out_value = init_{self.op}(out_type);
+                auto out_value = @expand_op(init_{self.op}, @out_type);
                 int out_index = -1;
                 for (int p = k2; p < k2_; ++p)
                     for (int q = k3; q < k3_; ++q) 
@@ -105,7 +105,6 @@ class Pool(Module):
                 return_dtypes = x.dtype
             out = jt.code(return_shapes, return_dtypes, [x],
                 cuda_header="""
-                    #include <ops/binary_op_defs.h>
                     #include <misc/cuda_limits.h>
                 """,
                 cuda_src=f'''
@@ -121,8 +120,8 @@ class Pool(Module):
                         for (int i2 = p2; i2 < out_shape2; i2 += s2)
                             {{ {forward_body} }}
                     }}
-                    int tx = min(1024, out_shape3);
-                    int ty = min(1024 / tx, out_shape2);
+                    int tx = std::min(1024, out_shape3);
+                    int ty = std::min(1024 / tx, out_shape2);
                     int bx = (out_shape2 - 1) / ty + 1;
                     int by = out_shape1;
                     int bz = out_shape0;
@@ -144,8 +143,8 @@ class Pool(Module):
                                 {{ {backward_body} }}
                     }}
                     cudaMemsetAsync(out_p, 0, out->size);
-                    int tx = min(1024, pout_shape3);
-                    int ty = min(1024 / tx, pout_shape2);
+                    int tx = std::min(1024, pout_shape3);
+                    int ty = std::min(1024 / tx, pout_shape2);
                     int bx = (pout_shape2 - 1) / ty + 1;
                     int by = pout_shape1;
                     int bz = pout_shape0;
@@ -153,7 +152,7 @@ class Pool(Module):
                     dim3 s2_(tx, ty);
                     kernel3<<<s1_, s2_>>>(@ARGS);
                 '''],
-                cpu_header='#include <ops/binary_op_defs.h>',
+                cpu_header='',
                 cpu_src=f'''
                     using namespace std;
                     for (int i0=0; i0<out_shape0; i0++)
@@ -242,15 +241,15 @@ class Pool3d(Module):
             '''
             if not self.return_indices:
                 forward_body += f'''
-                @out(i0, i1, i2, i3, i4) = init_{self.op}(out_type);
+                @out(i0, i1, i2, i3, i4) = @expand_op(init_{self.op}, @out_type);
                 for (int p = k2; p < k2_; ++p)
                     for (int q = k3; q < k3_; ++q)
                         for (int r = k4; r < k4_; ++r)
-                            @out(i0, i1, i2, i3, i4) = {self.op}(out_type, @out(i0, i1, i2, i3, i4), @in0(i0, i1, p, q, r));
+                            @out(i0, i1, i2, i3, i4) = @expand_op({self.op}, @out_type, @out(i0, i1, i2, i3, i4), @out_type, @in0(i0, i1, p, q, r), @in0_type);
                 '''
             else:
                 forward_body += f'''
-                auto out_value = init_{self.op}(out_type);
+                auto out_value = @expand_op(init_{self.op}, @out_type);
                 int out_index = -1;
                 for (int p = k2; p < k2_; ++p)
                     for (int q = k3; q < k3_; ++q) 
@@ -293,7 +292,6 @@ class Pool3d(Module):
                 return_dtypes = x.dtype
             out = jt.code(return_shapes, return_dtypes, [x],
                 cuda_header="""
-                    #include <ops/binary_op_defs.h>
                     #include <misc/cuda_limits.h>
                 """,
                 cuda_src=f'''
@@ -312,9 +310,9 @@ class Pool3d(Module):
                         for (int i2 = p2; i2 < out_shape2; i2 += s2)
                             {{ {forward_body} }}
                     }}
-                    int tx = min(1024, out_shape4);
-                    int ty = min(1024 / tx, out_shape3);
-                    int tz = min(1024 / tx / ty, out_shape2);
+                    int tx = std::min(1024, out_shape4);
+                    int ty = std::min(1024 / tx, out_shape3);
+                    int tz = std::min(1024 / tx / ty, out_shape2);
                     int bx = (out_shape2 - 1) / tz + 1;
                     int by = out_shape1;
                     int bz = out_shape0;
@@ -339,9 +337,9 @@ class Pool3d(Module):
                                 {{ {backward_body} }}
                     }}
                     cudaMemsetAsync(out_p, 0, out->size);
-                    int tx = min(1024, pout_shape4);
-                    int ty = min(1024 / tx, pout_shape3);
-                    int tz = min(1024 / tx / ty, pout_shape2);
+                    int tx = std::min(1024, pout_shape4);
+                    int ty = std::min(1024 / tx, pout_shape3);
+                    int tz = std::min(1024 / tx / ty, pout_shape2);
                     int bx = (pout_shape2 - 1) / tz + 1;
                     int by = pout_shape1;
                     int bz = pout_shape0;
@@ -349,7 +347,7 @@ class Pool3d(Module):
                     dim3 s2(tx, ty, tz);
                     kernel3<<<s1, s2>>>(@ARGS);
                 '''],
-                cpu_header='#include <ops/binary_op_defs.h>',
+                cpu_header='',
                 cpu_src=f'''
                     using namespace std;
                     for (int i0=0; i0<out_shape0; i0++)
diff --git a/python/jittor/src/executor.cc b/python/jittor/src/executor.cc
index 64afde7f..186eeb0c 100644
--- a/python/jittor/src/executor.cc
+++ b/python/jittor/src/executor.cc
@@ -509,6 +509,11 @@ void Executor::run_sync(vector<Var*> vars, bool device_sync) {
                     var->alloc(cpu_allocator);
                 }
             }
+        } else {
+            for (Var* v : op->inputs()) {
+                if (!v->allocator->is_cuda())
+                    migrate_to_gpu(v, allocator);
+            }
         }
         #endif
         #ifdef NODE_MEMCHECK
diff --git a/python/jittor/src/executor.h b/python/jittor/src/executor.h
index 8e37c361..dc21d096 100644
--- a/python/jittor/src/executor.h
+++ b/python/jittor/src/executor.h
@@ -22,6 +22,10 @@ struct Executor {
     Allocator* temp_allocator;
     bool last_is_cuda = false;
     void run_sync(vector<Var*> vars, bool device_sync);
+
+    inline Allocation alloc_temp(size_t size) {
+        return Allocation(temp_allocator, size);
+    }
 };
 
 EXTERN_LIB Executor exe;
diff --git a/python/jittor/src/fused_op.cc b/python/jittor/src/fused_op.cc
index 74da506b..924a1113 100644
--- a/python/jittor/src/fused_op.cc
+++ b/python/jittor/src/fused_op.cc
@@ -9,6 +9,7 @@
 #include "op_compiler.h"
 #include "profiler/profiler.h"
 #include "misc/fast_shared_ptr.h"
+#include "misc/cuda_flags.h"
 
 namespace jittor {
 
@@ -42,6 +43,7 @@ void FusedOp::update_ops() {
     loop_options_tuned.clear();
     loop_options = loop_options_origin = nullptr;
 
+    _inputs.clear();
     _outputs.clear();
     for (Op* op : ops) {
         for (Var* o : op->outputs()) {
@@ -101,6 +103,7 @@ void FusedOp::update_ops() {
             if (!(c&2)) {
                 c += 2 + vars.size()*4;
                 vars.push_back({i, 0});
+                _inputs.emplace_back((Node*)i);
             }
         }
         for (Var* o : opi->outputs()) {
@@ -135,6 +138,7 @@ FusedOp::FusedOp(const FusedOp& other) {
 }
 
 FusedOp::~FusedOp() {
+    _inputs.clear();
     _outputs.clear();
     Op::number_of_lived_ops++;
 }
@@ -159,20 +163,15 @@ void FusedOp::statistics(uint64_t& in, uint64_t& out, uint64_t& compute) {
 
 void FusedOp::do_jit_prepare(JK& jk) {
     jk.clear();
-    int8 flags = 3;
     for (uint i=0; i<ops.size(); i++) {
         Op* op = ops[i];
         jk << "[opkey" << i << JK::val;
-        op->do_jit_prepare(jk);
+        jk << op->name();
+        op->jit_prepare(jk);
         jk << JK::end;
-        if (op->flags.get(NodeFlags::_cpu))
-            flags &= 1; // only cpu
-        else
-            flags &= 2; // only gpu
     }
-    ASSERT(flags) << "FusedOp cannot contain both cpu and cuda ops.";
     jk << _CS("[JIT:1]");
-    if (flags==1) {
+    if (!use_cuda) {
         // only cpu
         jk << _CS("[JIT_cpu:1]");
         this->flags.set(NodeFlags::_cuda, 0);
@@ -189,9 +188,17 @@ void FusedOp::do_jit_prepare(JK& jk) {
         jk << JK::hex2(i) << JK::hex1(j) << JK::hex2(k) << JK::hex1(l) << ',';
     }
     jk << _CS("][var_info:") << JK::val;
-    for (auto& vi : vars)
+    bool use_int64_t = false;
+    for (auto& vi : vars) {
         jk << JK::hex1(vi.type) << JK::hex1(vi.var->shape.size());
+        if (vi.type != 1 && vi.var->num >= std::numeric_limits<int32_t>::max())
+            use_int64_t = true;
+    }
     jk << JK::end;
+    if (use_int64_t)
+        jk << _CS("[index_t:int64]");
+    else
+        jk << _CS("[index_t:int32]");
     if (loop_options->size()) {
         if (get_loop_option("compile_shapes")) {
             jk << _CS("[shapes:");
diff --git a/python/jittor/src/grad.cc b/python/jittor/src/grad.cc
index 34879757..5e891714 100644
--- a/python/jittor/src/grad.cc
+++ b/python/jittor/src/grad.cc
@@ -39,11 +39,24 @@ template<class T> struct StackIniter {
 #define STACK_ALLOC2(T, a, n) T a[n]
 #endif
 
+struct AmpGradGuard {
+    int amp_reg_bk;
+    AmpGradGuard(Op* op) {
+        amp_reg_bk = amp_reg;
+        amp_reg |= (op->flags.flags >> NodeFlags::_prefer_32);
+    }
+
+    ~AmpGradGuard() {
+        amp_reg = amp_reg_bk;
+    }
+};
+
 VarPtr make_grad(Op* op, Var* out, Var* dout, Var* x, int x_index) {
     if (dout == nullptr) return nullptr;
     if (x_index<0) return nullptr;
     LOGvvvv << "Make grad op:" >> op->name() << "inputs:" >> op->inputs()
         << "out:" >> out << "dout:" >> dout << "x:" >> x << "xid:" >> x_index;
+    AmpGradGuard agg(op);
     auto dx = op->grad(out, dout, x, x_index);
     if (x->loop_options)
         dx->loop_options = x->loop_options;
@@ -182,7 +195,10 @@ vector<VarPtr> grad(Var* loss, vector<Var*> targets) {
                         douts[i] = nullptr;
                 }
                 trace_grad_op = op;
-                op->grads(douts, dins);
+                {
+                    AmpGradGuard agg(op);
+                    op->grads(douts, dins);
+                }
                 // dump "for (Var* in : op->inputs())"
                 for (int i=0; i<n_i; i++,j++) {
                     auto id = id_buffer[j].second;
diff --git a/python/jittor/src/init.cc b/python/jittor/src/init.cc
index 7d8817a0..c3010e52 100644
--- a/python/jittor/src/init.cc
+++ b/python/jittor/src/init.cc
@@ -43,7 +43,7 @@ void cleanup() {
 }
 
 static void init_cuda_devices() {
-#ifdef HAS_CUDA
+#ifdef IS_CUDA
     if (cuda_archs.size()) return;
     int count=0;
     cudaGetDeviceCount(&count);
diff --git a/python/jittor/src/jit_compiler.cc b/python/jittor/src/jit_compiler.cc
index 8b29719e..5fca7c18 100644
--- a/python/jittor/src/jit_compiler.cc
+++ b/python/jittor/src/jit_compiler.cc
@@ -20,6 +20,7 @@
 #include "utils/flags.h"
 #include "fused_op.h"
 #include "utils/str_utils.h"
+JPU(header)
 
 namespace jittor {
     
@@ -204,6 +205,8 @@ jit_op_entry_t compile(const string& jit_key, const string& src, const bool is_c
     // compiler do not allowed filename too long
     CHECK(cc_path.size());
     string jit_src_path = Op::get_filename_from_jit_key(jit_key, ".cc");
+    string* src2 = (string*)&src;
+    JPU(op_compiler(jit_src_path, *src2, is_cuda_op));
     #ifdef _WIN32
     string jit_lib_path = Op::get_filename_from_jit_key(jit_key, ".dll");
     string jit_src_path2 = _to_winstr(jit_src_path);
diff --git a/python/jittor/src/jit_key.h b/python/jittor/src/jit_key.h
index 563c5cf3..91647dac 100644
--- a/python/jittor/src/jit_key.h
+++ b/python/jittor/src/jit_key.h
@@ -167,7 +167,7 @@ inline JK& operator<<(JK& jk, int64 c) {
 }
 
 #ifdef __linux__
-inline JK& operator<<(JK& jk, long long int c) {
+inline JK& operator<<(JK& jk, int64_t c) {
     return jk << (int64)c;
 }
 #endif
diff --git a/python/jittor/src/mem/allocator.cc b/python/jittor/src/mem/allocator.cc
index a6f1821b..1e425b4f 100644
--- a/python/jittor/src/mem/allocator.cc
+++ b/python/jittor/src/mem/allocator.cc
@@ -107,7 +107,7 @@ void migrate_to_cpu(Var* var, Allocator* allocator) {
     if (!use_cuda_managed_allocator) {
         // must be a device allocator
         Allocation a(allocator, var->size);
-        checkCudaErrors(cudaMemcpy(a.ptr, var->mem_ptr, var->size, cudaMemcpyDefault));
+        checkCudaErrors(cudaMemcpy(a.ptr, var->mem_ptr, var->size, cudaMemcpyDeviceToHost));
         var->allocator->free(var->mem_ptr, var->size, var->allocation);
         var->mem_ptr = a.ptr;
         var->allocation = a.allocation;
diff --git a/python/jittor/src/mem/mem_info.cc b/python/jittor/src/mem/mem_info.cc
index b042ce2e..eea8b567 100644
--- a/python/jittor/src/mem/mem_info.cc
+++ b/python/jittor/src/mem/mem_info.cc
@@ -62,7 +62,7 @@ void display_memory_info(const char* fileline, bool dump_var, bool red_color) {
     log << "\n=== display_memory_info ===\n";
     log << "total_cpu_ram:" << 
         FloatOutput{(double)mem_info.total_cpu_ram, " KMG", 1024, "B"};
-    log << "total_cuda_ram:" << 
+    log << "total_device_ram:" << 
         FloatOutput{(double)mem_info.total_cuda_ram, " KMG", 1024, "B"} >> "\n";
     log << "hold_vars:" << hold_vars.size()
         << "lived_vars:" << Var::number_of_lived_vars
@@ -105,7 +105,7 @@ void display_memory_info(const char* fileline, bool dump_var, bool red_color) {
         auto total = a->used_memory + a->unused_memory;
         all_total += total;
         a->is_cuda() ? gpu_total += total : cpu_total += total;
-        log << "name:" << a->name() << "is_cuda:" << a->is_cuda()
+        log << "name:" << a->name() << "is_device:" << a->is_cuda()
             << "used:" << FloatOutput{(double)a->used_memory, " KMG", 1024, "B"}
                 >> "(" >> std::setprecision(p) >> a->used_memory*100.0 / total >> "%)"
             << "unused:" << FloatOutput{(double)a->unused_memory, " KMG", 1024, "B"} 
@@ -117,7 +117,7 @@ void display_memory_info(const char* fileline, bool dump_var, bool red_color) {
             auto total = a->used_memory + a->unused_memory;
             all_total += total;
             a->is_cuda() ? gpu_total += total : cpu_total += total;
-            log << "name:" << a->name() << "is_cuda:" << a->is_cuda()
+            log << "name:" << a->name() << "is_device:" << a->is_cuda()
                 << "used:" << FloatOutput{(double)a->used_memory, " KMG", 1024, "B"}
                     >> "(" >> std::setprecision(p) >> a->used_memory*100.0 / total >> "%)"
                 << "unused:" << FloatOutput{(double)a->unused_memory, " KMG", 1024, "B"} 
@@ -227,9 +227,9 @@ MemInfo::MemInfo() {
 
     total_cuda_ram = 0;
 #ifdef HAS_CUDA
-    cudaDeviceProp prop;
-    cudaGetDeviceProperties(&prop, 0);
-    total_cuda_ram = prop.totalGlobalMem;
+    size_t gpu_free = 0, _gpu_total = 0;
+    cudaMemGetInfo(&gpu_free, &_gpu_total);
+    total_cuda_ram = _gpu_total;
 #endif
     sigquit_callback.push_back(&meminfo_callback);
 }
diff --git a/python/jittor/src/misc/cuda_flags.h b/python/jittor/src/misc/cuda_flags.h
index f8aafdaa..61bd4701 100644
--- a/python/jittor/src/misc/cuda_flags.h
+++ b/python/jittor/src/misc/cuda_flags.h
@@ -24,7 +24,7 @@ inline int get_device_count() {
 
 } // jittor
 
-#if CUDART_VERSION < 10000
+#if defined(CUDART_VERSION) && CUDART_VERSION < 10000
     #define _cudaLaunchHostFunc(a,b,c) \
         cudaStreamAddCallback(a,b,c,0)
     #define CUDA_HOST_FUNC_ARGS cudaStream_t stream, cudaError_t status, void*
diff --git a/python/jittor/src/misc/deleter.h b/python/jittor/src/misc/deleter.h
index 8d0239ee..fcf661e9 100644
--- a/python/jittor/src/misc/deleter.h
+++ b/python/jittor/src/misc/deleter.h
@@ -13,7 +13,8 @@ namespace jittor {
 struct Deleter {
     std::function<void()> del;
     inline Deleter(std::function<void()>&& func) : del(move(func)) {}
-    inline ~Deleter() { del(); }
+    inline Deleter() {}
+    inline ~Deleter() { if (del) del(); }
 };
 
 } // jittor
diff --git a/python/jittor/src/misc/nan_checker.cc b/python/jittor/src/misc/nan_checker.cc
index d7f5a767..b5e6c737 100644
--- a/python/jittor/src/misc/nan_checker.cc
+++ b/python/jittor/src/misc/nan_checker.cc
@@ -17,7 +17,7 @@
 namespace jittor {
 
 
-#ifdef HAS_CUDA
+#ifdef IS_CUDA
 EXTERN_LIB void check_nan_float32(float32* ptr, int64 num);
 EXTERN_LIB void check_nan_float64(float64* ptr, int64 num);
 #endif
@@ -28,7 +28,7 @@ bool check_nan(Var* v) {
             v->input()->name() == string("empty") ||
             v->input()->name() == string("setitem")))
         return true;
-    #ifdef HAS_CUDA
+    #ifdef IS_CUDA
     if (v->allocator->is_cuda()) {
         if (v->dtype() == ns_float32) {
             check_nan_float32((float32*)v->mem_ptr, v->num);
diff --git a/python/jittor/src/misc/nano_string.cc b/python/jittor/src/misc/nano_string.cc
index b85bf861..70e01b94 100644
--- a/python/jittor/src/misc/nano_string.cc
+++ b/python/jittor/src/misc/nano_string.cc
@@ -9,6 +9,17 @@
 
 namespace jittor {
 
+DEFINE_FLAG(int, amp_reg, 0, "Auto mixed-precision control registers, bit 0: prefer 32; bit 1: prefer 16; bit 2: keep reduce type; bit 3 keep white list type; bit 4: array like op prefer too");
+
+DEFINE_FLAG_WITH_SETTER(int, auto_mixed_precision_level, 0, "Auto mixed-precision optimization level, 0: not use fp16, 1-3: preserve level, not use fp16 for now; 4: perfer fp16, but some ops use fp32 e.g. sum,exp; 5: simular with 4, and array op will automatically convert to fp16; 6: all ops prefer fp16");
+
+void setter_auto_mixed_precision_level(int value) {
+    if (value <= 3) amp_reg = 0; else
+    if (value == 4) amp_reg = amp_prefer16; else
+    if (value == 5) amp_reg = amp_prefer16 | amp_array_prefer; else
+    if (value == 6) amp_reg = amp_prefer16 | amp_array_prefer | amp_keep_reduce | amp_keep_white;
+}
+
 #define FOR_ALL_TYPES(m) \
     m(bool) \
     m(int8) \
@@ -89,15 +100,18 @@ static unordered_set<string> unary_ops = {
     "erfinv"
 };
 
-static unordered_set<string> unary_float_ops = {
+static unordered_set<string> float_ops = {
     "log",
     "exp",
     "sqrt",
+    "mean",
+    "divide",
 };
-static unordered_set<string> unary_int_ops = {
+static unordered_set<string> int_ops = {
     "round_int",
     "floor_int",
     "ceil_int",
+    "floor_divide",
 };
 
 static unordered_set<string> binary_ops = {
@@ -127,6 +141,13 @@ static unordered_set<string> binary_ops = {
     "mean",
 };
 
+
+static unordered_set<string> white_ops = {
+    // "log",
+    "exp",
+    "pow",
+};
+
 #define DEFINE_NS(T) NanoString ns_##T;
 FOR_ALL_NS(DEFINE_NS);
 
@@ -135,6 +156,9 @@ char __ns_to_string[ns_max_size*ns_max_len];
 int __ns_len[ns_max_size];
 
 static void init_ns() {
+    dsize_map["float16"] = 1;
+    is_float_map["float16"] = 1;
+    is_unsigned["float16"] = 0;
     NanoString::ns_t i=0;
     auto func = [&](const char* name, NanoString& ns) {
         ns.set(NanoString::_index, i++, NanoString::_index_nbits);
@@ -149,13 +173,16 @@ static void init_ns() {
         if (unary_ops.count(name)) {
             ns.set(NanoString::_type, NanoString::_unary, NanoString::_type_nbits);
             ns.set(NanoString::_bool, is_bool.count(name));
-            ns.set(NanoString::_int, unary_int_ops.count(name));
-            ns.set(NanoString::_float, unary_float_ops.count(name));
+            ns.set(NanoString::_int, int_ops.count(name));
+            ns.set(NanoString::_float, float_ops.count(name));
         } else
         if (binary_ops.count(name)) {
             ns.set(NanoString::_type, NanoString::_binary, NanoString::_type_nbits);
             ns.set(NanoString::_bool, is_bool.count(name));
+            ns.set(NanoString::_int, int_ops.count(name));
+            ns.set(NanoString::_float, float_ops.count(name));
         }
+        ns.set(NanoString::_white_list, white_ops.count(name));
         __string_to_ns[name] = ns;
         auto name2 = ns.to_cstring();
         int len=0;
@@ -171,6 +198,7 @@ static void init_ns() {
     __string_to_ns["sum"] = ns_add;
     __string_to_ns["min"] = ns_minimum;
     __string_to_ns["max"] = ns_maximum;
+    __string_to_ns["half"] = ns_float16;
     __string_to_ns["float"] = ns_float32;
     __string_to_ns["double"] = ns_float64;
     __string_to_ns["int"] = ns_int32;
diff --git a/python/jittor/src/misc/nano_string.h b/python/jittor/src/misc/nano_string.h
index 12feb331..02440985 100644
--- a/python/jittor/src/misc/nano_string.h
+++ b/python/jittor/src/misc/nano_string.h
@@ -24,6 +24,7 @@ constexpr int ns_max_len = 16;
     m(uint16) \
     m(uint32) \
     m(uint64) \
+    m(float16) \
     m(float32) \
     m(float64) \
 \
@@ -100,7 +101,7 @@ struct NanoString {
     typedef uint16 ns_t;
     enum Flags {
         // bit0~7: index
-        _index=0, _index_nbits=8,
+        _index=0, _index_nbits=7,
         _n=_index_nbits,
 
         // bit0-1: type
@@ -116,6 +117,8 @@ struct NanoString {
         _float=_n+5,
         // bit6-7: dsize(1,2,4,8 byte)
         _dsize=_n+6, _dsize_nbits=2,
+        // bit8: white list
+        _white_list=_n+8,
     };
     ns_t data=0;
 
@@ -130,11 +133,16 @@ struct NanoString {
     inline ns_t index() const { return get(_index, _index_nbits); }
     inline int len() const { return __ns_len[index()]; }
     inline ns_t type() const { return get(_type, _type_nbits); }
-    inline ns_t is_bool() const { return get(_bool); }
-    inline ns_t is_int() const { return get(_int); }
-    inline ns_t is_unsigned() const { return get(_unsigned); }
-    inline ns_t is_float() const { return get(_float); }
+    // @pyjt(is_bool)
+    inline bool is_bool() const { return get(_bool); }
+    // @pyjt(is_int)
+    inline bool is_int() const { return get(_int); }
+    inline bool is_unsigned() const { return get(_unsigned); }
+    // @pyjt(is_float)
+    inline bool is_float() const { return get(_float); }
+    inline ns_t is_white() const { return get(_white_list); }
     inline ns_t dsize() const { return 1<<get(_dsize, _dsize_nbits); }
+    inline ns_t dsize_() const { return get(_dsize, _dsize_nbits); }
     inline ns_t is_dtype() const { return get(_type, _type_nbits)==_dtype; }
     inline ns_t is_binary() const { return get(_type, _type_nbits)==_binary; }
     inline ns_t is_unary() const { return get(_type, _type_nbits)==_unary; }
@@ -156,28 +164,6 @@ struct NanoString {
         { return __ns_to_string+index()*ns_max_len; }
 };
 
-// force_type = 1 for int, 2 for float
-inline 
-NanoString dtype_infer(NanoString v1, NanoString v2, int force_type=0, NanoString op=ns_void) {
-    bool is_float = v1.is_float() || v2.is_float();
-    int dsize = std::max(v1.dsize(), v2.dsize());
-    if (force_type == 1)
-        is_float = false;
-    else if (force_type == 2)
-        is_float = true;
-    if (is_float) {
-        if (dsize==4) return ns_float32;
-        return ns_float64;
-    } else {
-        if (dsize==8) return ns_int64;
-        if (dsize==4) return ns_int32;
-        if (dsize==2) return ns_int16;
-        if (op.data == ns_add.data || op.data == ns_subtract.data)
-            return ns_int8;
-        return v1;
-    }
-}
-
 // @pyjt(NanoString.__eq__)
 inline bool eq(const NanoString& a, const NanoString& b) {
     return a.data == b.data;
@@ -199,4 +185,72 @@ inline std::ostream& operator<<(std::ostream& os, const NanoString& v) {
     return os << v.to_cstring();
 }
 
+EXTERN_LIB int amp_reg;
+constexpr int amp_prefer32 = 1;
+constexpr int amp_prefer16 = 2;
+constexpr int amp_keep_reduce = 4;
+constexpr int amp_keep_white = 8;
+constexpr int amp_array_prefer = 16;
+
+inline NanoString float_dtype(int dsize_) {
+    if (amp_reg & amp_prefer32) return ns_float32;
+    if (amp_reg & amp_prefer16) return ns_float16;
+    return (dsize_ == 3) ? ns_float64 : 
+        (dsize_ == 2 ) ? ns_float32 : ns_float16;
+}
+
+inline NanoString int_dtype(int dsize_) {
+    return (dsize_ == 3) ? ns_int64 : 
+        (dsize_ == 2) ? ns_int32 :
+        (dsize_ == 1) ? ns_int16 : ns_int8;
+}
+
+inline  NanoString dtype_infer(NanoString x, NanoString y) {
+    int dsize_ = std::max(x.dsize_(), y.dsize_());
+    bool is_float = x.is_float() || y.is_float();
+    if (is_float)
+        return float_dtype(dsize_);
+    else {
+        return int_dtype(dsize_);
+    }
+}
+
+inline NanoString binary_dtype_infer(NanoString op, NanoString x, NanoString y) {
+    if (op.is_bool()) return ns_bool;
+    int dsize_ = std::max(x.dsize_(), y.dsize_());
+    bool is_float = !op.is_int() && 
+        (x.is_float() || y.is_float() || op.is_float());
+    if (is_float) {
+        if (op.is_white() && !(amp_reg & amp_keep_white))
+            return (dsize_ == 3) ? ns_float64 : ns_float32;
+        return float_dtype(dsize_);
+    } else {
+        return int_dtype(dsize_);
+    }
+}
+
+inline NanoString unary_dtype_infer(NanoString op, NanoString x) {
+    if (op.is_bool()) return ns_bool;
+    int dsize_ = x.dsize_();
+    if (op.is_float()) {
+        if (op.is_white() && !(amp_reg & amp_keep_white))
+            return (dsize_ == 3) ? ns_float64 : ns_float32;
+        return float_dtype(dsize_);
+    }
+    if (op.is_int()) return int_dtype(dsize_);
+    return x;
+}
+
+inline NanoString reduce_dtype_infer(NanoString op, NanoString x) {
+    bool is_float = x.is_float() || op.is_float();
+    int dsize_ = x.dsize_();
+    if (is_float) {
+        if (amp_reg & amp_keep_reduce)
+            return float_dtype(dsize_);
+        return (dsize_ == 3) ? ns_float64 : ns_float32;
+    } else {
+        return x;
+    }
+}
+
 }
diff --git a/python/jittor/src/misc/string_view_map.h b/python/jittor/src/misc/string_view_map.h
index 08ad693a..bcbbda7e 100644
--- a/python/jittor/src/misc/string_view_map.h
+++ b/python/jittor/src/misc/string_view_map.h
@@ -16,12 +16,12 @@
 
 namespace jittor {
 
-#if defined(__clang__)
+#if __cplusplus < 201400L || defined(IS_ACL)
+using string_view = string;
+#elif defined(__clang__)
 using std::string_view;
 #elif defined(__GNUC__)
 using std::experimental::string_view;
-#elif __cplusplus < 201400L
-using string_view = string;
 #else
 using std::string_view;
 #endif
diff --git a/python/jittor/src/node.h b/python/jittor/src/node.h
index 67665549..de8cf41b 100644
--- a/python/jittor/src/node.h
+++ b/python/jittor/src/node.h
@@ -51,8 +51,14 @@ struct NodeFlags {
         _grads=_n+6,
         // bit7: has graph optimize
         _has_gopt=_n+7,
-        // bit7: has vary input
+        // bit8: has vary input
         _has_vary_input=_n+8,
+        // bit9: prefer 32 bit
+        _prefer_32=_n+9,
+        // bit10: force 16 bit
+        _prefer_16=_n+10,
+        // bit11: reduce keep type unchange
+        _reduce_keep=_n+11,
     };
 
     inline void set(Flags f, int a=1, int nbits=1) {
@@ -90,7 +96,7 @@ struct Node {
         operator Var*() { return (Var*)node; }
         operator var_output_t() { return {(Op*)node, index}; }
     };
-    static int64_t tflag_count;
+    static int64 tflag_count;
     NodeFlags flags;
     NanoString ns;
     inline bool is_var() const { return flags.get(NodeFlags::_var); }
diff --git a/python/jittor/src/op.cc b/python/jittor/src/op.cc
index f60955e2..35a42f06 100644
--- a/python/jittor/src/op.cc
+++ b/python/jittor/src/op.cc
@@ -25,11 +25,12 @@ DEFINE_FLAG(int, try_use_32bit_index, 0,
 string_view_map<jit_op_entry_t> jit_ops;
 string_view_map<string> jit_key_mapper;
 
-int64_t Op::number_of_lived_ops = 0;
+int64 Op::number_of_lived_ops = 0;
 
 Op::Op() {
     flags.set(NodeFlags::_var, 0);
     flags.set(NodeFlags::_cpu, 1);
+    flags.flags |= ((amp_reg & 7) << NodeFlags::_prefer_32);
     number_of_lived_ops++;
     if (PREDICT_BRANCH_NOT_TAKEN(trace_py_var)) trace_data.record_node(this);
 }
@@ -122,43 +123,24 @@ void Op::do_jit_prepare(JK& jk) {
         if (has_cuda && has_cpu && !use_cuda)
             flags.set(NodeFlags::_cuda, 0);
     } else {
-        // check use int64_t as index_t if array is too big
-        int in_id=0, out_id=0;
         bool use_int64_t = false;
         // TODO: fused op do not have inputs,
         //   check use_cuda_op from outputs may not be enough
         bool use_cuda_op = use_cuda;
         for (Var* var : inputs()) {
-            if (var->mem_ptr) {
-                /* jit key don't include here, because 
-                    parallel compiler don't known
-                jk << JK::key << "alloc_i" << JK::hex1(in_id)
-                    << JK::hex1(var->allocator->flags()) << JK::end;
-                */
-                use_cuda_op &= var->allocator->is_cuda();
-            }
             if (var->num >= std::numeric_limits<int32_t>::max())
                 use_int64_t = true;
-            in_id ++;
         }
         for (Var* var : outputs()) {
-            if (var->mem_ptr) {
-                /*
-                jk << JK::key << "alloc_o" << JK::hex1(in_id)
-                    << JK::hex1(var->allocator->flags()) << JK::end;
-                */
-                use_cuda_op &= var->allocator->is_cuda();
-            }
             if (var->num >= std::numeric_limits<int32_t>::max())
                 use_int64_t = true;
-            out_id ++;
         }
         jk << _CS("[JIT:1]");
         if (use_cuda_op && flags.get(NodeFlags::_cuda)) {
             jk << _CS("[JIT_cuda:1]");
             flags.set(NodeFlags::_cpu, 0);
             // TODO: 64bit index in CUDA
-            use_int64_t = false;
+            // use_int64_t = false;
         } else {
             if (use_cuda==2) {
                 if (flags.get(NodeFlags::_cuda))
@@ -268,11 +250,15 @@ void Op::jit_run(JK& jk) {
 
 void Op::statistics(uint64_t& in, uint64_t& out, uint64_t& compute) {
     in = out = compute = 0;
-    for (Var* var : inputs()) {
+    for (auto& e : _inputs) {
+        auto var = e.node->var();
+        if (e.back->index<0) continue;
         in += var->size;
         compute = std::max(compute, (uint64_t)var->num);
     }
-    for (Var* var : outputs()) {
+    for (auto& e : _outputs) {
+        auto var = e.node->var();
+        if (e.index<0) continue;
         out += var->size;
         compute = std::max(compute, (uint64_t)var->num);
     }
diff --git a/python/jittor/src/op.h b/python/jittor/src/op.h
index 25d752e5..957e435a 100644
--- a/python/jittor/src/op.h
+++ b/python/jittor/src/op.h
@@ -15,7 +15,7 @@ namespace jittor {
 enum OpType {other=0, element=1, broadcast=2, reduce=3};
 struct Op : Node {
     vector<VarPtr> outputs_holder;
-    static int64_t number_of_lived_ops;
+    static int64 number_of_lived_ops;
     
     inline Caster<Var*, Node::input_t> inputs() { CHECK_EXIST; return &_inputs; }
     inline Caster<Var*, Node::output_t> outputs() { CHECK_EXIST; return &_outputs; }
diff --git a/python/jittor/src/op_compiler.cc b/python/jittor/src/op_compiler.cc
index f7c84167..53acf0e2 100644
--- a/python/jittor/src/op_compiler.cc
+++ b/python/jittor/src/op_compiler.cc
@@ -112,7 +112,7 @@ int OpCompiler::total_member_count() {
     return member_count;
 }
 
-int64_t OpCompiler::eval(const string& expr, const unordered_map<string,string>& vars) {
+int64 OpCompiler::eval(const string& expr, const unordered_map<string,string>& vars) {
     if (expr.find("@") != string::npos) {
         string new_expr;
         for (size_t i=0; i<expr.size(); i++) {
@@ -223,6 +223,16 @@ void load_macros(const string& src, unordered_map<string,string>& macros) {
     }
 }
 
+string expand_op_search(const vector<string>& args) {
+    for (auto op_type : op_types) {
+        string ret = op_type->expand_op(args);
+        if (ret.size())
+            return ret;
+    }
+    LOGf << "No expand op pattern found for args:" << args;
+    return "";
+}
+
 void expand_macro(const string& macro, const vector<string>& args, string& new_src) {
     LOGvvvv << "expand_macro" << macro << "args:" << args;
     if (macro.size() == 0 || macro[0] != '<') {
@@ -434,6 +444,7 @@ string precompile(unordered_map<string,string> defs, string src, unordered_map<s
                 vector<string> args;
                 size_t l = k+1;
                 if (expr == "for" || expr == "if" || expr == "expand_macro" ||
+                expr == "expand_op" ||
                     expr == "is_def" || expr == "python" ||
                     (k<src.size() && src[k]=='(')) {
                     ASSERT(src[k] == '(');
@@ -555,6 +566,18 @@ string precompile(unordered_map<string,string> defs, string src, unordered_map<s
                     i = l-1;
                     continue;
                 } else
+                if (expr == "expand_op") {
+                    // syntax: @expand_op(args)
+                    for (auto& arg : args) {
+                        uint p=0;
+                        while (p<arg.size() && arg[p] == ' ') p++;
+                        arg = precompile(defs, arg.substr(p), macros);
+                    }
+                    string ns = expand_op_search(args);
+                    new_src += precompile(defs, ns, macros);
+                    i = l-1;
+                    continue;
+                } else
                 if (expr == "define") {
                     // syntax: @define(macro, value)
                     //         ij     k             l
@@ -846,6 +869,9 @@ string OpCompiler::__get_fused_src(
     };
     auto not_change = [&](const string& s) -> bool {
         if (unchanged.count(s)) return true;
+        for (auto op_type : op_types)
+            if (op_type->types.count(s))
+                return true;
         return (s.find("::") != string::npos) || (s.find("LOG") != string::npos);
     };
     // regex find XxxXxxOp::jit_run
@@ -1043,7 +1069,7 @@ jit_op_entry_t OpCompiler::compile(const string& jit_key, const string& src) {
         if (v->loop_options)
             for (auto& kv : v->loop_options.data()) {
                 if (kv.second && startswith(kv.first, "FLAGS:"))
-                    extra_flags += " "+kv.first.substr(6)+" ";
+                    extra_flags += " " + kv.first.substr(6) + " ";
             }
     return jit_compiler::compile(jit_key, src, is_cuda, extra_flags);
 }
@@ -1052,6 +1078,8 @@ jit_op_entry_t OpCompiler::do_compile(Op* op) {
     jittor::lock_guard lg;
     OpCompiler oc(op);
     string* src = &oc.src;
+    for (auto op_type : op_types)
+        op_type->post_pass(&oc);
     string src_after_passes;
     // if is fused op
     if (oc.op) {
diff --git a/python/jittor/src/ops/binary_op.cc b/python/jittor/src/ops/binary_op.cc
index 7946ea61..197d90ff 100644
--- a/python/jittor/src/ops/binary_op.cc
+++ b/python/jittor/src/ops/binary_op.cc
@@ -8,7 +8,6 @@
 #include "var.h"
 #include "ops/binary_op.h"
 #include "ops/broadcast_to_op.h"
-#include "ops/binary_op_defs.h"
 #include "ops/op_register.h"
 
 namespace jittor {
@@ -419,21 +418,13 @@ unordered_set<string> binary_ops = {
     "bitwise_xor",
 };
 
-NanoString binary_dtype_infer(NanoString op, Var* x, Var* y) {
-    if (op == ns_mean) return dtype_infer(x->ns, y->ns, 2); // force float
-    int force_type=0;
-    if (op == ns_divide) force_type=2; // force float
-    if (op == ns_floor_divide) force_type=1; // force int
-    return op.is_bool() ? ns_bool : dtype_infer(x->ns, y->ns, force_type, op);
-}
-
 BinaryOp::BinaryOp(Var* x, Var* y, NanoString op) : x(x), y(y) {
     flags.set(NodeFlags::_cpu);
     flags.set(NodeFlags::_cuda);
     set_type(OpType::element);
     ns = op;
     ASSERT(ns.is_binary());
-    z = create_output(nullptr, binary_dtype_infer(op, x, y));
+    z = create_output(nullptr, binary_dtype_infer(op, x->ns, y->ns));
 }
 
 VarPtr dirty_clone_broadcast(Var* v) {
@@ -554,7 +545,7 @@ void BinaryOp::jit_run() {
     auto* __restrict__ zp = z->ptr<Tz>();
     index_t num = z->num;
     for (index_t i=0; i<num; i++)
-        zp[i] = @expand_macro(@OP, Tz, xp[i], yp[i]);
+        zp[i] = @expand_op(@OP, @Tz, xp[i], @Tx, yp[i], @Ty);
 }
 #endif // JIT
 
diff --git a/python/jittor/src/ops/binary_op_defs.h b/python/jittor/src/ops/binary_op_defs.h
deleted file mode 100644
index dc3a68eb..00000000
--- a/python/jittor/src/ops/binary_op_defs.h
+++ /dev/null
@@ -1,61 +0,0 @@
-// ***************************************************************
-// Copyright (c) 2021 Jittor. All Rights Reserved. 
-// Maintainers: Dun Liang <randonlang@gmail.com>. 
-// This file is subject to the terms and conditions defined in
-// file 'LICENSE.txt', which is part of this source code package.
-// ***************************************************************
-#pragma once
-#include "common.h"
-
-namespace jittor {
-
-#ifdef JIT_cuda
-#define pow(T,a,b) ::pow(a,b)
-#define maximum(T,a,b) ::max(T(a), T(b))
-#define minimum(T,a,b) ::min(T(a), T(b))
-#define mod(T,a,b) @if(@strcmp(@T,float32)==0,(a-::floorf((a)/(b))*(b)),@if(@strcmp(@Tx,float64)==0,(a-::floor((a)/(b))*(b)),(a%b)))
-#else // JIT_cpu
-#define pow(T,a,b) std::pow(a,b)
-#define maximum(T,a,b) std::max(T(a), T(b))
-#define minimum(T,a,b) std::min(T(a), T(b))
-#define mod(T,a,b) @if(@strcmp(@T,float32)==0,(a-std::floor((a)/(b))*(b)),@if(@strcmp(@Tx,float64)==0,(a-std::floor((a)/(b))*(b)),(a%b)))
-#endif
-#define add(T,a,b) ((a)+(b))
-#define subtract(T,a,b) ((a)-(b))
-#define multiply(T,a,b) ((a)*(b))
-#define divide(T,a,b) (T((T(a))/(T(b))))
-#define floor_divide(T,a,b) (T((T(a))/(T(b))))
-#define less(T,a,b) ((a)<(b))
-#define less_equal(T,a,b) ((a)<=(b))
-#define greater(T,a,b) ((a)>(b))
-#define greater_equal(T,a,b) ((a)>=(b))
-#define equal(T,a,b) ((a)==(b))
-#define not_equal(T,a,b) ((a)!=(b))
-#define left_shift(T,a,b) ((a)<<(b))
-#define right_shift(T,a,b) ((a)>>(b))
-#define logical_and(T,a,b) ((a)&&(b))
-#define logical_or(T,a,b) ((a)||(b))
-#define logical_xor(T,a,b) ((bool(a))!=(bool(b)))
-#define bitwise_and(T,a,b) ((a)&(b))
-#define bitwise_or(T,a,b) ((a)|(b))
-#define bitwise_xor(T,a,b) ((a)^(b))
-#define mean(T,a,b) ((a)+T(b)*(T(rcount)))
-
-#ifdef JIT_cuda
-#define init_maximum(T) ::numeric_min<T>()
-#define init_minimum(T) ::numeric_max<T>()
-#else
-#define init_maximum(T) std::numeric_limits<T>::lowest()
-#define init_minimum(T) std::numeric_limits<T>::max()
-#endif
-#define init_add(T) T(0)
-#define init_multiply(T) T(1)
-#define init_logical_and(T) true
-#define init_logical_or(T) false
-#define init_logical_xor(T) false
-#define init_bitwise_and(T) T(-1)
-#define init_bitwise_or(T) T(0)
-#define init_bitwise_xor(T) T(0)
-#define init_mean(T) T(0)
-
-} // jittor
\ No newline at end of file
diff --git a/python/jittor/src/ops/candidate_op.cc b/python/jittor/src/ops/candidate_op.cc
index 5a056722..f50955cb 100644
--- a/python/jittor/src/ops/candidate_op.cc
+++ b/python/jittor/src/ops/candidate_op.cc
@@ -91,7 +91,7 @@ void CandidateOp::jit_run() {
 
     int n=0;
     // checkCudaErrors(cudaDeviceSynchronize());
-    checkCudaErrors(cudaMemcpy(&n, np, 4, cudaMemcpyDefault));
+    checkCudaErrors(cudaMemcpy(&n, np, 4, cudaMemcpyDeviceToHost));
     y->set_shape({n});
     exe.temp_allocator->free(np, 4, n_allocation);
     exe.temp_allocator->free(maskp, xshape0, mask_allocation);
diff --git a/python/jittor/src/ops/copy_op.cc b/python/jittor/src/ops/copy_op.cc
index 535f0d3f..5d62e1c8 100644
--- a/python/jittor/src/ops/copy_op.cc
+++ b/python/jittor/src/ops/copy_op.cc
@@ -39,8 +39,8 @@ void CopyOp::run() {
     auto x_ptr = x->mem_ptr;
     auto y_ptr = outputs().front()->mem_ptr;
     #ifdef HAS_CUDA
-    if (flags.get(NodeFlags::_cuda))  {
-        checkCudaErrors(cudaMemcpyAsync(y_ptr, x_ptr, size, cudaMemcpyDefault, 0));
+    if (flags.get(NodeFlags::_cuda)) {
+        checkCudaErrors(cudaMemcpyAsync(y_ptr, x_ptr, size, cudaMemcpyDeviceToDevice, 0));
     } else
     #endif
     {
diff --git a/python/jittor/src/ops/fetch_op.cc b/python/jittor/src/ops/fetch_op.cc
index 101860e7..48303686 100644
--- a/python/jittor/src/ops/fetch_op.cc
+++ b/python/jittor/src/ops/fetch_op.cc
@@ -121,13 +121,18 @@ void FetchOp::run() {
             checkCudaErrors(cudaStreamWaitEvent(stream, event, 0));
             new (&allocation) Allocation(&cuda_dual_allocator, v->size);
             // mostly device to device
+            #if IS_CUDA
             checkCudaErrors(cudaMemcpyAsync(
                 allocation.ptr, v->mem_ptr, v->size, cudaMemcpyDefault, stream));
+            #else
+            checkCudaErrors(cudaMemcpyAsync(
+                allocation.ptr, v->mem_ptr, v->size, cudaMemcpyDeviceToDevice, stream));
+            #endif
             auto host_ptr = cuda_dual_allocator.get_dual_allocation(
                 allocation.allocation).host_ptr;
             // device to host
             checkCudaErrors(cudaMemcpyAsync(
-                host_ptr, allocation.ptr, v->size, cudaMemcpyDefault, stream));
+                host_ptr, allocation.ptr, v->size, cudaMemcpyDeviceToHost, stream));
             allocation.ptr = host_ptr;
             has_cuda_memcpy = true;
         } else
diff --git a/python/jittor/src/ops/op_register.cc b/python/jittor/src/ops/op_register.cc
index 7c9a5647..23879ab3 100644
--- a/python/jittor/src/ops/op_register.cc
+++ b/python/jittor/src/ops/op_register.cc
@@ -33,4 +33,12 @@ OpInfo get_op_info(const string& name) {
     return op_info_map.at(op_file_name);
 }
 
+vector<OpByType*> op_types;
+
+int registe_op_type(OpByType* op_type) {
+    op_types.push_back(op_type);
+    return 0;
+}
+
+
 } // jittor
\ No newline at end of file
diff --git a/python/jittor/src/ops/op_register.h b/python/jittor/src/ops/op_register.h
index 989e588c..bfb56295 100644
--- a/python/jittor/src/ops/op_register.h
+++ b/python/jittor/src/ops/op_register.h
@@ -32,4 +32,14 @@ void op_registe(const OpInfo& op_info);
 bool has_op(const string& name);
 OpInfo get_op_info(const string& name);
 
+struct OpCompiler;
+struct OpByType {
+    unordered_set<string> types;
+    virtual string expand_op(const vector<string>& args) = 0;
+    virtual void post_pass(OpCompiler*) = 0;
+};
+
+extern vector<OpByType*> op_types;
+int registe_op_type(OpByType*);
+
 } // jittor
\ No newline at end of file
diff --git a/python/jittor/src/ops/reduce_op.cc b/python/jittor/src/ops/reduce_op.cc
index fd4c5f29..d5f1935c 100644
--- a/python/jittor/src/ops/reduce_op.cc
+++ b/python/jittor/src/ops/reduce_op.cc
@@ -8,7 +8,6 @@
 #include <limits>
 #include "var.h"
 #include "ops/reduce_op.h"
-#include "ops/binary_op_defs.h"
 #include "ops/op_register.h"
 #include "executor.h"
 
@@ -272,7 +271,7 @@ ReduceOp::ReduceOp(Var* x, NanoString op, NanoVector dims, bool keepdims)
     if (x->dtype() == ns_bool)
         y = create_output(nullptr, ns_int32);
     else
-        y = create_output(nullptr, binary_dtype_infer(ns, x, x));
+        y = create_output(nullptr, reduce_dtype_infer(ns, x->ns));
 }
 
 ReduceOp::ReduceOp(Var* x, NanoString op, uint dims_mask, uint keepdims_mask)
@@ -284,7 +283,7 @@ ReduceOp::ReduceOp(Var* x, NanoString op, uint dims_mask, uint keepdims_mask)
     ASSERT(ns.is_binary());
     reduce_mask = dims_mask;
     this->keepdims_mask = keepdims_mask;
-    y = create_output(nullptr, binary_dtype_infer(ns, x, x));
+    y = create_output(nullptr, reduce_dtype_infer(ns, x->ns));
 }
 
 ReduceOp::ReduceOp(Var* x, NanoString op, int dim, bool keepdims)
@@ -360,18 +359,18 @@ void ReduceOp::jit_run() {
     @for(i, DIM-2, -1, -1, auto ystride@i = ystride@{i+1} * yshape@{i+1};)
     index_t xstride@{DIM-1} = 1;
     @for(i, DIM-2, -1, -1, auto xstride@i = xstride@{i+1} * xshape@{i+1};)
-    Ty count = Ty(x->num) / Ty(y->num);
-    Ty rcount = Ty(y->num) / Ty(x->num);
+    Ty count = x->num*1.0 / y->num;
+    Ty rcount = y->num*1.0 / x->num;
     @for(d, 0, DIM,@if(REDUCE>>d&1,, for (index_t xi@d=0; xi@d < xshape@d; xi@d++))) {
         auto yid = 0 @for(d, 0, DIM,@if(REDUCE>>d&1,, + xi@d * ystride@d));
-        yp[yid] = @expand_macro(init_@OP, Ty);
+        yp[yid] = @expand_op(init_@OP, @Ty);
     }
     
     @for(d, 0, DIM,@if(REDUCE>>d&1,, for (index_t xi@d=0; xi@d < xshape@d; xi@d++))) {
         @for(d, 0, DIM,@if(REDUCE>>d&1, for (index_t xi@d=0; xi@d < xshape@d; xi@d++),)) {
             auto yid = 0 @for(d, 0, DIM,@if(REDUCE>>d&1,, + xi@d * ystride@d));
             auto xid = 0 @for(d, 0, DIM, + xi@d * xstride@d);
-            yp[yid] = @expand_macro(@OP, Ty, yp[yid], xp[xid]);
+            yp[yid] = @expand_op(@OP, @Ty, yp[yid], @Ty, xp[xid], @Tx);
         }
     }
     (void)count, (void)rcount, (void)yshape0, (void)ystride0;
diff --git a/python/jittor/src/ops/reindex_op.cc b/python/jittor/src/ops/reindex_op.cc
index 03110d7d..be38b97d 100644
--- a/python/jittor/src/ops/reindex_op.cc
+++ b/python/jittor/src/ops/reindex_op.cc
@@ -132,7 +132,7 @@ void ReindexOp::jit_run() {
         @for(d, 0, XDIM, index_t xid@d = @expand_macro(INDEX@d);)
         auto xid = @for(d, 0, XDIM, + xid@d * xstride@d);
         bool check_overflow = 0 @for(d, 0, XDIM, || xid@d<0 || xid@d>=xshape@d) @for(d, 0, OSIZE, || (@expand_macro(OFD@d)));
-        yp[yid] = check_overflow ? (@OVERFLOW) : xp[xid];
+        yp[yid] = check_overflow ? Tx(@OVERFLOW) : xp[xid];
     }
 }
 #endif // JIT
diff --git a/python/jittor/src/ops/reindex_reduce_op.cc b/python/jittor/src/ops/reindex_reduce_op.cc
index c7ae1d34..93b958e6 100644
--- a/python/jittor/src/ops/reindex_reduce_op.cc
+++ b/python/jittor/src/ops/reindex_reduce_op.cc
@@ -8,7 +8,6 @@
 #include <limits>
 #include "var.h"
 #include "ops/reindex_reduce_op.h"
-#include "ops/binary_op_defs.h"
 #include "ops/op_register.h"
 
 namespace jittor {
@@ -112,7 +111,7 @@ void ReindexReduceOp::jit_run() {
 
     @for(d, 0, XDIM, for (index_t i@d=0; i@d < xshape@d; i@d++)) {
         auto xid = @for(d, 0, XDIM, + i@d * xstride@d);
-        xp[xid] = @expand_macro(init_@OP, Tx);
+        xp[xid] = @expand_op(init_@OP, @Tx);
     }
     // generate d-for loop
     @for(d, 0, YDIM, for (index_t i@d=0; i@d < yshape@d; i@d++)) {
@@ -121,7 +120,7 @@ void ReindexReduceOp::jit_run() {
         auto xid = @for(d, 0, XDIM, + xid@d * xstride@d);
         bool check_overflow = 0 @for(d, 0, XDIM, || xid@d<0 || xid@d>=xshape@d) @for(d, 0, OSIZE, || (@expand_macro(OFD@d)));
         if (!check_overflow)
-            xp[xid] = @expand_macro(@OP, Tx, xp[xid], yp[yid]);
+            xp[xid] = @expand_op(@OP, @Tx, xp[xid], @Tx, yp[yid], @Tx);
     }
 }
 #endif // JIT
diff --git a/python/jittor/src/ops/setitem_op.cc b/python/jittor/src/ops/setitem_op.cc
index 13836f36..513aac6b 100644
--- a/python/jittor/src/ops/setitem_op.cc
+++ b/python/jittor/src/ops/setitem_op.cc
@@ -9,7 +9,6 @@
 #include "ops/setitem_op.h"
 #include "ops/getitem_op.h"
 #ifdef JIT
-#include "ops/binary_op_defs.h"
 #ifdef JIT_cuda
 #include <cuda_runtime.h>
 #include "helper_cuda.h"
@@ -313,7 +312,7 @@ void SetitemOp::jit_run() {
         std::memcpy(op, ip, out->size);
     #else
     if (op != ip)
-        checkCudaErrors(cudaMemcpyAsync(op, ip, out->size, cudaMemcpyDefault, 0));
+        checkCudaErrors(cudaMemcpyAsync(op, ip, out->size, cudaMemcpyDeviceToDevice, 0));
     #endif
 
     if (flags.get((NodeFlags::Flags(SetitemOp::_data_inplaced))) &&
@@ -340,12 +339,12 @@ void SetitemOp::jit_run() {
         @if(@is_def(JIT_cpu),
             @if(@strcmp(@OP,void)==0,
                 op[iid] = (Ti)dp[did],
-                op[iid] = @expand_macro(@OP, Ti, op[iid], dp[did])
+                op[iid] = @expand_op(@OP, @Ti, op[iid], @Ti, dp[did], @Td)
             );
         ,
             @if(@strcmp(@OP,void)==0, op[iid] = (Ti)dp[did],
             @if(@strcmp(@OP,add)==0, atomicAdd(&op[iid], (Ti)dp[did]),
-                op[iid] = @expand_macro(@OP, Ti, op[iid], dp[did])
+                op[iid] = @expand_op(@OP, @Ti, op[iid], @Ti, dp[did], @Td)
             )
             );
         )
diff --git a/python/jittor/src/ops/transpose_op.cc b/python/jittor/src/ops/transpose_op.cc
index 85888b95..94d9576e 100644
--- a/python/jittor/src/ops/transpose_op.cc
+++ b/python/jittor/src/ops/transpose_op.cc
@@ -28,6 +28,12 @@ TransposeOp::TransposeOp(Var* x, NanoVector axes_) : x(x), axes(axes_) {
         for (int i=0; i<(int)xdim; i++)
             axes.push_back(xdim-1-i);
     }
+    if (axes.size() < xdim || (axes.size() == xdim && axes[xdim-1]==xdim-1)) {
+        static VarPtr(*fuse_transpose)(Var*, NanoVector) = get_op_info("fuse_transpose").get_constructor<VarPtr, Var*, NanoVector>();
+        auto var = fuse_transpose(x, axes);
+        forward(var);
+        return;
+    }
     #ifdef HAS_CUDA
     if (use_cuda) {
         static VarPtr(*cutt_transpose)(Var*, NanoVector) = nullptr;
diff --git a/python/jittor/src/ops/unary_op.cc b/python/jittor/src/ops/unary_op.cc
index c1ef95d2..9a95d6df 100644
--- a/python/jittor/src/ops/unary_op.cc
+++ b/python/jittor/src/ops/unary_op.cc
@@ -8,7 +8,6 @@
 #include "misc/cpu_math.h"
 #include "var.h"
 #include "ops/unary_op.h"
-#include "ops/unary_op_defs.h"
 #include "ops/op_register.h"
 
 namespace jittor {
@@ -33,6 +32,7 @@ static unordered_set<string> unary_ops = {
     "uint16",
     "uint32",
     "uint64",
+    "float16",
     "float32",
     "float64",
     // please keep float64 the last type
@@ -534,22 +534,15 @@ UnaryOp::UnaryOp(Var* x, NanoString op) : x(x) {
     ns = op;
     ASSERT(ns.is_unary() | ns.is_dtype());
     NanoString dtype;
+    if (ns == x->dtype()) {
+        forward(x);
+        return;
+    }
     if (ns.is_dtype()) {
-        if (ns == x->dtype()) {
-            forward(x);
-            return;
-        }
         dtype = ns;
         ns = ns_cast;
-    } else if (ns.is_bool())
-        dtype = ns_bool;
-    else if (ns.is_float())
-        dtype = dtype_infer(x->ns, x->ns, 2);
-    else if (ns.is_int())
-        dtype = dtype_infer(x->ns, x->ns, 1);
-    else {
-        dtype = x->ns;
-    }
+    } else 
+        dtype = unary_dtype_infer(ns, x->ns);
     y = create_output(nullptr, dtype);
 }
 
@@ -688,7 +681,7 @@ void UnaryOp::jit_run() {
     auto* __restrict__ yp = y->ptr<Ty>();
     index_t num = y->num;
     for (index_t i=0; i<num; i++)
-        yp[i] = @expand_macro(@OP, Ty, xp[i]);
+        yp[i] = @expand_op(@OP, @Ty, xp[i], @Tx);
 }
 #endif // JIT
 
diff --git a/python/jittor/src/ops/unary_op_defs.h b/python/jittor/src/ops/unary_op_defs.h
deleted file mode 100644
index 6e8a9a99..00000000
--- a/python/jittor/src/ops/unary_op_defs.h
+++ /dev/null
@@ -1,84 +0,0 @@
-// ***************************************************************
-// Copyright (c) 2021 Jittor. All Rights Reserved. 
-// Maintainers: Dun Liang <randonlang@gmail.com>. 
-// This file is subject to the terms and conditions defined in
-// file 'LICENSE.txt', which is part of this source code package.
-// ***************************************************************
-#pragma once
-#include "common.h"
-
-namespace jittor {
-
-#define logical_not(T,x) (!(x))
-#define bitwise_not(T,x) (~(x))
-#define negative(T,x) (-(x))
-#ifdef JIT_cuda
-// TODO: add float64 version
-#define abs(T,x) ::abs(x)
-#define log(T,x) ::logf((T)(x))
-#define exp(T,x) ::expf((T)(x))
-#define sqrt(T,x) ::sqrtf((T)(x))
-#define round(T,x) ((T) ::roundf((x)))
-#define floor(T,x) ((T) ::floorf((x)))
-#define ceil(T,x) ((T) ::ceilf((x)))
-#define round_int(T,x) ((T) ::roundf((x)))
-#define floor_int(T,x) ((T) ::floorf((x)))
-#define ceil_int(T,x) ((T) ::ceilf((x)))
-
-#define sin(T,x) ((T) ::sinf((x)))
-#define asin(T,x) ((T) ::asinf((x)))
-#define sinh(T,x) ((T) ::sinhf((x)))
-#define asinh(T,x) ((T) ::asinhf((x)))
-
-#define cos(T,x) ((T) ::cosf((x)))
-#define acos(T,x) ((T) ::acosf((x)))
-#define cosh(T,x) ((T) ::coshf((x)))
-#define acosh(T,x) ((T) ::acoshf((x)))
-
-#define tan(T,x) ((T) ::tanf((x)))
-#define atan(T,x) ((T) ::atanf((x)))
-#define tanh(T,x) ((T) ::tanhf((x)))
-#define atanh(T,x) ((T) ::atanhf((x)))
-
-#define sigmoid(T,x) ((T) (1.0f/(1.0f+::expf((::min(T(-(x)), T(@if(@strcmp(@T,float32)==0,30,300))))))))
-
-#define erf(T,x) ((T) ::erff((x)))
-#define erfinv(T,x) ((T) ::erfinvf((T)(x)))
-
-#else
-#define abs(T,x) std::abs(x)
-#define log(T,x) std::log((T)(x))
-#define exp(T,x) std::exp((T)(x))
-#define sqrt(T,x) std::sqrt((T)(x))
-#define round(T,x) ((T)std::round((x)))
-#define floor(T,x) ((T)std::floor((x)))
-#define ceil(T,x) ((T)std::ceil((x)))
-#define round_int(T,x) ((T)std::round((x)))
-#define floor_int(T,x) ((T)std::floor((x)))
-#define ceil_int(T,x) ((T)std::ceil((x)))
-
-#define sin(T,x) ((T) std::sin((x)))
-#define asin(T,x) ((T) std::asin((x)))
-#define sinh(T,x) ((T) std::sinh((x)))
-#define asinh(T,x) ((T) std::asinh((x)))
-
-#define cos(T,x) ((T) std::cos((x)))
-#define acos(T,x) ((T) std::acos((x)))
-#define cosh(T,x) ((T) std::cosh((x)))
-#define acosh(T,x) ((T) std::acosh((x)))
-
-#define tan(T,x) ((T) std::tan((x)))
-#define atan(T,x) ((T) std::atan((x)))
-#define tanh(T,x) ((T) std::tanh((x)))
-#define atanh(T,x) ((T) std::atanh((x)))
-
-#define sigmoid(T,x) ((T) (1.0f/(1.0f+std::exp(std::min(T(-(x)), T(@if(@strcmp(@T,float32)==0,30,300)))))))
-
-#define erf(T,x) ((T) std::erf((x)))
-#define erfinv(T,x) (jittor::_erfinv(x))
-
-#endif
-
-#define cast(T,x) ((T)(x))
-
-} // jittor
\ No newline at end of file
diff --git a/python/jittor/src/ops/where_op.cc b/python/jittor/src/ops/where_op.cc
index 8e15dd81..e11025fb 100644
--- a/python/jittor/src/ops/where_op.cc
+++ b/python/jittor/src/ops/where_op.cc
@@ -230,7 +230,7 @@ void WhereOp::jit_run() {
 
     int n=0;
     // checkCudaErrors(cudaDeviceSynchronize());
-    checkCudaErrors(cudaMemcpy(&n, np, 4, cudaMemcpyDefault));
+    checkCudaErrors(cudaMemcpy(&n, np, 4, cudaMemcpyDeviceToHost));
     @for(i, 0, NDIM, outs[@i]->set_shape({n});)
     exe.temp_allocator->free(np, 4, n_allocation);
 }
diff --git a/python/jittor/src/opt/tuner/conv_tuner.cc b/python/jittor/src/opt/tuner/conv_tuner.cc
index 738746d4..c043c1b5 100644
--- a/python/jittor/src/opt/tuner/conv_tuner.cc
+++ b/python/jittor/src/opt/tuner/conv_tuner.cc
@@ -25,6 +25,7 @@
 namespace jittor {
 
 using namespace expr;
+extern int use_cuda;
 
 struct OpInspector {
     // binary mask for
@@ -229,9 +230,14 @@ void ConvTuner::forwardTune(FusedOp* fop) {
         if (!(bop->y->input() && bop->x->input() && fop->has(bop->x->input()) && fop->has(bop->y->input()))) continue;
         if (!(bop->x->input()->type()==OpType::broadcast && bop->y->input()->type()==OpType::broadcast)) return;
 
-        // only support float32 currently
-        if (bop->z->dtype() != ns_float32)
-            continue;
+        // only support float32,float16 currently
+        if (use_cuda) {
+            if (bop->z->dtype() != ns_float32 && bop->z->dtype() != ns_float16)
+                continue;
+        } else {
+            if (bop->z->dtype() != ns_float32)
+                continue;
+        }
         Op* ops[3] = {op, bop->x->input(), bop->y->input()};
         int ok = 0;
         LOGvvvv << "conv like op" << fop << fop->get_jit_key(get_jk());
diff --git a/python/jittor/src/profiler/profiler.cc b/python/jittor/src/profiler/profiler.cc
index 262ac2cf..b133606f 100644
--- a/python/jittor/src/profiler/profiler.cc
+++ b/python/jittor/src/profiler/profiler.cc
@@ -23,6 +23,7 @@
 #include "fused_op.h"
 #include "profiler/memory_checker.h"
 #include "misc/deleter.h"
+#include "executor.h"
 
 namespace jittor {
 
@@ -30,6 +31,8 @@ Profiler profiler;
 
 DEFINE_FLAG(int, profiler_warmup, 0, "Profiler warmup.");
 DEFINE_FLAG(int, profiler_rerun, 0, "Profiler rerun.");
+DEFINE_FLAG(int, profiler_record_peek, 0, "Profiler record peek mem bandwidth.");
+DEFINE_FLAG(int, profiler_record_shape, 0, "Profiler record shape for op.");
 DEFINE_FLAG(int, profiler_hide_relay, 0, "Profiler hide relayed op.");
 DEFINE_FLAG_WITH_SETTER(int, profiler_enable, 0, "Enable profiler.");
 
@@ -54,6 +57,8 @@ void Profiler::start(int64 warmup, int64 rerun) {
     profiler.records.clear();
     profiler.warmup = warmup;
     profiler.rerun = rerun;
+    profiler.relay_extra_cost = 0;
+    profiler.relay_fop = 0;
 }
 
 void Profiler::stop() {
@@ -138,6 +143,60 @@ static  string get_stack_info(Op* op) {
     }
 }
 
+static void stat_peek_bandwidth(uint64 in, uint64 out, uint64 loop, uint64& peek_time_total) {
+    auto size = (in+out) / 2;
+    // memcpy in some not aligned case will drop performance
+    size &= ~((1 << 12)-1);
+    // size = 7680000*4;
+    auto temp1 = exe.alloc_temp(size);
+    auto temp2 = exe.alloc_temp(size);
+    loop = 1 << loop;
+    int warmup = std::max(loop/8, (uint64)1);
+    for (int i=0; i<warmup; i++)
+    #ifdef HAS_CUDA
+        if (use_cuda)
+            cudaMemcpyAsync(temp1.ptr, temp2.ptr, size, cudaMemcpyDeviceToDevice, 0);
+        else
+    #endif
+            std::memcpy(temp1.ptr, temp2.ptr, size);
+    #ifdef HAS_CUDA
+    if (use_cuda)
+        checkCudaErrors(cudaDeviceSynchronize());
+    #endif
+    auto start = std::chrono::high_resolution_clock::now();
+    for (int i=0; i<loop; i++)
+    #ifdef HAS_CUDA
+        if (use_cuda)
+            cudaMemcpyAsync(temp1.ptr, temp2.ptr, size, cudaMemcpyDeviceToDevice, 0);
+        else
+    #endif
+            std::memcpy(temp1.ptr, temp2.ptr, size);
+    #ifdef HAS_CUDA
+    if (use_cuda)
+        checkCudaErrors(cudaDeviceSynchronize());
+    #endif
+    auto finish = std::chrono::high_resolution_clock::now();
+    auto total_ns =  (int64_t)std::chrono::duration_cast<std::chrono::nanoseconds>(finish-start).count();
+    peek_time_total += total_ns;
+}
+
+struct RecordExtraCost {
+    int ck;
+    std::chrono::high_resolution_clock::time_point start;
+
+    RecordExtraCost(int ck) : ck(ck) {
+        if (!ck) return;
+        start = std::chrono::high_resolution_clock::now();
+    }
+
+    ~RecordExtraCost() {
+        if (!ck) return;
+        auto finish = std::chrono::high_resolution_clock::now();
+        auto total_ns =  (int64_t)std::chrono::duration_cast<std::chrono::nanoseconds>(finish-start).count();
+        profiler.relay_extra_cost += total_ns;
+    }
+};
+
 void Profiler::record_and_run(
     jit_op_entry_t jit_entry,
     Op* op,
@@ -151,7 +210,10 @@ void Profiler::record_and_run(
             jit_key : ikey->second.c_str();
         auto iter = profiler.records.find(key);
         uint64_t in, out, compute;
-        op->statistics(in, out, compute);
+        if (profiler.relay_fop)
+            profiler.relay_fop->statistics(in, out, compute);
+        else
+            op->statistics(in, out, compute);
         if (iter == profiler.records.end()) {
             profiler.records[key] = Info{
                 0, 0, -1ull, 0, 
@@ -165,7 +227,7 @@ void Profiler::record_and_run(
         bool is_fused = op->name() == string("fused");
 
         uint64* shape_time = nullptr;
-        if (trace_py_var) {
+        if (trace_py_var || profiler_record_shape) {
             // record shape
             NanoVector shape;
             int64 num = 0;
@@ -193,41 +255,62 @@ void Profiler::record_and_run(
             iter->second.shapes[shape].second += 1;
             shape_time = &iter->second.shapes[shape].first;
         }
-        int loop = (is_fused && 
-            ((FusedOp*)op)->get_loop_option("insert_profile_loop")) ? 10 : 0;
-        int64_t warmup = profiler.warmup ? std::max(profiler.warmup>>loop, (int64_t)1) : 0;
-        int64_t rerun = std::max((profiler.rerun+1)>>loop, (int64_t)1);
-        // prevent relayed op being rerun
-        auto warmup_bk = profiler.warmup;
-        auto rerun_bk = profiler.rerun;
-        profiler.warmup = profiler.rerun = 0;
-        Deleter del([&]() {
-            profiler.warmup = warmup_bk;
-            profiler.rerun = rerun_bk;
-        });
-        
-        for (int64_t i=0; i<warmup; i++) {
-            jit_entry(op);
+        int64_t warmup = profiler.warmup;
+        int64_t rerun = profiler.rerun + 1;
+        rerun = std::max(NanoVector::get_nbits(rerun) - 2, 0);
+        int loop = 0;
+        Deleter _d;
+        if (is_fused) {
+            auto fop = ((FusedOp*)op);
+            if (fop->context && fop->context->vrm.relay_groups.size()) {
+                // relay op
+                loop = rerun;
+                profiler.relay_extra_cost = 0;
+                profiler.relay_fop = fop;
+                _d.del = [&]() {
+                    profiler.relay_extra_cost = 0;
+                    profiler.relay_fop = 0;
+                };
+            } else
+                loop = fop->get_loop_option("insert_profile_loop") ? 10 : 0;
+        }
+        int64 num = 1<<(rerun - loop);
+
+        {
+            profiler_enable = 0;
+            Deleter del([&]() { profiler_enable = 1;});
+            RecordExtraCost rec(profiler.relay_fop && profiler.relay_fop != op);
+            for (int64_t i=0; i<warmup; i++) {
+                jit_entry(op);
+            }
             #ifdef HAS_CUDA
             if (use_cuda)
                 checkCudaErrors(cudaDeviceSynchronize());
             #endif
         }
-        for (int64_t i=0; i<rerun; i++) {
-            auto start = std::chrono::high_resolution_clock::now();
+
+        auto start = std::chrono::high_resolution_clock::now();
+        for (int64_t i=0; i<num; i++) {
             jit_entry(op);
-            #ifdef HAS_CUDA
-            if (use_cuda)
-                checkCudaErrors(cudaDeviceSynchronize());
-            #endif
-            auto finish = std::chrono::high_resolution_clock::now();
-            auto total_ns =  (int64_t)std::chrono::duration_cast<std::chrono::nanoseconds>(finish-start).count();
-            // 24ns function call overhead
-            total_ns = std::max((int64_t)1, total_ns-24);
-            iter->second.update(loop, total_ns, in, out, compute);
-            if (shape_time) shape_time[0] += total_ns;
-            LOGvvvv << "Duration" << total_ns >> "ns running" << op;
         }
+        #ifdef HAS_CUDA
+        if (use_cuda)
+            checkCudaErrors(cudaDeviceSynchronize());
+        #endif
+        auto finish = std::chrono::high_resolution_clock::now();
+        auto total_ns =  (int64_t)std::chrono::duration_cast<std::chrono::nanoseconds>(finish-start).count();
+        if (profiler.relay_fop == op) {
+            total_ns -= profiler.relay_extra_cost;
+        }
+        // 24ns function call overhead
+        total_ns = std::max((int64_t)1, total_ns-24);
+        iter->second.update(rerun, total_ns, in, out, compute);
+        if (shape_time) shape_time[0] += total_ns;
+
+        RecordExtraCost rec(profiler.relay_fop && profiler.relay_fop != op);
+        if (profiler_record_peek)
+            stat_peek_bandwidth(in, out, rerun, iter->second.peek_time_total);
+        LOGvvvv << "Duration" << total_ns >> "ns running" << op;
         if (is_fused && 
             ((FusedOp*)op)->get_loop_option("check_cache")) {
             auto fname = Op::get_filename_from_jit_key(key, ".so");
@@ -239,6 +322,8 @@ void Profiler::record_and_run(
 
 vector<vector<string>> Profiler::report(const string& sort_key) {
     vector<vector<string>> rep = {{"Name", "FileName", "Count", "TotalTime", "AvgTime", "MinTime", "MaxTime", "Input", "Output", "InOut", "Compute"}};
+    if (profiler_record_peek)
+        rep[0].push_back("Peek");
     vector<string> names, fnames;
     vector<vector<double>> info;
     vector<int> order;
@@ -295,6 +380,10 @@ vector<vector<string>> Profiler::report(const string& sort_key) {
             (double)(kinfo.in_total+kinfo.out_total)*1e9 / kinfo.time_total, // InOut
             (double)kinfo.compute_total*1e9 / kinfo.time_total, // Compute
         });
+        if (profiler_record_peek)
+            info.back().push_back(
+                (double)(kinfo.in_total+kinfo.out_total)*1e9 / kinfo.peek_time_total // Peek
+            );
     }
     if (sort_key_id>=2)
         std::sort(order.begin(), order.end(), [&](int i, int j) {
@@ -363,7 +452,7 @@ vector<vector<string>> Profiler::report(const string& sort_key) {
                         << std::setw(3)
                         << std::setprecision(p) << cum_time / total_time * 100 << "%)";
                 }
-            } else if (j<=7) {
+            } else if (j<=7 || j==9) {
                 // output thoughtput
                 output_float(" KMG", 1024, "B/s", k);
             } else {
diff --git a/python/jittor/src/profiler/profiler.h b/python/jittor/src/profiler/profiler.h
index 19c02d2a..90aed655 100644
--- a/python/jittor/src/profiler/profiler.h
+++ b/python/jittor/src/profiler/profiler.h
@@ -24,6 +24,8 @@ struct Profiler {
         uint64_t in_total, out_total;
         // compute thoughtput in ops
         uint64_t compute_total;
+        // peek time use memcopy
+        uint64_t peek_time_total;
         // cache test info
         unique_ptr<CacheInfo> cache_info;
         cstr stack_info;
@@ -56,6 +58,9 @@ struct Profiler {
     
     int64_t warmup=0, rerun=0;
     unordered_map<string, Info> records;
+    int64 relay_extra_cost;
+    FusedOp* relay_fop;
+
     ~Profiler();
 };
 
diff --git a/python/jittor/src/pybind/py_var_tracer.cc b/python/jittor/src/pybind/py_var_tracer.cc
index ee4e2a9e..30072aec 100644
--- a/python/jittor/src/pybind/py_var_tracer.cc
+++ b/python/jittor/src/pybind/py_var_tracer.cc
@@ -267,7 +267,7 @@ void TraceData::release_node(Node* node) {
         return;
     auto node_id = iter->second;
     id_map.erase(node);
-    if (trace_py_var < 2) {
+    if (trace_py_var < 2 || execute_op_info.size() > 100000) {
         node_data.erase(node_id);
     }
 }
@@ -312,6 +312,7 @@ void TraceData::record_op(Op* op) {
 }
 
 void TraceData::record_execution(Op* op, bool is_fused_op, JK& jk) {
+    if (execute_op_info.size() > 100000) return;
     ExecuteOpInfo& einfo = execute_op_info[execute_op_info_cnt++];
     if (is_fused_op) {
         FusedOp* fop = (FusedOp*)op;
diff --git a/python/jittor/src/pyjt/numpy.cc b/python/jittor/src/pyjt/numpy.cc
index 6546e56c..cda93085 100644
--- a/python/jittor/src/pyjt/numpy.cc
+++ b/python/jittor/src/pyjt/numpy.cc
@@ -21,7 +21,9 @@ NanoString npy2ns[] = {
     ns_int64, ns_uint64,
     ns_float32, ns_float64, ns_float64,
     ns_void, ns_void, ns_void, 
-    ns_void 
+    ns_void, // 17
+    ns_void, ns_void, ns_void, ns_void, ns_void, // 22
+    ns_float16, // 23
 };
 
 NPY_TYPES ns2npy[] = {
@@ -34,7 +36,7 @@ NPY_TYPES ns2npy[] = {
     NPY_BYTE, NPY_SHORT, NPY_INT, NPY_LONGLONG,
     NPY_UBYTE, NPY_USHORT, NPY_UINT, NPY_ULONGLONG,
     #endif
-    NPY_FLOAT, NPY_DOUBLE
+    NPY_HALF, NPY_FLOAT, NPY_DOUBLE
 };
 
 void** PyArray_API;
diff --git a/python/jittor/src/pyjt/numpy.h b/python/jittor/src/pyjt/numpy.h
index b6468b55..c0c4d123 100644
--- a/python/jittor/src/pyjt/numpy.h
+++ b/python/jittor/src/pyjt/numpy.h
@@ -48,6 +48,8 @@ enum NPY_TYPES {
     NPY_FLOAT, NPY_DOUBLE, NPY_LONGDOUBLE,
     NPY_CFLOAT, NPY_CDOUBLE, NPY_CLONGDOUBLE,
     NPY_OBJECT=17,
+    NPY_HALF=23,
+    NPY_END=24,
 };
 
 EXTERN_LIB NanoString npy2ns[];
@@ -60,11 +62,11 @@ EXTERN_LIB NPY_TYPES ns2npy[];
 inline bool is_c_style(PyArray_Proxy* obj) { return obj->flags & 1; }
 inline NanoString get_type_str(PyArray_Proxy* obj) {
     NanoString type = ns_void;
-    if (obj->descr->type_num < NPY_OBJECT)
+    if (obj->descr->type_num < NPY_END)
         type = npy2ns[obj->descr->type_num];
     CHECK(type != ns_void) << "Numpy type not support, type_num:"
         << obj->descr->type_num 
-        << "type_char:" << obj->descr->type;
+        << "type_char:" << obj->descr->type << NPY_END << npy2ns[obj->descr->type_num];
     return type;
 }
 
diff --git a/python/jittor/src/pyjt/py_array_op.cc b/python/jittor/src/pyjt/py_array_op.cc
index 683e48bb..79a711a8 100644
--- a/python/jittor/src/pyjt/py_array_op.cc
+++ b/python/jittor/src/pyjt/py_array_op.cc
@@ -141,7 +141,7 @@ ArrayOp::ArrayOp(PyObject* obj) {
     } else {
         // this is non-continue numpy array
 #if defined(__linux__) || defined(_WIN32)
-        STACK_ALLOC(int64, dims, args.shape.size());
+        STACK_ALLOC(int64_t, dims, args.shape.size());
 #elif defined(__APPLE__)
         long dims[args.shape.size()];
 #endif
diff --git a/python/jittor/src/pyjt/py_converter.h b/python/jittor/src/pyjt/py_converter.h
index 3cafc103..9fb505c0 100644
--- a/python/jittor/src/pyjt/py_converter.h
+++ b/python/jittor/src/pyjt/py_converter.h
@@ -15,7 +15,7 @@
 #include "misc/nano_string.h"
 #include "misc/fast_shared_ptr.h"
 #include "profiler/simple_profiler.h"
-#ifdef HAS_CUDA
+#ifdef IS_CUDA
 #include "misc/cuda_flags.h"
 #endif
 
@@ -274,7 +274,7 @@ DEF_IS(ArrayArgs, bool) is_type(PyObject* obj) {
 
 DEF_IS(ArrayArgs, PyObject*) to_py_object(const T& a) {
 #if defined(__linux__) || defined(_WIN32)
-    STACK_ALLOC(int64, dims, a.shape.size());
+    STACK_ALLOC(int64_t, dims, a.shape.size());
 #elif defined(__APPLE__)
     long dims[a.shape.size()];
 #endif
@@ -390,7 +390,7 @@ DEF_IS(VarHolder*, T) from_py_object(PyObject* obj, unique_ptr<VarHolder>& holde
 struct DataView;
 DEF_IS(DataView, PyObject*) to_py_object(T a) {
 #if defined(__linux__) || defined(_WIN32)
-    STACK_ALLOC(int64, dims, a.shape.size());
+    STACK_ALLOC(int64_t, dims, a.shape.size());
 #elif defined(__APPLE__)
     long dims[a.shape.size()];
 #endif
@@ -652,7 +652,7 @@ DEF_IS(NumpyFunc, T) from_py_object(PyObject* obj) {
         [obj](typename T::R* result) {
             // import numpy
             string npstr="numpy";
-            #ifdef HAS_CUDA
+            #ifdef IS_CUDA
             if (use_cuda) npstr="cupy";
             #endif
 
@@ -669,7 +669,7 @@ DEF_IS(NumpyFunc, T) from_py_object(PyObject* obj) {
             PyTuple_SET_ITEM(args.obj, 0, np.release());
             PyTuple_SET_ITEM(args.obj, 1, data.release());
 
-            #ifdef HAS_CUDA
+            #ifdef IS_CUDA
             if (npstr=="cupy") {
                 PyObjHolder jt(PyImport_ImportModule("jittor"));
                 PyObjHolder pFunc(PyObject_GetAttrString(jt.obj,"numpy2cupy"));
diff --git a/python/jittor/src/pyjt/py_ring_buffer.cc b/python/jittor/src/pyjt/py_ring_buffer.cc
index 3f46f4f8..3347553c 100644
--- a/python/jittor/src/pyjt/py_ring_buffer.cc
+++ b/python/jittor/src/pyjt/py_ring_buffer.cc
@@ -110,7 +110,7 @@ static void push_py_object(RingBuffer* rb, PyObject* obj, uint64& __restrict__ o
                 rb->push(size, offset);
                 args.ptr = rb->get_ptr(size, offset);
 #if defined(__linux__) || defined(_WIN32)
-                STACK_ALLOC(int64, dims, args.shape.size());
+                STACK_ALLOC(int64_t, dims, args.shape.size());
 #elif defined(__APPLE__)
                 long dims[args.shape.size()];
 #endif
diff --git a/python/jittor/src/type/common_op_type.cc b/python/jittor/src/type/common_op_type.cc
new file mode 100644
index 00000000..305917d4
--- /dev/null
+++ b/python/jittor/src/type/common_op_type.cc
@@ -0,0 +1,165 @@
+// ***************************************************************
+// Copyright (c) 2021 Jittor. All Rights Reserved. 
+// Maintainers: Dun Liang <randonlang@gmail.com>. 
+// This file is subject to the terms and conditions defined in
+// file 'LICENSE.txt', which is part of this source code package.
+// ***************************************************************
+#include "common.h"
+#include "utils/str_utils.h"
+#include "ops/op_register.h"
+
+namespace jittor {
+
+extern int use_cuda;
+
+unordered_map<string,string> common_op_type_cuda_map = {
+    {"logical_not", "(!($2))"},
+    {"bitwise_not", "(~($2))"},
+    {"negative", "(-($2))"},
+    {"abs", "::abs($2)"},
+    {"log", "::logf(($1)($2))"},
+    {"exp", "::expf(($1)($2))"},
+    {"sqrt", "::sqrtf(($1)($2))"},
+    {"round", "(($1) ::roundf(($2)))"},
+    {"floor", "(($1) ::floorf(($2)))"},
+    {"ceil", "(($1) ::ceilf(($2)))"},
+    {"round_int", "(($1) ::roundf(($2)))"},
+    {"floor_int", "(($1) ::floorf(($2)))"},
+    {"ceil_int", "(($1) ::ceilf(($2)))"},
+    {"sin", "(($1) ::sinf(($2)))"},
+    {"asin", "(($1) ::asinf(($2)))"},
+    {"sinh", "(($1) ::sinhf(($2)))"},
+    {"asinh", "(($1) ::asinhf(($2)))"},
+    {"cos", "(($1) ::cosf(($2)))"},
+    {"acos", "(($1) ::acosf(($2)))"},
+    {"cosh", "(($1) ::coshf(($2)))"},
+    {"acosh", "(($1) ::acoshf(($2)))"},
+    {"tan", "(($1) ::tanf(($2)))"},
+    {"atan", "(($1) ::atanf(($2)))"},
+    {"tanh", "(($1) ::tanhf(($2)))"},
+    {"atanh", "(($1) ::atanhf(($2)))"},
+    {"sigmoid", "(($1) (1.0f/(1.0f+::expf((::min($1(-($2)), $1(@if(@strcmp($1,float32)==0,30,300))))))))"},
+    {"erf", "(($1) ::erff(($2)))"},
+    {"erfinv", "(($1) ::erfinvf(($1)($2)))"},
+    {"cast", "(($1)($2))"},
+    {"pow", "::pow(($2),($4))"},
+    {"maximum", "::max($1($2), $1($4))"},
+    {"minimum", "::min($1($2), $1($4))"},
+    {"mod", "@if(@strcmp($1,float32)==0,(($2)-::floorf(($2)/($4))*($4)),@if(@strcmp(@Tx,float64)==0,(($2)-::floor(($2)/($4))*($4)),(($2)%($4))))"},
+    {"init_maximum", "::numeric_min<$1>()"},
+    {"init_minimum", "::numeric_max<$1>()"},
+};
+
+struct CommonOpType : OpByType {
+    CommonOpType() {
+        types = {
+            "bool",
+            "int8",
+            "int16",
+            "int32",
+            "int64",
+            "uint8",
+            "uint16",
+            "uint32",
+            "uint64",
+            "float32",
+            "float64",
+        };
+    }
+
+    string expand_op(const vector<string>& args) {
+        for (int i=1; i<args.size(); i+=2) {
+            if (!types.count(args[i]))
+                return "";
+        }
+        auto& cuda_map = common_op_type_cuda_map;
+
+        static unordered_map<string,string> cpu_map = {
+            {"logical_not", "(!($2))"},
+            {"bitwise_not", "(~($2))"},
+            {"negative", "(-($2))"},
+            {"abs", "std::abs($2)"},
+            {"log", "std::log(($1)($2))"},
+            {"exp", "std::exp(($1)($2))"},
+            {"sqrt", "std::sqrt(($1)($2))"},
+            {"round", "(($1)std::round(($2)))"},
+            {"floor", "(($1)std::floor(($2)))"},
+            {"ceil", "(($1)std::ceil(($2)))"},
+            {"round_int", "(($1)std::round(($2)))"},
+            {"floor_int", "(($1)std::floor(($2)))"},
+            {"ceil_int", "(($1)std::ceil(($2)))"},
+            {"sin", "(($1) std::sin(($2)))"},
+            {"asin", "(($1) std::asin(($2)))"},
+            {"sinh", "(($1) std::sinh(($2)))"},
+            {"asinh", "(($1) std::asinh(($2)))"},
+            {"cos", "(($1) std::cos(($2)))"},
+            {"acos", "(($1) std::acos(($2)))"},
+            {"cosh", "(($1) std::cosh(($2)))"},
+            {"acosh", "(($1) std::acosh(($2)))"},
+            {"tan", "(($1) std::tan(($2)))"},
+            {"atan", "(($1) std::atan(($2)))"},
+            {"tanh", "(($1) std::tanh(($2)))"},
+            {"atanh", "(($1) std::atanh(($2)))"},
+            {"sigmoid", "(($1) (1.0f/(1.0f+std::exp(std::min($1(-($2)), $1(@if(@strcmp($1,float32)==0,30,300)))))))"},
+            {"erf", "(($1) std::erf(($2)))"},
+            {"erfinv", "(jittor::_erfinv($2))"},
+            {"cast", "(($1)($2))"},
+            {"pow", "std::pow(($2),($4))"},
+            {"maximum", "std::max($1($2), $1($4))"},
+            {"minimum", "std::min($1($2), $1($4))"},
+            {"mod", "@if(@strcmp($1,float32)==0,(($2)-std::floor(($2)/($4))*($4)),@if(@strcmp(@Tx,float64)==0,(($2)-std::floor(($2)/($4))*($4)),(($2)%($4))))"},
+            {"init_maximum", "std::numeric_limits<$1>::lowest()"},
+            {"init_minimum", "std::numeric_limits<$1>::max()"},
+        };
+
+        static unordered_map<string,string> both_map {
+            {"add", "(($2)+($4))"},
+            {"subtract", "(($2)-($4))"},
+            {"multiply", "(($2)*($4))"},
+            {"divide", "($1(($1($2))/($1($4))))"},
+            {"floor_divide", "($1(($1($2))/($1($4))))"},
+            {"less", "(($2)<($4))"},
+            {"less_equal", "(($2)<=($4))"},
+            {"greater", "(($2)>($4))"},
+            {"greater_equal", "(($2)>=($4))"},
+            {"equal", "(($2)==($4))"},
+            {"not_equal", "(($2)!=($4))"},
+            {"left_shift", "(($2)<<($4))"},
+            {"right_shift", "(($2)>>($4))"},
+            {"logical_and", "(($2)&&($4))"},
+            {"logical_or", "(($2)||($4))"},
+            {"logical_xor", "((bool($2))!=(bool($4)))"},
+            {"bitwise_and", "(($2)&($4))"},
+            {"bitwise_or", "(($2)|($4))"},
+            {"bitwise_xor", "(($2)^($4))"},
+            {"mean", "(($2)+$1($4)*($1(rcount)))"},
+            {"init_add", "$1(0)"},
+            {"init_multiply", "$1(1)"},
+            {"init_logical_and", "true"},
+            {"init_logical_or", "false"},
+            {"init_logical_xor", "false"},
+            {"init_bitwise_and", "$1(-1)"},
+            {"init_bitwise_or", "$1(0)"},
+            {"init_bitwise_xor", "$1(0)"},
+            {"init_mean", "$1(0)"},
+        };
+
+        string ret;
+        if (both_map.count(args.at(0)))
+            ret = both_map[args.at(0)];
+        else if (use_cuda)
+            ret = cuda_map[args.at(0)];
+        else
+            ret = cpu_map[args.at(0)];
+        return format(ret, args);
+    }
+
+    void post_pass(OpCompiler*) {
+        return;
+    }
+};
+
+
+static int _ = registe_op_type(new CommonOpType());
+
+}
\ No newline at end of file
diff --git a/python/jittor/src/type/fp16_compute.h b/python/jittor/src/type/fp16_compute.h
new file mode 100644
index 00000000..93833704
--- /dev/null
+++ b/python/jittor/src/type/fp16_compute.h
@@ -0,0 +1,164 @@
+// ***************************************************************
+// Copyright (c) 2021 Jittor. All Rights Reserved. 
+// Maintainers: Dun Liang <randonlang@gmail.com>. 
+// This file is subject to the terms and conditions defined in
+// file 'LICENSE.txt', which is part of this source code package.
+// ***************************************************************
+#pragma once
+#include "common.h"
+
+#ifdef JIT_cuda
+
+#include <driver_types.h>
+#include <cuda_fp16.h>
+
+namespace jittor {
+
+typedef __half float16;
+
+#if CUDA_ARCH >= 800
+inline __device__ float16 max(float16 a, float16 b) { return __hmax(a, b); }
+inline __device__ float16 min(float16 a, float16 b) { return __hmin(a, b); }
+#else
+inline __device__ float16 max(float16 a, float16 b) { return a<b?b:a; }
+inline __device__ float16 min(float16 a, float16 b) { return a<b?a:b; }
+#endif
+
+inline __device__ float16 pow(float16 a, float16 b) { return ::pow(float32(a), float32(b)); }
+
+
+template<int nbyte, class T>
+__device__ inline void vload(T* __restrict__ a, T* __restrict__ b) {
+    if constexpr (nbyte<=0) return;
+    if constexpr (nbyte>=16) {
+        auto __restrict__ aa = (float4* __restrict__)a;
+        auto __restrict__ bb = (float4* __restrict__)b;
+        aa[0] = bb[0];
+        return vload<nbyte-16>(aa+1, bb+1);
+    }
+    if constexpr (nbyte>=8) {
+        auto __restrict__ aa = (float2* __restrict__)a;
+        auto __restrict__ bb = (float2* __restrict__)b;
+        aa[0] = bb[0];
+        return vload<nbyte-8>(aa+1, bb+1);
+    }
+    if constexpr (nbyte>=4) {
+        auto __restrict__ aa = (float* __restrict__)a;
+        auto __restrict__ bb = (float* __restrict__)b;
+        aa[0] = bb[0];
+        return vload<nbyte-4>(aa+1, bb+1);
+    }
+    if constexpr (nbyte>=2) {
+        auto __restrict__ aa = (__half* __restrict__)a;
+        auto __restrict__ bb = (__half* __restrict__)b;
+        aa[0] = bb[0];
+        return vload<nbyte-2>(aa+1, bb+1);
+    }
+    if constexpr (nbyte>=1) {
+        auto __restrict__ aa = (int8_t* __restrict__)a;
+        auto __restrict__ bb = (int8_t* __restrict__)b;
+        aa[0] = bb[0];
+        return vload<nbyte-1>(aa+1, bb+1);
+    }
+}
+
+
+}
+
+using jittor::max;
+using jittor::min;
+using jittor::pow;
+
+#else
+
+namespace jittor {
+
+struct float16 {
+    uint16 x;
+
+    inline float16(float32 f) {
+        unsigned x = *((int*)(void*)(&f));
+        unsigned u = (x & 0x7fffffff), remainder, shift, lsb, lsb_s1, lsb_m1;
+        unsigned sign, exponent, mantissa;
+
+
+        // Get rid of +NaN/-NaN case first.
+        if (u > 0x7f800000) {
+            this->x = 0x7fffU;
+            return;
+        }
+    
+        sign = ((x >> 16) & 0x8000);
+    
+        // Get rid of +Inf/-Inf, +0/-0.
+        if (u > 0x477fefff) {
+            this->x = sign | 0x7c00U;
+            return;
+        }
+        if (u < 0x33000001) {
+            this->x = sign | 0x0000U;
+            return;
+        }
+
+        exponent = ((u >> 23) & 0xff);
+        mantissa = (u & 0x7fffff);
+
+        if (exponent > 0x70) {
+            shift = 13;
+            exponent -= 0x70;
+        } else {
+            shift = 0x7e - exponent;
+            exponent = 0;
+            mantissa |= 0x800000;
+        }
+        lsb = (1 << shift);
+        lsb_s1 = (lsb >> 1);
+        lsb_m1 = (lsb - 1);
+    
+        // Round to nearest even.
+        remainder = (mantissa & lsb_m1);
+        mantissa >>= shift;
+        if (remainder > lsb_s1 || (remainder == lsb_s1 && (mantissa & 0x1))) {
+            ++mantissa;
+            if (!(mantissa & 0x3ff)) {
+                ++exponent;
+                mantissa = 0;
+            }
+        }  
+
+        this->x = (sign | (exponent << 10) | mantissa);  
+    }
+
+    inline operator float() {
+
+        unsigned sign     = ((x >> 15) & 1);
+        unsigned exponent = ((x >> 10) & 0x1f);
+        unsigned mantissa = ((x & 0x3ff) << 13);
+
+        if (exponent == 0x1f) {  /* NaN or Inf */
+            mantissa = (mantissa ? (sign = 0, 0x7fffff) : 0);
+            exponent = 0xff;
+        } else if (!exponent) {  /* Denorm or Zero */
+            if (mantissa) {
+                unsigned int msb;
+                exponent = 0x71;
+                do {
+                    msb = (mantissa & 0x400000);
+                    mantissa <<= 1;  /* normalize */
+                    --exponent;
+                } while (!msb);
+                mantissa &= 0x7fffff;  /* 1.mantissa is implicit */
+            }
+        } else {
+            exponent += 0x70;
+        }
+
+        int temp = ((sign << 31) | (exponent << 23) | mantissa);
+
+        return reinterpret_cast<float&>(temp);
+    }
+};
+
+}
+
+#endif
\ No newline at end of file
diff --git a/python/jittor/src/type/fp16_op_type.cc b/python/jittor/src/type/fp16_op_type.cc
new file mode 100644
index 00000000..1d6c14cb
--- /dev/null
+++ b/python/jittor/src/type/fp16_op_type.cc
@@ -0,0 +1,188 @@
+// ***************************************************************
+// Copyright (c) 2021 Jittor. All Rights Reserved. 
+// Maintainers: Dun Liang <randonlang@gmail.com>. 
+// This file is subject to the terms and conditions defined in
+// file 'LICENSE.txt', which is part of this source code package.
+// ***************************************************************
+#include "common.h"
+#include "utils/str_utils.h"
+#include "ops/op_register.h"
+#include "op_compiler.h"
+
+namespace jittor {
+
+extern int use_cuda;
+
+extern unordered_map<string,string> common_op_type_cuda_map;
+
+static bool isvar(char x) { return isalnum(x) || x == '_' || x == ':'; }
+
+struct FP16OpType : OpByType {
+    FP16OpType() {
+        types = {
+            "float16",
+        };
+    }
+
+    string expand_op(const vector<string>& args) {
+        bool found_fp16 = 0;
+        for (int i=1; i<args.size(); i+=2) {
+            if (types.count(args[i]))
+                found_fp16 = 1;
+        }
+        if (!found_fp16) return "";
+        static unordered_map<string,string> cuda_map = {
+            {"logical_not", "(!($2))"},
+            {"bitwise_not", "(~($2))"},
+            {"negative", "(-($2))"},
+            {"abs", "::abs($2)"},
+            {"log", "::hlog(($1)($2))"},
+            {"exp", "::hexp(($1)($2))"},
+            {"sqrt", "::hsqrt(($1)($2))"},
+            {"round", "(($1) ::roundf(($2)))"},
+            {"floor", "(($1) ::floorf(($2)))"},
+            {"ceil", "(($1) ::ceilf(($2)))"},
+            {"round_int", "(($1) ::roundf(($2)))"},
+            {"floor_int", "(($1) ::floorf(($2)))"},
+            {"ceil_int", "(($1) ::ceilf(($2)))"},
+            {"sin", "(($1) ::sinf(($2)))"},
+            {"asin", "(($1) ::asinf(($2)))"},
+            {"sinh", "(($1) ::sinhf(($2)))"},
+            {"asinh", "(($1) ::asinhf(($2)))"},
+            {"cos", "(($1) ::cosf(($2)))"},
+            {"acos", "(($1) ::acosf(($2)))"},
+            {"cosh", "(($1) ::coshf(($2)))"},
+            {"acosh", "(($1) ::acoshf(($2)))"},
+            {"tan", "(($1) ::tanf(($2)))"},
+            {"atan", "(($1) ::atanf(($2)))"},
+            {"tanh", "(($1) ::tanhf(($2)))"},
+            {"atanh", "(($1) ::atanhf(($2)))"},
+            {"sigmoid", "(($1) (1.0f/(1.0f+::expf((::min($1(-($2)), $1(@if(@strcmp($1,float16)==0,30,300))))))))"},
+            {"erf", "(($1) ::erff(($2)))"},
+            {"erfinv", "(($1) ::erfinvf(($1)($2)))"},
+            {"cast", "(($1)($2))"},
+            {"pow", "::pow(($2),($4))"},
+            {"maximum", "::max($1($2), $1($4))"},
+            {"minimum", "::min($1($2), $1($4))"},
+            {"mod", "$1(($2)-::hfloor(($2)/($4))*($4))"},
+            {"init_maximum", "-32768.0f"},
+            {"init_minimum", "32768.0f"},
+        };
+
+        static unordered_map<string,string> cpu_map = {
+            {"logical_not", "(!($2))"},
+            {"bitwise_not", "(~($2))"},
+            {"negative", "(-($2))"},
+            {"abs", "std::abs($2)"},
+            {"log", "std::log(($1)($2))"},
+            {"exp", "std::exp(($1)($2))"},
+            {"sqrt", "std::sqrt(($1)($2))"},
+            {"round", "(($1)std::round(($2)))"},
+            {"floor", "(($1)std::floor(($2)))"},
+            {"ceil", "(($1)std::ceil(($2)))"},
+            {"round_int", "(($1)std::round(($2)))"},
+            {"floor_int", "(($1)std::floor(($2)))"},
+            {"ceil_int", "(($1)std::ceil(($2)))"},
+            {"sin", "(($1) std::sin(($2)))"},
+            {"asin", "(($1) std::asin(($2)))"},
+            {"sinh", "(($1) std::sinh(($2)))"},
+            {"asinh", "(($1) std::asinh(($2)))"},
+            {"cos", "(($1) std::cos(($2)))"},
+            {"acos", "(($1) std::acos(($2)))"},
+            {"cosh", "(($1) std::cosh(($2)))"},
+            {"acosh", "(($1) std::acosh(($2)))"},
+            {"tan", "(($1) std::tan(($2)))"},
+            {"atan", "(($1) std::atan(($2)))"},
+            {"tanh", "(($1) std::tanh(($2)))"},
+            {"atanh", "(($1) std::atanh(($2)))"},
+            {"sigmoid", "(($1) (1.0f/(1.0f+std::exp(std::min($1(-($2)), $1(@if(@strcmp($1,float32)==0,30,300)))))))"},
+            {"erf", "(($1) std::erf(($2)))"},
+            {"erfinv", "(jittor::_erfinv($2))"},
+            {"cast", "(($1)($2))"},
+            {"pow", "std::pow(($2),($4))"},
+            {"maximum", "std::max($1($2), $1($4))"},
+            {"minimum", "std::min($1($2), $1($4))"},
+            {"mod", "$1(($2)-std::floor(($2)/($4))*($4))"},
+            {"init_maximum", "-32768.0f"},
+            {"init_minimum", "32768.0f"},
+        };
+
+        static unordered_map<string,string> both_map {
+            {"add", "(($2)+($4))"},
+            {"subtract", "(($2)-($4))"},
+            {"multiply", "(($2)*($4))"},
+            {"divide", "($1(($1($2))/($1($4))))"},
+            {"floor_divide", "($1(($1($2))/($1($4))))"},
+            {"less", "(($2)<($4))"},
+            {"less_equal", "(($2)<=($4))"},
+            {"greater", "(($2)>($4))"},
+            {"greater_equal", "(($2)>=($4))"},
+            {"equal", "(($2)==($4))"},
+            {"not_equal", "(($2)!=($4))"},
+            {"left_shift", "(($2)<<($4))"},
+            {"right_shift", "(($2)>>($4))"},
+            {"logical_and", "(($2)&&($4))"},
+            {"logical_or", "(($2)||($4))"},
+            {"logical_xor", "((bool($2))!=(bool($4)))"},
+            {"bitwise_and", "(($2)&($4))"},
+            {"bitwise_or", "(($2)|($4))"},
+            {"bitwise_xor", "(($2)^($4))"},
+            {"mean", "(($2)+($4)*($1(rcount)))"},
+            {"init_add", "$1(0)"},
+            {"init_multiply", "$1(1)"},
+            {"init_logical_and", "true"},
+            {"init_logical_or", "false"},
+            {"init_logical_xor", "false"},
+            {"init_bitwise_and", "$1(-1)"},
+            {"init_bitwise_or", "$1(0)"},
+            {"init_bitwise_xor", "$1(0)"},
+            {"init_mean", "$1(0)"},
+        };
+
+        string ret;
+        if (both_map.count(args.at(0)))
+            ret = both_map[args.at(0)];
+        else if (use_cuda)
+            ret = cuda_map[args.at(0)];
+        else
+            ret = cpu_map[args.at(0)];
+        if (use_cuda) {
+            if (args[1] == "float32" && !both_map.count(args.at(0))) {
+                ret = common_op_type_cuda_map[args.at(0)];
+            }
+            if (args[1] == "float16" || args[1] == "float32") {
+                for (int i=3; i<args.size(); i+=2) {
+                    if (args[i] != args[1]) {
+                        ret = replace(ret, "$"+S(i-1),
+                            args[1]+"($"+S(i-1)+")");
+                    }
+                }
+            } else {
+                for (int i=3; i<args.size(); i+=2) {
+                    if (args[i] != "float16") {
+                        ret = replace(ret, "$"+S(i-1),
+                            "float16($"+S(i-1)+")");
+                    }
+                }
+            }
+        }
+        return format(ret, args);
+    }
+
+    void post_pass(OpCompiler* oc) {
+        string& src = oc->src;
+        if (src.find("float16") == string::npos)
+            return;
+        int i = src.rfind("#include");
+        if (i<0) i=0;
+        i = src.find('\n', i) + 1;
+        src = src.substr(0, i) + "#include \"type/fp16_compute.h\"\n" + 
+            src.substr(i);
+        return;
+    }
+};
+
+
+static int _ = registe_op_type(new FP16OpType());
+
+}
\ No newline at end of file
diff --git a/python/jittor/src/types.h b/python/jittor/src/types.h
index e3942146..fb252fdb 100644
--- a/python/jittor/src/types.h
+++ b/python/jittor/src/types.h
@@ -18,7 +18,7 @@ namespace jittor {
 typedef int8_t int8;
 typedef int16_t int16;
 typedef int int32;
-typedef int64_t int64;
+typedef long long int64;
 typedef uint8_t uint8;
 typedef uint16_t uint16;
 typedef uint32_t uint32;
@@ -239,4 +239,6 @@ std::ostream& operator<<(std::ostream& os, const Caster<T,To>& input) {
     return os << ']';
 }
 
+#define JPU(x) ;
+
 } // jittor
diff --git a/python/jittor/src/utils/cache_compile.cc b/python/jittor/src/utils/cache_compile.cc
index 279c7117..e7e57e09 100644
--- a/python/jittor/src/utils/cache_compile.cc
+++ b/python/jittor/src/utils/cache_compile.cc
@@ -167,14 +167,14 @@ void process(string src, vector<string>& input_names, string& cmd) {
             // #include "a.h"
             // i       jk    l
             auto j=i+1;
-            while (j<src.size() && (src[j] != ' ' && src[j] != '\n')) j++;
+            while (j<src.size() && (src[j] != ' ' && src[j] != '\n' && src[j] != '\r')) j++;
             if (j>=src.size()) return;
             if (j-i != 8 && j-i != 6) continue;
             auto k=j+1;
             while (k<src.size() && src[k] == ' ') k++;
             if (k>=src.size()) return;
             auto l=k+1;
-            while (l<src.size() && (src[l] != ' ' && src[l] != '\n')) l++;
+            while (l<src.size() && (src[l] != ' ' && src[l] != '\n' && src[l] != '\r')) l++;
             if (src[k] == '"' && src[l-1] == '"' && j-i==8 && src.substr(i,j-i) == "#include") {
                 auto inc = src.substr(k+1, l-k-2);
                 if (inc != "test.h" && inc != "helper_cuda.h") {
diff --git a/python/jittor/src/utils/str_utils.cc b/python/jittor/src/utils/str_utils.cc
index 0c340b65..386da689 100644
--- a/python/jittor/src/utils/str_utils.cc
+++ b/python/jittor/src/utils/str_utils.cc
@@ -47,4 +47,151 @@ string strip(const string& s) {
     return s.substr(i,j-i);
 }
 
+string format(const string& s, const vector<string>& v) {
+    string ss;
+    for (int i=0; i<s.size(); i++) {
+        if (s[i] == '$') {
+            int j = s[i+1] - '0';
+            ss += v.at(j);
+            i ++;
+            continue;
+        } else
+            ss += s[i];
+    }
+    return ss;
+}
+
+string join(const vector<string>& vs, const string& x) {
+    string s;
+    for (int i=0; i<vs.size(); i++) {
+        s += vs[i];
+        if (i!=vs.size()-1)
+            s += x;
+    }
+    return s;
+}
+
+string replace(const string& a, const string& b, const string& c) {
+    auto vs = split(a, b);
+    return join(vs, c);
+}
+
+static inline bool isvar(char x) { return isalnum(x) || x == '_' || x == ':'; }
+
+vector<string> token_split(const string& s) {
+    vector<string> ss;
+    if (!s.size()) return ss;
+    ss.push_back(string()+s[0]);
+    for (int i=1; i<s.size(); i++) {
+        if (isvar(s[i]) != isvar(s[i-1]))
+            ss.push_back("");
+        ss.back() += s[i];
+    }
+    return ss;
+}
+
+static void parse_reg(const string& src, 
+    vector<string>& patterns,
+    vector<int>& arg_id) {
+    patterns.clear();
+    arg_id.clear();
+    patterns.push_back("");
+    for (int j=0; j<src.size(); j++) {
+        if (src[j] == '$') {
+            j++;
+            arg_id.push_back(src[j]-'0');
+            patterns.push_back("");
+            continue;
+        }
+        patterns.back() += src[j];
+    }
+}
+
+void token_replace(vector<string>& tokens, int i, const string& src, const string& dst) {
+    ASSERT(src.at(0) != '$' && src.at(src.size()-1) != '$' && 
+        src.at(src.size()-2) != '$') << "illegal src:" << src;
+    vector<string> patterns;
+    vector<int> arg_id;
+    vector<string> patterns2;
+    vector<int> arg_id2;
+    unordered_map<int, string> args;
+    parse_reg(src, patterns, arg_id);
+    parse_reg(dst, patterns2, arg_id2);
+
+    int start_i, start_pos, end_i, end_pos;
+    int c_i = i, c_pos = 0;
+    int match_i, match_pos;
+    string c_arg;
+
+    auto match = [&](int c_i, int c_pos, const string& pat) -> bool {
+        for (int i=0; i<pat.size(); i++) {
+            if (tokens[c_i][c_pos] != pat[i])
+                return false;
+            c_pos ++;
+            if (c_pos >= tokens[c_i].size()) {
+                c_pos = 0;
+                c_i ++;
+                if (c_i >= tokens.size())
+                    return false;
+            }
+        }
+        match_i = c_i;
+        match_pos = c_pos;
+        return true;
+    };
+
+    for (int j=0; j<patterns.size(); j++) {
+        int ok = 0;
+        while (c_i < tokens.size()) {
+            while (c_pos < tokens[c_i].size()) {
+                if (match(c_i, c_pos, patterns[j])) {
+                    ok = 1;
+                    break;
+                }
+                c_arg += tokens[c_i][c_pos];
+                c_pos ++;
+            }
+            if (ok) break;
+            c_i ++;
+            c_pos = 0;
+        }
+        CHECK(ok) << "Pattern not match:" << patterns[j] << j;
+        if (j == 0) {
+            start_i = c_i;
+            start_pos = c_pos;
+        }
+        if (j) {
+            args[arg_id[j-1]] = c_arg;
+        }
+        c_arg = "";
+        c_i = match_i;
+        c_pos = match_pos;
+        if (j == patterns.size()-1) {
+            end_i = c_i;
+            end_pos = c_pos;
+        }
+    }
+    string new_src;
+    for (int j=0; j<patterns2.size(); j++) {
+        if (j) new_src += args[arg_id2.at(j-1)];
+        new_src += patterns2[j];
+    }
+    if (start_i == end_i) {
+        tokens[start_i] = tokens[start_i].substr(0, start_pos) +
+            new_src + tokens[start_i].substr(end_pos);
+    } else {
+        tokens[start_i] = tokens[start_i].substr(0, start_pos)
+            + new_src;
+        tokens[end_i] = tokens[end_i].substr(end_pos);
+        for (int j=start_i+1; j<end_i; j++)
+            tokens[j] = "";
+    }
+}
+
+string token_replace(const string& s, const string& src, const string& dst) {
+    vector<string> ss{s};
+    token_replace(ss, 0, src, dst);
+    return join(ss, "");
+}
+
 } // jittor
\ No newline at end of file
diff --git a/python/jittor/src/utils/str_utils.h b/python/jittor/src/utils/str_utils.h
index 0098728d..6a2251e0 100644
--- a/python/jittor/src/utils/str_utils.h
+++ b/python/jittor/src/utils/str_utils.h
@@ -27,4 +27,16 @@ vector<string> split(const string& s, const string& sep, int max_split=0);
 
 string strip(const string& s);
 
+string format(const string& s, const vector<string>& v);
+
+string replace(const string& a, const string& b, const string& c);
+
+string join(const vector<string>& vs, const string& x);
+
+vector<string> token_split(const string& s);
+
+void token_replace(vector<string>& tokens, int i, const string& src, const string& dst);
+
+string token_replace(const string& s, const string& src, const string& dst);
+
 } // jittor
\ No newline at end of file
diff --git a/python/jittor/src/var.cc b/python/jittor/src/var.cc
index 97a8e2a0..b4339575 100644
--- a/python/jittor/src/var.cc
+++ b/python/jittor/src/var.cc
@@ -14,7 +14,7 @@
 
 namespace jittor {
 
-int64_t Var::number_of_lived_vars = 0;
+int64 Var::number_of_lived_vars = 0;
 
 DEFINE_FLAG(fast_shared_ptr<loop_options_t>, compile_options, {}, 
     "Override the default loop transfrom options");
@@ -42,7 +42,7 @@ string Var::to_string() {
     return s;
 }
 
-int64_t Var::numel() {
+int64 Var::numel() {
     if (!shape.size()) return size=num=-1;
     bool negtive = 0;
     num=1;
diff --git a/python/jittor/src/var.h b/python/jittor/src/var.h
index 78e09aa8..941ef215 100644
--- a/python/jittor/src/var.h
+++ b/python/jittor/src/var.h
@@ -18,13 +18,13 @@ struct Var : Node {
     NanoVector shape;
     cstr name;
     fast_shared_ptr<loop_options_t> loop_options;
-    static int64_t number_of_lived_vars;
+    static int64 number_of_lived_vars;
 
     // this var will be generated after alloc.
     void* mem_ptr = nullptr;
     Allocator* allocator = nullptr;
     size_t allocation;
-    int64_t size, num;
+    int64 size, num;
     inline bool is_float() const { CHECK_EXIST; return ns.is_float(); }
     inline int dsize() const { CHECK_EXIST; return ns.dsize(); }
     inline NanoString dtype() const { CHECK_EXIST; return ns; }
@@ -40,7 +40,7 @@ struct Var : Node {
     Var(NanoVector shape, NanoString dtype);
 
     string to_string();
-    int64_t numel();
+    int64 numel();
     void set_shape(NanoVector shape);
     bool alloc(Allocator* allocator);
     inline void share_with(Var* x, size_t offset = 0) { CHECK_EXIST; allocator = (Allocator*)x; allocation = offset; }
diff --git a/python/jittor/test/__main__.py b/python/jittor/test/__main__.py
index 7b26a16c..3904fa48 100644
--- a/python/jittor/test/__main__.py
+++ b/python/jittor/test/__main__.py
@@ -15,6 +15,7 @@ if __name__ == "__main__":
 
     skip_l = int(os.environ.get("test_skip_l", "0"))
     skip_r = int(os.environ.get("test_skip_r", "1000000"))
+    skip = os.environ.get("test_skip", "").split(",")
     test_only = None
     if "test_only" in os.environ:
         test_only = set(os.environ.get("test_only").split(","))
@@ -34,6 +35,9 @@ if __name__ == "__main__":
             continue
         if test_only and test_name not in test_only:
             continue
+        for s in skip:
+            if s in test_name:
+                continue
 
         print("Add Test", _, test_name)
         suite.addTest(tests)
diff --git a/python/jittor/test/misc/superglue.py b/python/jittor/test/misc/superglue.py
new file mode 100644
index 00000000..44af14fd
--- /dev/null
+++ b/python/jittor/test/misc/superglue.py
@@ -0,0 +1,374 @@
+from copy import deepcopy
+from pathlib import Path
+import jittor as jt
+import jittor.nn as nn
+import numpy as np
+import os
+
+split_size = 1000000
+
+conv_opt = int(os.environ.get("conv_opt", "0"))
+
+if conv_opt:
+    Conv1d_sp = nn.Conv1d_sp
+else:
+    Conv1d_sp = nn.Conv1d
+
+
+def MLP(channels: list, do_bn=True):
+    """ Multi-layer perceptron """
+    n = len(channels)
+    layers = []
+    for i in range(1, n):
+        layers.append(Conv1d_sp(channels[i - 1], channels[i], kernel_size=1, bias=True))
+        if i < (n - 1):
+            if do_bn:
+                layers.append(nn.BatchNorm(channels[i]))
+                # layers.append(nn.InstanceNorm1d(channels[i]))
+                # layers.append(nn.LayerNorm(channels[i]))
+            layers.append(nn.ReLU())
+    return nn.Sequential(*layers)
+
+
+def normalize_keypoints(kpts, image_shape):
+    size = image_shape.flip(1)  # shape=(b,2) ;h w -> w, h
+    center = size / 2
+    scaling = size.float32().max(1, keepdims=True) * 0.7
+    return (kpts - center[:, None, :]) / scaling[:, None, :]
+
+
+class KeypointEncoder(nn.Module):
+    """ Joint encoding of visual appearance and location using MLPs"""
+    def __init__(self, feature_dim, layers, keypoint_position_dim=2):
+        super().__init__()
+        # self.keypoint_position_dim = keypoint_position_dim
+        self.encoder = MLP([keypoint_position_dim + 1] + layers + [feature_dim])
+        nn.init.constant_(self.encoder[-1].bias, 0.0)
+
+    def execute(self, kpts, scores):
+        inputs = jt.concat([kpts.t(), scores.unsqueeze(1)], dim=1)
+        return self.encoder(inputs)
+
+cnt = 0
+
+def attention(query, key, value):
+    global cnt
+    cnt += 1
+    b, d, h, n = query.shape
+    # print("attention", b,d,h,n, cnt)
+    dim_factor = (1.0 / d)**0.5
+    query = query.transpose(0, 2, 3, 1).reshape(b * h, -1, d) * dim_factor
+    key = key.transpose(0, 2, 1, 3).reshape(b * h, d, -1)
+    value = value.transpose(0, 2, 3, 1).reshape(b * h, -1, d)
+    # print("attention", query.shape, key.shape, value.shape)
+
+    data = []
+    for i in range(0, query.shape[0], split_size):
+        end = min(i + split_size, query.shape[0])
+        tmp1 = nn.bmm(query[i:end], key[i:end])
+        tmp2 = nn.softmax(tmp1, dim=-1)
+        tmp3 = nn.bmm(tmp2, value[i:end])
+        tmp3.sync()
+        data.append(tmp3)
+    tmp3 = jt.concat(data)
+    
+    # for i in range(0, query.shape[0], split_size):
+    #     end = min(i + split_size, query.shape[0])
+    #     tmp1 = nn.bmm(query[:,i:end], key[:,i:end])
+    #     tmp2 = nn.softmax(tmp1, dim=-1)
+    #     tmp3 = nn.bmm(tmp2, value[:,i:end])
+    #     tmp3.sync()
+    #     data.append(tmp3)
+    # tmp3 = jt.concat(data, dim=1)
+
+    # tmp1 = nn.bmm(query, key)
+    # print(tmp1.shape)
+    # tmp2 = nn.softmax(tmp1, dim=-1)
+    # print(tmp2.shape)
+    # tmp3 = nn.bmm(tmp2, value)
+    # print(tmp3.shape)
+    return tmp3.reshape(b, h, -1, d).transpose(0, 3, 1, 2)
+    return nn.bmm(nn.softmax(nn.bmm(query, key), dim=-1), value).reshape(b, h, -1, d).transpose(0, 3, 1, 2)
+
+
+class MultiHeadedAttention(nn.Module):
+    """ Multi-head attention to increase model expressivitiy """
+    def __init__(self, num_heads: int, d_model: int):
+        super().__init__()
+        assert d_model % num_heads == 0
+        self.dim = d_model // num_heads
+        self.num_heads = num_heads
+        self.merge = Conv1d_sp(d_model, d_model, kernel_size=1)
+        self.proj = nn.ModuleList([deepcopy(self.merge) for _ in range(3)])
+
+    def execute(self, query, key, value):
+        batch_dim = query.size(0)
+        query, key, value = [l(x).reshape(batch_dim, self.dim, self.num_heads, -1) for l, x in zip(self.proj, (query, key, value))]
+        x = attention(query, key, value)
+        # x = attention_chunk(query, key, value)
+        return self.merge(x.reshape(batch_dim, self.dim * self.num_heads, -1))
+
+
+class AttentionalPropagation(nn.Module):
+    def __init__(self, feature_dim: int, num_heads: int):
+        super().__init__()
+        self.attn = MultiHeadedAttention(num_heads, feature_dim)
+        self.mlp = MLP([feature_dim * 2, feature_dim * 2, feature_dim])
+        nn.init.constant_(self.mlp[-1].bias, 0.0)
+
+    def execute(self, x, source):
+        message = self.attn(x, source, source)
+        return self.mlp(jt.concat([x, message], dim=1))
+
+
+class AttentionalGNN(nn.Module):
+    def __init__(self, feature_dim: int, layer_names: list):
+        super().__init__()
+        self.layers = nn.ModuleList([AttentionalPropagation(feature_dim, 4) for _ in range(len(layer_names))])
+        self.is_cross = [x == 'cross' for x in layer_names]
+
+    def execute(self, desc0, desc1):
+        for layer, is_cross in zip(self.layers, self.is_cross):
+            layer.attn.prob = []
+            if is_cross:
+                src0, src1 = desc1, desc0
+            else:  # if name == 'self':
+                src0, src1 = desc0, desc1
+            # delta0, delta1 = layer(desc0, src0), layer(desc1, src1)
+
+            delta0 = layer(desc0, src0)
+            # print(delta0.numel()*4)
+            # breakpoint()
+            jt.sync_all()
+            delta1 = layer(desc1, src1)
+            jt.sync_all()
+            desc0, desc1 = (desc0 + delta0), (desc1 + delta1)
+            jt.sync_all()
+        return desc0, desc1
+
+
+def log_sinkhorn_iterations(Z, log_mu, log_nu, iters: int):
+    """ Perform Sinkhorn Normalization in Log-space for stability"""
+    u, v = jt.zeros_like(log_mu), jt.zeros_like(log_nu)
+    for _ in range(iters):
+        u = log_mu - (Z + v.unsqueeze(1)).exp().sum(dim=2).log()
+        v = log_nu - (Z + u.unsqueeze(2)).exp().sum(dim=1).log()
+    return Z + u.unsqueeze(2) + v.unsqueeze(1)
+
+
+def log_optimal_transport(scores, alpha, iters: int):
+    """ Perform Differentiable Optimal Transport in Log-space for stability"""
+    b, m, n = scores.shape
+    ms, ns = jt.float(m, requires_grad=False), jt.float(n, requires_grad=False)
+
+    bins0 = alpha.broadcast([b, m, 1])
+    bins1 = alpha.broadcast([b, 1, n])
+    alpha = alpha.broadcast([b, 1, 1])
+
+    couplings = jt.concat([jt.concat([scores, bins0], -1), jt.concat([bins1, alpha], -1)], 1)
+
+    norm = -(ms + ns).log()
+    log_mu = jt.concat([norm.broadcast([m]), ns.log() + norm])
+    log_nu = jt.concat([norm.broadcast([n]), ms.log() + norm])
+    log_mu, log_nu = log_mu[None].broadcast([b, m + 1]), log_nu[None].broadcast([b, n + 1])
+
+    Z = log_sinkhorn_iterations(couplings, log_mu, log_nu, iters)
+    Z = Z - norm  # multiply probabilities by M+N
+    return Z
+
+
+def arange_like(x, dim: int):
+    return jt.ones(x.shape[dim], dtype=x.dtype)[None].cumsum()[0] - 1  # traceable in 1.1
+
+
+default_config = {
+    'descriptor_dim': 256,  # SuperPoint
+    'weights': 'indoor',
+    'keypoint_encoder': [32, 64, 128, 256],  # SuperPoint
+    'GNN_layers': ['self', 'cross'] * 9,
+    'sinkhorn_iterations': 100,
+    'match_threshold': 0.2,
+}
+
+
+def get_weighted_loss_batch(scores, all_matches):
+    matches0, matches1 = all_matches.chunk(chunks=2, dim=2)
+    batchIdx = jt.arange(all_matches.shape[0]).unsqueeze(1).repeat(1, all_matches.shape[1])
+    batchIdx, matches0, matches1 = batchIdx.view(-1), matches0.view(-1), matches1.view(-1)
+    valid_index0, valid_index1 = matches0 >= 0, matches1 >= 0
+    valid_match = jt.logical_and(valid_index0, valid_index1)
+    valid_unmatch = jt.logical_xor(valid_index0, valid_index1)
+    num_match = valid_match.sum().maximum(1e-9)
+    num_unmatch = valid_unmatch.sum().maximum(1e-9)
+
+
+
+    score_ = scores[batchIdx, matches0, matches1]
+    score_match_ = (score_*valid_match).float32().sum() / num_match
+    score_umatch_ = (score_*valid_unmatch).float32().sum() / num_unmatch
+    return -(num_unmatch * score_match_ + num_match * score_umatch_) / (num_match + num_unmatch)
+    # print(score_umatch_, score_match_)
+    # return -(score_match + score_umatch) / (num_match + num_unmatch)
+
+    score_match = scores[(batchIdx[valid_match], matches0[valid_match], matches1[valid_match])].float32().mean() if num_match > 0 else 0
+    score_umatch = scores[(batchIdx[valid_unmatch], matches0[valid_unmatch], matches1[valid_unmatch])].float32().mean() if num_unmatch > 0 else 0
+    # print(score_match, score_umatch)
+    return -(num_unmatch * score_match + num_match * score_umatch) / (num_match + num_unmatch)
+
+
+def add_dustbin(scores, alpha):
+    b, m, n = scores.shape
+    bins0 = jt.broadcast(alpha, (b, m, 1))
+    bins1 = jt.broadcast(alpha, (b, 1, n))
+    alpha = jt.broadcast(alpha, (b, 1, 1))
+    couplings = jt.concat([jt.concat([scores, bins0], -1), jt.concat([bins1, alpha], -1)], 1)
+    return couplings
+
+
+class SuperGlue(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        config = {**default_config, **config}
+        self.descriptor_dim = config['descriptor_dim']
+        self.keypoint_encoder = config['keypoint_encoder']
+        self.GNN_layers = config['GNN_layers']
+        self.sinkhorn_iterations = config['sinkhorn_iterations']
+        self.match_threshold = config['match_threshold']
+        self.keypoint_position_dim = config['keypoint_position_dim']
+        self.use_dual_softmax = config['use_dual_softmax']
+        self.scale = jt.float(self.descriptor_dim**-0.5).stop_grad()
+        # self.scale.requires_grad = False
+
+        # self.des_extend = MLP([128, 256])
+
+        self.kenc = KeypointEncoder(self.descriptor_dim, self.keypoint_encoder, keypoint_position_dim=self.keypoint_position_dim)
+
+        self.gnn = AttentionalGNN(self.descriptor_dim, self.GNN_layers)
+
+        self.final_proj = Conv1d_sp(self.descriptor_dim, self.descriptor_dim, kernel_size=1, bias=True)
+
+        self.bin_score = jt.float(1.0)
+
+    def execute(self, data):
+        """Run SuperGlue on a pair of keypoints and descriptors"""
+
+        kpts0, kpts1 = data['keypoints0'], data['keypoints1']
+        desc0, desc1 = data['descriptors0'], data['descriptors1']
+        all_matches = data['all_matches']
+        # match_num = data['match_num']
+
+        if kpts0.shape[1] == 0 or kpts1.shape[1] == 0 or all_matches.shape[1] == 0:  # no keypoints or no matches/unmatches
+            shape0, shape1 = kpts0.shape[:-1], kpts1.shape[:-1]
+            return {
+                'matches0': jt.ones(shape0, dtype=jt.int),
+                'matches1': jt.ones(shape1, dtype=jt.int),
+                'matching_scores0': jt.zeros(shape0, dtype=jt.float),
+                'matching_scores1': jt.zeros(shape1, dtype=jt.float),
+                'skip_train': True
+            }
+
+        # Keypoint normalization.
+        kpts0 = normalize_keypoints(kpts0, data['shape0'])
+        kpts1 = normalize_keypoints(kpts1, data['shape1'])
+
+        # Keypoint MLP encoder.
+        # desc0 = self.des_extend(desc0) + self.kenc(kpts0, data['scores0'])
+        # desc1 = self.des_extend(desc1) + self.kenc(kpts1, data['scores1'])
+        desc0 = desc0 + self.kenc(kpts0, data['scores0'])
+        desc1 = desc1 + self.kenc(kpts1, data['scores1'])
+
+        # Multi-layer Transformer network.
+        desc0, desc1 = self.gnn(desc0, desc1)
+
+        # Final MLP projection.
+        desc0, desc1 = self.final_proj(desc0), self.final_proj(desc1)
+        desc0_t = desc0.t()
+        losses = []
+
+        for i in range(0, desc1.shape[0], split_size):
+            end = min(desc1.shape[0], i + split_size)
+
+            # Compute matching descriptor distance.
+            scores = nn.bmm(desc0_t[i:end], desc1[i:end]) * self.scale  # 457.76 MB
+            scores.sync()
+
+            # Run the optimal transport.
+            if self.use_dual_softmax:
+                scores = add_dustbin(scores, self.bin_score)  # 458.68 MB
+                scores.sync()
+                dual_softmax0, dual_softmax1 = nn.log_softmax(scores, 1), nn.log_softmax(scores, 2)
+                scores = dual_softmax0 + dual_softmax1  # 458.22 MB
+                scores.sync()
+            else:
+                scores = log_optimal_transport(scores, self.bin_score, iters=self.config['sinkhorn_iterations'])
+
+            # loss = torch.stack([get_match_score(scores[b], all_matches[b]) for b in range(all_matches.shape[0])])
+
+            loss = get_weighted_loss_batch(scores, all_matches[i:end])
+            loss.sync()
+            losses.append(loss)
+        loss = jt.concat(losses)
+        '''
+        # Compute matching descriptor distance.
+        scores = nn.bmm(desc0.t(), desc1) * self.scale # 457.76 MB
+        scores.sync()
+
+        # Run the optimal transport.
+        if self.use_dual_softmax:
+            scores = add_dustbin(scores, self.bin_score) # 458.68 MB
+            scores.sync()
+            dual_softmax0, dual_softmax1 = nn.log_softmax(scores, 1), nn.log_softmax(scores, 2)
+            scores = dual_softmax0 + dual_softmax1 # 458.22 MB
+            scores.sync()
+        else:
+            scores = log_optimal_transport(scores, self.bin_score, iters=self.config['sinkhorn_iterations'])
+
+        # loss = torch.stack([get_match_score(scores[b], all_matches[b]) for b in range(all_matches.shape[0])])
+
+        loss = get_weighted_loss_batch(scores, all_matches)
+        # print(scores.shape, all_matches.shape, loss.shape)
+        '''
+
+        # matches0, matches1 = all_matches.chunk(chunks=2, dim=2)
+        # batchIdx = jt.arange(0, b).unsqueeze(1).repeat(1, num)
+        # batchIdx, matches0, matches1 = batchIdx.view(-1), matches0.view(-1), matches1.view(-1)
+        # validmatch = (matches0 >= 0) | (matches1 >= 0)
+        # batchIdx, matches0, matches1 = batchIdx[validmatch], matches0[validmatch], matches1[validmatch]
+        # matches0[matches0 == -1] = n
+        # matches1[matches1 == -1] = m
+        # loss_mean = -scores[(batchIdx, matches0, matches1)].mean()
+        # loss_mean = nn.l1_loss(loss_mean, jt.float(0.0))
+
+        if not data['return_match']:
+            return {'loss': loss}
+
+        with jt.no_grad():
+            b, n, m = scores.shape
+            # Get the matches with score above "match_threshold".
+            indices0, max0 = scores[:, :-1, :-1].argmax(2)
+            indices1, max1 = scores[:, :-1, :-1].argmax(1)
+            mutual0 = jt.arange(0, n)[None] == indices1.gather(1, indices0)
+            mutual1 = jt.arange(0, m)[None] == indices0.gather(1, indices1)
+            # zero = scores.new_tensor(0)
+            # mscores0 = torch.where(mutual0, max0.values.exp(), zero)
+            mscores0 = max0.exp()
+            mscores0[mutual0.logical_not()] = 0
+            # mscores1 = torch.where(mutual1, mscores0.gather(1, indices1), zero)
+            mscores1 = mscores0.gather(1, indices1)
+            mscores1[mutual1.logical_not()] = 0
+            valid0 = mutual0 & (mscores0 > self.match_threshold)
+            valid1 = mutual1 & valid0.gather(1, indices1)
+            # indices0 = torch.where(valid0, indices0, indices0.new_tensor(-1))
+            # indices1 = torch.where(valid1, indices1, indices1.new_tensor(-1))
+            indices0[valid0.logical_not()] = -1
+            indices1[valid1.logical_not()] = -1
+
+        return {
+            'matches0': indices0,  # use -1 for invalid match
+            'matches1': indices1,  # use -1 for invalid match
+            'matching_scores0': mscores0,
+            'matching_scores1': mscores1,
+            'loss': loss,
+        }
+
+        # scores big value or small value means confidence? log can't take neg value
\ No newline at end of file
diff --git a/python/jittor/test/perf/perf.py b/python/jittor/test/perf/perf.py
index ef190017..93e77ef1 100644
--- a/python/jittor/test/perf/perf.py
+++ b/python/jittor/test/perf/perf.py
@@ -4,8 +4,8 @@ suffix = ""
 
 import jittor as jt
 import time
-from pathlib import Path
-home_path = str(Path.home())
+import jittor_utils as jit_utils
+home_path = jit_utils.home()
 perf_path = os.path.join(home_path, ".cache", "jittor_perf")
 
 def main():
diff --git a/python/jittor/test/test_acl.py b/python/jittor/test/test_acl.py
new file mode 100644
index 00000000..62cb147e
--- /dev/null
+++ b/python/jittor/test/test_acl.py
@@ -0,0 +1,31 @@
+# ***************************************************************
+# Copyright (c) 2021 Jittor. All Rights Reserved. 
+# Maintainers: Dun Liang <randonlang@gmail.com>. 
+# This file is subject to the terms and conditions defined in
+# file 'LICENSE.txt', which is part of this source code package.
+# ***************************************************************
+import unittest
+import jittor as jt
+from .test_core import expect_error
+import numpy as np
+
+@unittest.skipIf(not jt.compiler.has_acl, "No ACL found")
+class TestACL(unittest.TestCase):
+
+    @jt.flag_scope(use_acl=1)
+    def test_array(self):
+        print("use_acl", jt.flags.use_acl)
+        a = jt.array([1,2,3])
+        np.testing.assert_allclose(a.numpy(), [1,2,3])
+
+    @jt.flag_scope(use_acl=1)
+    def test_add(self):
+        a = jt.array([1,2,3])
+        b = a+a
+        np.testing.assert_allclose(b.numpy(), [2,4,6])
+
+    def test_meminfo(self):
+        jt.display_memory_info()
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/jittor/test/test_benchmark.py b/python/jittor/test/test_benchmark.py
new file mode 100644
index 00000000..2e7db8b7
--- /dev/null
+++ b/python/jittor/test/test_benchmark.py
@@ -0,0 +1,344 @@
+# ***************************************************************
+# Copyright (c) 2021 Jittor. All Rights Reserved. 
+# Maintainers: Dun Liang <randonlang@gmail.com>. 
+# This file is subject to the terms and conditions defined in
+# file 'LICENSE.txt', which is part of this source code package.
+# ***************************************************************
+import unittest
+import jittor as jt
+import numpy as np
+import os
+
+n = 400000000
+# n = 4000000
+n = 7680000
+
+def get_mem_band():
+    a = jt.rand((n)).float32()
+    for i in range(100):
+        a.copy().sync()
+    jt.sync_all(True)
+    import time
+    t = time.time()
+    for i in range(1000):
+        a.copy().sync()
+    jt.sync_all(True)
+    dt = time.time() - t
+    band = a.numel() * 4 * 2000 / dt / 1024**3
+    print("Mem band: ", band)
+    return band
+
+def check_simple_add_band():
+    # copy: 816
+    # S=1 128,1024, ILP=1 634
+    # S=0 128,1024, ILP=1 734
+    # S=0 128,512, ILP=1 716
+    # S=0 64,1024, ILP=1 706
+    # S=0 256,1024, ILP=1 706
+    def test(S=0, B=128, T=1024, ILP=1):
+        a = jt.rand((n)).float32()
+        jt.sync_all(True)
+        jt.flags.log_silent = 1
+        with jt.profile_scope(100, 1000) as rep:
+            b = jt.code(a.shape, a.dtype, [a], 
+            cuda_header="#include \"type/fp16_compute.h\"",
+            cuda_src=f"""
+            __global__ void kernel(in0_type * __restrict__ a, in0_type* __restrict__ b, int num) {{
+                int tid = threadIdx.x + blockIdx.x * blockDim.x;
+                int tnum = blockDim.x * gridDim.x;
+                #define ILP {ILP}
+                for (int i=tid*ILP; i<num; i+=tnum*ILP) {{
+                    // b[i] = a[i];
+                    vload<ILP*sizeof(in0_type)>(b+i, a+i);
+                    {"__syncthreads();" if S else ""}
+                }}
+            }}
+            kernel<<<{B},{T}>>>(in0_p, out0_p, in0->num);
+            """)
+            b.sync()
+        bw = float(rep[-1][9]) / 1024**3
+        s = f"S={S}, B={B}, T={T}, ILP={ILP} BW={bw}"
+        print(s)
+        return s, bw
+
+    def test2(S=0, B=128, T=1024, ILP=1):
+        a = jt.rand((n)).float32()
+        jt.sync_all(True)
+        # jt.flags.log_silent = 0
+        with jt.profile_scope(10, 1000) as rep:
+            b = jt.code(a.shape, a.dtype, [a], 
+            cuda_header="#include \"type/fp16_compute.h\"",
+            cuda_src=f"""
+            __global__ void kernel(float2 * __restrict__ a, float2* __restrict__ b, int num) {{
+                int tid = threadIdx.x + blockIdx.x * blockDim.x;
+                int tnum = blockDim.x * gridDim.x;
+                #define ILP 1
+                for (int i=tid*ILP; i<num; i+=tnum*ILP) {{
+                    b[i] = a[i];
+                    // b[i+1] = a[i+1];
+                    // vload<ILP*sizeof(in0_type)>(b+i, a+i);
+                    {"__syncthreads();" if S else ""}
+                }}
+            }}
+            kernel<<<{B},{T}>>>((float2*)in0_p, (float2*)out0_p, in0->num/2);
+            """)
+            b.sync()
+        bw = float(rep[-1][9]) / 1024**3
+        s = f"T2: S={S}, B={B}, T={T}, ILP={ILP} BW={bw}"
+        print(s)
+        return s, bw
+
+            
+    def test3(S=0, B=128, T=1024, ILP=1, C=0):
+        a = jt.rand((n)).float32()
+        b = jt.rand(B)
+        jt.sync_all(True)
+        jt.flags.log_silent = 1
+        with jt.profile_scope(100, 1000) as rep:
+            b = jt.code(a.shape, a.dtype, [a, b], 
+            cuda_header="#include \"type/fp16_compute.h\"",
+            cuda_src=f"""
+            __global__ void kernel(in0_type * __restrict__ a, in0_type* __restrict__ b, int num) {{
+                int tid = threadIdx.x + blockIdx.x * blockDim.x;
+                int tnum = blockDim.x * gridDim.x;
+                #define ILP {ILP}
+                for (int i=tid*ILP; i<num; i+=tnum*ILP) {{
+                    // b[i] = a[i];
+                    vload<ILP*sizeof(in0_type)>(b+i, a+i);
+                    {"__syncthreads();" if S else ""}
+                }}
+                {"__syncthreads();" if C else ""}
+            }}
+            kernel<<<in1->shape[0],{T}>>>(in0_p, out0_p, in0->num);
+            """)
+            b.compile_options = {"FLAGS: -Xptxas -dlcm=ca ": C}
+            # b.compile_options = {"FLAGS: –Xptxas –dlcm=ca ": 1}
+            b.sync()
+
+        bw = float(rep[-1][9]) / 1024**3
+        s = f"T3: S={S}, B={B}, T={T}, ILP={ILP} C={C} BW={bw}"
+        print(s)
+        return s, bw
+
+            
+    def test4(S=0, B=128, T=1024, ILP=1, C=0, name="b.png"):
+        a = jt.rand((n)).float32()
+        b = jt.rand(B*4).uint32()
+        jt.sync_all(True)
+        # jt.flags.log_silent = 1
+        with jt.profile_scope(100, 10000) as rep:
+            _ = jt.code(a.shape, a.dtype, [a, b], 
+            cuda_header="#include \"type/fp16_compute.h\"",
+            cuda_src=f"""
+            __device__ uint get_smid(void) {{
+                uint ret;
+                asm("mov.u32 %0, %smid;" : "=r"(ret) );
+                return ret;
+            }}
+            __device__ uint get_time(void) {{
+                uint ret;
+                asm volatile("mov.u32 %0, %%globaltimer_lo;" : "=r"(ret));
+                return ret;
+            }}
+
+            __global__ void kernel(in0_type * __restrict__ a, in0_type* __restrict__ b, int num, in1_type* __restrict__ c) {{
+                uint t = get_time();
+                int tid = threadIdx.x + blockIdx.x * blockDim.x;
+                int tnum = blockDim.x * gridDim.x;
+                #define ILP {ILP}
+                for (int i=tid*ILP; i<num; i+=tnum*ILP) {{
+                    // b[i] = a[i];
+                    vload<ILP*sizeof(in0_type)>(b+i, a+i);
+                    {"__syncthreads();" if S else ""}
+                }}
+                {"__syncthreads();" if C else ""}
+                if (threadIdx.x == 0)
+                    ((uint4* __restrict__)c)[blockIdx.x] = 
+                    uint4{{get_smid(), t, get_time(), 0}};
+            }}
+            kernel<<<in1->shape[0]/4,{T}>>>(in0_p, out0_p, in0->num, in1_p);
+            """)
+            _.compile_options = {"FLAGS: -Xptxas -dlcm=ca ": C}
+            # b.compile_options = {"FLAGS: –Xptxas –dlcm=ca ": 1}
+            _.sync()
+
+        bw = float(rep[-1][9]) / 1024**3
+        b = b.data.reshape(-1, 4)[:,:3]
+        mint = b[:,1].min()
+        b[:,1:] -= mint
+        smmax = int(b[:,0].max())
+        smmin = int(b[:,0].min())
+        maxt = b.max()
+
+        # print(b)
+
+        s = f"T4: S={S}, B={B}, T={T}, ILP={ILP} C={C} BW={bw:.3f} sm={smmin},{smmax} maxt={maxt}"
+        print(s)
+        import pylab as pl
+        pl.figure(figsize=(16,16))
+        texts = []
+        pret = np.zeros(200, dtype="uint32")
+        for i in range(B):
+            smid, s, t = b[i]
+            pl.plot([s,t], [smid, smid], 'ro-')
+            texts.append((s, smid, i))
+            texts.append((t, smid, i))
+
+        texts = sorted(texts)
+        for (s, smid, bid) in texts:
+            cpos = max(pret[smid], s)
+            pl.text(cpos, smid, str(bid))
+            pret[smid] = cpos + maxt // 30
+
+
+        # print("???")
+        # adjust_text(texts, arrowprops=dict(arrowstyle='->', color='blue'))
+        # print("???")
+        pl.savefig(name)
+        pl.close()
+        return s, bw
+    # test(S=0, B=128, T=1024, ILP=1)
+    # test(S=1, B=128, T=1024, ILP=1)
+    # test(S=0, B=64, T=1024, ILP=1)
+    # test(S=0, B=256, T=1024, ILP=1)
+    # test(S=1, B=128, T=512, ILP=1)
+    # test(S=1, B=128, T=256, ILP=1)
+    
+    # test(S=0, B=128, T=1024, ILP=2)
+    # test(S=0, B=128, T=1024, ILP=4)
+    # test(S=0, B=128, T=512, ILP=2)
+    # test(S=0, B=128, T=512, ILP=4)
+    
+    # test(S=1, B=128, T=1024, ILP=2)
+    # test(S=1, B=128, T=1024, ILP=4)
+    # test(S=1, B=128, T=1024, ILP=8)
+    # test(S=1, B=128, T=1024, ILP=16)
+    # test(S=1, B=128, T=512, ILP=2)
+    # test(S=1, B=128, T=512, ILP=4)
+    
+    # test(S=1, B=256, T=1024, ILP=2)
+    # test(S=1, B=512, T=1024, ILP=2)
+    # test(S=1, B=256, T=1024, ILP=4)
+    # test(S=1, B=256, T=1024, ILP=8)
+    # test(S=1, B=256, T=1024, ILP=16)
+    # test(S=1, B=256, T=512, ILP=2)
+    # test(S=1, B=256, T=512, ILP=4)
+    
+    # test(S=1, B=128, T=256, ILP=2)
+    # test(S=1, B=128, T=256, ILP=4)
+    # test(S=0, B=128, T=256, ILP=2)
+    # test(S=0, B=128, T=256, ILP=4)
+    
+    # for b in [1, 2, 4, 8, 16, 32, 64, 128,256]:
+    #     test(S=1, B=b, T=512, ILP=2)
+
+    import matplotlib as mpl
+    mpl.use('Agg')
+    import pylab as pl
+    import numpy as np
+
+    # test4(S=1, B=82, T=1024, ILP=2, C=0, name="b.png")
+    # test4(S=1, B=83, T=1024, ILP=2, C=0, name="c.png")
+    # test4(S=1, B=82*3, T=512, ILP=2, C=0, name="d1.png")
+    # test4(S=1, B=82*3+1, T=512, ILP=2, C=0, name="d2.png")
+    # test4(S=1, B=82*6+1, T=512, ILP=2, C=0, name="d3.png")
+    # test4(S=0, B=82*6+1, T=512, ILP=2, C=0, name="d4.png")
+    
+    for b in range(70, 83):
+        test4(S=1, B=b, T=1024, ILP=2, C=0, name=f"b-{b}.png")
+
+    # data = []
+    # for b in range(32, 2000, 8):
+    #     _, bw = test3(S=0, B=b, T=32, ILP=2)
+    #     data.append([b, bw])
+    # data = np.array(data)
+    # pl.plot(data[:,0], data[:,1])
+
+    # for t in [32, 64, 128, 256, 512, 1024]:
+    #     data = []
+    #     for b in range(32, 2000, 8):
+    #         _, bw = test3(S=1, B=b*(1024//t), T=t, ILP=2)
+    #         data.append([b, bw])
+    #     data = np.array(data)
+    #     pl.plot(data[:,0], data[:,1])
+
+    # for t in [1024]:
+    #     for c in [0,1]:
+    #         data = []
+    #         # for b in range(32, 1000, 8):
+    #         for b in range(32, 33, 8):
+    #             _, bw = test3(S=c, B=b*(1024//t), T=t, ILP=2, C=0)
+    #             data.append([b, bw])
+    #         data = np.array(data)
+    #         pl.plot(data[:,0], data[:,1])
+    
+    # for ilp in [2]:
+    #     for s in [1]:
+    #         for t in [1024,512,256,128]:
+    #             data = []
+    #             for b in range(32, 1100, 8):
+    #                 _, bw = test3(S=s, B=b*(1024//t), T=t, ILP=ilp)
+    #                 data.append([b, bw])
+    #             data = np.array(data)
+    #             pl.plot(data[:,0], data[:,1])
+
+    # pl.savefig("a.png")
+    # pl.close()
+    # for b in range(80, 90, 1):
+    #     _, bw = test3(S=1, B=b, T=1024, ILP=2)
+    # # 82
+    # for b in range(240, 260, 1):
+    #     _, bw = test3(S=1, B=b, T=512, ILP=2)
+    # # 82*3 = 246
+    # for b in range(240, 500, 1):
+    #     _, bw = test3(S=1, B=b, T=256, ILP=2)
+    # # 492 = 82*6
+    # for b in range(240, 1000, 1):
+    #     _, bw = test3(S=1, B=b, T=128, ILP=2)
+    # # 984 = 82*12
+
+
+    # for b in [128,256]:
+    #     test(S=1, B=b, T=1024, ILP=2)
+    # for b in [128,256]:
+    #     test(S=0, B=b, T=512, ILP=2)
+    # for b in [128,256]:
+    #     test(S=0, B=b, T=1024, ILP=2)
+    # for b in [128,256]:
+    #     test(S=1, B=b, T=512, ILP=1)
+    # for b in [128,256]:
+    #     test(S=1, B=b, T=1024, ILP=1)
+    # for b in [128,256]:
+    #     test(S=0, B=b, T=512, ILP=1)
+    # for b in [128,256]:
+    #     test(S=0, B=b, T=1024, ILP=1)
+    # test(S=1, B=128, T=512, ILP=4)
+    # test(S=1, B=64, T=512, ILP=2)
+    # test(S=1, B=80, T=512, ILP=2)
+    # test(S=1, B=100, T=512, ILP=2)
+    # test(S=1, B=110, T=512, ILP=2)
+    # test(S=1, B=115, T=512, ILP=2)
+    # test(S=1, B=120, T=512, ILP=2)
+    # test(S=1, B=130, T=512, ILP=2)
+    # test(S=1, B=140, T=512, ILP=2)
+    # test2(S=1, B=128, T=512, ILP=2)
+    # test(S=1, B=128, T=256, ILP=4)
+    # test(S=1, B=128, T=128, ILP=8)
+    # test(S=1, B=128, T=64, ILP=16)
+    
+
+
+@unittest.skipIf(not jt.compiler.has_cuda, "No CUDA found")
+class TestBenchmarkCUDA(unittest.TestCase):
+    def setUp(self):
+        jt.flags.use_cuda = 1
+    def tearDown(self):
+        jt.flags.use_cuda = 0
+
+    def test_main(self):
+        return
+        get_mem_band()
+        check_simple_add_band()
+
+if __name__ == "__main__":
+    unittest.main()
\ No newline at end of file
diff --git a/python/jittor/test/test_binary_op.py b/python/jittor/test/test_binary_op.py
index 6eec8c0f..1947fef8 100644
--- a/python/jittor/test/test_binary_op.py
+++ b/python/jittor/test/test_binary_op.py
@@ -19,12 +19,12 @@ def all_eq(x, y):
     y = convert(y)
     if str(x.dtype).startswith("float"):
         return str(y.dtype).startswith("float") and x.shape == y.shape and (x==y).all()
-    return x.dtype == y.dtype and x.shape == y.shape and (x==y).all()
+    return x.dtype == y.dtype and x.shape == y.shape and np.testing.assert_allclose(x, y)
 
 def check(op, *args):
     x = eval(f"np.{op}(*args)")
     y = eval(f"jt.{op}(*args).data")
-    assert all_eq(x, y), f"{x}\n{y}"
+    all_eq(x, y)
 
 class TestBinaryOp(unittest.TestCase):
     def test_binary_op(self):
@@ -47,6 +47,9 @@ class TestBinaryOp(unittest.TestCase):
         
     def test_i(self):
         def check(op, a, b):
+            if isinstance(a, list):
+                a = np.array(a)
+                b = np.array(b)
             if jt.flags.use_cuda and op == "@":
                 return
             if op=="@":
@@ -65,13 +68,13 @@ class TestBinaryOp(unittest.TestCase):
                 a = np.float32(a)
             ja = np.float32(ja)
             
-            assert all_eq(ja, a), (ja,a)
+            all_eq(ja, a)
         check("+", 5, 2)
         check("-", 5, 2)
         check("*", 5, 2)
         check("/", 5, 2)
         check("//", 5, 2)
-        check("@", [[5]], [[2]])
+        # check("@", [[5]], [[2]])
         check("%", 5, 2)
         check("**", 5, 2)
         check("<<", 5, 2)
@@ -80,6 +83,15 @@ class TestBinaryOp(unittest.TestCase):
         check("^", 5, 2)
         check("|", 5, 2)
         
+        check("+", [5.0,6.0], [2.0,3.0])
+        check("-", [5.0,6.0], [2.0,3.0])
+        check("*", [5.0,6.0], [2.0,3.0])
+        check("/", [5.0,6.0], [2.0,3.0])
+        check("//", [5.0,6.0], [2.0,3.0])
+        check("@", [[5,6],[7,8]], [[2,3],[4,5]])
+        check("%", [5.0,6.0], [2.0,3.0])
+        check("**", [5.0,6.0], [2.0,3.0])
+        
     def test_r(self):
         def check(op, a, b):
             a = np.array(a)
@@ -97,7 +109,7 @@ class TestBinaryOp(unittest.TestCase):
                 a = eval(f"a {op} b")
                 a = np.array(a)
             
-            assert all_eq(jc, a), f"\n{jc}\n{a}"
+            all_eq(jc, a)
         check("+", 5, 2)
         check("-", 5, 2)
         check("*", 5, 2)
@@ -118,6 +130,7 @@ class TestBinaryOp(unittest.TestCase):
         a = np.random.rand(10)
         b = np.random.rand(10)
         c = np.random.rand(10)
+        tol = 1e-2 if jt.flags.amp_reg & 2 else 1e-4
         for op in ops:
             func = lambda x: eval(f"((x[0]{op}x[1])*x[2]).sum()")
             x, grads = ngrad(func, [a,b,c], 1e-8)
@@ -127,7 +140,7 @@ class TestBinaryOp(unittest.TestCase):
             jx = eval(f"(ja{op}jb)*jc")
             jgrads = jt.grad(jx, [ja,jb,jc])
             for jd, nd in zip(jgrads, grads):
-                assert (np.abs(jd.data-nd)<1e-4).all(), f"\n{jd.data}\n{nd}"
+                np.testing.assert_allclose(jd.data, nd, atol=tol, rtol=tol)
 
     def test_mod_float(self):
         a = jt.random((10,))
@@ -137,7 +150,8 @@ class TestBinaryOp(unittest.TestCase):
         a = jt.random((10,), 'float64')
         b = jt.random((10,), 'float64')
         c = a % b
-        assert np.allclose(c.data, a.data % b.data)
+        assert np.allclose(c.data, a.data % b.data, a.data, b.data)
+        if jt.flags.amp_reg & 2: return
         a = jt.random((10,)) * 1000
         b = (jt.random((10,)) * 10).int() + 1
         c = a % b
@@ -169,5 +183,19 @@ class TestBinaryOp(unittest.TestCase):
 class TestBinaryOpCuda(TestBinaryOp, test_cuda(2)):
     pass
 
+class TestBinaryOpCpuFp16(TestBinaryOp):
+    def setUp(self):
+        jt.flags.amp_reg = 2 | 4 | 8 | 16
+    def tearDown(self):
+        jt.flags.amp_reg = 0
+
+class TestBinaryOpCudaFp16(TestBinaryOp):
+    def setUp(self):
+        jt.flags.amp_reg = 2 | 4 | 8 | 16
+        jt.flags.use_cuda = 1
+    def tearDown(self):
+        jt.flags.amp_reg = 0
+        jt.flags.use_cuda = 0
+
 if __name__ == "__main__":
     unittest.main()
\ No newline at end of file
diff --git a/python/jittor/test/test_core.py b/python/jittor/test/test_core.py
index f1fc1b63..fd7d888e 100644
--- a/python/jittor/test/test_core.py
+++ b/python/jittor/test/test_core.py
@@ -106,5 +106,18 @@ class TestCore(unittest.TestCase):
         a.y = 2
         assert a.y == 2
 
+    def test_modules(self):
+        a = jt.Module()
+        a.x = jt.Module()
+        a.y = jt.Module()
+        a.a = jt.array([1,2,3])
+        a.b = jt.array([1,2,3])
+        assert list(a._modules.keys()) == ["x", "y"]
+        assert a._modules['x'] is a.x
+        assert a._modules['y'] is a.y
+        assert list(a._parameters.keys()) == ['a', 'b']
+        assert a._parameters['a'] is a.a
+        assert a._parameters['b'] is a.b
+
 if __name__ == "__main__":
     unittest.main()
\ No newline at end of file
diff --git a/python/jittor/test/test_cuda.py b/python/jittor/test/test_cuda.py
index 4c28436c..16a16546 100644
--- a/python/jittor/test/test_cuda.py
+++ b/python/jittor/test/test_cuda.py
@@ -101,6 +101,12 @@ class TestCuda(unittest.TestCase):
         assert a.shape == [3,4,5] and a.dtype == 'float'
         assert (-na.flatten() == range(3*4*5)).all(), na
 
+    def test_cuda_fused_op(self):
+        a = jt.array([1,2,3])
+        a.sync()
+        with jt.flag_scope(use_cuda=1):
+            ((a+a)*2).data
+
 
 @unittest.skipIf(jt.compiler.has_cuda, "Only test without CUDA")
 class TestNoCuda(unittest.TestCase):
diff --git a/python/jittor/test/test_cudnn_op.py b/python/jittor/test/test_cudnn_op.py
index 12c674a9..fcf46a9a 100644
--- a/python/jittor/test/test_cudnn_op.py
+++ b/python/jittor/test/test_cudnn_op.py
@@ -123,8 +123,8 @@ class TestCudnnConvOp(unittest.TestCase):
             logs = find_log_with_re(raw_log, "(Jit op key (not )?found: cudnn_conv.*)")
             assert len(logs)==3 and "oihw" in logs[0][0], logs
             assert np.allclose(y.data, cy.data)
-            np.testing.assert_allclose(dx.data, cdx.data, atol=1e-2)
-            np.testing.assert_allclose(dw.data, cdw.data, atol=1e-2)
+            np.testing.assert_allclose(dx.data, cdx.data, atol=1e-2, rtol=1e-3)
+            np.testing.assert_allclose(dw.data, cdw.data, atol=1e-2, rtol=1e-3)
         if os.name == 'nt': return
         check([10,3,100,100], [5,3,3,3], stride=2, padding=0, dilation=1)
         check([10,4,40,50], [5,4,5,5], stride=1, padding=1, dilation=1)
diff --git a/python/jittor/test/test_fp16.py b/python/jittor/test/test_fp16.py
new file mode 100644
index 00000000..86569a59
--- /dev/null
+++ b/python/jittor/test/test_fp16.py
@@ -0,0 +1,347 @@
+# ***************************************************************
+# Copyright (c) 2021 Jittor. All Rights Reserved. 
+# Maintainers: Dun Liang <randonlang@gmail.com>. 
+# This file is subject to the terms and conditions defined in
+# file 'LICENSE.txt', which is part of this source code package.
+# ***************************************************************
+import unittest
+import jittor as jt
+import numpy as np
+import os
+
+def transpose0231(x):
+    s0, s1, s2, s3 = x.shape
+    asize = 16
+    bsize = 16
+    ILP = 2
+    return jt.code([s0, s2, s3, s1], x.dtype, [x],
+    cuda_header="#include <type/fp16_compute.h>\n#include <cassert>",
+    cuda_src=f"""
+    __global__ void kernel(in0_type* __restrict__ x, in0_type* __restrict__ y, int s0, int s1, int s2, int s3) {{
+        __shared__ in0_type t[{asize*ILP}*{bsize*ILP+1}];
+        int t3 = threadIdx.x % {bsize};
+        int t1 = threadIdx.x / {bsize};
+        int b3 = blockIdx.x;
+        int b2 = blockIdx.y;
+        int b0 = blockIdx.z;
+        int x3 = 1;
+        int x2 = s3;
+        int x1 = s2*x2;
+        int x0 = s1*x1;
+        int y3 = 1;
+        int y2 = s1;
+        int y1 = s3*y2;
+        int y0 = s2*y1;
+        in0_type tmp[{ILP}];
+        for (int i=0; i<(s1-1)/{asize*ILP}+1; i++)
+        {{
+            int _b3 = b3 * {bsize*ILP} + t3*{ILP};
+            if (_b3 < s3) {{
+                #pragma unroll
+                for (int j=0; j<{ILP}; j++) {{
+                    vload<sizeof(in0_type)*{ILP}>(
+                        tmp,
+                        &x[b0*x0+(t1*{ILP}+j+i*{asize*ILP})*x1+b2*x2+_b3*x3]
+                    );
+                    #pragma unroll
+                    for (int k=0; k<{ILP}; k++)
+                        t[(t1*{ILP}+j)*{bsize*ILP+1}+t3*{ILP}+k] = tmp[k];
+                    
+                }}
+            }}
+            __syncthreads();
+            int t3_ = threadIdx.x % {asize};
+            int t1_ = threadIdx.x / {asize};
+            _b3 = b3 * {bsize*ILP} + t1_*{ILP};
+            if (_b3 < s3) {{
+                #pragma unroll
+                for (int j=0; j<{ILP}; j++) {{
+                    #pragma unroll
+                    for (int k=0; k<{ILP}; k++) {{
+                        tmp[k] =
+                            t[(t3*{ILP}+k)*{bsize*ILP+1}+t1_*{ILP}+j];
+                    }}
+                    vload<sizeof(in0_type)*{ILP}>(
+                        &y[b0*y0+b2*y1+(_b3+j)*y2+((t3*{ILP})+i*{asize*ILP})*y3],
+                        tmp
+                    );
+                }}
+            }}
+            __syncthreads();
+        }}
+    }}
+    int s0, s1, s2, s3;
+    in0->shape.unpack(s0, s1, s2, s3);
+    kernel<<<{{(s3-1)/{bsize*ILP}+1, s2, s0 }}, {bsize*asize}>>>
+        (in0_p, out0_p, s0, s1, s2, s3);
+    """)
+
+def transpose0231_2(x):
+    s0, s1, s2, s3 = x.shape
+    asize = 16
+    bsize = 8
+    ILP = 2
+    return jt.code([s0, s2, s3, s1], x.dtype, [x],
+    cuda_header="#include <type/fp16_compute.h>\n#include <cassert>",
+    cuda_src=f"""
+    __global__ __launch_bounds__({asize*bsize}) void kernel(in0_type* __restrict__ x, in0_type* __restrict__ y, int s0, int s1, int s2, int s3) {{
+        __shared__ in0_type t[{asize*ILP}*{bsize*ILP+1}];
+        int t3 = threadIdx.x % {bsize};
+        int t1 = threadIdx.x / {bsize};
+        int b3 = blockIdx.x;
+        int b1 = blockIdx.y;
+        int b2 = 0;
+        int b0 = blockIdx.z;
+        int x3 = 1;
+        int x2 = s3;
+        int x1 = s2*x2;
+        int x0 = s1*x1;
+        int y3 = 1;
+        int y2 = s1;
+        int y1 = s3*y2;
+        int y0 = s2*y1;
+        in0_type tmp[{ILP}];
+        {{
+            int _b3 = b3 * {bsize*ILP} + t3*{ILP};
+            if (_b3 < s3) {{
+                #pragma unroll
+                for (int j=0; j<{ILP}; j++) {{
+                    if (t1*{ILP}+j+b1*{asize*ILP} >= s1)
+                        continue;
+                    vload<sizeof(in0_type)*{ILP}>(
+                        tmp,
+                        &x[b0*x0+(t1*{ILP}+j+b1*{asize*ILP})*x1+b2*x2+_b3*x3]
+                    );
+                    #pragma unroll
+                    for (int k=0; k<{ILP}; k++)
+                        t[(t1*{ILP}+j)*{bsize*ILP+1}+t3*{ILP}+k] = tmp[k];
+                    
+                }}
+            }}
+            __syncthreads();
+            int t3_ = threadIdx.x % {asize};
+            int t1_ = threadIdx.x / {asize};
+            _b3 = b3 * {bsize*ILP} + t1_*{ILP};
+            int yy3 = (t3_*{ILP})+b1*{asize*ILP};
+            if (_b3 < s3 && yy3 < s1) {{
+                #pragma unroll
+                for (int j=0; j<{ILP}; j++) {{
+                    #pragma unroll
+                    for (int k=0; k<{ILP}; k++) {{
+                        tmp[k] =
+                            t[(t3_*{ILP}+k)*{bsize*ILP+1}+t1_*{ILP}+j];
+                    }}
+                    vload<sizeof(in0_type)*{ILP}>(
+                        &y[b0*y0+b2*y1+(_b3+j)*y2+yy3*y3],
+                        tmp
+                    );
+                    // printf("%d %d %d %d %d\\n", b0*y0+b2*y1+(_b3+j)*y2+yy3*y3,
+                    //    b0, b2, (_b3+j), yy3);
+                }}
+            }}
+            __syncthreads();
+        }}
+    }}
+    int s0, s1, s2, s3;
+    in0->shape.unpack(s0, s1, s2, s3);
+    kernel<<<{{(s3-1)/{bsize*ILP}+1, (s1-1)/{asize*ILP}+1, s0 }}, {bsize*asize}>>>
+        (in0_p, out0_p, s0, s1, s2, s3);
+    """)
+
+def check_share():
+    return
+    a = jt.rand((30, 32, 4, 2000)).float32()
+    jt.code(a.shape, a.dtype, [a],
+    cuda_header="#include <type/fp16_compute.h>\n#include <cassert>",
+    cuda_src="""
+    __global__ void kernel(in0_type* __restrict__ a, in0_type* __restrict__ b) {
+        __shared__ float x[32*33];
+        for (int i=0; i<3; i++) {
+        ((float2*)&x[i])[0] = ((float2*)&a[i])[0];
+        ((float2*)&b[i])[0] = ((float2*)&x[i+1])[0];
+        }
+    }
+    kernel<<<1024,16*16>>>(in0_p, out0_p);
+    """).sync()
+    jt.sync_all(True)
+    # print(a[0]+1)
+    print("pass test")
+
+class TestFP16(unittest.TestCase):
+    def test_array(self):
+        a = np.array([1,2,3], dtype="float16")
+        b = jt.array(a)
+        np.testing.assert_allclose(a, b.data)
+
+    def test_add(self):
+        a = np.array([1,2,3], dtype="float16")
+        b = jt.array(a)
+        c = b+b
+        np.testing.assert_allclose(c.data, a+a)
+        d = c.sum()
+        np.testing.assert_allclose(d.data, [12])
+        c = c+1
+        print(c)
+
+    def test_matmul(self):
+        a = jt.random((100,100)).float16()
+        b = jt.random((100,100)).float16()
+        c = jt.matmul(a, b)
+        c.sync()
+
+    def test_matmul_grad(self):
+        a = jt.random((100,100)).float16()
+        b = jt.random((100,100)).float16()
+        c = jt.matmul(a, b)
+        c.sync()
+        da, db = jt.grad(c, [a,b])
+        jt.sync_all()
+        assert da.dtype == "float16"
+        assert db.dtype == "float16"
+
+    def test_array_random_auto_cast(self):
+        a = jt.array([1.0,2.0])
+        assert a.dtype == "float32"
+        with jt.flag_scope(amp_reg=2+16):
+            a = jt.array([1.0,2.0])
+            assert a.dtype == "float16", a.dtype
+            
+        a = jt.random([10])
+        assert a.dtype == "float32"
+        with jt.flag_scope(amp_reg=2+16):
+            a = jt.random([10])
+            assert a.dtype == "float16", a.dtype
+
+    def test_conv(self):
+        a = jt.random((3,4,5,5)).float16()
+        b = jt.random((4,4,3,3)).float16()
+        c = jt.nn.conv(a, b)
+        c.sync()
+
+    def test_max(self):
+        a = jt.random((100,)).float16()
+        b = jt.random((100,)).float16()
+        c = a.maximum(b)
+        c.sync()
+
+    def test_reduce_dtype_infer(self):
+        with jt.flag_scope(amp_reg=1):
+            a = jt.random((3,4,5,5)).float16()
+            b = a.sum()
+            b.sync()
+            assert b.dtype == "float32"
+        with jt.flag_scope(amp_reg=2):
+            a = jt.random((3,4,5,5)).float16()
+            b = a.sum()
+            b.sync()
+            assert b.dtype == "float32"
+        with jt.flag_scope(amp_reg=0):
+            a = jt.random((3,4,5,5)).float16()
+            b = a.sum()
+            b.sync()
+            assert b.dtype == "float32"
+        with jt.flag_scope(amp_reg=2+4):
+            a = jt.random((3,4,5,5)).float16()
+            b = a.sum()
+            b.sync()
+            assert b.dtype == "float16", b.dtype
+
+    def test_white_dtype_infer(self):
+        with jt.flag_scope(amp_reg=1):
+            a = jt.random((3,4,5,5)).float16()
+            b = a**a
+            b.sync()
+            assert b.dtype == "float32"
+        with jt.flag_scope(amp_reg=2):
+            a = jt.random((3,4,5,5)).float16()
+            b = a**a
+            b.sync()
+            assert b.dtype == "float32"
+        with jt.flag_scope(amp_reg=0):
+            a = jt.random((3,4,5,5)).float16()
+            b = a**a
+            b.sync()
+            assert b.dtype == "float32"
+        with jt.flag_scope(amp_reg=2+8):
+            a = jt.random((3,4,5,5)).float16()
+            b = a**a
+            b.sync()
+            assert b.dtype == "float16", b.dtype
+
+    def test_module_half(self):
+        a = jt.nn.Linear(10,10)
+        assert a.weight.dtype == "float32"
+        a.half()
+        assert a.weight.dtype == "float16"
+
+
+
+@unittest.skipIf(not jt.compiler.has_cuda, "No CUDA found")
+class TestFP16CUDA(TestFP16):
+    def setUp(self):
+        jt.flags.use_cuda = 1
+    def tearDown(self):
+        jt.flags.use_cuda = 0
+
+    def test_softmax(self):
+        a = jt.rand((120, 2000, 2000)).float16()
+        # a = jt.rand((1, 2000, 2000)).float32()
+        jt.sync_all()
+        with jt.profile_scope(10, 100):
+            a.log_softmax(-1).sync()
+
+    def test_transpose(self):
+        check_share()
+        # return
+        a = jt.rand((30, 32, 4, 2000)).float32()
+        # a = jt.rand((1, 1024, 1, 2000)).float32()
+        diff = transpose0231(a).data != a.transpose((0,2,3,1)).data
+        print(np.where(diff))
+        # return
+        jt.sync_all()
+        # with jt.profile_scope(100, 11000):
+        with jt.profile_scope(100, 11000):
+            # a.log_softmax(-1).sync()
+            transpose0231(a).sync()
+
+            a.transpose((0,2,3,1)).sync()
+            # a.transpose((0,2,1,3)).sync()
+            a.fuse_transpose((0,2,1,3)).sync()
+            (a+1).sync()
+        jt.sync_all(True)
+        diff = transpose0231(a).data != a.transpose((0,2,3,1)).data
+        print(np.where(diff))
+        np.testing.assert_allclose(transpose0231(a).data, a.transpose((0,2,3,1)).data)
+
+    def test_transpose2(self):
+        # check_share()
+        # return
+        # a = jt.rand((30, 32, 4, 2000)).float32()
+        # a = jt.rand((1, 10000, 1, 2000)).float32()
+        a = jt.rand((1, 10000, 1, 2048)).float32()
+        print("transpose")
+        transpose0231_2(a).sync()
+        print("add")
+        (a+1).sync()
+        return
+        # a = jt.arange(32*16).reshape((1, 32, 1, 16))
+        diff = transpose0231_2(a).data != a.transpose((0,2,3,1)).data
+        print(np.where(diff))
+        # return
+        jt.sync_all()
+        # with jt.profile_scope(100, 11000):
+        with jt.profile_scope(100, 1100):
+            # a.log_softmax(-1).sync()
+            transpose0231_2(a).sync()
+
+            a.transpose((0,2,3,1)).sync()
+            # a.transpose((0,2,1,3)).sync()
+            a.fuse_transpose((0,2,1,3)).sync()
+            (a+1).sync()
+        jt.sync_all(True)
+        diff = transpose0231_2(a).data != a.transpose((0,2,3,1)).data
+        print(np.where(diff))
+        np.testing.assert_allclose(transpose0231_2(a).data, a.transpose((0,2,3,1)).data)
+        
+if __name__ == "__main__":
+    unittest.main()
\ No newline at end of file
diff --git a/python/jittor/test/test_lock.py b/python/jittor/test/test_lock.py
index e70486cd..f5f17b26 100644
--- a/python/jittor/test/test_lock.py
+++ b/python/jittor/test/test_lock.py
@@ -10,12 +10,12 @@
 import unittest
 import os, sys
 import jittor as jt
-from pathlib import Path
+import jittor_utils as jit_utils
 
 class TestLock(unittest.TestCase):
     def test(self):
         if os.environ.get('lock_full_test', '0') == '1':
-            cache_path = os.path.join(str(Path.home()), ".cache", "jittor", "lock")
+            cache_path = os.path.join(jit_utils.home(), ".cache", "jittor", "lock")
             assert os.system(f"rm -rf {cache_path}") == 0
             cmd = f"cache_name=lock {sys.executable} -m jittor.test.test_example"
         else:
diff --git a/python/jittor/test/test_misc_op.py b/python/jittor/test/test_misc_op.py
index 1d2bee6d..47f04762 100644
--- a/python/jittor/test/test_misc_op.py
+++ b/python/jittor/test/test_misc_op.py
@@ -75,6 +75,8 @@ class TestPad(unittest.TestCase):
         print('pass flip test ...')
 
     def test_cross(self):
+        def check_equal(a, b, tol):
+            np.testing.assert_allclose(a.detach().numpy(), b.numpy(), atol=1e-5)
         arr1 = np.random.randn(16,3,224,224,3)
         arr2 = np.random.randn(16,3,224,224,3)
         check_equal(torch.Tensor(arr1).cross(torch.Tensor(arr2), dim=1), jt.array(arr1).cross(jt.array(arr2), dim=1), 1e-1)
@@ -257,5 +259,52 @@ class TestOther(unittest.TestCase):
         a = jt.arctan2(jt.array([1,1.0,0]), jt.array([1,0.0,-1]))
         np.testing.assert_allclose(a.data, [0.7853982,1.5707964,3.1415927])
 
+        y = jt.random((100,))
+        x = jt.random((100,))
+        z = jt.arctan2(y, x)
+        z2 = np.arctan2(y.data, x.data)
+        np.testing.assert_allclose(z.data, z2)
+
+    def test_code_softmax(self):
+        if not jt.has_cuda: return
+        
+        def softmax(x, dim = None, log=False):
+            if dim is None:
+                x = (x - x.max()).exp()
+                ret = x / x.sum()
+            else:
+                x = (x-x.max(dim, keepdims=True)).exp()
+                ret = x / x.sum(dim, keepdims=True)
+            if log: return ret.log()
+            return ret
+        from jittor.other.code_softmax import softmax_v1
+
+        with jt.flag_scope(use_cuda = 1):
+            shape = (120, 2000, 2000)
+            shape = (3,3)
+            for log in [0,1]:
+                for shape in [(3,3), 
+                    (12, 200, 2000), 
+                    (12, 200, 2048), 
+                    (12, 200, 2049)]:
+                    print(shape)
+                    a = jt.rand(shape)
+                    c = jt.rand(shape)
+                    b = softmax(a, -1, log=log)
+                    bb = softmax_v1(a, log=log)
+
+                    err = (bb - b).abs().max()
+                    assert err.item() < 1e-5, (err, bb, b)
+
+                    d1 = jt.grad(b*c, a)
+                    d2 = jt.grad(bb*c, a)
+                    err = (d1 - d2).abs().max()
+
+                    if log:
+                        assert err.item() < 1e-2, (err.item())
+                    else:
+                        assert err.item() < 1e-5, (err.item())
+
+
 if __name__ == "__main__":
     unittest.main()
\ No newline at end of file
diff --git a/python/jittor/test/test_notebooks.py b/python/jittor/test/test_notebooks.py
index 9f90b7d8..a88f0d19 100644
--- a/python/jittor/test/test_notebooks.py
+++ b/python/jittor/test/test_notebooks.py
@@ -8,10 +8,10 @@ import unittest, os
 import jittor as jt
 from jittor import LOG
 import sys
-from pathlib import Path
+import jittor_utils as jit_utils
 
 dirname = os.path.join(jt.flags.jittor_path, "notebook")
-notebook_dir = os.path.join(str(Path.home()), ".cache","jittor","notebook")
+notebook_dir = os.path.join(jit_utils.home(), ".cache","jittor","notebook")
 tests = []
 for mdname in os.listdir(dirname):
     if not mdname.endswith(".src.md"): continue
diff --git a/python/jittor/test/test_op_compiler.py b/python/jittor/test/test_op_compiler.py
index 01bf12e8..dc203a9c 100644
--- a/python/jittor/test/test_op_compiler.py
+++ b/python/jittor/test/test_op_compiler.py
@@ -111,17 +111,6 @@ class TestOpCompiler(unittest.TestCase):
         check("@{a^b == 7}", "2")
         check("@{(a^b) == 7}", "1")
         check("@{b<<a == 5*4}", "1")
-        check('''#include "ops/binary_op_defs.h"
-        #define OP1(a, b) a+b
-        OP1
-        @expand_macro(OP1,1,2)
-        @expand_macro(maximum, T, 1, 2)
-        @expand_macro(@OP,T,1,2)''',
-        '''        #define OP1(a, b) a+b
-        OP1
-        1+2
-        std::max(T(1), T(2))
-        ((1)+T(2)*(T(rcount)))''')
         expect_error(lambda: jit_precompile(vars, "@{a"))
         expect_error(lambda: jit_precompile(vars, "@for(a"))
         expect_error(lambda: jit_precompile(vars, "@for(i,l,r)"))
diff --git a/python/jittor/test/test_profiler.py b/python/jittor/test/test_profiler.py
new file mode 100644
index 00000000..2293dc0f
--- /dev/null
+++ b/python/jittor/test/test_profiler.py
@@ -0,0 +1,24 @@
+# ***************************************************************
+# Copyright (c) 2021 Jittor. All Rights Reserved. 
+# Maintainers: Dun Liang <randonlang@gmail.com>. 
+# This file is subject to the terms and conditions defined in
+# file 'LICENSE.txt', which is part of this source code package.
+# ***************************************************************
+import unittest
+import jittor as jt
+import numpy as np
+import os
+
+class TestProfiler(unittest.TestCase):
+    def test_profiler(self):
+        a = jt.rand(1000,1000)
+        b = jt.rand(1000,1000)
+        jt.sync_all()
+        with jt.profile_scope(10, 100, profiler_record_peek=1) as rep:
+            jt.matmul(a, b).sync()
+        x = float(rep[-1][4])
+        y = float(rep[-2][4])
+        assert abs(x-y)/x < 1e-3
+
+if __name__ == "__main__":
+    unittest.main()
\ No newline at end of file
diff --git a/python/jittor/test/test_resnet.py b/python/jittor/test/test_resnet.py
index 8defb633..8de9fc3f 100644
--- a/python/jittor/test/test_resnet.py
+++ b/python/jittor/test/test_resnet.py
@@ -36,19 +36,7 @@ class MnistNet(Module):
         return x
 
 @unittest.skipIf(skip_this_test, "skip_this_test")
-class TestResnet(unittest.TestCase):
-    @classmethod
-    def setUpClass(self):
-        # hyper-parameters
-        self.batch_size = int(os.environ.get("TEST_BATCH_SIZE", "100"))
-        self.weight_decay = 0.0001
-        self.momentum = 0.9
-        self.learning_rate = 0.1
-        # mnist dataset
-        self.train_loader = MNIST(train=True, transform=trans.Resize(224)) \
-            .set_attrs(batch_size=self.batch_size, shuffle=True)
-        self.train_loader.num_workers = 4
-
+class TestResnetFp32(unittest.TestCase):
     # setup random seed
     def setup_seed(self, seed):
         np.random.seed(seed)
@@ -59,6 +47,19 @@ class TestResnet(unittest.TestCase):
     @jt.flag_scope(use_cuda=1, use_stat_allocator=1)
     def test_resnet(self):
         self.setup_seed(1)
+
+        # hyper-parameters
+        self.batch_size = int(os.environ.get("TEST_BATCH_SIZE", "100"))
+        self.weight_decay = 0.0001
+        self.momentum = 0.9
+        self.learning_rate = 0.1
+        if jt.flags.amp_reg:
+            self.learning_rate = 0.01
+        # mnist dataset
+        self.train_loader = MNIST(train=True, transform=trans.Resize(224)) \
+            .set_attrs(batch_size=self.batch_size, shuffle=True)
+        self.train_loader.num_workers = 4
+
         loss_list=[]
         acc_list=[]
         mnist_net = MnistNet()
@@ -70,6 +71,7 @@ class TestResnet(unittest.TestCase):
         for data, target in self.train_loader:
             batch_id = self.train_loader.batch_id
             epoch_id = self.train_loader.epoch_id
+            data = data.float_auto()
 
             # train step
             # with jt.log_capture_scope(
@@ -120,6 +122,8 @@ class TestResnet(unittest.TestCase):
             # Train Epoch: 0 [40/100 (40%)]   Loss: 2.286762  Acc: 0.130000
             # Train Epoch: 0 [50/100 (50%)]   Loss: 2.055014  Acc: 0.290000
 
+            if jt.flags.amp_reg:
+                continue
             if jt.in_mpi:
                 assert jt.core.number_of_lived_vars() < 8100, jt.core.number_of_lived_vars()
             else:
@@ -131,5 +135,14 @@ class TestResnet(unittest.TestCase):
         assert np.mean(loss_list[-50:])<0.5
         assert np.mean(acc_list[-50:])>0.8
         
+
+@unittest.skipIf(skip_this_test, "skip_this_test")
+class TestResnetFp16(TestResnetFp32):
+    def setup(self):
+        jt.flags.auto_mixed_precision_level = 5
+
+    def tearDown(self):
+        jt.flags.auto_mixed_precision_level = 0
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/jittor/test/test_superglue.py b/python/jittor/test/test_superglue.py
new file mode 100644
index 00000000..0a3c7a18
--- /dev/null
+++ b/python/jittor/test/test_superglue.py
@@ -0,0 +1,121 @@
+# ***************************************************************
+# Copyright (c) 2021 Jittor. All Rights Reserved. 
+# Maintainers: Dun Liang <randonlang@gmail.com>. 
+# This file is subject to the terms and conditions defined in
+# file 'LICENSE.txt', which is part of this source code package.
+# ***************************************************************
+import unittest
+import jittor as jt
+import numpy as np
+import os
+
+from jittor.test.misc import superglue
+from jittor.test.misc.superglue import SuperGlue
+import time
+
+@jt.flag_scope(use_cuda=1)
+def main():
+    global superglue
+    superglue.split_size = int(os.environ.get("split_size", "12"))
+    # superglue.split_size = 1000000
+
+    batch = 30
+    num = 2000
+    dim = 128
+
+    # jt.display_memory_info()
+    # os.system("nvidia-smi")
+    # breakpoint()
+
+    with jt.no_grad():
+
+        config = {
+            'superglue': {
+                'sinkhorn_iterations': 25,
+                'match_threshold': 0.01,
+                'keypoint_position_dim': 2,
+                'descriptor_dim': dim,
+                'use_dual_softmax': True,
+                'GNN_layers': ['self', 'cross'] * 9,
+            }
+        }
+
+        superglue = SuperGlue(config.get('superglue', {}))
+
+        superglue.eval()
+
+        data = {
+            'keypoints0': jt.rand((batch, num, 2), dtype=jt.float),
+            'keypoints1': jt.rand((batch, num, 2), dtype=jt.float),
+            'shape0': jt.rand((batch, 2), dtype=jt.float),
+            'shape1': jt.rand((batch, 2), dtype=jt.float),
+            'descriptors0': jt.rand((batch, dim, num), dtype=jt.float),
+            'descriptors1': jt.rand((batch, dim, num), dtype=jt.float),
+            'scores0': jt.rand((batch, num), dtype=jt.float),
+            'scores1': jt.rand((batch, num), dtype=jt.float),
+            'all_matches': jt.randint(0, num, (batch, num, 2), dtype=jt.int),
+            'return_match': False,
+            # 'match_num': match_num
+        }
+
+        use_fp16 = int(os.environ.get("use_fp16", "0"))
+        if use_fp16:
+            jt.flags.amp_reg = 2
+            for k,v in data.items():
+                if isinstance(v, jt.Var) and v.dtype == "float32":
+                    v.assign(v.float16())
+            for v in superglue.parameters():
+                if v.dtype == "float32":
+                    v.assign(v.float16())
+            jt.sync_all(True)
+
+        import pickle
+        jt.sync_all(True)
+        for x in range(5):
+            print(x)
+            jt.gc()
+            x = superglue(data)['loss']
+            x.sync()
+            jt.display_memory_info()
+            # os.system("nvidia-smi")
+            # breakpoint()
+            # print(data)
+            # print(x)
+        
+        # with open("/tmp/record.pkl", "wb") as f:
+        #     pickle.dump([data, x], f, pickle.HIGHEST_PROTOCOL)
+
+        # with jt.flag_scope(trace_py_var=3, profile_memory_enable=1):
+        #     x = superglue(data)['loss']
+        #     x.sync()
+        #     jt.get_max_memory_treemap()
+        # exit(0)
+
+        jt.sync_all(True)
+        time0 = time.time()
+        jt.flags.profiler_enable = int(os.environ.get("profiler", "0"))
+
+        for x in range(20):
+            print(x)
+            # jt.display_memory_info()
+            x = superglue(data)['loss']
+            x.sync()
+            # print(x)
+
+        jt.sync_all(True)
+        time1 = time.time()
+        print("avg time:", (time1 - time0) / 20)
+        return (time1 - time0) / 20
+
+
+class TestSuperglue(unittest.TestCase):
+    def test(self):
+        if not jt.has_cuda: return
+        t1 = main()
+        os.environ["use_fp16"] = "1"
+        t2 = main()
+        os.environ["use_fp16"] = "0"
+        assert t1*0.55 > t2
+
+if __name__ == "__main__":
+    unittest.main()
\ No newline at end of file
diff --git a/python/jittor/test/test_unary_op.py b/python/jittor/test/test_unary_op.py
index 24483dae..d7b8bc88 100644
--- a/python/jittor/test/test_unary_op.py
+++ b/python/jittor/test/test_unary_op.py
@@ -17,7 +17,8 @@ def check(op, *args):
     x = convert(x)
     y = convert(y)
     # str match nan and inf
-    assert x.dtype == y.dtype and x.shape == y.shape
+    assert x.dtype == y.dtype and x.shape == y.shape, \
+        (x.dtype, y.dtype, x.shape, y.shape)
     for a,b in zip(x.flatten(), y.flatten()):
         assert str(a)[:5] == str(b)[:5], (a,b)
 
@@ -32,9 +33,10 @@ class TestUnaryOp(unittest.TestCase):
         check("logical_not", a)
         check("bitwise_not", a)
         b = np.array([1.1, 2.2, 3.3, 4.4, -1, 0])
-        check("log", a.astype("float32"))
-        check("exp", a.astype("float32"))
-        check("sqrt", a.astype("float32"))
+        type = "float16" if (jt.flags.amp_reg & 2) else "float32"
+        check("log", a.astype(type))
+        check("exp", a.astype(type))
+        check("sqrt", a.astype(type))
         
     def test_grad(self):
         ops = ["abs", "negative", "log", "exp", "sqrt",
@@ -60,7 +62,8 @@ class TestUnaryOp(unittest.TestCase):
             ja = jt.array(b)
             jb = eval(f"jt.{op}(ja)")
             jda = jt.grad(jb, ja)
-            assert (np.allclose(jda.data, da)), (jda.data,da,op)
+            tol = 1e-2 if jt.flags.amp_reg & 2 else 1e-6
+            assert (np.allclose(jda.data, da, atol=tol, rtol=tol)), (jda.data,da,op)
 
     def test_sigmoid(self):
         a = np.arange(-150,150, 10).astype("float32")
@@ -92,11 +95,26 @@ class TestUnaryOp(unittest.TestCase):
         np.testing.assert_allclose(y.data, y2.data)
         d = jt.grad(x2, y2)
         _, (dn,) = ngrad(lambda y: special.erfinv(y).sum(), [y], 1e-8)
-        np.testing.assert_allclose(d.data, dn, atol=1e-6, rtol=1e-6)
+        tol = 1e-3 if jt.flags.amp_reg & 2 else 1e-6
+        np.testing.assert_allclose(d.data, dn, atol=tol, rtol=tol)
 
 
 class TestUnaryOpCuda(TestUnaryOp, test_cuda(2)):
     pass
 
+class TestUnaryOpCudaFp16(TestUnaryOp, test_cuda(2)):
+    def setUp(self):
+        jt.flags.amp_reg = 2 | 4 | 8 | 16
+    def tearDown(self):
+        jt.flags.amp_reg = 0
+
+class TestUnaryOpCudaFp16(TestUnaryOp, test_cuda(2)):
+    def setUp(self):
+        jt.flags.amp_reg = 2 | 4 | 8 | 16
+        jt.flags.use_cuda = 1
+    def tearDown(self):
+        jt.flags.amp_reg = 0
+        jt.flags.use_cuda = 0
+
 if __name__ == "__main__":
     unittest.main()
\ No newline at end of file
diff --git a/python/jittor/test/test_utils.py b/python/jittor/test/test_utils.py
index 16c381fe..69cbda82 100644
--- a/python/jittor/test/test_utils.py
+++ b/python/jittor/test/test_utils.py
@@ -15,8 +15,8 @@ def find_jittor_path():
     return path[:-len(suffix)] + ".."
     
 def find_cache_path():
-    from pathlib import Path
-    path = str(Path.home())
+    import jittor_utils as jit_utils
+    path = jit_utils.home()
     dirs = [".cache", "jittor"]
     for d in dirs:
         path = os.path.join(path, d)
diff --git a/python/jittor/utils/data.gz b/python/jittor/utils/data.gz
index 728482b2..f59ad084 100644
Binary files a/python/jittor/utils/data.gz and b/python/jittor/utils/data.gz differ
diff --git a/python/jittor/utils/polish.py b/python/jittor/utils/polish.py
index a49e8a5a..77256399 100644
--- a/python/jittor/utils/polish.py
+++ b/python/jittor/utils/polish.py
@@ -49,8 +49,8 @@ data_files = [ name for name in files
 LOG.i("data_files", data_files)
 
 # compile data files
-from pathlib import Path
-home = str(Path.home())
+import jittor_utils as jit_utils
+home = jit_utils.home()
 # for cc_type in ["g++", "clang"]:
 #     for device in ["cpu", "cuda"]:
 
diff --git a/python/jittor/utils/polish_centos.py b/python/jittor/utils/polish_centos.py
index a2903775..caf6d027 100644
--- a/python/jittor/utils/polish_centos.py
+++ b/python/jittor/utils/polish_centos.py
@@ -7,8 +7,8 @@
 # ***************************************************************
 import jittor as jt
 import os
-from pathlib import Path
-home_path = str(Path.home())
+import jittor_utils as jit_utils
+home_path = jit_utils.home()
 
 def run_cmd(cmd):
     print("RUN CMD:", cmd)
diff --git a/python/jittor_utils/__init__.py b/python/jittor_utils/__init__.py
index ed943850..0a0a8dcf 100644
--- a/python/jittor_utils/__init__.py
+++ b/python/jittor_utils/__init__.py
@@ -19,10 +19,46 @@ import time
 from ctypes import cdll
 import shutil
 import urllib.request
+import ctypes
 
 if platform.system() == 'Darwin':
     mp.set_start_method('fork')
 
+from pathlib import Path
+import json
+
+
+_jittor_home = None
+def home():
+    global _jittor_home
+    if _jittor_home is not None:
+        return _jittor_home
+
+    src_path = os.path.join(str(Path.home()),".cache","jittor")
+    os.makedirs(src_path,exist_ok=True)
+    src_path_file = os.path.join(src_path,"config.json")
+    data = {}
+    if os.path.exists(src_path_file):
+        with open(src_path_file,"r") as f:
+            data = json.load(f)
+
+    default_path = data.get("JITTOR_HOME",str(Path.home()))
+
+    _home_path = os.environ.get("JITTOR_HOME",default_path)
+    
+    if not os.path.exists(_home_path):
+        _home_path = default_path
+    _home_path = os.path.abspath(_home_path)
+    
+    # LOG.i(f"Use {_home_path} as Jittor Home")
+
+    with open(src_path_file,"w") as f:
+        data['JITTOR_HOME'] = _home_path
+        json.dump(data,f)
+    
+    _jittor_home = _home_path
+    return _home_path
+
 class Logwrapper:
     def __init__(self):
         self.log_silent = int(os.environ.get("log_silent", "0"))
@@ -294,8 +330,7 @@ def short(s):
     return ss
 
 def find_cache_path():
-    from pathlib import Path
-    path = str(Path.home())
+    path = home()
     # jittor version key
     jtv = "jt"+get_jittor_version().rsplit('.', 1)[0]
     # cc version key
@@ -386,6 +421,22 @@ def env_or_find(name, bname, silent=False):
         return path
     return find_exe(bname, silent=silent)
 
+def env_or_try_find(name, bname):
+    if name in os.environ:
+        path = os.environ[name]
+        if path != "":
+            version = get_version(path)
+            LOG.i(f"Found {bname}{version} at {path}")
+        return path
+    return try_find_exe(bname)
+
+def try_find_exe(*args):
+    try:
+        return find_exe(*args)
+    except:
+        LOG.v(f"{args[0]} not found.")
+        return ""
+
 def get_cc_type(cc_path):
     bname = os.path.basename(cc_path)
     if "clang" in bname: return "clang"
@@ -491,8 +542,7 @@ LOG = Logwrapper()
 check_msvc_install = False
 msvc_path = ""
 if os.name == 'nt' and os.environ.get("cc_path", "")=="":
-    from pathlib import Path
-    msvc_path = os.path.join(str(Path.home()), ".cache", "jittor", "msvc")
+    msvc_path = os.path.join(home(), ".cache", "jittor", "msvc")
     cc_path = os.path.join(msvc_path, "VC", r"_\_\_\_\_\bin", "cl.exe")
     check_msvc_install = True
 else:
@@ -506,7 +556,6 @@ _py3_include_path = None
 _py3_extension_suffix = None
 
 if os.name == 'nt':
-    from pathlib import Path
     try:
         import ssl
         ssl._create_default_https_context = ssl._create_unverified_context
@@ -516,7 +565,7 @@ if os.name == 'nt':
         if not os.path.isfile(cc_path):
             from jittor_utils import install_msvc
             install_msvc.install(msvc_path)
-    mpath = os.path.join(str(Path.home()), ".cache", "jittor", "msvc")
+    mpath = os.path.join(home(), ".cache", "jittor", "msvc")
     if cc_path.startswith(mpath):
         msvc_path = mpath
     os.RTLD_NOW = os.RTLD_GLOBAL = os.RTLD_DEEPBIND = 0
@@ -526,3 +575,65 @@ if os.name == 'nt':
         os.environ["PATH"] = path+';'+os.environ["PATH"]
         if hasattr(os, "add_dll_directory"):
             os.add_dll_directory(path)
+
+backends = []
+def add_backend(mod):
+    backends.append(mod)
+
+def compile_module(source, flags):
+    tmp_path = os.path.join(cache_path, "tmp")
+    os.makedirs(tmp_path, exist_ok=True)
+    hash = "hash_" + get_str_hash(source)
+    so = get_py3_extension_suffix()
+    header_name = os.path.join(tmp_path, hash+".h")
+    source_name = os.path.join(tmp_path, hash+".cc")
+    lib_name = hash+so
+    with open(header_name, "w", encoding="utf8") as f:
+        f.write(source)
+    from jittor.pyjt_compiler import compile_single
+    ok = compile_single(header_name, source_name)
+    assert ok, "no pyjt interface found"
+    
+    entry_src = f'''
+static void init_module(PyModuleDef* mdef, PyObject* m) {{
+    mdef->m_doc = "generated py jittor_utils.compile_module";
+    jittor::pyjt_def_{hash}(m);
+}}
+PYJT_MODULE_INIT({hash});
+    '''
+    with open(source_name, "r", encoding="utf8") as f:
+        src = f.read()
+    with open(source_name, "w", encoding="utf8") as f:
+        f.write(src + entry_src)
+    jittor_path = os.path.join(os.path.dirname(__file__), "..", "jittor")
+    jittor_path = os.path.abspath(jittor_path)
+    do_compile([f"\"{cc_path}\" \"{source_name}\" \"{jittor_path}/src/pyjt/py_arg_printer.cc\" {flags} -o \"{cache_path+'/'+lib_name}\" ",
+        cache_path, jittor_path])
+    with import_scope(os.RTLD_GLOBAL | os.RTLD_NOW):
+        exec(f"import {hash}")
+    mod = locals()[hash]
+    return mod
+
+def process_jittor_source(device_type, callback):
+    import jittor.compiler as compiler
+    import shutil
+    djittor = device_type + "_jittor"
+    djittor_path = os.path.join(compiler.cache_path, djittor)
+    os.makedirs(djittor_path, exist_ok=True)
+
+    for root, dir, files in os.walk(compiler.jittor_path):
+        root2 = root.replace(compiler.jittor_path, djittor_path)
+        os.makedirs(root2, exist_ok=True)
+        for name in files:
+            fname = os.path.join(root, name)
+            fname2 = os.path.join(root2, name)
+            if fname.endswith(".h") or fname.endswith(".cc"):
+                with open(fname, 'r', encoding="utf8") as f:
+                    src = f.read()
+                src = callback(src, name, {"fname":fname, "fname2":fname2})
+                with open(fname2, 'w', encoding="utf8") as f:
+                    f.write(src)
+            else:
+                shutil.copy(fname, fname2)
+    compiler.cc_flags = compiler.cc_flags.replace(compiler.jittor_path, djittor_path) + f" -I\"{djittor_path}/extern/cuda/inc\" "
+    compiler.jittor_path = djittor_path
diff --git a/python/jittor_utils/auto_diff.py b/python/jittor_utils/auto_diff.py
index 1235ca31..112366a6 100644
--- a/python/jittor_utils/auto_diff.py
+++ b/python/jittor_utils/auto_diff.py
@@ -1,9 +1,9 @@
 import os
-from pathlib import Path
 from collections import defaultdict
 import pickle
 import numpy as np
 import jittor_utils
+import jittor_utils as jit_utils
 from jittor_utils import LOG
 import sys
 
@@ -96,7 +96,7 @@ class Hook:
         hook_rand()
         self.rid = 0
         self.base_name = base_name
-        self.base_path = os.path.join(str(Path.home()), ".cache", "jittor", "auto_diff", base_name)
+        self.base_path = os.path.join(jit_utils.home(), ".cache", "jittor", "auto_diff", base_name)
         if not os.path.exists(self.base_path):
             os.makedirs(self.base_path, exist_ok=True)
             self.mode = 'save'
diff --git a/python/jittor_utils/clean_cache.py b/python/jittor_utils/clean_cache.py
index 72702707..74e05d1d 100644
--- a/python/jittor_utils/clean_cache.py
+++ b/python/jittor_utils/clean_cache.py
@@ -5,10 +5,10 @@
 # file 'LICENSE.txt', which is part of this source code package.
 # ***************************************************************
 import os, sys, shutil
-from pathlib import Path
 import glob
+import jittor_utils as jit_utils
 
-cache_path = os.path.join(str(Path.home()), ".cache", "jittor")
+cache_path = os.path.join(jit_utils.home(), ".cache", "jittor")
 
 def callback(func, path, exc_info):
     print(f"remove \"{path}\" failed.")
diff --git a/python/jittor_utils/install_cuda.py b/python/jittor_utils/install_cuda.py
index 4d28e69f..a2823fe9 100644
--- a/python/jittor_utils/install_cuda.py
+++ b/python/jittor_utils/install_cuda.py
@@ -41,7 +41,7 @@ def get_cuda_driver():
         return None
 
 def has_installation():
-    jtcuda_path = os.path.join(pathlib.Path.home(), ".cache", "jittor", "jtcuda")
+    jtcuda_path = os.path.join(jit_utils.home(), ".cache", "jittor", "jtcuda")
     return os.path.isdir(jtcuda_path)
 
 def install_cuda():
@@ -54,10 +54,12 @@ def install_cuda():
         LOG.i("JTCUDA_VERSION: ", cuda_driver_version)
 
     if os.name == 'nt':
-        if cuda_driver_version >= [11,4]:
-            cuda_tgz = "cuda11.4_cudnn8_win.zip"
-            md5 = "06eed370d0d44bb2cc57809343911187"
-        elif cuda_driver_version >= [11,2]:
+        # TODO: cuda11.4 has bug fit with
+        # current msvc, FIXME
+        # if cuda_driver_version >= [11,4]:
+        #     cuda_tgz = "cuda11.4_cudnn8_win.zip"
+        #     md5 = "06eed370d0d44bb2cc57809343911187"
+        if cuda_driver_version >= [11,2]:
             cuda_tgz = "cuda11.2_cudnn8_win.zip"
             md5 = "b5543822c21bc460c1a414af47754556"
         elif cuda_driver_version >= [11,]:
@@ -83,7 +85,7 @@ def install_cuda():
             md5 = "f16d3ff63f081031d21faec3ec8b7dac"
         else:
             raise RuntimeError(f"Unsupport cuda driver version: {cuda_driver_version}, at least 10.0")
-    jtcuda_path = os.path.join(pathlib.Path.home(), ".cache", "jittor", "jtcuda")
+    jtcuda_path = os.path.join(jit_utils.home(), ".cache", "jittor", "jtcuda")
     nvcc_path = os.path.join(jtcuda_path, cuda_tgz[:-4], "bin", "nvcc")
     if os.name=='nt': nvcc_path += '.exe'
     nvcc_lib_path = os.path.join(jtcuda_path, cuda_tgz[:-4], "lib64")