Merge branch 'master' into fft

2022-03-26 23:30:35 -04:00 · 2022-03-26 23:30:35 -04:00 · b642b8f1d1
parent b04ad0ccb4 65e9ff1265
commit b642b8f1d1
106 changed files with 3949 additions and 710 deletions
--- a/python/jittor/init.py
+++ b/python/jittor/init.py
@ -9,7 +9,7 @@
 # file 'LICENSE.txt', which is part of this source code package.
 # ***************************************************************

-__version__ = '1.3.1.46'
+__version__ = '1.3.1.55'
 from jittor_utils import lock
 with lock.lock_scope():
    ori_int = int
@ -91,7 +91,7 @@ def safeunpickle(path):
            import torch
        except:
            raise RuntimeError("pytorch need to be installed when load pth format.")
-        model_dict = torch.load(path, map_location=torch.device('cpu'))
+        model_dict = torch.load(path, map_location='cpu')
        try:
            for k, v in model_dict.items():
                try:
@ -231,19 +231,19 @@ class profile_scope(_call_no_record_scope):

    def __enter__(self):
        assert not flags.profiler_enable
-        profiler.start(self.warmup, self.rerun)
        self.report = []
        try:
            self.fs.__enter__()
+            profiler.start(self.warmup, self.rerun)
            return self.report
        except:
            profiler.stop()
            raise

    def __exit__(self, *exc):
-        self.fs.__exit__(*exc)
        profiler.stop()
        self.report.extend(profiler.report())
+        self.fs.__exit__(*exc)

 class __single_process_scope:
    def __init__(self, rank=0):
@ -304,15 +304,52 @@ Var.cast = Var.cast
 def array(data, dtype=None):
    if isinstance(data, core.Var):
        if dtype is None:
-            return data.clone()
-        return cast(data, dtype)
-    if dtype is not None:
+            ret = data.clone()
+        else:
+            ret = cast(data, dtype)
+    elif dtype is not None:
        if isinstance(dtype, NanoString):
            dtype = str(dtype)
        elif callable(dtype):
            dtype = dtype.__name__
-        return ops.array(np.array(data, dtype))
-    return ops.array(data)
+        ret = ops.array(np.array(data, dtype))
+    else:
+        ret = ops.array(data)
+    # TODO: move those code to core
+    amp_reg = jt.flags.amp_reg
+    if amp_reg and ret.numel() != 1 and ret.dtype.is_float():
+        if amp_reg & 16:
+            if amp_reg & 1:
+                if ret.dtype != "float32":
+                    return ret.float32()
+            elif amp_reg & 2:
+                if ret.dtype != "float16":
+                    return ret.float16()
+    return ret
+
+def random(shape, dtype="float32", type="uniform"):
+    # TODO: move those code to core
+    if dtype == "float16":
+        # TODO: make curand support fp16
+        ret = ops.random(shape, "float32", type).float16()
+    else:
+        ret = ops.random(shape, dtype, type)
+    amp_reg = jt.flags.amp_reg
+    if amp_reg:
+        if amp_reg & 16:
+            if amp_reg & 1:
+                if ret.dtype != "float32":
+                    return ret.float32()
+            elif amp_reg & 2:
+                if ret.dtype != "float16":
+                    return ret.float16()
+    return ret
+
+def float_auto(x):
+    if jt.flags.amp_reg & 2:
+        return x.float16()
+    return x.float32()
+Var.float_auto = float_auto

 def array64(data, dtype=None):
    with jt.flag_scope(auto_convert_64_to_32=0):
@ -920,6 +957,14 @@ class Module:
        self.dfs([], "", callback, callback_leave)
        return ms

+    @property
+    def _modules(self):
+        return { k:v for k,v in self.__dict__.items() if isinstance(v, Module) }
+
+    @property
+    def _parameters(self):
+        return { k:v for k,v in self.__dict__.items() if isinstance(v, Var) }
+
    def requires_grad_(self, requires_grad=True):
        self._requires_grad = requires_grad
        self._place_hooker()
@ -1187,6 +1232,33 @@ Arguments of hook are defined as::
    def __getattr__(self, key):
        return object.__getattribute__(self, key)

+    def float64(self):
+        '''convert all parameters to float16'''
+        for p in self.parameters():
+            if p.dtype.is_float():
+                p.assign(p.float64())
+        return self
+
+    def float16(self):
+        '''convert all parameters to float16'''
+        for p in self.parameters():
+            if p.dtype.is_float():
+                p.assign(p.float16())
+        return self
+
+    def half(self):
+        '''convert all parameters to float16'''
+        return self.float16()
+
+    def float_auto(self):
+        '''convert all parameters to float16 or float32 automatically
+        by jt.flags.auto_mixed_precision_level and jt.flags.amp_reg'''
+        for p in self.parameters():
+            if p.dtype.is_float():
+                p.assign(p.float_auto())
+        return self
+
+

 class Function(Module):
    ''' Function Module for customized backward operations
@ -1417,18 +1489,15 @@ Var.size = size


 def to_int(v):
-    dtype = str(v.dtype)
-    assert dtype.startswith("int")
+    assert v.dtype.is_int()
    return v.item()

 def to_float(v):
-    dtype = str(v.dtype)
-    assert dtype.startswith("float")
+    assert v.dtype.is_float()
    return v.item()

 def to_bool(v):
-    dtype = str(v.dtype)
-    assert dtype.startswith("int") or dtype=="bool"
+    assert v.dtype.is_int() or v.dtype.is_bool()
    return ori_bool(v.item())

 Var.__int__ = to_int
@ -1450,6 +1519,8 @@ float = float32
 Var.float = Var.float32
 double = float64
 Var.double = Var.float64
+half = float16
+Var.half = Var.float16

 def is_var(v):
    return isinstance(v, Var)
--- a/python/jittor/init.pyi
+++ b/python/jittor/init.pyi
@ -5,7 +5,7 @@ from . import attention as attention, contrib as contrib, dataset as dataset, in
 from .compile_extern import cublas as cublas, cudnn as cudnn, curand as curand, cufft as cufft, mkl_ops as mkl_ops, mpi_ops as mpi_ops, world_size as world_size
 from .compiler import compile_custom_op as compile_custom_op, compile_custom_ops as compile_custom_ops
 from .contrib import concat as concat
-from .nn import matmul as matmul
+from .nn import bmm as bmm, bmm_transpose as bmm_transpose, matmul as matmul
 from collections import OrderedDict as OrderedDict
 from collections.abc import Mapping as Mapping
 from typing import Any
@ -64,6 +64,8 @@ def clean() -> None: ...
 cast = unary

 def array(data, dtype: Any | None = ...): ...
+def random(shape, dtype: str = ..., type: str = ...): ...
+def float_auto(x): ...
 def array64(data, dtype: Any | None = ...): ...
 def grad(loss, targets): ...
 def liveness_info(): ...
@ -85,7 +87,6 @@ origin_transpose = transpose
 def transpose(x, *dim): ...
 permute = transpose
 def flatten(input, start_dim: int = ..., end_dim: int = ...): ...
-def start_grad(x): ...
 def detach(x): ...
 def unsqueeze(x, dim): ...
 def squeeze(x, dim): ...
@ -149,6 +150,11 @@ class Module:
    is_train: bool
    def is_training(self) -> bool: ...
    def mpi_param_broadcast(self, root: int = ...) -> None: ...
+    def __setattr__(self, key, value) -> None: ...
+    def __getattr__(self, key): ...
+    def float16(self) -> None: ...
+    def half(self) -> None: ...
+    def float_auto(self) -> None: ...

 class Function(Module):
    input_mask: Any
@ -187,6 +193,7 @@ def to_float(v): ...
 def to_bool(v): ...
 def format(v, spec): ...
 def get_len(var): ...
+half = float16

 def is_var(v): ...
 from typing import List, Tuple, Callable, overload
@ -374,7 +381,7 @@ def index(shape: Tuple[int], dim: int, dtype: str="int32")-> Var:
 	        # output: [[0,1],[0,1]]'''
 	...
@overload
-def index(shape: Tuple[int], dtype: str="int32")-> List[Var]:
+def index(shape: Tuple[int], dtype: str="int32"):
 	'''Document:
 	* 
 	    Index Operator generate index of shape.
@ -428,7 +435,7 @@ def index(a: Var, dim: int, dtype: str="int32")-> Var:
 	        # output: [[0,1],[0,1]]'''
 	...
@overload
-def index(a: Var, dtype: str="int32")-> List[Var]:
+def index(a: Var, dtype: str="int32"):
 	'''Document:
 	* 
 	    Index Operator generate index of shape.
@ -461,7 +468,7 @@ def index_var(a: Var, dim: int, dtype: str="int32")-> Var:
 	        jt.index_var(a, 1) similar with jt.index(a.shape, 1)'''
 	...
@overload
-def index_var(a: Var, dtype: str="int32")-> List[Var]:
+def index_var(a: Var, dtype: str="int32"):
 	'''Document:
 	* shape dependency version of index op
 	        jt.index_var(a, 1) similar with jt.index(a.shape, 1)'''
@ -824,7 +831,7 @@ def bitwise_xor(x: Var, y: Var)-> Var:
 	...
 def tape(x: Var)-> Var:
 ...
-def where(cond: Var, dtype: str="int32")-> List[Var]:
+def where(cond: Var, dtype: str="int32"):
 	'''Document:
 	*
 	    Where Operator generate index of true condition.
@ -838,9 +845,9 @@ def where(cond: Var, dtype: str="int32")-> List[Var]:
 	    Example::
 	
 	        jt.where([[0,0,1],[1,0,0]])
-	        # return ( [0,2], [1,0] )'''
+	        # return [jt.Var([0 1], dtype=int32), jt.Var([2 0], dtype=int32)]'''
 	...
-def argsort(x: Var, dim: int=-1, descending: bool=False, dtype: str="int32")-> List[Var]:
+def argsort(x: Var, dim: int=-1, descending: bool=False, dtype: str="int32"):
 	'''Document:
 	* 
 	    Argsort Operator Perform an indirect sort by given key or compare function.
@ -883,7 +890,7 @@ def argsort(x: Var, dim: int=-1, descending: bool=False, dtype: str="int32")-> L
 	...
 def fetch(inputs: List[Var], func: Callable)-> Var:
 ...
-def arg_reduce(x: Var, op: str, dim: int, keepdims: bool)-> List[Var]:
+def arg_reduce(x: Var, op: str, dim: int, keepdims: bool):
 	'''Document:
 	*
 	    Returns the indices of the maximum / minimum of the input across a dimension.
@ -908,7 +915,7 @@ def arg_reduce(x: Var, op: str, dim: int, keepdims: bool)-> List[Var]:
 	        >>> jt.arg_reduce(x, 'max', dim=1, keepdims=False)
 	        [jt.Var([2 1], dtype=int32), jt.Var([5 7], dtype=int32)]
 	        >>> jt.arg_reduce(x, 'min', dim=1, keepdims=False)
-	        [jt.Var([1 2], dtype=int32), jt.Var([5 7], dtype=int32)]'''
+	        [jt.Var([1 2], dtype=int32), jt.Var([2 1], dtype=int32)]'''
 	...
 def random(shape: Tuple[int], dtype: str="float32", type: str="uniform")-> Var:
 ...
@ -2278,6 +2285,8 @@ def uint32(x: Var)-> Var:
 ...
 def uint64(x: Var)-> Var:
 ...
+def float16(x: Var)-> Var:
+ ...
 def float32(x: Var)-> Var:
 ...
 def float64(x: Var)-> Var:
@ -2870,6 +2879,8 @@ def erf(x: Var)-> Var:
 	        >>> jt.erf(a)
 	        jt.Var([ 0.51559156  0.45739546 -0.85728306 -0.9258883 ], dtype=float32)'''
 	...
+def erfinv(x: Var)-> Var:
+ ...
 def transpose(x: Var, axes: Tuple[int]=())-> Var:
 ...
 def fuse_transpose(x: Var, axes: Tuple[int]=())-> Var:
@ -3005,7 +3016,7 @@ def numpy_code(shape: Tuple[int], dtype: str, inputs: List[Var], forward: Callab
 	        )'''
 	...
@overload
-def numpy_code(shapes: List[Tuple[int]], dtypes: List[str], inputs: List[Var], forward: Callable, backward: List[Callable])-> List[Var]:
+def numpy_code(shapes: List[Tuple[int]], dtypes: List[str], inputs: List[Var], forward: Callable, backward: List[Callable]):
 	'''Document:
 	*
 	    Numpy Code Operator for easily customized op.
@ -3151,7 +3162,7 @@ def numpy_code(shape: Tuple[int], dtype: str, inputs: List[Var], forward: Callab
 	        )'''
 	...
@overload
-def numpy_code(shapes: List[Tuple[int]], dtypes: List[str], inputs: List[Var], forward: Callable)-> List[Var]:
+def numpy_code(shapes: List[Tuple[int]], dtypes: List[str], inputs: List[Var], forward: Callable):
 	'''Document:
 	*
 	    Numpy Code Operator for easily customized op.
@ -3345,6 +3356,23 @@ def code(shape: Tuple[int], dtype: str, inputs: List[Var]={}, cpu_src: str="", c
 	        assert (b.data == [5,3,1]).all()
 	        assert (c.data == [-4,-2]).all()
 	
+	    Example-5::
+	
+	        # This example shows how to customize code op
+	        # compilation flags, such as add include search
+	        # path, add definitions, or any command line options
+	
+	        a = jt.random([10])
+	        b = jt.code(a.shape, a.dtype, [a],
+	            cpu_src="""
+	                @out0(0) = HAHAHA;
+	            """)
+	        # HAHAHA is defined in flags below
+	        # /any/include/path can be change to any path you want to include
+	        b.compile_options = {"FLAGS: -DHAHAHA=233 -I/any/include/path ": 1}
+	        print(b[0])
+	        # will output 233
+	
 	
 	    CUDA Example-1::
 	
@ -3435,7 +3463,7 @@ def code(shape: Tuple[int], dtype: str, inputs: List[Var]={}, cpu_src: str="", c
 	        print(jt.grad(c, [a, b]))'''
 	...
@overload
-def code(shapes: List[Tuple[int]], dtypes: List[str], inputs: List[Var]={}, cpu_src: str="", cpu_grad_src: List[str]={}, cpu_header: str="", cuda_src: str="", cuda_grad_src: List[str]={}, cuda_header: str="")-> List[Var]:
+def code(shapes: List[Tuple[int]], dtypes: List[str], inputs: List[Var]={}, cpu_src: str="", cpu_grad_src: List[str]={}, cpu_header: str="", cuda_src: str="", cuda_grad_src: List[str]={}, cuda_header: str=""):
 	'''Document:
 	*
 	    Code Operator for easily customized op.
@ -3556,6 +3584,23 @@ def code(shapes: List[Tuple[int]], dtypes: List[str], inputs: List[Var]={}, cpu_
 	        assert (b.data == [5,3,1]).all()
 	        assert (c.data == [-4,-2]).all()
 	
+	    Example-5::
+	
+	        # This example shows how to customize code op
+	        # compilation flags, such as add include search
+	        # path, add definitions, or any command line options
+	
+	        a = jt.random([10])
+	        b = jt.code(a.shape, a.dtype, [a],
+	            cpu_src="""
+	                @out0(0) = HAHAHA;
+	            """)
+	        # HAHAHA is defined in flags below
+	        # /any/include/path can be change to any path you want to include
+	        b.compile_options = {"FLAGS: -DHAHAHA=233 -I/any/include/path ": 1}
+	        print(b[0])
+	        # will output 233
+	
 	
 	    CUDA Example-1::
 	
@ -3646,7 +3691,7 @@ def code(shapes: List[Tuple[int]], dtypes: List[str], inputs: List[Var]={}, cpu_
 	        print(jt.grad(c, [a, b]))'''
 	...
@overload
-def code(inputs: List[Var], outputs: List[Var], cpu_src: str="", cpu_grad_src: List[str]={}, cpu_header: str="", cuda_src: str="", cuda_grad_src: List[str]={}, cuda_header: str="")-> List[Var]:
+def code(inputs: List[Var], outputs: List[Var], cpu_src: str="", cpu_grad_src: List[str]={}, cpu_header: str="", cuda_src: str="", cuda_grad_src: List[str]={}, cuda_header: str=""):
 	'''Document:
 	*
 	    Code Operator for easily customized op.
@ -3767,6 +3812,23 @@ def code(inputs: List[Var], outputs: List[Var], cpu_src: str="", cpu_grad_src: L
 	        assert (b.data == [5,3,1]).all()
 	        assert (c.data == [-4,-2]).all()
 	
+	    Example-5::
+	
+	        # This example shows how to customize code op
+	        # compilation flags, such as add include search
+	        # path, add definitions, or any command line options
+	
+	        a = jt.random([10])
+	        b = jt.code(a.shape, a.dtype, [a],
+	            cpu_src="""
+	                @out0(0) = HAHAHA;
+	            """)
+	        # HAHAHA is defined in flags below
+	        # /any/include/path can be change to any path you want to include
+	        b.compile_options = {"FLAGS: -DHAHAHA=233 -I/any/include/path ": 1}
+	        print(b[0])
+	        # will output 233
+	
 	
 	    CUDA Example-1::
 	
@ -4239,7 +4301,7 @@ class Var:
 		        # output: [[0,1],[0,1]]'''
 		...
 	@overload
-	def index(self, dtype: str="int32")-> List[Var]:		
+	def index(self, dtype: str="int32"):		
 		'''Document:
 		* 
 		    Index Operator generate index of shape.
@ -4272,7 +4334,7 @@ class Var:
 		        jt.index_var(a, 1) similar with jt.index(a.shape, 1)'''
 		...
 	@overload
-	def index_var(self, dtype: str="int32")-> List[Var]:		
+	def index_var(self, dtype: str="int32"):		
 		'''Document:
 		* shape dependency version of index op
 		        jt.index_var(a, 1) similar with jt.index(a.shape, 1)'''
@ -4633,7 +4695,7 @@ class Var:
 		    * [in] y: the second input, jt.Var (integal or boolean).'''
 		...
 	def tape(self)-> Var: ...
-	def where(self, dtype: str="int32")-> List[Var]:		
+	def where(self, dtype: str="int32"):		
 		'''Document:
 		*
 		    Where Operator generate index of true condition.
@ -4647,9 +4709,9 @@ class Var:
 		    Example::
 		
 		        jt.where([[0,0,1],[1,0,0]])
-		        # return ( [0,2], [1,0] )'''
+		        # return [jt.Var([0 1], dtype=int32), jt.Var([2 0], dtype=int32)]'''
 		...
-	def argsort(self, dim: int=-1, descending: bool=False, dtype: str="int32")-> List[Var]:		
+	def argsort(self, dim: int=-1, descending: bool=False, dtype: str="int32"):		
 		'''Document:
 		* 
 		    Argsort Operator Perform an indirect sort by given key or compare function.
@ -4691,7 +4753,7 @@ class Var:
 		            # return [[0 1 0],[1 0 1]],  [[11 11 12],[12 13 13]]'''
 		...
 	def fetch(self, func: Callable)-> Var: ...
-	def arg_reduce(self, op: str, dim: int, keepdims: bool)-> List[Var]:		
+	def arg_reduce(self, op: str, dim: int, keepdims: bool):		
 		'''Document:
 		*
 		    Returns the indices of the maximum / minimum of the input across a dimension.
@ -6059,6 +6121,7 @@ class Var:
 	def uint16(self)-> Var: ...
 	def uint32(self)-> Var: ...
 	def uint64(self)-> Var: ...
+	def float16(self)-> Var: ...
 	def float32(self)-> Var: ...
 	def float64(self)-> Var: ...
 	def abs(self)-> Var:		
@ -6649,6 +6712,7 @@ class Var:
 		        >>> jt.erf(a)
 		        jt.Var([ 0.51559156  0.45739546 -0.85728306 -0.9258883 ], dtype=float32)'''
 		...
+	def erfinv(self)-> Var: ...
 	def transpose(self, axes: Tuple[int]=())-> Var: ...
 	def fuse_transpose(self, axes: Tuple[int]=())-> Var: ...
 	def safe_clip(self, left: float, right: float)-> Var:		
@ -6705,7 +6769,7 @@ class Var:
 		        #    x[y[0], 1] <= x[y[1], 1] and x[y[1], 1] <= x[y[2], 1] and ... and x[y[m-2], 1] <= x[y[m-1], 1]'''
 		...
 	@overload
-	def code(self, outputs: List[Var], cpu_src: str="", cpu_grad_src: List[str]={}, cpu_header: str="", cuda_src: str="", cuda_grad_src: List[str]={}, cuda_header: str="")-> List[Var]:		
+	def code(self, outputs: List[Var], cpu_src: str="", cpu_grad_src: List[str]={}, cpu_header: str="", cuda_src: str="", cuda_grad_src: List[str]={}, cuda_header: str=""):		
 		'''Document:
 		*
 		    Code Operator for easily customized op.
@ -6826,6 +6890,23 @@ class Var:
 		        assert (b.data == [5,3,1]).all()
 		        assert (c.data == [-4,-2]).all()
 		
+		    Example-5::
+		
+		        # This example shows how to customize code op
+		        # compilation flags, such as add include search
+		        # path, add definitions, or any command line options
+		
+		        a = jt.random([10])
+		        b = jt.code(a.shape, a.dtype, [a],
+		            cpu_src="""
+		                @out0(0) = HAHAHA;
+		            """)
+		        # HAHAHA is defined in flags below
+		        # /any/include/path can be change to any path you want to include
+		        b.compile_options = {"FLAGS: -DHAHAHA=233 -I/any/include/path ": 1}
+		        print(b[0])
+		        # will output 233
+		
 		
 		    CUDA Example-1::
 		
@ -7177,6 +7258,11 @@ class Var:
 		*
 		     * return True if operator fusion is stopped.'''
 		...
+	def start_grad(self)-> Var:		
+		'''Document:
+		* 
+		     * enable the gradient calculation for the Var.'''
+		...
 	def item(self)-> float | int | bool:		
 		'''Document:
 		*
@ -7266,7 +7352,13 @@ class Var:
 		         [1 8 1 1 2 2]], dtype=int32)'''
 		...
 	def permute(self, x: Var, axes: Tuple[int]=())-> Var: ...
+	def detach_inplace(self)-> Var:		
+		'''Document:
+		* 
+		     * enable the gradient calculation for the Var.'''
+		...
 	def astype(self, x: Var, op: str)-> Var: ...
+	def half(self, x: Var)-> Var: ...
 	def expand_as(self, x: Var, y: Var, dims: Tuple[int]=())-> Var:		
 		'''Document:
 		*
@ -7310,8 +7402,12 @@ class Flags:
 	'''A set of flags to configure jittor running behaviors'''
 	addr2line_path: str
 	'''Path of addr2line. Default: ""'''
+	amp_reg: int
+	'''Auto mixed-precision control registers, bit 0: prefer 32; bit 1: prefer 16; bit 2: keep reduce type; bit 3 keep white list type; bit 4: array like op prefer too. Default: 0'''
 	auto_convert_64_to_32: int
 	'''auto convert 64bit numpy array into 32bit jittor array. Default: 1'''
+	auto_mixed_precision_level: int
+	'''Auto mixed-precision optimization level, 0: not use fp16, 1-3: preserve level, not use fp16 for now; 4: perfer fp16, but some ops use fp32 e.g. sum,exp; 5: simular with 4, and array op will automatically convert to fp16; 6: all ops prefer fp16. Default: 0'''
 	cache_path: str
 	'''Cache path of jittor. Default: ""'''
 	cc_flags: str
@ -7324,10 +7420,12 @@ class Flags:
 	'''Unify graph sanity check. Default: 0'''
 	compile_options: Any
 	'''Override the default loop transfrom options. Default: {}'''
+	disable_lock: bool
+	'''Disable file lock. Default: 0'''
 	enable_tuner: int
 	'''Enable tuner. Default: 1'''
 	exclude_pass: str
-	'''Don't run certian pass. Default: ""'''
+	'''Don't run certain pass. Default: ""'''
 	extra_gdb_cmd: str
 	'''Extra command pass to GDB, seperate by(;) . Default: ""): Extra command pass to GDB, seperate by(;'''
 	gdb_attach: int
@ -7352,6 +7450,8 @@ class Flags:
 	'''Default enabled, if disable, use immediately eager execution rather than lazy execution, This flag makes error message and traceback infomation better. But this flag will raise memory consumption and lower the performance. Default: 1'''
 	log_file: str
 	'''log to file, mpi env will add $OMPI_COMM_WORLD_RANK suffix. Default: ""'''
+	log_op_hash: str
+	'''Output compiler pass result of certain hash of op. Default: ""'''
 	log_silent: int
 	'''The log will be completely silent. Default: 0'''
 	log_sync: int
@ -7376,6 +7476,10 @@ class Flags:
 	'''Enable profiler. Default: 0'''
 	profiler_hide_relay: int
 	'''Profiler hide relayed op. Default: 0'''
+	profiler_record_peek: int
+	'''Profiler record peek mem bandwidth. Default: 0'''
+	profiler_record_shape: int
+	'''Profiler record shape for op. Default: 0'''
 	profiler_rerun: int
 	'''Profiler rerun. Default: 0'''
 	profiler_warmup: int
@ -7402,8 +7506,12 @@ class Flags:
 	'''If not overflow, try to use 32 bit type as index type. Default: 0'''
 	update_queue_auto_flush_delay: int
 	'''when size of a update queue is great than this value, update queue trigger auto flush(default 2). Default: 2): when size of a update queue is great than this value, update queue trigger auto flush(default 2'''
+	use_acl: int
+	'''Use cuda or not. 1 for trying to use cuda, 2 for forcing to use cuda. Default: 0'''
 	use_cuda: int
 	'''Use cuda or not. 1 for trying to use cuda, 2 for forcing to use cuda. Default: 0'''
+	use_device: int
+	'''Use cuda or not. 1 for trying to use cuda, 2 for forcing to use cuda. Default: 0'''
 	use_nfef_allocator: int
 	'''Enable never free exact fit allocator. Default: 0'''
 	use_parallel_op_compiler: int
@ -7414,5 +7522,7 @@ class Flags:
 	'''Enable stat allocator. Default: 0'''
 	use_temp_allocator: int
 	'''Enable temp allocator. Default: 1'''
+	use_tensorcore: int
+	'''use tensor core. Default: 0'''
 flags: Flags
 '''Jittor running time flags instance'''
--- a/python/jittor/compile_extern.py
+++ b/python/jittor/compile_extern.py
@ -9,6 +9,7 @@ import platform
 from .compiler import *
 from jittor_utils import run_cmd, get_version, get_int_version
 from jittor_utils.misc import download_url_to_local
+import jittor_utils as jit_utils

 def search_file(dirs, name, prefer_version=()):
    if os.name == 'nt':
@ -110,8 +111,7 @@ def setup_mkl():
            LOG.v("setup mkl...")
            # mkl_path = os.path.join(cache_path, "mkl")
            # mkl_path decouple with cc_path
-            from pathlib import Path
-            mkl_path = os.path.join(str(Path.home()), ".cache", "jittor", "mkl")
+            mkl_path = os.path.join(jit_utils.home(), ".cache", "jittor", "mkl")
            
            make_cache_dir(mkl_path)
            install_mkl(mkl_path)
@ -141,12 +141,12 @@ def setup_mkl():

    elif platform.system() == 'Darwin':
        mkl_lib_paths = [
-            "/usr/local/lib/libmkldnn.dylib",       # x86_64
-            "/opt/homebrew/lib/libmkldnn.dylib",    # arm64
+            "/usr/local/lib/libdnnl.dylib",       # x86_64
+            "/opt/homebrew/lib/libdnnl.dylib",    # arm64
        ]
        if not any([os.path.exists(lib) for lib in mkl_lib_paths]):
            raise RuntimeError("Not found onednn, please install it by the command 'brew install onednn'")
-        extra_flags = f" -lmkldnn "
+        extra_flags = f" -ldnnl "

    mkl_op_dir = os.path.join(jittor_path, "extern", "mkl", "ops")
    mkl_op_files = [os.path.join(mkl_op_dir, name) for name in os.listdir(mkl_op_dir)]
@ -178,8 +178,7 @@ def install_cub(root_folder):
 def setup_cub():
    global cub_home
    cub_home = ""
-    from pathlib import Path
-    cub_path = os.path.join(str(Path.home()), ".cache", "jittor", "cub")
+    cub_path = os.path.join(jit_utils.home(), ".cache", "jittor", "cub")
    cuda_version = int(get_version(nvcc_path)[1:-1].split('.')[0])
    extra_flags = ""
    if cuda_version < 11:
@ -221,6 +220,12 @@ def setup_cuda_extern():
        LOG.w(f"CUDA found but cub is not loaded:\n{line}")

    libs = ["cublas", "cudnn", "curand", "cufft"]
+    # in cuda 11.4, module memory comsumptions:
+    # default context: 259 MB
+    # cublas: 340 MB
+    # cudnn: 340 MB
+    if int(os.environ.get("conv_opt", "0")):
+        libs = ["cublas", "curand"]
    for lib_name in libs:
        try:
            setup_cuda_lib(lib_name, extra_flags=link_cuda_extern)
@ -320,7 +325,11 @@ def install_cutt(root_folder):
        if md5 != true_md5:
            os.remove(fullname)
            shutil.rmtree(dirname)
-    if not os.path.isfile(os.path.join(cache_path, "libcutt"+so)):
+    CUTT_PATH = os.environ.get("CUTT_PATH", "")
+    if not os.path.isfile(os.path.join(cache_path, "libcutt"+so)) or CUTT_PATH:
+        if CUTT_PATH:
+            dirname = CUTT_PATH
+        else:
            LOG.i("Downloading cutt...")
            download_url_to_local(url, filename, root_folder, true_md5)

@ -335,7 +344,8 @@ def install_cutt(root_folder):
            zf.close()

        LOG.i("installing cutt...")
-        arch_flag = ""
+        # -Xptxas -dlcm=ca actually not work
+        arch_flag = " -Xptxas -dlcm=ca "
        if len(flags.cuda_archs):
            arch_flag = f" -arch=compute_{min(flags.cuda_archs)} "
            arch_flag += ''.join(map(lambda x:f' -code=sm_{x} ', flags.cuda_archs))
@ -365,8 +375,7 @@ def setup_cutt():
    if cutt_lib_path is None or cutt_include_path is None:
        LOG.v("setup cutt...")
        # cutt_path decouple with cc_path
-        from pathlib import Path
-        cutt_path = os.path.join(str(Path.home()), ".cache", "jittor", "cutt")
+        cutt_path = os.path.join(jit_utils.home(), ".cache", "jittor", "cutt")
        
        make_cache_dir(cutt_path)
        install_cutt(cutt_path)
@ -442,8 +451,7 @@ def setup_nccl():
    if nccl_lib_path is None or nccl_include_path is None:
        LOG.v("setup nccl...")
        # nccl_path decouple with cc_path
-        from pathlib import Path
-        nccl_path = os.path.join(str(Path.home()), ".cache", "jittor", "nccl")
+        nccl_path = os.path.join(jit_utils.home(), ".cache", "jittor", "nccl")
        
        make_cache_dir(nccl_path)
        nccl_home = install_nccl(nccl_path)
--- a/python/jittor/compiler.py
+++ b/python/jittor/compiler.py
@ -19,7 +19,7 @@ from ctypes import cdll
 from ctypes.util import find_library

 import jittor_utils as jit_utils
-from jittor_utils import LOG, run_cmd, cache_path, find_exe, cc_path, cc_type, cache_path
+from jittor_utils import LOG, run_cmd, find_exe, cc_path, cc_type, cache_path
 from . import pyjt_compiler
 from jittor_utils import lock
 from jittor_utils import install_cuda
@ -228,13 +228,20 @@ def gen_jit_flags():
                continue
            visit[name] = 1
            jit_declares.append(f"DECLARE_FLAG({type}, {name});")
+            alias = []
+            if name == "use_cuda":
+                alias = ["use_device", "use_acl"]
+            elif name == "auto_mixed_precision_level":
+                alias = ["amp_level"]
+            get_names = ",".join(["__get__"+a for a in [name]+alias])
+            set_names = ",".join(["__set__"+a for a in [name]+alias])
            flags_defs.append(f"""
                /* {name}(type:{type}, default:{default}): {doc} */
-                // @pyjt(__get__{name})
+                // @pyjt({get_names})
                {type} _get_{name}() {{ return {name}; }}
-                // @pyjt(__set__{name})
+                // @pyjt({set_names})
                void _set_{name}({type} v) {{ set_{name}(v); }}
-                {f'''// @pyjt(__set__{name})
+                {f'''// @pyjt({set_names})
                void _set_{name}(bool v) {{ set_{name}(v); }}
                ''' if type=="int" else ""}
            """)
@ -843,7 +850,7 @@ def check_cuda():
        # this nvcc is install by package manager
        cuda_lib = "/usr/lib/x86_64-linux-gnu"
    cuda_include2 = os.path.join(jittor_path, "extern","cuda","inc")
-    cc_flags += f" -DHAS_CUDA -I\"{cuda_include}\" -I\"{cuda_include2}\" "
+    cc_flags += f" -DHAS_CUDA -DIS_CUDA -I\"{cuda_include}\" -I\"{cuda_include2}\" "
    if os.name == 'nt':
        cuda_lib = os.path.abspath(os.path.join(cuda_dir, "..", "lib", "x64"))
        # cc_flags += f" \"{cuda_lib}\\cudart.lib\" "
@ -1212,6 +1219,14 @@ if has_cuda:
        return nvcc_flags
    nvcc_flags = convert_nvcc_flags(nvcc_flags)

+# from .acl_compiler import check_acl
+from .extern.acl import acl_compiler
+jit_utils.add_backend(acl_compiler)
+
+for mod in jit_utils.backends:
+    if mod.check():
+        break
+
 # build core
 gen_jit_flags()
 gen_jit_tests()
@ -1236,6 +1251,8 @@ files4 = [ f[len(jittor_path)+1:] for f in files4 ]
 # files4 = run_cmd('find -L src | grep '+grep_args, jittor_path).splitlines()
 at_beginning = [
    "src/ops/op_utils.cc",
+    "src/ops/op_register.cc",
+    "src/init.cc",
    "src/event_queue.cc",
    "src/mem/allocator/sfrl_allocator.cc",
    "src/mem/allocator.cc",
--- a/python/jittor/dataset/dataset.py
+++ b/python/jittor/dataset/dataset.py
@ -21,8 +21,9 @@ import signal
 from jittor_utils import LOG
 import jittor as jt
 import time
+import jittor_utils as jit_utils

-dataset_root = os.path.join(pathlib.Path.home(), ".cache", "jittor", "dataset")
+dataset_root = os.path.join(jit_utils.home(), ".cache", "jittor", "dataset")
 mp_log_v = os.environ.get("mp_log_v", 0) 
 mpi = jt.mpi
 img_open_hook = HookTimer(Image, "open")
--- a/python/jittor/demo/simple_cgan.py
+++ b/python/jittor/demo/simple_cgan.py
@ -0,0 +1,107 @@
+import jittor as jt
+from jittor import nn
+import numpy as np
+# import pylab as pl
+
+# 隐空间向量长度
+latent_dim = 100
+# 类别数量
+n_classes = 10
+# 图片大小
+img_size = 32
+# 图片通道数量
+channels = 1
+# 图片张量的形状
+img_shape = (channels, img_size, img_size)
+
+class Generator(nn.Module):
+    def __init__(self):
+        super(Generator, self).__init__()
+        self.label_emb = nn.Embedding(n_classes, n_classes)
+
+        def block(in_feat, out_feat, normalize=True):
+            layers = [nn.Linear(in_feat, out_feat)]
+            if normalize:
+                layers.append(nn.BatchNorm1d(out_feat, 0.8))
+            layers.append(nn.LeakyReLU(0.2))
+            return layers
+        self.model = nn.Sequential(
+            *block((latent_dim + n_classes), 128, normalize=False), 
+            *block(128, 256), 
+            *block(256, 512), 
+            *block(512, 1024), 
+            nn.Linear(1024, int(np.prod(img_shape))), 
+            nn.Tanh())
+
+    def execute(self, noise, labels):
+        gen_input = jt.contrib.concat((self.label_emb(labels), noise), dim=1)
+        img = self.model(gen_input)
+        img = img.view((img.shape[0], *img_shape))
+        return img
+
+class Discriminator(nn.Module):
+    def __init__(self):
+        super(Discriminator, self).__init__()
+        self.label_embedding = nn.Embedding(n_classes, n_classes)
+        self.model = nn.Sequential(
+            nn.Linear((n_classes + int(np.prod(img_shape))), 512), 
+            nn.LeakyReLU(0.2), 
+            nn.Linear(512, 512), 
+            nn.Dropout(0.4), 
+            nn.LeakyReLU(0.2), 
+            nn.Linear(512, 512), 
+            nn.Dropout(0.4), 
+            nn.LeakyReLU(0.2), 
+            nn.Linear(512, 1))
+
+    def execute(self, img, labels):
+        d_in = jt.contrib.concat((img.view((img.shape[0], (- 1))), self.label_embedding(labels)), dim=1)
+        validity = self.model(d_in)
+        return validity
+
+
+# 定义模型
+generator = Generator()
+discriminator = Discriminator()
+generator.eval()
+discriminator.eval()
+
+# 加载参数
+generator.load('https://cg.cs.tsinghua.edu.cn/jittor/assets/build/generator_last.pkl')
+discriminator.load('https://cg.cs.tsinghua.edu.cn/jittor/assets/build/discriminator_last.pkl')
+
+
+
+def gen_img(number):
+    print(number, type(number))
+    n_row = len(number)
+    z = jt.array(np.random.normal(0, 1, (n_row, latent_dim))).float32().stop_grad()
+    labels = jt.array(np.array([int(number[num]) for num in range(n_row)])).float32().stop_grad()
+    gen_imgs = generator(z,labels)
+    gen_imgs = gen_imgs.transpose((1,2,0,3)).reshape(gen_imgs.shape[2], -1)
+    gen_imgs = gen_imgs[:,:,None].broadcast(gen_imgs.shape+(3,)) # .uint8()
+    gen_imgs = (gen_imgs - gen_imgs.min()) / (gen_imgs.max() - gen_imgs.min()) * 255
+    gen_imgs = gen_imgs.uint8()
+    # print(gen_imgs.shape, gen_imgs.max(), gen_imgs.min())
+    return gen_imgs.numpy()
+    # gen_imgs = gen_imgs.data.transpose((1,2,0,3))[0].reshape((gen_imgs.shape[2], -1))
+    # print(gen_imgs.shape)
+    return gen_imgs[:,:,None]
+
+from PIL import Image
+import pywebio as pw
+# 定义一串数字
+number = "201962517"
+# gen_img(number)
+Image.fromarray(gen_img(number))
+# pl.imshow()
+# pl.show()
+# print("done")
+
+
+def web_server():
+    pw.pin.put_input("number", label="输入用于生成的数字(由计图框架支持)：")
+    pw.output.put_buttons(['Gen image'], 
+        lambda _: pw.output.put_image(Image.fromarray(gen_img(pw.pin.pin.number))))
+
+pw.start_server(web_server, port=8123)
--- a/python/jittor/extern/acl/acl_compiler.py
+++ b/python/jittor/extern/acl/acl_compiler.py
@ -0,0 +1,54 @@
+# ***************************************************************
+# Copyright (c) 2021 Jittor. All Rights Reserved. 
+# Maintainers: Dun Liang <randonlang@gmail.com>. 
+# This file is subject to the terms and conditions defined in
+# file 'LICENSE.txt', which is part of this source code package.
+# ***************************************************************
+import os
+from jittor_utils import env_or_try_find
+import jittor_utils
+import ctypes
+import glob
+
+has_acl = 0
+cc_flags = ""
+tikcc_path = env_or_try_find('tikcc_path', 'tikcc')
+dlopen_flags = os.RTLD_NOW | os.RTLD_GLOBAL
+
+def install():
+    import jittor.compiler as compiler
+    global has_acl, cc_flags
+    acl_compiler_home = os.path.dirname(__file__)
+    cc_files = sorted(glob.glob(acl_compiler_home+"/**/*.cc", recursive=True))
+    cc_flags += f" -DHAS_CUDA -DIS_ACL -I/usr/local/Ascend/runtime/include -I/usr/local/Ascend/driver/include -L/usr/local/Ascend/compiler/lib64 -L/usr/local/Ascend/runtime/lib64 -I{acl_compiler_home}  -ltikc_runtime -lascendcl  "
+    ctypes.CDLL("libascendcl.so", dlopen_flags)
+    jittor_utils.LOG.i("ACL detected")
+
+    mod = jittor_utils.compile_module('''
+#include "common.h"
+namespace jittor {
+// @pyjt(process)
+string process_acl(const string& src, const string& name, const map<string,string>& kargs);
+}''', compiler.cc_flags + " " + " ".join(cc_files) + cc_flags)
+    jittor_utils.process_jittor_source("acl", mod.process)
+
+    has_acl = 1
+
+
+def check():
+    import jittor.compiler as compiler
+    global has_acl, cc_flags
+    if tikcc_path:
+        try:
+            install()
+        except Exception as e:
+            jittor_utils.LOG.w(f"load ACL failed, exception: {e}")
+            has_acl = 0
+    compiler.has_acl = has_acl
+    compiler.tikcc_path = tikcc_path
+    if not has_acl: return False
+    compiler.cc_flags += cc_flags
+    compiler.nvcc_path = tikcc_path
+    compiler.nvcc_flags = compiler.cc_flags.replace("-std=c++14","")
+    return True
+
--- a/python/jittor/extern/acl/acl_error_code.cc
+++ b/python/jittor/extern/acl/acl_error_code.cc
@ -0,0 +1,228 @@
+// ***************************************************************
+// Copyright (c) 2021 Jittor. All Rights Reserved. 
+// Maintainers: Dun Liang <randonlang@gmail.com>. 
+// This file is subject to the terms and conditions defined in
+// file 'LICENSE.txt', which is part of this source code package.
+// ***************************************************************
+
+#include "common.h"
+using std::string;
+using std::unordered_map;
+
+typedef int aclError;
+
+static inline unordered_map<aclError,string> gen_map(string s) {
+    unordered_map<aclError,string> smap;
+    for (int i=0; i<s.size(); i++) {
+        if (s[i] == ';') {
+            int j=s.rfind(" ", i);
+            int code = std::stoi(s.substr(j+1, i-j-1));
+            int k = s.rfind(" ", j-1);
+            int l = s.rfind(" ACL_", k-1);
+            smap[code] = s.substr(l+1, k-l-1);
+        }
+    }
+    return smap;
+}
+
+string acl_error_to_string(aclError error) {
+
+static unordered_map<aclError,string> acl_error_map = gen_map(R"(
+// from acl_base.h
+static const int ACL_ERROR_INVALID_PARAM = 100000;
+static const int ACL_ERROR_UNINITIALIZE = 100001;
+static const int ACL_ERROR_REPEAT_INITIALIZE = 100002;
+static const int ACL_ERROR_INVALID_FILE = 100003;
+static const int ACL_ERROR_WRITE_FILE = 100004;
+static const int ACL_ERROR_INVALID_FILE_SIZE = 100005;
+static const int ACL_ERROR_PARSE_FILE = 100006;
+static const int ACL_ERROR_FILE_MISSING_ATTR = 100007;
+static const int ACL_ERROR_FILE_ATTR_INVALID = 100008;
+static const int ACL_ERROR_INVALID_DUMP_CONFIG = 100009;
+static const int ACL_ERROR_INVALID_PROFILING_CONFIG = 100010;
+static const int ACL_ERROR_INVALID_MODEL_ID = 100011;
+static const int ACL_ERROR_DESERIALIZE_MODEL = 100012;
+static const int ACL_ERROR_PARSE_MODEL = 100013;
+static const int ACL_ERROR_READ_MODEL_FAILURE = 100014;
+static const int ACL_ERROR_MODEL_SIZE_INVALID = 100015;
+static const int ACL_ERROR_MODEL_MISSING_ATTR = 100016;
+static const int ACL_ERROR_MODEL_INPUT_NOT_MATCH = 100017;
+static const int ACL_ERROR_MODEL_OUTPUT_NOT_MATCH = 100018;
+static const int ACL_ERROR_MODEL_NOT_DYNAMIC = 100019;
+static const int ACL_ERROR_OP_TYPE_NOT_MATCH = 100020;
+static const int ACL_ERROR_OP_INPUT_NOT_MATCH = 100021;
+static const int ACL_ERROR_OP_OUTPUT_NOT_MATCH = 100022;
+static const int ACL_ERROR_OP_ATTR_NOT_MATCH = 100023;
+static const int ACL_ERROR_OP_NOT_FOUND = 100024;
+static const int ACL_ERROR_OP_LOAD_FAILED = 100025;
+static const int ACL_ERROR_UNSUPPORTED_DATA_TYPE = 100026;
+static const int ACL_ERROR_FORMAT_NOT_MATCH = 100027;
+static const int ACL_ERROR_BIN_SELECTOR_NOT_REGISTERED = 100028;
+static const int ACL_ERROR_KERNEL_NOT_FOUND = 100029;
+static const int ACL_ERROR_BIN_SELECTOR_ALREADY_REGISTERED = 100030;
+static const int ACL_ERROR_KERNEL_ALREADY_REGISTERED = 100031;
+static const int ACL_ERROR_INVALID_QUEUE_ID = 100032;
+static const int ACL_ERROR_REPEAT_SUBSCRIBE = 100033;
+static const int ACL_ERROR_STREAM_NOT_SUBSCRIBE = 100034;
+static const int ACL_ERROR_THREAD_NOT_SUBSCRIBE = 100035;
+static const int ACL_ERROR_WAIT_CALLBACK_TIMEOUT = 100036;
+static const int ACL_ERROR_REPEAT_FINALIZE = 100037;
+static const int ACL_ERROR_NOT_STATIC_AIPP = 100038;
+static const int ACL_ERROR_COMPILING_STUB_MODE = 100039;
+static const int ACL_ERROR_GROUP_NOT_SET = 100040;
+static const int ACL_ERROR_GROUP_NOT_CREATE = 100041;
+static const int ACL_ERROR_PROF_ALREADY_RUN = 100042;
+static const int ACL_ERROR_PROF_NOT_RUN = 100043;
+static const int ACL_ERROR_DUMP_ALREADY_RUN = 100044;
+static const int ACL_ERROR_DUMP_NOT_RUN = 100045;
+static const int ACL_ERROR_PROF_REPEAT_SUBSCRIBE = 148046;
+static const int ACL_ERROR_PROF_API_CONFLICT = 148047;
+static const int ACL_ERROR_INVALID_MAX_OPQUEUE_NUM_CONFIG = 148048;
+static const int ACL_ERROR_INVALID_OPP_PATH = 148049;
+static const int ACL_ERROR_OP_UNSUPPORTED_DYNAMIC = 148050;
+static const int ACL_ERROR_RELATIVE_RESOURCE_NOT_CLEARED = 148051;
+
+static const int ACL_ERROR_BAD_ALLOC = 200000;
+static const int ACL_ERROR_API_NOT_SUPPORT = 200001;
+static const int ACL_ERROR_INVALID_DEVICE = 200002;
+static const int ACL_ERROR_MEMORY_ADDRESS_UNALIGNED = 200003;
+static const int ACL_ERROR_RESOURCE_NOT_MATCH = 200004;
+static const int ACL_ERROR_INVALID_RESOURCE_HANDLE = 200005;
+static const int ACL_ERROR_FEATURE_UNSUPPORTED = 200006;
+static const int ACL_ERROR_PROF_MODULES_UNSUPPORTED = 200007;
+
+static const int ACL_ERROR_STORAGE_OVER_LIMIT = 300000;
+
+static const int ACL_ERROR_INTERNAL_ERROR = 500000;
+static const int ACL_ERROR_FAILURE = 500001;
+static const int ACL_ERROR_GE_FAILURE = 500002;
+static const int ACL_ERROR_RT_FAILURE = 500003;
+static const int ACL_ERROR_DRV_FAILURE = 500004;
+static const int ACL_ERROR_PROFILING_FAILURE = 500005;
+
+// from ge_error_codes.h
+static const uint32_t ACL_ERROR_GE_PARAM_INVALID = 145000U;
+static const uint32_t ACL_ERROR_GE_EXEC_NOT_INIT = 145001U;
+static const uint32_t ACL_ERROR_GE_EXEC_MODEL_PATH_INVALID = 145002U;
+static const uint32_t ACL_ERROR_GE_EXEC_MODEL_ID_INVALID = 145003U;
+static const uint32_t ACL_ERROR_GE_EXEC_MODEL_DATA_SIZE_INVALID = 145006U;
+static const uint32_t ACL_ERROR_GE_EXEC_MODEL_ADDR_INVALID = 145007U;
+static const uint32_t ACL_ERROR_GE_EXEC_MODEL_QUEUE_ID_INVALID = 145008U;
+static const uint32_t ACL_ERROR_GE_EXEC_LOAD_MODEL_REPEATED = 145009U;
+static const uint32_t ACL_ERROR_GE_DYNAMIC_INPUT_ADDR_INVALID = 145011U;
+static const uint32_t ACL_ERROR_GE_DYNAMIC_INPUT_LENGTH_INVALID = 145012U;
+static const uint32_t ACL_ERROR_GE_DYNAMIC_BATCH_SIZE_INVALID = 145013U;
+static const uint32_t ACL_ERROR_GE_AIPP_BATCH_EMPTY = 145014U;
+static const uint32_t ACL_ERROR_GE_AIPP_NOT_EXIST = 145015U;
+static const uint32_t ACL_ERROR_GE_AIPP_MODE_INVALID = 145016U;
+static const uint32_t ACL_ERROR_GE_OP_TASK_TYPE_INVALID = 145017U;
+static const uint32_t ACL_ERROR_GE_OP_KERNEL_TYPE_INVALID = 145018U;
+static const uint32_t ACL_ERROR_GE_PLGMGR_PATH_INVALID = 145019U;
+static const uint32_t ACL_ERROR_GE_FORMAT_INVALID = 145020U;
+static const uint32_t ACL_ERROR_GE_SHAPE_INVALID = 145021U;
+static const uint32_t ACL_ERROR_GE_DATATYPE_INVALID = 145022U;
+static const uint32_t ACL_ERROR_GE_MEMORY_ALLOCATION = 245000U;
+static const uint32_t ACL_ERROR_GE_MEMORY_OPERATE_FAILED = 245001U;
+static const uint32_t ACL_ERROR_GE_INTERNAL_ERROR = 545000U;
+static const uint32_t ACL_ERROR_GE_LOAD_MODEL = 545001U;
+static const uint32_t ACL_ERROR_GE_EXEC_LOAD_MODEL_PARTITION_FAILED = 545002U;
+static const uint32_t ACL_ERROR_GE_EXEC_LOAD_WEIGHT_PARTITION_FAILED = 545003U;
+static const uint32_t ACL_ERROR_GE_EXEC_LOAD_TASK_PARTITION_FAILED = 545004U;
+static const uint32_t ACL_ERROR_GE_EXEC_LOAD_KERNEL_PARTITION_FAILED = 545005U;
+static const uint32_t ACL_ERROR_GE_EXEC_RELEASE_MODEL_DATA = 545006U;
+static const uint32_t ACL_ERROR_GE_COMMAND_HANDLE = 545007U;
+static const uint32_t ACL_ERROR_GE_GET_TENSOR_INFO = 545008U;
+static const uint32_t ACL_ERROR_GE_UNLOAD_MODEL = 545009U;
+
+
+static const int32_t ACL_ERROR_RT_PARAM_INVALID              = 107000; // param invalid
+static const int32_t ACL_ERROR_RT_INVALID_DEVICEID           = 107001; // invalid device id
+static const int32_t ACL_ERROR_RT_CONTEXT_NULL               = 107002; // current context null
+static const int32_t ACL_ERROR_RT_STREAM_CONTEXT             = 107003; // stream not in current context
+static const int32_t ACL_ERROR_RT_MODEL_CONTEXT              = 107004; // model not in current context
+static const int32_t ACL_ERROR_RT_STREAM_MODEL               = 107005; // stream not in model
+static const int32_t ACL_ERROR_RT_EVENT_TIMESTAMP_INVALID    = 107006; // event timestamp invalid
+static const int32_t ACL_ERROR_RT_EVENT_TIMESTAMP_REVERSAL   = 107007; // event timestamp reversal
+static const int32_t ACL_ERROR_RT_ADDR_UNALIGNED             = 107008; // memory address unaligned
+static const int32_t ACL_ERROR_RT_FILE_OPEN                  = 107009; // open file failed
+static const int32_t ACL_ERROR_RT_FILE_WRITE                 = 107010; // write file failed
+static const int32_t ACL_ERROR_RT_STREAM_SUBSCRIBE           = 107011; // error subscribe stream
+static const int32_t ACL_ERROR_RT_THREAD_SUBSCRIBE           = 107012; // error subscribe thread
+static const int32_t ACL_ERROR_RT_GROUP_NOT_SET              = 107013; // group not set
+static const int32_t ACL_ERROR_RT_GROUP_NOT_CREATE           = 107014; // group not create
+static const int32_t ACL_ERROR_RT_STREAM_NO_CB_REG           = 107015; // callback not register to stream
+static const int32_t ACL_ERROR_RT_INVALID_MEMORY_TYPE        = 107016; // invalid memory type
+static const int32_t ACL_ERROR_RT_INVALID_HANDLE             = 107017; // invalid handle
+static const int32_t ACL_ERROR_RT_INVALID_MALLOC_TYPE        = 107018; // invalid malloc type
+static const int32_t ACL_ERROR_RT_WAIT_TIMEOUT               = 107019; // wait timeout
+
+static const int32_t ACL_ERROR_RT_FEATURE_NOT_SUPPORT        = 207000; // feature not support
+static const int32_t ACL_ERROR_RT_MEMORY_ALLOCATION          = 207001; // memory allocation error
+static const int32_t ACL_ERROR_RT_MEMORY_FREE                = 207002; // memory free error
+static const int32_t ACL_ERROR_RT_AICORE_OVER_FLOW           = 207003; // aicore over flow
+static const int32_t ACL_ERROR_RT_NO_DEVICE                  = 207004; // no device
+static const int32_t ACL_ERROR_RT_RESOURCE_ALLOC_FAIL        = 207005; // resource alloc fail
+static const int32_t ACL_ERROR_RT_NO_PERMISSION              = 207006; // no permission
+static const int32_t ACL_ERROR_RT_NO_EVENT_RESOURCE          = 207007; // no event resource
+static const int32_t ACL_ERROR_RT_NO_STREAM_RESOURCE         = 207008; // no stream resource
+static const int32_t ACL_ERROR_RT_NO_NOTIFY_RESOURCE         = 207009; // no notify resource
+static const int32_t ACL_ERROR_RT_NO_MODEL_RESOURCE          = 207010; // no model resource
+static const int32_t ACL_ERROR_RT_NO_CDQ_RESOURCE            = 207011; // no cdq resource
+static const int32_t ACL_ERROR_RT_OVER_LIMIT                 = 207012; // over limit
+static const int32_t ACL_ERROR_RT_QUEUE_EMPTY                = 207013; // queue is empty
+static const int32_t ACL_ERROR_RT_QUEUE_FULL                 = 207014; // queue is full
+static const int32_t ACL_ERROR_RT_REPEATED_INIT              = 207015; // repeated init
+static const int32_t ACL_ERROR_RT_AIVEC_OVER_FLOW            = 207016; // aivec over flow
+
+static const int32_t ACL_ERROR_RT_INTERNAL_ERROR             = 507000; // runtime internal error
+static const int32_t ACL_ERROR_RT_TS_ERROR                   = 507001; // ts internel error
+static const int32_t ACL_ERROR_RT_STREAM_TASK_FULL           = 507002; // task full in stream
+static const int32_t ACL_ERROR_RT_STREAM_TASK_EMPTY          = 507003; // task empty in stream
+static const int32_t ACL_ERROR_RT_STREAM_NOT_COMPLETE        = 507004; // stream not complete
+static const int32_t ACL_ERROR_RT_END_OF_SEQUENCE            = 507005; // end of sequence
+static const int32_t ACL_ERROR_RT_EVENT_NOT_COMPLETE         = 507006; // event not complete
+static const int32_t ACL_ERROR_RT_CONTEXT_RELEASE_ERROR      = 507007; // context release error
+static const int32_t ACL_ERROR_RT_SOC_VERSION                = 507008; // soc version error
+static const int32_t ACL_ERROR_RT_TASK_TYPE_NOT_SUPPORT      = 507009; // task type not support
+static const int32_t ACL_ERROR_RT_LOST_HEARTBEAT             = 507010; // ts lost heartbeat
+static const int32_t ACL_ERROR_RT_MODEL_EXECUTE              = 507011; // model execute failed
+static const int32_t ACL_ERROR_RT_REPORT_TIMEOUT             = 507012; // report timeout
+static const int32_t ACL_ERROR_RT_SYS_DMA                    = 507013; // sys dma error
+static const int32_t ACL_ERROR_RT_AICORE_TIMEOUT             = 507014; // aicore timeout
+static const int32_t ACL_ERROR_RT_AICORE_EXCEPTION           = 507015; // aicore exception
+static const int32_t ACL_ERROR_RT_AICORE_TRAP_EXCEPTION      = 507016; // aicore trap exception
+static const int32_t ACL_ERROR_RT_AICPU_TIMEOUT              = 507017; // aicpu timeout
+static const int32_t ACL_ERROR_RT_AICPU_EXCEPTION            = 507018; // aicpu exception
+static const int32_t ACL_ERROR_RT_AICPU_DATADUMP_RSP_ERR     = 507019; // aicpu datadump response error
+static const int32_t ACL_ERROR_RT_AICPU_MODEL_RSP_ERR        = 507020; // aicpu model operate response error
+static const int32_t ACL_ERROR_RT_PROFILING_ERROR            = 507021; // profiling error
+static const int32_t ACL_ERROR_RT_IPC_ERROR                  = 507022; // ipc error
+static const int32_t ACL_ERROR_RT_MODEL_ABORT_NORMAL         = 507023; // model abort normal
+static const int32_t ACL_ERROR_RT_KERNEL_UNREGISTERING       = 507024; // kernel unregistering
+static const int32_t ACL_ERROR_RT_RINGBUFFER_NOT_INIT        = 507025; // ringbuffer not init
+static const int32_t ACL_ERROR_RT_RINGBUFFER_NO_DATA         = 507026; // ringbuffer no data
+static const int32_t ACL_ERROR_RT_KERNEL_LOOKUP              = 507027; // kernel lookup error
+static const int32_t ACL_ERROR_RT_KERNEL_DUPLICATE           = 507028; // kernel register duplicate
+static const int32_t ACL_ERROR_RT_DEBUG_REGISTER_FAIL        = 507029; // debug register failed
+static const int32_t ACL_ERROR_RT_DEBUG_UNREGISTER_FAIL      = 507030; // debug unregister failed
+static const int32_t ACL_ERROR_RT_LABEL_CONTEXT              = 507031; // label not in current context
+static const int32_t ACL_ERROR_RT_PROGRAM_USE_OUT            = 507032; // program register num use out
+static const int32_t ACL_ERROR_RT_DEV_SETUP_ERROR            = 507033; // device setup error
+static const int32_t ACL_ERROR_RT_VECTOR_CORE_TIMEOUT        = 507034; // vector core timeout
+static const int32_t ACL_ERROR_RT_VECTOR_CORE_EXCEPTION      = 507035; // vector core exception
+static const int32_t ACL_ERROR_RT_VECTOR_CORE_TRAP_EXCEPTION = 507036; // vector core trap exception
+static const int32_t ACL_ERROR_RT_CDQ_BATCH_ABNORMAL         = 507037; // cdq alloc batch abnormal
+static const int32_t ACL_ERROR_RT_DIE_MODE_CHANGE_ERROR      = 507038; // can not change die mode
+static const int32_t ACL_ERROR_RT_DIE_SET_ERROR              = 507039; // single die mode can not set die
+static const int32_t ACL_ERROR_RT_INVALID_DIEID              = 507040; // invalid die id
+static const int32_t ACL_ERROR_RT_DIE_MODE_NOT_SET           = 507041; // die mode not set
+
+static const int32_t ACL_ERROR_RT_DRV_INTERNAL_ERROR         = 507899; // drv internal error
+static const int32_t ACL_ERROR_RT_AICPU_INTERNAL_ERROR       = 507900; // aicpu internal error
+static const int32_t ACL_ERROR_RT_SOCKET_CLOSE               = 507901; // hdc disconnect
+
+)");
+    if (acl_error_map.count(error))
+        return acl_error_map[error];
+    return "unknown " + std::to_string((int)error);
+}
--- a/python/jittor/extern/acl/acl_jittor.cc
+++ b/python/jittor/extern/acl/acl_jittor.cc
@ -0,0 +1,186 @@
+// ***************************************************************
+// Copyright (c) 2021 Jittor. All Rights Reserved. 
+// Maintainers: Dun Liang <randonlang@gmail.com>. 
+// This file is subject to the terms and conditions defined in
+// file 'LICENSE.txt', which is part of this source code package.
+// ***************************************************************
+#include "acl_jittor.h"
+#include "utils/str_utils.h"
+#include <chrono>
+#include <thread>
+
+namespace jittor {
+
+uint64_t acl_jittor_tid;
+int acl_jittor_thread_running=0;
+aclrtContext acl_jittor_context;
+
+#define CHECK_ACL(x) ASSERTop(x,==,0)
+
+static void* acl_jittor_process_callback(void*) {
+    acl_jittor_thread_running = 1;
+    int deviceId = 0;
+    CHECK_ACL(aclrtSetCurrentContext(acl_jittor_context));
+    
+    while (acl_jittor_thread_running) {
+        // LOGir << "acl_jittor_process_callback";
+        auto ret = aclrtProcessReport(1000);
+        if (ret) {
+            if (acl_jittor_thread_running && ret != ACL_ERROR_RT_REPORT_TIMEOUT)
+                LOGir << "aclrtProcessReport:" << ret << acl_error_to_string(ret);
+            break;
+        }
+    }
+    acl_jittor_thread_running = 0;
+    return (void*)0;
+}
+
+// void aaa(void*) {
+//     LOGir << "haha";
+// }
+
+struct acl_jittor_initer {
+
+acl_jittor_initer() {
+    CHECK_ACL(aclInit(nullptr));
+    uint device_count = 0;
+    // 获取可用的Device数量
+    CHECK_ACL(aclrtGetDeviceCount(&device_count));
+    LOGi << "Found ACL device number:" << device_count;
+    CHECK_ACL(aclrtSetDevice(0));
+    CHECK_ACL(aclrtCreateContext(&acl_jittor_context, 0));
+    CHECK_ACL(aclrtSetCurrentContext(acl_jittor_context));
+    
+    pthread_create(&acl_jittor_tid, nullptr, acl_jittor_process_callback, 0);
+
+    // subscribe for default stream
+    CHECK_ACL(aclrtSubscribeReport(acl_jittor_tid,0));
+
+    // simple callback test
+    // aclrtStream stream;
+    // CHECK_ACL(aclrtCreateStream(&stream));
+    // CHECK_ACL(aclrtSubscribeReport(acl_jittor_tid,stream));
+    // CHECK_ACL(aclrtLaunchCallback((aclrtCallback)&aaa, 0, ACL_CALLBACK_NO_BLOCK, stream));
+    // CHECK_ACL(aclrtLaunchCallback((aclrtCallback)&aaa, 0, ACL_CALLBACK_NO_BLOCK, 0));
+}
+
+~acl_jittor_initer() {
+    acl_jittor_thread_running = 0;
+    CHECK_ACL(aclrtUnSubscribeReport(acl_jittor_tid,0));
+    CHECK_ACL(aclrtDestroyContext(acl_jittor_context));
+    CHECK_ACL(aclFinalize());
+}
+
+} _acl_jittor_initer;
+
+string process_acl(const string& src, const string& name, const map<string,string>& kargs) {
+    auto tokens = token_split(src);
+    int edit = 0;
+    for (int i=0; i<tokens.size(); i++) {
+        auto& token = tokens[i];
+        if (token == "cuda_runtime") token = "acl_jittor", edit ++; else
+        if (token == "CUDA") token = "ACL", edit ++; else
+        if (startswith(token, "cuda")) {
+            if (token.size()>=5 && token[4] >= 'A' && token[4] <= 'Z') {
+                if (token == "cudaGetDeviceCount") {
+                    token_replace(tokens, i, "($1);", "((uint*)$1);");
+                } else if (token == "cudaLaunchHostFunc") {
+                    // ACL_CALLBACK_BLOCK for 310
+                    token_replace(tokens, i, "LaunchHostFunc($1,$2,$3)",
+                        "LaunchCallback($2,$3,ACL_CALLBACK_NO_BLOCK,$1)");
+                } else if (token == "cudaMemcpy")
+                    token_replace(tokens, i, "cudaMemcpy($1,$2,$3,",
+                        "aclrtMemcpy($1,$3,$2,$3,");
+                else if (token == "cudaMemcpyAsync")
+                    token_replace(tokens, i, "cudaMemcpyAsync($1,$2,$3,",
+                        "aclrtMemcpyAsync($1,$3,$2,$3,");
+                else if (token == "cudaMemcpyDeviceToHost") token = "ACL_MEMCPY_DEVICE_TO_HOST";
+                else if (token == "cudaMemcpyHostToDevice") token = "ACL_MEMCPY_HOST_TO_DEVICE";
+                else if (token == "cudaMemcpyDeviceToDevice") token = "ACL_MEMCPY_DEVICE_TO_DEVICE";
+                else if (token == "cudaMallocManaged" || token == "cudaMalloc") {
+                    // unified address not supported
+                    token = "aclrtMalloc";
+                    token_replace(tokens, i, "($1,$2)",
+                        "($1,$2,ACL_MEM_MALLOC_HUGE_FIRST)");
+                } else if (token == "cudaMemGetInfo")
+                    token_replace(tokens, i, "cudaMemGetInfo($1,$2)",
+                        "aclrtGetMemInfo(ACL_DDR_MEM,$1,$2)");
+                else if (token == "cudaGetLastError")
+                    token_replace(tokens, i, "cudaGetLastError()", "0");
+                else if (token == "cudaStreamCreateWithFlags")
+                    token_replace(tokens, i-1, 
+                        "(cudaStreamCreateWithFlags($1,$2));",
+                        "(aclrtCreateStream($1)); checkAclErrors(aclrtSubscribeReport(acl_jittor_tid,*$1));");
+                else if (token == "cudaEventCreate")
+                    token_replace(tokens, i, 
+                        "cudaEventCreate($1,$2)",
+                        "aclrtCreateEvent($1)");
+                else if (token == "cudaDeviceSynchronize")
+                    token = "aclrtSynchronizeDevice";
+                else if (token == "cudaStreamDestroy")
+                    token_replace(tokens, i, "cudaStreamDestroy($1)",
+                        "(aclrtUnSubscribeReport(acl_jittor_tid,$1), aclrtDestroyStream($1))");
+                else if (token == "cudaEventDestroy")
+                    token = "aclrtDestroyEvent";
+                else if (token == "cudaEventRecord")
+                    token = "aclrtRecordEvent";
+                else if (token == "cudaStreamWaitEvent")
+                    token_replace(tokens, i, 
+                        "cudaStreamWaitEvent($1,$2,$3)",
+                        "aclrtStreamWaitEvent($1,$2)");
+                
+                if (token.size() && token[0] == 'c')
+                    token = "aclrt" + token.substr(4);
+                if (endswith(token, "_t"))
+                    token = token.substr(0, token.size()-2);
+                edit ++;
+            }
+        } else
+        if (token == "_cudaGetErrorEnum") {
+            token_replace(tokens, i, "_cudaGetErrorEnum($1)", "(acl_error_to_string($1))");
+            edit ++;
+        } else
+        if (token == "checkCudaErrors")
+            token = "checkAclErrors";
+        else if (token == "JPU") {
+            edit ++;
+            string new_code;
+            if (tokens[i+2] == "op_compiler")
+                token_replace(tokens, i, 
+                    "JPU(op_compiler($1,$2,$3))",
+                    "acl_jittor_op_compiler($1,$2,$3)");
+            else if (tokens[i+2] == "header")
+                new_code = "#include \"acl_jittor.h\"";
+            if (new_code.size())
+                token_replace(tokens, i,  "JPU($1)", new_code);
+        } else if (token == "use_cuda_managed_allocator" && tokens[i+1][0]==',') {
+            tokens[i+2] = "0"; // disable unified address
+        }
+    }
+    if (!edit) return src;
+    return join(tokens, "");
+}
+
+void acl_jittor_op_compiler(string& filename, string& src, bool is_acl) {
+    if (!is_acl) return;
+    filename = replace(filename, ".cc", ".tikcc");
+    // LOGir << filename;
+    string new_src = process_acl(src, "", {});
+    new_src = replace(new_src, R"(#include "misc/cuda_atomic.h")", "");
+    new_src = replace(new_src, R"(#include "misc/cuda_limits.h")", "");
+    new_src = replace(new_src, "__global__", "__ai_device_entry__");
+    new_src = token_replace(new_src, "__launch_bounds__($1)", "");
+    new_src = token_replace(new_src, "int thread_num = $1;", "int thread_num = 1;");
+    new_src = token_replace(new_src, "tn0=std::max(tn0, $1);", "");
+    new_src = token_replace(new_src, "<<<$1,$2>>>", "<<<1,0>>>");
+    new_src = token_replace(new_src, "int thread_id = $1;", "int thread_id = 1;");
+    // for inc error
+    new_src = token_replace(new_src, "for ($1+=$2)", "for ($1++)");
+    // bit op error
+    new_src = token_replace(new_src, "int tnum$1;", "");
+    new_src = token_replace(new_src, "int tid$1=$2;", "int tid$1=0;");
+    src = new_src;
+    // auto tokens = token_split(new_src);
+}
+
+}
--- a/python/jittor/extern/acl/acl_jittor.h
+++ b/python/jittor/extern/acl/acl_jittor.h
@ -0,0 +1,19 @@
+// ***************************************************************
+// Copyright (c) 2021 Jittor. All Rights Reserved. 
+// Maintainers: Dun Liang <randonlang@gmail.com>. 
+// This file is subject to the terms and conditions defined in
+// file 'LICENSE.txt', which is part of this source code package.
+// ***************************************************************
+#pragma once
+#include "common.h"
+#include <acl/acl.h>
+
+std::string acl_error_to_string(aclError error);
+
+namespace jittor {
+
+EXTERN_LIB uint64_t acl_jittor_tid;
+
+void acl_jittor_op_compiler(string& filename, string& src, bool is_acl);
+
+}
--- a/python/jittor/extern/cuda/cublas/inc/cublas_wrapper.h
+++ b/python/jittor/extern/cuda/cublas/inc/cublas_wrapper.h
@ -23,8 +23,8 @@ EXTERN_LIB cublasHandle_t cublas_handle;

 static inline cudaDataType get_dtype(NanoString dtype) {
    if (dtype == ns_float32) return CUDA_R_32F;
-    // if (dtype == ns_float64) return CUDA_R_64F;
-    // if (dtype == ns_float16) return CUDA_R_16F;
+    if (dtype == ns_float64) return CUDA_R_64F;
+    if (dtype == ns_float16) return CUDA_R_16F;
    LOGf << "not support type" << dtype;
    return CUDA_R_32F;
 }
--- a/python/jittor/extern/cuda/cublas/ops/cublas_batched_matmul_op.cc
+++ b/python/jittor/extern/cuda/cublas/ops/cublas_batched_matmul_op.cc
@ -89,7 +89,7 @@ void CublasBatchedMatmulOp::jit_prepare(JK& jk) {
    jk << _CS("[T:") << a->dtype();
    jk << _CS("][Trans_a:") << (trans_a ? 'T' : 'N');
    jk << _CS("][Trans_b:") << (trans_b ? 'T' : 'N');
-    jk << _CS("][op:") << (a->dtype().dsize() == 4 ? 'S' : 'D');
+    jk << _CS("][op:") << (a->dtype().dsize() == 2? 'H' : (a->dtype().dsize() == 4 ? 'S' : 'D'));
    jk << ']';
 }

@ -124,6 +124,22 @@ void CublasBatchedMatmulOp::jit_run() {
    if (use_tensorcore) {
        computeType = CUBLAS_COMPUTE_32F_FAST_16F;
    }
+    if (a->dtype() == ns_float16
+        || b->dtype() == ns_float16 || c->dtype() == ns_float16) {
+        computeType = CUBLAS_COMPUTE_16F;
+    }
+    #else 
+    cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT;
+    cudaDataType_t computeType = CUDA_R_32F;
+    if (use_tensorcore) {
+        algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP;
+    }
+    if (a->dtype() == ns_float16
+        || b->dtype() == ns_float16 || c->dtype() == ns_float16) {
+        computeType = CUDA_R_16F;
+        algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP;
+    }
+    #endif
    checkCudaErrors(cublasGemmStridedBatchedEx(handle_,
    CUBLAS_OP_@Trans_b, CUBLAS_OP_@Trans_a,
    k, n, m, &alpha,
@ -131,15 +147,13 @@ void CublasBatchedMatmulOp::jit_run() {
    a->ptr<T>(),get_dtype(a->dtype()), '@Trans_a' == 'N' ? m : n, n * m, &beta,
    c->ptr<T>(),get_dtype(c->dtype()), k, k * n,
    batch_size,computeType,algo));
-    #else
-    checkCudaErrors(cublas@op@@gemmStridedBatched(handle_,
-    CUBLAS_OP_@Trans_b, CUBLAS_OP_@Trans_a,
-    k, n, m, &alpha,
-    b->ptr<T>(), '@Trans_b' == 'N' ? k : m, k * m, 
-    a->ptr<T>(), '@Trans_a' == 'N' ? m : n, n * m, &beta,
-    c->ptr<T>(), k, k * n,
-    batch_size));
-    #endif
+    // checkCudaErrors(cublas@op@@gemmStridedBatched(handle_,
+    // CUBLAS_OP_@Trans_b, CUBLAS_OP_@Trans_a,
+    // k, n, m, &alpha,
+    // b->ptr<T>(), '@Trans_b' == 'N' ? k : m, k * m, 
+    // a->ptr<T>(), '@Trans_a' == 'N' ? m : n, n * m, &beta,
+    // c->ptr<T>(), k, k * n,
+    // batch_size));
 }
 #endif
 #endif // JIT
--- a/python/jittor/extern/cuda/cublas/ops/cublas_matmul_op.cc
+++ b/python/jittor/extern/cuda/cublas/ops/cublas_matmul_op.cc
@ -50,7 +50,7 @@ void CublasMatmulOp::jit_prepare(JK& jk) {
    jk << _CS("[T:") << a->dtype();
    jk << _CS("][Trans_a:") << (trans_a ? 'T' : 'N');
    jk << _CS("][Trans_b:") << (trans_b ? 'T' : 'N');
-    jk << _CS("][op:") << (a->dtype().dsize() == 4 ? 'S' : 'D');
+    jk << _CS("][op:") << (a->dtype().dsize() == 2? 'H' : (a->dtype().dsize() == 4 ? 'S' : 'D'));
    jk << ']';
 }

@ -81,6 +81,22 @@ void CublasMatmulOp::jit_run() {
    if (use_tensorcore) {
        computeType = CUBLAS_COMPUTE_32F_FAST_16F;
    }
+    if (a->dtype() == ns_float16
+        || b->dtype() == ns_float16 || c->dtype() == ns_float16) {
+        computeType = CUBLAS_COMPUTE_16F;
+    }
+    #else
+    cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT;
+    cudaDataType_t computeType = CUDA_R_32F;
+    if (use_tensorcore) {
+        algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP;
+    }
+    if (a->dtype() == ns_float16
+        || b->dtype() == ns_float16 || c->dtype() == ns_float16) {
+        computeType = CUDA_R_16F;
+        algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP;
+    }
+    #endif
    checkCudaErrors(cublasGemmEx(handle_, 
    CUBLAS_OP_@Trans_b, CUBLAS_OP_@Trans_a, 
    k, n, m, &alpha, 
@ -88,15 +104,13 @@ void CublasMatmulOp::jit_run() {
    a->ptr<T>(),get_dtype(a->dtype()), '@Trans_a' == 'N' ? m : n, &beta, 
    c->ptr<T>(),get_dtype(c->dtype()), k,
    computeType, algo));
-    #else
-    checkCudaErrors(cublas@op@@gemm(handle_, 
-    CUBLAS_OP_@Trans_b, CUBLAS_OP_@Trans_a, 
-    k, n, m, &alpha, 
-    b->ptr<T>(), '@Trans_b' == 'N' ? k : m, 
-    a->ptr<T>(), '@Trans_a' == 'N' ? m : n, &beta, 
-    c->ptr<T>(), k));
+    // checkCudaErrors(cublas@op@@gemm(handle_, 
+    // CUBLAS_OP_@Trans_b, CUBLAS_OP_@Trans_a, 
+    // k, n, m, &alpha, 
+    // b->ptr<T>(), '@Trans_b' == 'N' ? k : m, 
+    // a->ptr<T>(), '@Trans_a' == 'N' ? m : n, &beta, 
+    // c->ptr<T>(), k));

-    #endif
    
 }
 #endif // JIT
--- a/python/jittor/extern/cuda/cudnn/ops/cudnn_conv_op.cc
+++ b/python/jittor/extern/cuda/cudnn/ops/cudnn_conv_op.cc
@ -175,6 +175,11 @@ void CudnnConvOp::jit_run() {
        checkCudaErrors( cudnnSetConvolutionMathType(cudnnConvDesc, CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION) );
    }
    
+    if (x->dtype() == ns_float16
+        || y->dtype() == ns_float16 || w->dtype() == ns_float16) {
+        checkCudaErrors( cudnnSetConvolutionMathType(cudnnConvDesc, CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION) );
+    }
+
    int dimY[] = {
        (int)y->shape[findc("@YFORMAT", 'a')], // n
        (int)y->shape[findc("@YFORMAT", 'b')], // c
--- a/python/jittor/extern/cuda/cutt/ops/cutt_transpose_op.cc
+++ b/python/jittor/extern/cuda/cutt/ops/cutt_transpose_op.cc
@ -90,7 +90,7 @@ void CuttTransposeOp::jit_run() {
    for (int i=0; i<dim; i++)
        x_shape[i] = new_shape[dim-1-i];
    if (dim == 1 || x->num==1) {
-        checkCudaErrors(cudaMemcpyAsync(yp, xp, x->size, cudaMemcpyDefault, 0));
+        checkCudaErrors(cudaMemcpyAsync(yp, xp, x->size, cudaMemcpyDeviceToDevice, 0));
        return;
    }
    JK& jk = get_jk();
--- a/python/jittor/extern/cuda/inc/helper_cuda.h
+++ b/python/jittor/extern/cuda/inc/helper_cuda.h
@ -25,7 +25,9 @@
 #include <stdlib.h>
 #include <string.h>

+#ifdef IS_CUDA
 #include <helper_string.h>
+#endif

 #ifndef EXIT_WAIVED
 #define EXIT_WAIVED 2
@ -129,6 +131,9 @@ void check(T result, char const *const func, const char *const file,
  }
 }

+#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
+#define peekCudaErrors(val) peek((val), #val, __FILE__, __LINE__)
+
 #ifdef __DRIVER_TYPES_H__
 // This will output the proper CUDA error strings in the event
 // that a CUDA host call returns an error
--- a/python/jittor/extern/mkl/ops/cpu_cnn_inference_f32.cpp
+++ b/python/jittor/extern/mkl/ops/cpu_cnn_inference_f32.cpp
@ -47,9 +47,9 @@
 #include <unordered_map>
 #include <vector>

-#include <mkldnn.hpp>
+#include <dnnl.hpp>

-using namespace mkldnn;
+using namespace dnnl;

 using namespace std;

@ -159,8 +159,8 @@ void simple_net(int times = 100) {
    if (conv1_prim_desc.src_desc() != user_src_memory.get_desc()) {
        conv1_src_memory = memory(conv1_prim_desc.src_desc(), eng);
        net.push_back(reorder(user_src_memory, conv1_src_memory));
-        net_args.push_back({ { MKLDNN_ARG_FROM, user_src_memory },
-                { MKLDNN_ARG_TO, conv1_src_memory } });
+        net_args.push_back({ { DNNL_ARG_FROM, user_src_memory },
+                { DNNL_ARG_TO, conv1_src_memory } });
    }

    auto conv1_weights_memory = user_weights_memory;
@ -181,10 +181,10 @@ void simple_net(int times = 100) {
 /// @snippet cpu_cnn_inference_f32.cpp Create memory for output
 //[Create convolution primitive]
    net.push_back(convolution_forward(conv1_prim_desc));
-    net_args.push_back({ { MKLDNN_ARG_SRC, conv1_src_memory },
-            { MKLDNN_ARG_WEIGHTS, conv1_weights_memory },
-            { MKLDNN_ARG_BIAS, conv1_user_bias_memory },
-            { MKLDNN_ARG_DST, conv1_dst_memory } });
+    net_args.push_back({ { DNNL_ARG_SRC, conv1_src_memory },
+            { DNNL_ARG_WEIGHTS, conv1_weights_memory },
+            { DNNL_ARG_BIAS, conv1_user_bias_memory },
+            { DNNL_ARG_DST, conv1_dst_memory } });
 //[Create convolution primitive]

    // AlexNet: relu1
@ -204,8 +204,8 @@ void simple_net(int times = 100) {
    auto relu1_prim_desc = eltwise_forward::primitive_desc(relu1_desc, eng);

    net.push_back(eltwise_forward(relu1_prim_desc));
-    net_args.push_back({ { MKLDNN_ARG_SRC, conv1_dst_memory },
-            { MKLDNN_ARG_DST, conv1_dst_memory } });
+    net_args.push_back({ { DNNL_ARG_SRC, conv1_dst_memory },
+            { DNNL_ARG_DST, conv1_dst_memory } });
 //[Create relu primitive]

    // AlexNet: lrn1
@ -226,8 +226,8 @@ void simple_net(int times = 100) {
    auto lrn1_dst_memory = memory(lrn1_prim_desc.dst_desc(), eng);

    net.push_back(lrn_forward(lrn1_prim_desc));
-    net_args.push_back({ { MKLDNN_ARG_SRC, conv1_dst_memory },
-            { MKLDNN_ARG_DST, lrn1_dst_memory } });
+    net_args.push_back({ { DNNL_ARG_SRC, conv1_dst_memory },
+            { DNNL_ARG_DST, lrn1_dst_memory } });

    // AlexNet: pool1
    // {batch, 96, 55, 55} -> {batch, 96, 27, 27}
@ -255,8 +255,8 @@ void simple_net(int times = 100) {
    auto pool1_dst_memory = memory(pool1_pd.dst_desc(), eng);

    net.push_back(pooling_forward(pool1_pd));
-    net_args.push_back({ { MKLDNN_ARG_SRC, lrn1_dst_memory },
-            { MKLDNN_ARG_DST, pool1_dst_memory } });
+    net_args.push_back({ { DNNL_ARG_SRC, lrn1_dst_memory },
+            { DNNL_ARG_DST, pool1_dst_memory } });
 //[Create pooling primitive]

    // AlexNet: conv2
@ -296,8 +296,8 @@ void simple_net(int times = 100) {
    if (conv2_prim_desc.src_desc() != conv2_src_memory.get_desc()) {
        conv2_src_memory = memory(conv2_prim_desc.src_desc(), eng);
        net.push_back(reorder(pool1_dst_memory, conv2_src_memory));
-        net_args.push_back({ { MKLDNN_ARG_FROM, pool1_dst_memory },
-                { MKLDNN_ARG_TO, conv2_src_memory } });
+        net_args.push_back({ { DNNL_ARG_FROM, pool1_dst_memory },
+                { DNNL_ARG_TO, conv2_src_memory } });
    }

    auto conv2_weights_memory = conv2_user_weights_memory;
@ -312,10 +312,10 @@ void simple_net(int times = 100) {

    // create convolution primitive and add it to net
    net.push_back(convolution_forward(conv2_prim_desc));
-    net_args.push_back({ { MKLDNN_ARG_SRC, conv2_src_memory },
-            { MKLDNN_ARG_WEIGHTS, conv2_weights_memory },
-            { MKLDNN_ARG_BIAS, conv2_user_bias_memory },
-            { MKLDNN_ARG_DST, conv2_dst_memory } });
+    net_args.push_back({ { DNNL_ARG_SRC, conv2_src_memory },
+            { DNNL_ARG_WEIGHTS, conv2_weights_memory },
+            { DNNL_ARG_BIAS, conv2_user_bias_memory },
+            { DNNL_ARG_DST, conv2_dst_memory } });

    // AlexNet: relu2
    // {batch, 256, 27, 27} -> {batch, 256, 27, 27}
@ -328,8 +328,8 @@ void simple_net(int times = 100) {
    auto relu2_prim_desc = eltwise_forward::primitive_desc(relu2_desc, eng);

    net.push_back(eltwise_forward(relu2_prim_desc));
-    net_args.push_back({ { MKLDNN_ARG_SRC, conv2_dst_memory },
-            { MKLDNN_ARG_DST, conv2_dst_memory } });
+    net_args.push_back({ { DNNL_ARG_SRC, conv2_dst_memory },
+            { DNNL_ARG_DST, conv2_dst_memory } });

    // AlexNet: lrn2
    // {batch, 256, 27, 27} -> {batch, 256, 27, 27}
@ -349,8 +349,8 @@ void simple_net(int times = 100) {
    auto lrn2_dst_memory = memory(lrn2_prim_desc.dst_desc(), eng);

    net.push_back(lrn_forward(lrn2_prim_desc));
-    net_args.push_back({ { MKLDNN_ARG_SRC, conv2_dst_memory },
-            { MKLDNN_ARG_DST, lrn2_dst_memory } });
+    net_args.push_back({ { DNNL_ARG_SRC, conv2_dst_memory },
+            { DNNL_ARG_DST, lrn2_dst_memory } });

    // AlexNet: pool2
    // {batch, 256, 27, 27} -> {batch, 256, 13, 13}
@ -372,8 +372,8 @@ void simple_net(int times = 100) {

    // create pooling primitive an add it to net
    net.push_back(pooling_forward(pool2_pd));
-    net_args.push_back({ { MKLDNN_ARG_SRC, lrn2_dst_memory },
-            { MKLDNN_ARG_DST, pool2_dst_memory } });
+    net_args.push_back({ { DNNL_ARG_SRC, lrn2_dst_memory },
+            { DNNL_ARG_DST, pool2_dst_memory } });

    // AlexNet: conv3
    // {batch, 256, 13, 13} (x)  {384, 256, 3, 3}; -> {batch, 384, 13, 13};
@ -412,8 +412,8 @@ void simple_net(int times = 100) {
    if (conv3_prim_desc.src_desc() != conv3_src_memory.get_desc()) {
        conv3_src_memory = memory(conv3_prim_desc.src_desc(), eng);
        net.push_back(reorder(pool2_dst_memory, conv3_src_memory));
-        net_args.push_back({ { MKLDNN_ARG_FROM, pool2_dst_memory },
-                { MKLDNN_ARG_TO, conv3_src_memory } });
+        net_args.push_back({ { DNNL_ARG_FROM, pool2_dst_memory },
+                { DNNL_ARG_TO, conv3_src_memory } });
    }

    auto conv3_weights_memory = conv3_user_weights_memory;
@ -428,10 +428,10 @@ void simple_net(int times = 100) {

    // create convolution primitive and add it to net
    net.push_back(convolution_forward(conv3_prim_desc));
-    net_args.push_back({ { MKLDNN_ARG_SRC, conv3_src_memory },
-            { MKLDNN_ARG_WEIGHTS, conv3_weights_memory },
-            { MKLDNN_ARG_BIAS, conv3_user_bias_memory },
-            { MKLDNN_ARG_DST, conv3_dst_memory } });
+    net_args.push_back({ { DNNL_ARG_SRC, conv3_src_memory },
+            { DNNL_ARG_WEIGHTS, conv3_weights_memory },
+            { DNNL_ARG_BIAS, conv3_user_bias_memory },
+            { DNNL_ARG_DST, conv3_dst_memory } });

    // AlexNet: relu3
    // {batch, 384, 13, 13} -> {batch, 384, 13, 13}
@ -444,8 +444,8 @@ void simple_net(int times = 100) {
    auto relu3_prim_desc = eltwise_forward::primitive_desc(relu3_desc, eng);

    net.push_back(eltwise_forward(relu3_prim_desc));
-    net_args.push_back({ { MKLDNN_ARG_SRC, conv3_dst_memory },
-            { MKLDNN_ARG_DST, conv3_dst_memory } });
+    net_args.push_back({ { DNNL_ARG_SRC, conv3_dst_memory },
+            { DNNL_ARG_DST, conv3_dst_memory } });

    // AlexNet: conv4
    // {batch, 384, 13, 13} (x)  {2, 192, 192, 3, 3}; ->
@ -485,8 +485,8 @@ void simple_net(int times = 100) {
    if (conv4_prim_desc.src_desc() != conv4_src_memory.get_desc()) {
        conv4_src_memory = memory(conv4_prim_desc.src_desc(), eng);
        net.push_back(reorder(conv3_dst_memory, conv4_src_memory));
-        net_args.push_back({ { MKLDNN_ARG_FROM, conv3_dst_memory },
-                { MKLDNN_ARG_TO, conv4_src_memory } });
+        net_args.push_back({ { DNNL_ARG_FROM, conv3_dst_memory },
+                { DNNL_ARG_TO, conv4_src_memory } });
    }

    auto conv4_weights_memory = conv4_user_weights_memory;
@ -501,10 +501,10 @@ void simple_net(int times = 100) {

    // create convolution primitive and add it to net
    net.push_back(convolution_forward(conv4_prim_desc));
-    net_args.push_back({ { MKLDNN_ARG_SRC, conv4_src_memory },
-            { MKLDNN_ARG_WEIGHTS, conv4_weights_memory },
-            { MKLDNN_ARG_BIAS, conv4_user_bias_memory },
-            { MKLDNN_ARG_DST, conv4_dst_memory } });
+    net_args.push_back({ { DNNL_ARG_SRC, conv4_src_memory },
+            { DNNL_ARG_WEIGHTS, conv4_weights_memory },
+            { DNNL_ARG_BIAS, conv4_user_bias_memory },
+            { DNNL_ARG_DST, conv4_dst_memory } });

    // AlexNet: relu4
    // {batch, 384, 13, 13} -> {batch, 384, 13, 13}
@ -517,8 +517,8 @@ void simple_net(int times = 100) {
    auto relu4_prim_desc = eltwise_forward::primitive_desc(relu4_desc, eng);

    net.push_back(eltwise_forward(relu4_prim_desc));
-    net_args.push_back({ { MKLDNN_ARG_SRC, conv4_dst_memory },
-            { MKLDNN_ARG_DST, conv4_dst_memory } });
+    net_args.push_back({ { DNNL_ARG_SRC, conv4_dst_memory },
+            { DNNL_ARG_DST, conv4_dst_memory } });

    // AlexNet: conv5
    // {batch, 384, 13, 13} (x)  {2, 128, 192, 3, 3}; -> {batch, 256, 13, 13};
@ -557,8 +557,8 @@ void simple_net(int times = 100) {
    if (conv5_prim_desc.src_desc() != conv5_src_memory.get_desc()) {
        conv5_src_memory = memory(conv5_prim_desc.src_desc(), eng);
        net.push_back(reorder(conv4_dst_memory, conv5_src_memory));
-        net_args.push_back({ { MKLDNN_ARG_FROM, conv4_dst_memory },
-                { MKLDNN_ARG_TO, conv5_src_memory } });
+        net_args.push_back({ { DNNL_ARG_FROM, conv4_dst_memory },
+                { DNNL_ARG_TO, conv5_src_memory } });
    }

    auto conv5_weights_memory = conv5_user_weights_memory;
@ -573,10 +573,10 @@ void simple_net(int times = 100) {

    // create convolution primitive and add it to net
    net.push_back(convolution_forward(conv5_prim_desc));
-    net_args.push_back({ { MKLDNN_ARG_SRC, conv5_src_memory },
-            { MKLDNN_ARG_WEIGHTS, conv5_weights_memory },
-            { MKLDNN_ARG_BIAS, conv5_user_bias_memory },
-            { MKLDNN_ARG_DST, conv5_dst_memory } });
+    net_args.push_back({ { DNNL_ARG_SRC, conv5_src_memory },
+            { DNNL_ARG_WEIGHTS, conv5_weights_memory },
+            { DNNL_ARG_BIAS, conv5_user_bias_memory },
+            { DNNL_ARG_DST, conv5_dst_memory } });

    // AlexNet: relu5
    // {batch, 256, 13, 13} -> {batch, 256, 13, 13}
@ -589,8 +589,8 @@ void simple_net(int times = 100) {
    auto relu5_prim_desc = eltwise_forward::primitive_desc(relu5_desc, eng);

    net.push_back(eltwise_forward(relu5_prim_desc));
-    net_args.push_back({ { MKLDNN_ARG_SRC, conv5_dst_memory },
-            { MKLDNN_ARG_DST, conv5_dst_memory } });
+    net_args.push_back({ { DNNL_ARG_SRC, conv5_dst_memory },
+            { DNNL_ARG_DST, conv5_dst_memory } });

    // AlexNet: pool5
    // {batch, 256, 13, 13} -> {batch, 256, 6, 6}
@ -615,8 +615,8 @@ void simple_net(int times = 100) {

    // create pooling primitive an add it to net
    net.push_back(pooling_forward(pool5_pd));
-    net_args.push_back({ { MKLDNN_ARG_SRC, conv5_dst_memory },
-            { MKLDNN_ARG_DST, pool5_dst_memory } });
+    net_args.push_back({ { DNNL_ARG_SRC, conv5_dst_memory },
+            { DNNL_ARG_DST, pool5_dst_memory } });


    // fc6 inner product {batch, 256, 6, 6} (x) {4096, 256, 6, 6}-> {batch,
@ -651,8 +651,8 @@ void simple_net(int times = 100) {
    if (fc6_prim_desc.src_desc() != fc6_src_memory.get_desc()) {
        fc6_src_memory = memory(fc6_prim_desc.src_desc(), eng);
        net.push_back(reorder(pool5_dst_memory, fc6_src_memory));
-        net_args.push_back({ { MKLDNN_ARG_FROM, pool5_dst_memory },
-                { MKLDNN_ARG_TO, fc6_src_memory } });
+        net_args.push_back({ { DNNL_ARG_FROM, pool5_dst_memory },
+                { DNNL_ARG_TO, fc6_src_memory } });
    }

    auto fc6_weights_memory = fc6_user_weights_memory;
@ -666,10 +666,10 @@ void simple_net(int times = 100) {

    // create convolution primitive and add it to net
    net.push_back(inner_product_forward(fc6_prim_desc));
-    net_args.push_back({ { MKLDNN_ARG_SRC, fc6_src_memory },
-            { MKLDNN_ARG_WEIGHTS, fc6_weights_memory },
-            { MKLDNN_ARG_BIAS, fc6_user_bias_memory },
-            { MKLDNN_ARG_DST, fc6_dst_memory } });
+    net_args.push_back({ { DNNL_ARG_SRC, fc6_src_memory },
+            { DNNL_ARG_WEIGHTS, fc6_weights_memory },
+            { DNNL_ARG_BIAS, fc6_user_bias_memory },
+            { DNNL_ARG_DST, fc6_dst_memory } });


    // fc7 inner product {batch, 4096} (x) {4096, 4096}-> {batch, 4096}
@ -708,10 +708,10 @@ void simple_net(int times = 100) {

    // create convolution primitive and add it to net
    net.push_back(inner_product_forward(fc7_prim_desc));
-    net_args.push_back({ { MKLDNN_ARG_SRC, fc6_dst_memory },
-            { MKLDNN_ARG_WEIGHTS, fc7_weights_memory },
-            { MKLDNN_ARG_BIAS, fc7_user_bias_memory },
-            { MKLDNN_ARG_DST, fc7_dst_memory } });
+    net_args.push_back({ { DNNL_ARG_SRC, fc6_dst_memory },
+            { DNNL_ARG_WEIGHTS, fc7_weights_memory },
+            { DNNL_ARG_BIAS, fc7_user_bias_memory },
+            { DNNL_ARG_DST, fc7_dst_memory } });

    // fc8 inner product {batch, 4096} (x) {1000, 4096}-> {batch, 1000}
    memory::dims fc8_weights_tz = { 1000, 4096 };
@ -750,17 +750,17 @@ void simple_net(int times = 100) {

    // create convolution primitive and add it to net
    net.push_back(inner_product_forward(fc8_prim_desc));
-    net_args.push_back({ { MKLDNN_ARG_SRC, fc7_dst_memory },
-            { MKLDNN_ARG_WEIGHTS, fc8_weights_memory },
-            { MKLDNN_ARG_BIAS, fc8_user_bias_memory },
-            { MKLDNN_ARG_DST, fc8_dst_memory } });
+    net_args.push_back({ { DNNL_ARG_SRC, fc7_dst_memory },
+            { DNNL_ARG_WEIGHTS, fc8_weights_memory },
+            { DNNL_ARG_BIAS, fc8_user_bias_memory },
+            { DNNL_ARG_DST, fc8_dst_memory } });

    // create reorder between internal and user data if it is needed and
    // add it to net after pooling
    if (fc8_dst_memory != user_dst_memory) {
        net.push_back(reorder(fc8_dst_memory, user_dst_memory));
-        net_args.push_back({ { MKLDNN_ARG_FROM, fc8_dst_memory },
-                { MKLDNN_ARG_TO, user_dst_memory } });
+        net_args.push_back({ { DNNL_ARG_FROM, fc8_dst_memory },
+                { DNNL_ARG_TO, user_dst_memory } });
    }

 /// @page cpu_cnn_inference_f32_cpp
--- a/python/jittor/extern/mkl/ops/mkl_conv_backward_w_op.cc
+++ b/python/jittor/extern/mkl/ops/mkl_conv_backward_w_op.cc
@ -13,9 +13,9 @@
 #include "var.h"
 #include "mkl_conv_backward_w_op.h"

-#include <mkldnn.hpp>
+#include <dnnl.hpp>

-using namespace mkldnn;
+using namespace dnnl;
 using namespace std;

 namespace jittor {
@ -143,8 +143,8 @@ void MklConvBackwardWOp::jit_run() {
    if (conv_pd.src_desc() != conv_user_src_memory.get_desc()) {
        conv_src_memory = memory(conv_pd.src_desc(), eng);
        net_bwd.push_back(reorder(conv_user_src_memory, conv_src_memory));
-        net_bwd_args.push_back({{MKLDNN_ARG_FROM, conv_user_src_memory},
-                {MKLDNN_ARG_TO, conv_src_memory}});
+        net_bwd_args.push_back({{DNNL_ARG_FROM, conv_user_src_memory},
+                {DNNL_ARG_TO, conv_src_memory}});
    }
    
    auto conv_user_diff_dst_memory
@ -169,8 +169,8 @@ void MklConvBackwardWOp::jit_run() {
    if (conv_bwd_weights_pd.src_desc() != conv_src_memory.get_desc()) {
        conv_bwd_src_memory = memory(conv_bwd_weights_pd.src_desc(), eng);
        net_bwd.push_back(reorder(conv_src_memory, conv_bwd_src_memory));
-        net_bwd_args.push_back({{MKLDNN_ARG_FROM, conv_src_memory},
-                {MKLDNN_ARG_TO, conv_bwd_src_memory}});
+        net_bwd_args.push_back({{DNNL_ARG_FROM, conv_src_memory},
+                {DNNL_ARG_TO, conv_bwd_src_memory}});
    }

    auto conv_diff_dst_memory = conv_user_diff_dst_memory;
@ -178,13 +178,13 @@ void MklConvBackwardWOp::jit_run() {
            != conv_user_diff_dst_memory.get_desc()) {
        conv_diff_dst_memory = memory(conv_bwd_weights_pd.diff_dst_desc(), eng);
        net_bwd.push_back(reorder(conv_user_diff_dst_memory, conv_diff_dst_memory));
-        net_bwd_args.push_back({{MKLDNN_ARG_FROM, conv_user_diff_dst_memory},
-                {MKLDNN_ARG_TO, conv_diff_dst_memory}});
+        net_bwd_args.push_back({{DNNL_ARG_FROM, conv_user_diff_dst_memory},
+                {DNNL_ARG_TO, conv_diff_dst_memory}});
    }

    net_bwd.push_back(convolution_backward_weights(conv_bwd_weights_pd));
-    net_bwd_args.push_back({{MKLDNN_ARG_SRC, conv_bwd_src_memory},
-            {MKLDNN_ARG_DIFF_DST, conv_diff_dst_memory}});
+    net_bwd_args.push_back({{DNNL_ARG_SRC, conv_bwd_src_memory},
+            {DNNL_ARG_DIFF_DST, conv_diff_dst_memory}});

    auto conv_diff_weights_memory = conv_user_diff_weights_memory;
    if (conv_bwd_weights_pd.diff_weights_desc()
@ -192,15 +192,15 @@ void MklConvBackwardWOp::jit_run() {
        conv_diff_weights_memory
                = memory(conv_bwd_weights_pd.diff_weights_desc(), eng);
        net_bwd_args.back().insert(
-                {MKLDNN_ARG_DIFF_WEIGHTS, conv_diff_weights_memory});
+                {DNNL_ARG_DIFF_WEIGHTS, conv_diff_weights_memory});

        net_bwd.push_back(reorder(
                conv_diff_weights_memory, conv_user_diff_weights_memory));
-        net_bwd_args.push_back({{MKLDNN_ARG_FROM, conv_diff_weights_memory},
-                {MKLDNN_ARG_TO, conv_user_diff_weights_memory}});
+        net_bwd_args.push_back({{DNNL_ARG_FROM, conv_diff_weights_memory},
+                {DNNL_ARG_TO, conv_user_diff_weights_memory}});
    } else {
        net_bwd_args.back().insert(
-                {MKLDNN_ARG_DIFF_WEIGHTS, conv_diff_weights_memory});
+                {DNNL_ARG_DIFF_WEIGHTS, conv_diff_weights_memory});
    }

    ASSERTop(net_bwd.size(),==,net_bwd_args.size());
--- a/python/jittor/extern/mkl/ops/mkl_conv_backward_x_op.cc
+++ b/python/jittor/extern/mkl/ops/mkl_conv_backward_x_op.cc
@ -13,9 +13,9 @@
 #include "var.h"
 #include "mkl_conv_backward_x_op.h"

-#include <mkldnn.hpp>
+#include <dnnl.hpp>

-using namespace mkldnn;
+using namespace dnnl;
 using namespace std;

 namespace jittor {
@ -142,8 +142,8 @@ void MklConvBackwardXOp::jit_run() {
        conv_weights_memory = memory(conv_pd.weights_desc(), eng);
        net_bwd.push_back(
                reorder(conv_user_weights_memory, conv_weights_memory));
-        net_bwd_args.push_back({{MKLDNN_ARG_FROM, conv_user_weights_memory},
-                {MKLDNN_ARG_TO, conv_weights_memory}});
+        net_bwd_args.push_back({{DNNL_ARG_FROM, conv_user_weights_memory},
+                {DNNL_ARG_TO, conv_weights_memory}});
    }
    
    auto conv_user_diff_dst_memory
@ -168,21 +168,21 @@ void MklConvBackwardXOp::jit_run() {
            != conv_user_diff_dst_memory.get_desc()) {
        conv_diff_dst_memory = memory(conv_bwd_data_pd.diff_dst_desc(), eng);
        net_bwd.push_back(reorder(conv_user_diff_dst_memory, conv_diff_dst_memory));
-        net_bwd_args.push_back({{MKLDNN_ARG_FROM, conv_user_diff_dst_memory},
-                {MKLDNN_ARG_TO, conv_diff_dst_memory}});
+        net_bwd_args.push_back({{DNNL_ARG_FROM, conv_user_diff_dst_memory},
+                {DNNL_ARG_TO, conv_diff_dst_memory}});
    }

    auto conv_bwd_weights_memory = conv_weights_memory;
    if (conv_bwd_data_pd.weights_desc() != conv_weights_memory.get_desc()) {
        conv_bwd_weights_memory = memory(conv_bwd_data_pd.weights_desc(), eng);
        net_bwd.push_back(reorder(conv_weights_memory, conv_bwd_weights_memory));
-        net_bwd_args.push_back({{MKLDNN_ARG_FROM, conv_weights_memory},
-                {MKLDNN_ARG_TO, conv_bwd_weights_memory}});
+        net_bwd_args.push_back({{DNNL_ARG_FROM, conv_weights_memory},
+                {DNNL_ARG_TO, conv_bwd_weights_memory}});
    }

    net_bwd.push_back(convolution_backward_data(conv_bwd_data_pd));
-    net_bwd_args.push_back({{MKLDNN_ARG_WEIGHTS, conv_bwd_weights_memory},
-        {MKLDNN_ARG_DIFF_DST, conv_diff_dst_memory}});
+    net_bwd_args.push_back({{DNNL_ARG_WEIGHTS, conv_bwd_weights_memory},
+        {DNNL_ARG_DIFF_DST, conv_diff_dst_memory}});
            
    auto conv_diff_src_memory = conv_user_diff_src_memory;
    if (conv_bwd_data_pd.diff_src_desc()
@ -190,15 +190,15 @@ void MklConvBackwardXOp::jit_run() {
        conv_diff_src_memory
                = memory(conv_bwd_data_pd.diff_src_desc(), eng);
        net_bwd_args.back().insert(
-                {MKLDNN_ARG_DIFF_SRC, conv_diff_src_memory});
+                {DNNL_ARG_DIFF_SRC, conv_diff_src_memory});
                
        net_bwd.push_back(reorder(
                conv_diff_src_memory, conv_user_diff_src_memory));
-        net_bwd_args.push_back({{MKLDNN_ARG_FROM, conv_diff_src_memory},
-                {MKLDNN_ARG_TO, conv_user_diff_src_memory}});
+        net_bwd_args.push_back({{DNNL_ARG_FROM, conv_diff_src_memory},
+                {DNNL_ARG_TO, conv_user_diff_src_memory}});
    } else {
        net_bwd_args.back().insert(
-                {MKLDNN_ARG_DIFF_SRC, conv_diff_src_memory});
+                {DNNL_ARG_DIFF_SRC, conv_diff_src_memory});
    }

    ASSERTop(net_bwd.size(),==,net_bwd_args.size());
--- a/python/jittor/extern/mkl/ops/mkl_conv_op.cc
+++ b/python/jittor/extern/mkl/ops/mkl_conv_op.cc
@ -7,12 +7,12 @@
 // This file is subject to the terms and conditions defined in
 // file 'LICENSE.txt', which is part of this source code package.
 // ***************************************************************
-#include <mkldnn.hpp>
+#include <dnnl.hpp>

 #include "var.h"
 #include "mkl_conv_op.h"

-using namespace mkldnn;
+using namespace dnnl;
 using namespace std;

 namespace jittor {
@ -110,7 +110,7 @@ void MklConvOp::jit_run() {
        auto n = ws[3];
        auto k = xs[3];
        // x: [m,k], w: [k,n], y: [m,n]
-        ASSERTop(0,==,mkldnn_sgemm('N', 'N', m, n, k,
+        ASSERTop(0,==,dnnl_sgemm('N', 'N', m, n, k,
            1.f, x->ptr<float32>(), k,
            w->ptr<float32>(), n,
            0.f, y->ptr<float32>(), n));
@ -162,27 +162,27 @@ void MklConvOp::jit_run() {
    if (conv1_prim_desc.src_desc() != user_src_memory.get_desc()) {
        conv1_src_memory = memory(conv1_prim_desc.src_desc(), eng);
        net.push_back(reorder(user_src_memory, conv1_src_memory));
-        net_args.push_back({ { MKLDNN_ARG_FROM, user_src_memory },
-                { MKLDNN_ARG_TO, conv1_src_memory } });
+        net_args.push_back({ { DNNL_ARG_FROM, user_src_memory },
+                { DNNL_ARG_TO, conv1_src_memory } });
    }

    auto conv1_weights_memory = user_weights_memory;
    if (conv1_prim_desc.weights_desc() != user_weights_memory.get_desc()) {
        conv1_weights_memory = memory(conv1_prim_desc.weights_desc(), eng);
        net.push_back(reorder(user_weights_memory, conv1_weights_memory));
-        net_args.push_back({ { MKLDNN_ARG_FROM, user_weights_memory }, { MKLDNN_ARG_TO, conv1_weights_memory } });
+        net_args.push_back({ { DNNL_ARG_FROM, user_weights_memory }, { DNNL_ARG_TO, conv1_weights_memory } });
    }
    
    auto conv1_dst_memory = memory(conv1_prim_desc.dst_desc(), eng);
    
    net.push_back(convolution_forward(conv1_prim_desc));
-    net_args.push_back({ { MKLDNN_ARG_SRC, conv1_src_memory },
-            { MKLDNN_ARG_WEIGHTS, conv1_weights_memory },
-            { MKLDNN_ARG_DST, conv1_dst_memory } });
+    net_args.push_back({ { DNNL_ARG_SRC, conv1_src_memory },
+            { DNNL_ARG_WEIGHTS, conv1_weights_memory },
+            { DNNL_ARG_DST, conv1_dst_memory } });

    if (conv1_dst_memory != user_dst_memory) {
        net.push_back(reorder(conv1_dst_memory, user_dst_memory));
-        net_args.push_back({ { MKLDNN_ARG_FROM, conv1_dst_memory },{ MKLDNN_ARG_TO, user_dst_memory } });
+        net_args.push_back({ { DNNL_ARG_FROM, conv1_dst_memory },{ DNNL_ARG_TO, user_dst_memory } });
    }

    ASSERTop(net.size(),==,net_args.size());
--- a/python/jittor/extern/mkl/ops/mkl_matmul_op.cc
+++ b/python/jittor/extern/mkl/ops/mkl_matmul_op.cc
@ -7,12 +7,12 @@
 // This file is subject to the terms and conditions defined in
 // file 'LICENSE.txt', which is part of this source code package.
 // ***************************************************************
-#include <mkldnn.hpp>
+#include <dnnl.hpp>

 #include "var.h"
 #include "mkl_matmul_op.h"

-using namespace mkldnn;
+using namespace dnnl;
 using namespace std;

 namespace jittor {
@ -66,7 +66,7 @@ void MklMatmulOp::jit_run() {
        k = bs[0];
    }
    // a: [n,m], b: [m,k], c: [n,k]
-    ASSERTop(0,==,mkldnn_sgemm('@Trans_a', '@Trans_b', n, k, m,
+    ASSERTop(0,==,dnnl_sgemm('@Trans_a', '@Trans_b', n, k, m,
        1.f, a->ptr<T>(), '@Trans_a'=='N'? m : n,
        b->ptr<T>(), '@Trans_b' == 'N' ? k : m,
        0.f, c->ptr<T>(), k));
--- a/python/jittor/init.py
+++ b/python/jittor/init.py
@ -706,7 +706,7 @@ def _no_grad_trunc_normal_(var, mean, std, a, b):
                      "The distribution of values may be incorrect.",
                      stacklevel=2)

-    with jt.no_grad():
+
    # Values are generated by using a truncated uniform distribution and
    # then using the inverse CDF for the normal distribution.
    # Get upper and lower cdf values
@ -716,16 +716,16 @@ def _no_grad_trunc_normal_(var, mean, std, a, b):
    # Uniformly fill tensor with values from [l, u], then translate to
    # [2l-1, 2u-1].
    # var.uniform(2 * l - 1, 2 * u - 1)
-        jt.init.uniform_(var, low=2 * l - 1, high=2 * u - 1)
+    var.uniform_(low=2 * l - 1, high=2 * u - 1)

    # Use inverse cdf transform for normal distribution to get truncated
    # standard normal
-        var.erfinv()
+    var = var.erfinv()

    # Transform to proper mean, std
-        var.multiply(std * math.sqrt(2.))
-        var.add(mean)
+    var = var.multiply(std * math.sqrt(2.))
+    var = var.add(mean)

    # Clamp to ensure it's in the proper range
-        var.clamp(min_v=a, max_v=b)
+    var = var.clamp(min_v=a, max_v=b)
    return var
--- a/python/jittor/misc.py
+++ b/python/jittor/misc.py
@ -488,18 +488,11 @@ def arctan2(y,x):
    angle = jt.zeros(x.shape,dtype=x.dtype)
    x = (x!=0.0).ternary(x, x+1e-30)
    angle = (y/x).arctan()
-        
-    mask = (y<0) & (x<0)
-    if angle[mask].numel()>0:
-        angle[mask] -= np.pi
-        
-    mask = (y>=0) &(x<0)
-    if angle[mask].numel()>0:
-        angle[mask] +=np.pi
+    mask = y<0 | ((y==0) & (x<0))
+    angle = angle + mask*np.pi
    return angle


-
 def nonzero(x):
    r'''
    Return the index of the elements of input tensor which are not equal to zero.
--- a/python/jittor/models/resnet.py
+++ b/python/jittor/models/resnet.py
@ -143,7 +143,7 @@ class ResNet(nn.Module):
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
-        x = self.avgpool(x)
+        x = self.avgpool(x).float_auto()
        x = jt.reshape(x, (x.shape[0], -1))
        x = self.fc(x)
        return x
--- a/python/jittor/nn.py
+++ b/python/jittor/nn.py
@ -37,6 +37,7 @@ def matmul_transpose(a, b):
    assert len(a.shape) == 2 and len(b.shape) == 2

    shape = list(a.shape)[:-1] + list(b.shape)
+    with jt.flag_scope(amp_reg = jt.flags.amp_reg | 4):
        a = a.broadcast(shape, [len(shape)-2])
        b = b.broadcast(shape)
        return (a*b).sum(len(shape)-1)
@ -108,6 +109,7 @@ Example::
    c = jt.matmul(a, b)
    assert c.shape == [8, 10, 3, 5]
    '''
+    with jt.flag_scope(amp_reg = jt.flags.amp_reg | 4):
        len_a = len(a.shape)
        len_b = len(b.shape)
        if len_b == 1:
@ -488,19 +490,22 @@ class BCEWithLogitsLoss(Module):
    def execute(self, output, target):
        return binary_cross_entropy_with_logits(output,target,self.weight,self.pos_weight,self.size_average)

-def softmax(x, dim = None):
+def softmax(x, dim=None, log=False):
+    import jittor.other.code_softmax as code_softmax
+    if code_softmax.can_softmax_v1(x, dim):
+        return code_softmax.softmax_v1(x, log)
    if dim is None:
        x = (x - x.max()).exp()
        ret = x / x.sum()
    else:
        x = (x-x.max(dim, keepdims=True)).exp()
        ret = x / x.sum(dim, keepdims=True)
+    if log: return ret.log()
    return ret
 jt.Var.softmax = softmax

 def log_softmax(x,dim=None):
-    x = softmax(x,dim=dim)
-    return jt.log(x)
+    return softmax(x,dim=dim, log=True)
 jt.Var.log_softmax = log_softmax

 def log_sigmoid(x):
@ -829,6 +834,7 @@ class Conv(Module):
            oh = (H+self.padding[0]*2-Kh*self.dilation[0]+self.dilation[0]-1)//self.stride[0]+1
            ow = (W+self.padding[1]*2-Kw*self.dilation[1]+self.dilation[1]-1)//self.stride[1]+1
            assert oh>0 and ow>0
+            with jt.flag_scope(amp_reg = jt.flags.amp_reg | 4):
                xx = x.reindex([N,self.out_channels,C,oh,ow,Kh,Kw], [
                    'i0', # Nid
                    'i2', # Cid
@ -1005,6 +1011,18 @@ class Conv3d(Module):
    def execute(self, x):
        return conv3d(x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups)

+
+class Conv1d_sp(Linear):
+    def __init__(self, inchannels, outchannels, kernel_size=1, bias=True):
+        super().__init__(inchannels, outchannels, bias=bias)
+        assert kernel_size == 1
+
+    def execute(self, x):
+        x = x.transpose(0, 2, 1)
+        x = super().execute(x)
+        x = x.transpose(0, 2, 1)
+        return x
+
 def conv2d(x, weight, bias=None, stride=1, padding=0, dilation=1, groups=1):
    ''' Applies a 2D convolution over an input signal composed of several input planes.

@ -1045,6 +1063,7 @@ def conv2d(x, weight, bias=None, stride=1, padding=0, dilation=1, groups=1):
        Kh, Kw = weight.shape[-2:]
        oh = (H+padding[0]*2-Kh*dilation[0]+dilation[0]-1)//stride[0]+1
        ow = (W+padding[1]*2-Kw*dilation[1]+dilation[1]-1)//stride[1]+1
+        with jt.flag_scope(amp_reg = jt.flags.amp_reg | 4):
            xx = x.reindex([N,out_channels,C,oh,ow,Kh,Kw], [
                    'i0', # Nid
                    'i2', # Cid
--- a/python/jittor/notebook/md_to_ipynb.py
+++ b/python/jittor/notebook/md_to_ipynb.py
@ -1,7 +1,7 @@
 #!python3
 import os, json
-from pathlib import Path
-notebook_dir = os.path.join(str(Path.home()), ".cache","jittor","notebook")
+import jittor_utils as jit_utils
+notebook_dir = os.path.join(jit_utils.home(), ".cache","jittor","notebook")
 if not os.path.isdir(notebook_dir):
    os.mkdir(notebook_dir)
 dirname = os.path.dirname(__file__)
--- a/python/jittor/other/code_softmax.py
+++ b/python/jittor/other/code_softmax.py
@ -0,0 +1,130 @@
+import jittor as jt
+from jittor import nn
+
+def can_softmax_v1(a, dim):
+    if not jt.flags.use_cuda:
+        return False
+    if dim != -1 and dim != len(a.shape)-1:
+        return False
+    if a.shape[len(a.shape)-1] > 10000:
+        return False
+    return True
+
+def softmax_v1(a, log=False):
+    assert can_softmax_v1(a, -1)
+    length = a.shape[-1]
+    # tnum = 1024
+    tnum = 500 if length % 500 == 0 else 512
+    tnum = 125 if length % 125 == 0 else 128
+    # tnum = 125
+    # tnum = 1000 if length % 1000 == 0 else 1024
+    # tnum = 250
+    per_thread = (length-1) // tnum + 1
+    ILP = 1
+    for ilp in [8,4,2]:
+        if length % tnum == 0 and per_thread % ilp == 0:
+            ILP = ilp
+            per_thread //= ILP
+            break
+    for_loop = f"""
+    #pragma unroll
+    for (int i=0; i<{per_thread}; i++)
+    """
+    if length % tnum != 0:
+        for_loop += f"if ((i*{tnum}+threadIdx.x)*{ILP} < len)\n"
+
+    return jt.code(a.shape, a.dtype, [a], cuda_header=f'''
+#include <{jt.compile_extern.cub_home}cub/cub.cuh>
+#include <type/fp16_compute.h>
+''', cuda_src=f'''
+__global__ void kernel(in0_type* x, out0_type* y, int len) {{
+    typedef cub::BlockReduce<float, {tnum}> BlockReduce;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+
+    int id = blockIdx.x * len;
+    in0_type v[{per_thread}][{ILP}];
+    {for_loop}
+        vload<sizeof(in0_type)*{ILP}>(v[i], &x[id+(i*{tnum}+threadIdx.x)*{ILP}]);
+    // v[i] = x[id+i*{tnum}+threadIdx.x];
+    float v1 = -1e30;
+    {for_loop}
+        #pragma unroll
+        for (int j=0; j<{ILP}; j++) {{
+            v1 = max(v1, float(v[i][j]));
+        }}
+    __shared__ float vmax;
+    auto tmp = BlockReduce(temp_storage).Reduce(v1, cub::Max());
+    if (threadIdx.x == 0)
+        vmax = tmp;
+    __syncthreads();
+
+    v1 = 0;
+    {for_loop}
+        #pragma unroll
+        for (int j=0; j<{ILP}; j++) {{
+            v[i][j] = expf(float(v[i][j]) - vmax);
+            v1 += float(v[i][j]);
+        }}
+
+    tmp = BlockReduce(temp_storage).Sum(v1);
+    __shared__ float vsum;
+    if (threadIdx.x == 0)
+        vsum = tmp;
+    __syncthreads();
+
+    {for_loop}
+        #pragma unroll
+        for (int j=0; j<{ILP}; j++)
+            v[i][j] = {
+                "@expand_op(log,@in0_type,float(v[i][j])/vsum)" if log
+                else "float(v[i][j])/vsum"
+            };
+    {for_loop}
+        vload<sizeof(in0_type)*{ILP}>(&y[id+(i*{tnum}+threadIdx.x)*{ILP}], v[i]);
+}}
+int len = in0->shape[in0->shape.size()-1];
+int bnum = in0->numel() / len;
+cudaGetLastError();
+kernel<<<bnum, {tnum}>>>(in0_p, out0_p, len);
+CHECK(0 == cudaGetLastError());
+''', cuda_grad_src=[f"""
+__global__ void kernel(pout0_type* x, dout_type* y, out0_type* z, int len) {{
+    int id = blockIdx.x * len;
+    in0_type vx[{per_thread}][{ILP}];
+    in0_type vy[{per_thread}][{ILP}];
+    {for_loop} {{
+        vload<sizeof(in0_type)*{ILP}>(vx[i], &x[id+(i*{tnum}+threadIdx.x)*{ILP}]);
+        vload<sizeof(in0_type)*{ILP}>(vy[i], &y[id+(i*{tnum}+threadIdx.x)*{ILP}]);
+    }}
+    float v1 = 0;
+    {for_loop} 
+        #pragma unroll
+        for (int j=0; j<{ILP}; j++)
+            v1 += {"float(vy[i][j]);" if log else "float(vx[i][j]*vy[i][j]);"}
+
+    typedef cub::BlockReduce<float, {tnum}> BlockReduce;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+    auto tmp = BlockReduce(temp_storage).Sum(v1);
+    __shared__ float reduce_var;
+    if (threadIdx.x == 0)
+        reduce_var = tmp;
+    __syncthreads();
+
+    {for_loop}
+        #pragma unroll
+        for (int j=0; j<{ILP}; j++)
+            vx[i][j] = {
+                "vy[i][j] - in0_type(expf(vx[i][j]) * reduce_var);" if log 
+                else "vx[i][j] * (vy[i][j] - in0_type(reduce_var));"
+            }
+
+    {for_loop}
+        vload<sizeof(in0_type)*{ILP}>(&z[id+(i*{tnum}+threadIdx.x)*{ILP}],
+            vx[i]);
+}}
+int len = in0->shape[in0->shape.size()-1];
+int bnum = in0->numel() / len;
+cudaGetLastError();
+kernel<<<bnum, {tnum}>>>(pout0_p, dout_p, out0_p, len);
+CHECK(0 == cudaGetLastError());
+"""])
--- a/python/jittor/pool.py
+++ b/python/jittor/pool.py
@ -60,14 +60,14 @@ class Pool(Module):
            '''
            if not self.return_indices:
                forward_body += f'''
-                @out(i0, i1, i2, i3) = init_{self.op}(out_type);
+                @out(i0, i1, i2, i3) = @expand_op(init_{self.op}, @out_type);
                for (int p = k2; p < k2_; ++p)
                    for (int q = k3; q < k3_; ++q)
-                        @out(i0, i1, i2, i3) = {self.op}(out_type, @out(i0, i1, i2, i3), @in0(i0, i1, p, q));
+                        @out(i0, i1, i2, i3) = @expand_op({self.op}, @out_type, @out(i0, i1, i2, i3), @out_type, @in0(i0, i1, p, q), @in0_type);
                '''
            else:
                forward_body += f'''
-                auto out_value = init_{self.op}(out_type);
+                auto out_value = @expand_op(init_{self.op}, @out_type);
                int out_index = -1;
                for (int p = k2; p < k2_; ++p)
                    for (int q = k3; q < k3_; ++q) 
@ -105,7 +105,6 @@ class Pool(Module):
                return_dtypes = x.dtype
            out = jt.code(return_shapes, return_dtypes, [x],
                cuda_header="""
-                    #include <ops/binary_op_defs.h>
                    #include <misc/cuda_limits.h>
                """,
                cuda_src=f'''
@ -121,8 +120,8 @@ class Pool(Module):
                        for (int i2 = p2; i2 < out_shape2; i2 += s2)
                            {{ {forward_body} }}
                    }}
-                    int tx = min(1024, out_shape3);
-                    int ty = min(1024 / tx, out_shape2);
+                    int tx = std::min(1024, out_shape3);
+                    int ty = std::min(1024 / tx, out_shape2);
                    int bx = (out_shape2 - 1) / ty + 1;
                    int by = out_shape1;
                    int bz = out_shape0;
@ -144,8 +143,8 @@ class Pool(Module):
                                {{ {backward_body} }}
                    }}
                    cudaMemsetAsync(out_p, 0, out->size);
-                    int tx = min(1024, pout_shape3);
-                    int ty = min(1024 / tx, pout_shape2);
+                    int tx = std::min(1024, pout_shape3);
+                    int ty = std::min(1024 / tx, pout_shape2);
                    int bx = (pout_shape2 - 1) / ty + 1;
                    int by = pout_shape1;
                    int bz = pout_shape0;
@ -153,7 +152,7 @@ class Pool(Module):
                    dim3 s2_(tx, ty);
                    kernel3<<<s1_, s2_>>>(@ARGS);
                '''],
-                cpu_header='#include <ops/binary_op_defs.h>',
+                cpu_header='',
                cpu_src=f'''
                    using namespace std;
                    for (int i0=0; i0<out_shape0; i0++)
@ -242,15 +241,15 @@ class Pool3d(Module):
            '''
            if not self.return_indices:
                forward_body += f'''
-                @out(i0, i1, i2, i3, i4) = init_{self.op}(out_type);
+                @out(i0, i1, i2, i3, i4) = @expand_op(init_{self.op}, @out_type);
                for (int p = k2; p < k2_; ++p)
                    for (int q = k3; q < k3_; ++q)
                        for (int r = k4; r < k4_; ++r)
-                            @out(i0, i1, i2, i3, i4) = {self.op}(out_type, @out(i0, i1, i2, i3, i4), @in0(i0, i1, p, q, r));
+                            @out(i0, i1, i2, i3, i4) = @expand_op({self.op}, @out_type, @out(i0, i1, i2, i3, i4), @out_type, @in0(i0, i1, p, q, r), @in0_type);
                '''
            else:
                forward_body += f'''
-                auto out_value = init_{self.op}(out_type);
+                auto out_value = @expand_op(init_{self.op}, @out_type);
                int out_index = -1;
                for (int p = k2; p < k2_; ++p)
                    for (int q = k3; q < k3_; ++q) 
@ -293,7 +292,6 @@ class Pool3d(Module):
                return_dtypes = x.dtype
            out = jt.code(return_shapes, return_dtypes, [x],
                cuda_header="""
-                    #include <ops/binary_op_defs.h>
                    #include <misc/cuda_limits.h>
                """,
                cuda_src=f'''
@ -312,9 +310,9 @@ class Pool3d(Module):
                        for (int i2 = p2; i2 < out_shape2; i2 += s2)
                            {{ {forward_body} }}
                    }}
-                    int tx = min(1024, out_shape4);
-                    int ty = min(1024 / tx, out_shape3);
-                    int tz = min(1024 / tx / ty, out_shape2);
+                    int tx = std::min(1024, out_shape4);
+                    int ty = std::min(1024 / tx, out_shape3);
+                    int tz = std::min(1024 / tx / ty, out_shape2);
                    int bx = (out_shape2 - 1) / tz + 1;
                    int by = out_shape1;
                    int bz = out_shape0;
@ -339,9 +337,9 @@ class Pool3d(Module):
                                {{ {backward_body} }}
                    }}
                    cudaMemsetAsync(out_p, 0, out->size);
-                    int tx = min(1024, pout_shape4);
-                    int ty = min(1024 / tx, pout_shape3);
-                    int tz = min(1024 / tx / ty, pout_shape2);
+                    int tx = std::min(1024, pout_shape4);
+                    int ty = std::min(1024 / tx, pout_shape3);
+                    int tz = std::min(1024 / tx / ty, pout_shape2);
                    int bx = (pout_shape2 - 1) / tz + 1;
                    int by = pout_shape1;
                    int bz = pout_shape0;
@ -349,7 +347,7 @@ class Pool3d(Module):
                    dim3 s2(tx, ty, tz);
                    kernel3<<<s1, s2>>>(@ARGS);
                '''],
-                cpu_header='#include <ops/binary_op_defs.h>',
+                cpu_header='',
                cpu_src=f'''
                    using namespace std;
                    for (int i0=0; i0<out_shape0; i0++)
--- a/python/jittor/src/executor.cc
+++ b/python/jittor/src/executor.cc
@ -509,6 +509,11 @@ void Executor::run_sync(vector<Var*> vars, bool device_sync) {
                    var->alloc(cpu_allocator);
                }
            }
+        } else {
+            for (Var* v : op->inputs()) {
+                if (!v->allocator->is_cuda())
+                    migrate_to_gpu(v, allocator);
+            }
        }
        #endif
        #ifdef NODE_MEMCHECK
--- a/python/jittor/src/executor.h
+++ b/python/jittor/src/executor.h
@ -22,6 +22,10 @@ struct Executor {
    Allocator* temp_allocator;
    bool last_is_cuda = false;
    void run_sync(vector<Var*> vars, bool device_sync);
+
+    inline Allocation alloc_temp(size_t size) {
+        return Allocation(temp_allocator, size);
+    }
 };

 EXTERN_LIB Executor exe;
--- a/python/jittor/src/fused_op.cc
+++ b/python/jittor/src/fused_op.cc
@ -9,6 +9,7 @@
 #include "op_compiler.h"
 #include "profiler/profiler.h"
 #include "misc/fast_shared_ptr.h"
+#include "misc/cuda_flags.h"

 namespace jittor {

@ -42,6 +43,7 @@ void FusedOp::update_ops() {
    loop_options_tuned.clear();
    loop_options = loop_options_origin = nullptr;

+    _inputs.clear();
    _outputs.clear();
    for (Op* op : ops) {
        for (Var* o : op->outputs()) {
@ -101,6 +103,7 @@ void FusedOp::update_ops() {
            if (!(c&2)) {
                c += 2 + vars.size()*4;
                vars.push_back({i, 0});
+                _inputs.emplace_back((Node*)i);
            }
        }
        for (Var* o : opi->outputs()) {
@ -135,6 +138,7 @@ FusedOp::FusedOp(const FusedOp& other) {
 }

 FusedOp::~FusedOp() {
+    _inputs.clear();
    _outputs.clear();
    Op::number_of_lived_ops++;
 }
@ -159,20 +163,15 @@ void FusedOp::statistics(uint64_t& in, uint64_t& out, uint64_t& compute) {

 void FusedOp::do_jit_prepare(JK& jk) {
    jk.clear();
-    int8 flags = 3;
    for (uint i=0; i<ops.size(); i++) {
        Op* op = ops[i];
        jk << "[opkey" << i << JK::val;
-        op->do_jit_prepare(jk);
+        jk << op->name();
+        op->jit_prepare(jk);
        jk << JK::end;
-        if (op->flags.get(NodeFlags::_cpu))
-            flags &= 1; // only cpu
-        else
-            flags &= 2; // only gpu
    }
-    ASSERT(flags) << "FusedOp cannot contain both cpu and cuda ops.";
    jk << _CS("[JIT:1]");
-    if (flags==1) {
+    if (!use_cuda) {
        // only cpu
        jk << _CS("[JIT_cpu:1]");
        this->flags.set(NodeFlags::_cuda, 0);
@ -189,9 +188,17 @@ void FusedOp::do_jit_prepare(JK& jk) {
        jk << JK::hex2(i) << JK::hex1(j) << JK::hex2(k) << JK::hex1(l) << ',';
    }
    jk << _CS("][var_info:") << JK::val;
-    for (auto& vi : vars)
+    bool use_int64_t = false;
+    for (auto& vi : vars) {
        jk << JK::hex1(vi.type) << JK::hex1(vi.var->shape.size());
+        if (vi.type != 1 && vi.var->num >= std::numeric_limits<int32_t>::max())
+            use_int64_t = true;
+    }
    jk << JK::end;
+    if (use_int64_t)
+        jk << _CS("[index_t:int64]");
+    else
+        jk << _CS("[index_t:int32]");
    if (loop_options->size()) {
        if (get_loop_option("compile_shapes")) {
            jk << _CS("[shapes:");
--- a/python/jittor/src/grad.cc
+++ b/python/jittor/src/grad.cc
@ -39,11 +39,24 @@ template<class T> struct StackIniter {
 #define STACK_ALLOC2(T, a, n) T a[n]
 #endif

+struct AmpGradGuard {
+    int amp_reg_bk;
+    AmpGradGuard(Op* op) {
+        amp_reg_bk = amp_reg;
+        amp_reg |= (op->flags.flags >> NodeFlags::_prefer_32);
+    }
+
+    ~AmpGradGuard() {
+        amp_reg = amp_reg_bk;
+    }
+};
+
 VarPtr make_grad(Op* op, Var* out, Var* dout, Var* x, int x_index) {
    if (dout == nullptr) return nullptr;
    if (x_index<0) return nullptr;
    LOGvvvv << "Make grad op:" >> op->name() << "inputs:" >> op->inputs()
        << "out:" >> out << "dout:" >> dout << "x:" >> x << "xid:" >> x_index;
+    AmpGradGuard agg(op);
    auto dx = op->grad(out, dout, x, x_index);
    if (x->loop_options)
        dx->loop_options = x->loop_options;
@ -182,7 +195,10 @@ vector<VarPtr> grad(Var* loss, vector<Var*> targets) {
                        douts[i] = nullptr;
                }
                trace_grad_op = op;
+                {
+                    AmpGradGuard agg(op);
                    op->grads(douts, dins);
+                }
                // dump "for (Var* in : op->inputs())"
                for (int i=0; i<n_i; i++,j++) {
                    auto id = id_buffer[j].second;
--- a/python/jittor/src/init.cc
+++ b/python/jittor/src/init.cc
@ -43,7 +43,7 @@ void cleanup() {
 }

 static void init_cuda_devices() {
-#ifdef HAS_CUDA
+#ifdef IS_CUDA
    if (cuda_archs.size()) return;
    int count=0;
    cudaGetDeviceCount(&count);
--- a/python/jittor/src/jit_compiler.cc
+++ b/python/jittor/src/jit_compiler.cc
@ -20,6 +20,7 @@
 #include "utils/flags.h"
 #include "fused_op.h"
 #include "utils/str_utils.h"
+JPU(header)

 namespace jittor {
    
@ -204,6 +205,8 @@ jit_op_entry_t compile(const string& jit_key, const string& src, const bool is_c
    // compiler do not allowed filename too long
    CHECK(cc_path.size());
    string jit_src_path = Op::get_filename_from_jit_key(jit_key, ".cc");
+    string* src2 = (string*)&src;
+    JPU(op_compiler(jit_src_path, *src2, is_cuda_op));
    #ifdef _WIN32
    string jit_lib_path = Op::get_filename_from_jit_key(jit_key, ".dll");
    string jit_src_path2 = _to_winstr(jit_src_path);
--- a/python/jittor/src/jit_key.h
+++ b/python/jittor/src/jit_key.h
@ -167,7 +167,7 @@ inline JK& operator<<(JK& jk, int64 c) {
 }

 #ifdef __linux__
-inline JK& operator<<(JK& jk, long long int c) {
+inline JK& operator<<(JK& jk, int64_t c) {
    return jk << (int64)c;
 }
 #endif
--- a/python/jittor/src/mem/allocator.cc
+++ b/python/jittor/src/mem/allocator.cc
@ -107,7 +107,7 @@ void migrate_to_cpu(Var* var, Allocator* allocator) {
    if (!use_cuda_managed_allocator) {
        // must be a device allocator
        Allocation a(allocator, var->size);
-        checkCudaErrors(cudaMemcpy(a.ptr, var->mem_ptr, var->size, cudaMemcpyDefault));
+        checkCudaErrors(cudaMemcpy(a.ptr, var->mem_ptr, var->size, cudaMemcpyDeviceToHost));
        var->allocator->free(var->mem_ptr, var->size, var->allocation);
        var->mem_ptr = a.ptr;
        var->allocation = a.allocation;
--- a/python/jittor/src/mem/mem_info.cc
+++ b/python/jittor/src/mem/mem_info.cc
@ -62,7 +62,7 @@ void display_memory_info(const char* fileline, bool dump_var, bool red_color) {
    log << "\n=== display_memory_info ===\n";
    log << "total_cpu_ram:" << 
        FloatOutput{(double)mem_info.total_cpu_ram, " KMG", 1024, "B"};
-    log << "total_cuda_ram:" << 
+    log << "total_device_ram:" << 
        FloatOutput{(double)mem_info.total_cuda_ram, " KMG", 1024, "B"} >> "\n";
    log << "hold_vars:" << hold_vars.size()
        << "lived_vars:" << Var::number_of_lived_vars
@ -105,7 +105,7 @@ void display_memory_info(const char* fileline, bool dump_var, bool red_color) {
        auto total = a->used_memory + a->unused_memory;
        all_total += total;
        a->is_cuda() ? gpu_total += total : cpu_total += total;
-        log << "name:" << a->name() << "is_cuda:" << a->is_cuda()
+        log << "name:" << a->name() << "is_device:" << a->is_cuda()
            << "used:" << FloatOutput{(double)a->used_memory, " KMG", 1024, "B"}
                >> "(" >> std::setprecision(p) >> a->used_memory*100.0 / total >> "%)"
            << "unused:" << FloatOutput{(double)a->unused_memory, " KMG", 1024, "B"} 
@ -117,7 +117,7 @@ void display_memory_info(const char* fileline, bool dump_var, bool red_color) {
            auto total = a->used_memory + a->unused_memory;
            all_total += total;
            a->is_cuda() ? gpu_total += total : cpu_total += total;
-            log << "name:" << a->name() << "is_cuda:" << a->is_cuda()
+            log << "name:" << a->name() << "is_device:" << a->is_cuda()
                << "used:" << FloatOutput{(double)a->used_memory, " KMG", 1024, "B"}
                    >> "(" >> std::setprecision(p) >> a->used_memory*100.0 / total >> "%)"
                << "unused:" << FloatOutput{(double)a->unused_memory, " KMG", 1024, "B"} 
@ -227,9 +227,9 @@ MemInfo::MemInfo() {

    total_cuda_ram = 0;
 #ifdef HAS_CUDA
-    cudaDeviceProp prop;
-    cudaGetDeviceProperties(&prop, 0);
-    total_cuda_ram = prop.totalGlobalMem;
+    size_t gpu_free = 0, _gpu_total = 0;
+    cudaMemGetInfo(&gpu_free, &_gpu_total);
+    total_cuda_ram = _gpu_total;
 #endif
    sigquit_callback.push_back(&meminfo_callback);
 }
--- a/python/jittor/src/misc/cuda_flags.h
+++ b/python/jittor/src/misc/cuda_flags.h
@ -24,7 +24,7 @@ inline int get_device_count() {

 } // jittor

-#if CUDART_VERSION < 10000
+#if defined(CUDART_VERSION) && CUDART_VERSION < 10000
    #define _cudaLaunchHostFunc(a,b,c) \
        cudaStreamAddCallback(a,b,c,0)
    #define CUDA_HOST_FUNC_ARGS cudaStream_t stream, cudaError_t status, void*
--- a/python/jittor/src/misc/deleter.h
+++ b/python/jittor/src/misc/deleter.h
@ -13,7 +13,8 @@ namespace jittor {
 struct Deleter {
    std::function<void()> del;
    inline Deleter(std::function<void()>&& func) : del(move(func)) {}
-    inline ~Deleter() { del(); }
+    inline Deleter() {}
+    inline ~Deleter() { if (del) del(); }
 };

 } // jittor
--- a/python/jittor/src/misc/nan_checker.cc
+++ b/python/jittor/src/misc/nan_checker.cc
@ -17,7 +17,7 @@
 namespace jittor {


-#ifdef HAS_CUDA
+#ifdef IS_CUDA
 EXTERN_LIB void check_nan_float32(float32* ptr, int64 num);
 EXTERN_LIB void check_nan_float64(float64* ptr, int64 num);
 #endif
@ -28,7 +28,7 @@ bool check_nan(Var* v) {
            v->input()->name() == string("empty") ||
            v->input()->name() == string("setitem")))
        return true;
-    #ifdef HAS_CUDA
+    #ifdef IS_CUDA
    if (v->allocator->is_cuda()) {
        if (v->dtype() == ns_float32) {
            check_nan_float32((float32*)v->mem_ptr, v->num);
--- a/python/jittor/src/misc/nano_string.cc
+++ b/python/jittor/src/misc/nano_string.cc
@ -9,6 +9,17 @@

 namespace jittor {

+DEFINE_FLAG(int, amp_reg, 0, "Auto mixed-precision control registers, bit 0: prefer 32; bit 1: prefer 16; bit 2: keep reduce type; bit 3 keep white list type; bit 4: array like op prefer too");
+
+DEFINE_FLAG_WITH_SETTER(int, auto_mixed_precision_level, 0, "Auto mixed-precision optimization level, 0: not use fp16, 1-3: preserve level, not use fp16 for now; 4: perfer fp16, but some ops use fp32 e.g. sum,exp; 5: simular with 4, and array op will automatically convert to fp16; 6: all ops prefer fp16");
+
+void setter_auto_mixed_precision_level(int value) {
+    if (value <= 3) amp_reg = 0; else
+    if (value == 4) amp_reg = amp_prefer16; else
+    if (value == 5) amp_reg = amp_prefer16 | amp_array_prefer; else
+    if (value == 6) amp_reg = amp_prefer16 | amp_array_prefer | amp_keep_reduce | amp_keep_white;
+}
+
 #define FOR_ALL_TYPES(m) \
    m(bool) \
    m(int8) \
@ -89,15 +100,18 @@ static unordered_set<string> unary_ops = {
    "erfinv"
 };

-static unordered_set<string> unary_float_ops = {
+static unordered_set<string> float_ops = {
    "log",
    "exp",
    "sqrt",
+    "mean",
+    "divide",
 };
-static unordered_set<string> unary_int_ops = {
+static unordered_set<string> int_ops = {
    "round_int",
    "floor_int",
    "ceil_int",
+    "floor_divide",
 };

 static unordered_set<string> binary_ops = {
@ -127,6 +141,13 @@ static unordered_set<string> binary_ops = {
    "mean",
 };

+
+static unordered_set<string> white_ops = {
+    // "log",
+    "exp",
+    "pow",
+};
+
 #define DEFINE_NS(T) NanoString ns_##T;
 FOR_ALL_NS(DEFINE_NS);

@ -135,6 +156,9 @@ char __ns_to_string[ns_max_size*ns_max_len];
 int __ns_len[ns_max_size];

 static void init_ns() {
+    dsize_map["float16"] = 1;
+    is_float_map["float16"] = 1;
+    is_unsigned["float16"] = 0;
    NanoString::ns_t i=0;
    auto func = [&](const char* name, NanoString& ns) {
        ns.set(NanoString::_index, i++, NanoString::_index_nbits);
@ -149,13 +173,16 @@ static void init_ns() {
        if (unary_ops.count(name)) {
            ns.set(NanoString::_type, NanoString::_unary, NanoString::_type_nbits);
            ns.set(NanoString::_bool, is_bool.count(name));
-            ns.set(NanoString::_int, unary_int_ops.count(name));
-            ns.set(NanoString::_float, unary_float_ops.count(name));
+            ns.set(NanoString::_int, int_ops.count(name));
+            ns.set(NanoString::_float, float_ops.count(name));
        } else
        if (binary_ops.count(name)) {
            ns.set(NanoString::_type, NanoString::_binary, NanoString::_type_nbits);
            ns.set(NanoString::_bool, is_bool.count(name));
+            ns.set(NanoString::_int, int_ops.count(name));
+            ns.set(NanoString::_float, float_ops.count(name));
        }
+        ns.set(NanoString::_white_list, white_ops.count(name));
        __string_to_ns[name] = ns;
        auto name2 = ns.to_cstring();
        int len=0;
@ -171,6 +198,7 @@ static void init_ns() {
    __string_to_ns["sum"] = ns_add;
    __string_to_ns["min"] = ns_minimum;
    __string_to_ns["max"] = ns_maximum;
+    __string_to_ns["half"] = ns_float16;
    __string_to_ns["float"] = ns_float32;
    __string_to_ns["double"] = ns_float64;
    __string_to_ns["int"] = ns_int32;
--- a/python/jittor/src/misc/nano_string.h
+++ b/python/jittor/src/misc/nano_string.h
@ -24,6 +24,7 @@ constexpr int ns_max_len = 16;
    m(uint16) \
    m(uint32) \
    m(uint64) \
+    m(float16) \
    m(float32) \
    m(float64) \
 \
@ -100,7 +101,7 @@ struct NanoString {
    typedef uint16 ns_t;
    enum Flags {
        // bit0~7: index
-        _index=0, _index_nbits=8,
+        _index=0, _index_nbits=7,
        _n=_index_nbits,

        // bit0-1: type
@ -116,6 +117,8 @@ struct NanoString {
        _float=_n+5,
        // bit6-7: dsize(1,2,4,8 byte)
        _dsize=_n+6, _dsize_nbits=2,
+        // bit8: white list
+        _white_list=_n+8,
    };
    ns_t data=0;

@ -130,11 +133,16 @@ struct NanoString {
    inline ns_t index() const { return get(_index, _index_nbits); }
    inline int len() const { return __ns_len[index()]; }
    inline ns_t type() const { return get(_type, _type_nbits); }
-    inline ns_t is_bool() const { return get(_bool); }
-    inline ns_t is_int() const { return get(_int); }
-    inline ns_t is_unsigned() const { return get(_unsigned); }
-    inline ns_t is_float() const { return get(_float); }
+    // @pyjt(is_bool)
+    inline bool is_bool() const { return get(_bool); }
+    // @pyjt(is_int)
+    inline bool is_int() const { return get(_int); }
+    inline bool is_unsigned() const { return get(_unsigned); }
+    // @pyjt(is_float)
+    inline bool is_float() const { return get(_float); }
+    inline ns_t is_white() const { return get(_white_list); }
    inline ns_t dsize() const { return 1<<get(_dsize, _dsize_nbits); }
+    inline ns_t dsize_() const { return get(_dsize, _dsize_nbits); }
    inline ns_t is_dtype() const { return get(_type, _type_nbits)==_dtype; }
    inline ns_t is_binary() const { return get(_type, _type_nbits)==_binary; }
    inline ns_t is_unary() const { return get(_type, _type_nbits)==_unary; }
@ -156,28 +164,6 @@ struct NanoString {
        { return __ns_to_string+index()*ns_max_len; }
 };

-// force_type = 1 for int, 2 for float
-inline 
-NanoString dtype_infer(NanoString v1, NanoString v2, int force_type=0, NanoString op=ns_void) {
-    bool is_float = v1.is_float() || v2.is_float();
-    int dsize = std::max(v1.dsize(), v2.dsize());
-    if (force_type == 1)
-        is_float = false;
-    else if (force_type == 2)
-        is_float = true;
-    if (is_float) {
-        if (dsize==4) return ns_float32;
-        return ns_float64;
-    } else {
-        if (dsize==8) return ns_int64;
-        if (dsize==4) return ns_int32;
-        if (dsize==2) return ns_int16;
-        if (op.data == ns_add.data || op.data == ns_subtract.data)
-            return ns_int8;
-        return v1;
-    }
-}
-
 // @pyjt(NanoString.__eq__)
 inline bool eq(const NanoString& a, const NanoString& b) {
    return a.data == b.data;
@ -199,4 +185,72 @@ inline std::ostream& operator<<(std::ostream& os, const NanoString& v) {
    return os << v.to_cstring();
 }

+EXTERN_LIB int amp_reg;
+constexpr int amp_prefer32 = 1;
+constexpr int amp_prefer16 = 2;
+constexpr int amp_keep_reduce = 4;
+constexpr int amp_keep_white = 8;
+constexpr int amp_array_prefer = 16;
+
+inline NanoString float_dtype(int dsize_) {
+    if (amp_reg & amp_prefer32) return ns_float32;
+    if (amp_reg & amp_prefer16) return ns_float16;
+    return (dsize_ == 3) ? ns_float64 : 
+        (dsize_ == 2 ) ? ns_float32 : ns_float16;
+}
+
+inline NanoString int_dtype(int dsize_) {
+    return (dsize_ == 3) ? ns_int64 : 
+        (dsize_ == 2) ? ns_int32 :
+        (dsize_ == 1) ? ns_int16 : ns_int8;
+}
+
+inline  NanoString dtype_infer(NanoString x, NanoString y) {
+    int dsize_ = std::max(x.dsize_(), y.dsize_());
+    bool is_float = x.is_float() || y.is_float();
+    if (is_float)
+        return float_dtype(dsize_);
+    else {
+        return int_dtype(dsize_);
+    }
+}
+
+inline NanoString binary_dtype_infer(NanoString op, NanoString x, NanoString y) {
+    if (op.is_bool()) return ns_bool;
+    int dsize_ = std::max(x.dsize_(), y.dsize_());
+    bool is_float = !op.is_int() && 
+        (x.is_float() || y.is_float() || op.is_float());
+    if (is_float) {
+        if (op.is_white() && !(amp_reg & amp_keep_white))
+            return (dsize_ == 3) ? ns_float64 : ns_float32;
+        return float_dtype(dsize_);
+    } else {
+        return int_dtype(dsize_);
+    }
+}
+
+inline NanoString unary_dtype_infer(NanoString op, NanoString x) {
+    if (op.is_bool()) return ns_bool;
+    int dsize_ = x.dsize_();
+    if (op.is_float()) {
+        if (op.is_white() && !(amp_reg & amp_keep_white))
+            return (dsize_ == 3) ? ns_float64 : ns_float32;
+        return float_dtype(dsize_);
+    }
+    if (op.is_int()) return int_dtype(dsize_);
+    return x;
+}
+
+inline NanoString reduce_dtype_infer(NanoString op, NanoString x) {
+    bool is_float = x.is_float() || op.is_float();
+    int dsize_ = x.dsize_();
+    if (is_float) {
+        if (amp_reg & amp_keep_reduce)
+            return float_dtype(dsize_);
+        return (dsize_ == 3) ? ns_float64 : ns_float32;
+    } else {
+        return x;
+    }
+}
+
 }
--- a/python/jittor/src/misc/string_view_map.h
+++ b/python/jittor/src/misc/string_view_map.h
@ -16,12 +16,12 @@

 namespace jittor {

-#if defined(__clang__)
+#if __cplusplus < 201400L || defined(IS_ACL)
+using string_view = string;
+#elif defined(__clang__)
 using std::string_view;
 #elif defined(__GNUC__)
 using std::experimental::string_view;
-#elif __cplusplus < 201400L
-using string_view = string;
 #else
 using std::string_view;
 #endif
--- a/python/jittor/src/node.h
+++ b/python/jittor/src/node.h
@ -51,8 +51,14 @@ struct NodeFlags {
        _grads=_n+6,
        // bit7: has graph optimize
        _has_gopt=_n+7,
-        // bit7: has vary input
+        // bit8: has vary input
        _has_vary_input=_n+8,
+        // bit9: prefer 32 bit
+        _prefer_32=_n+9,
+        // bit10: force 16 bit
+        _prefer_16=_n+10,
+        // bit11: reduce keep type unchange
+        _reduce_keep=_n+11,
    };

    inline void set(Flags f, int a=1, int nbits=1) {
@ -90,7 +96,7 @@ struct Node {
        operator Var*() { return (Var*)node; }
        operator var_output_t() { return {(Op*)node, index}; }
    };
-    static int64_t tflag_count;
+    static int64 tflag_count;
    NodeFlags flags;
    NanoString ns;
    inline bool is_var() const { return flags.get(NodeFlags::_var); }
--- a/python/jittor/src/op.cc
+++ b/python/jittor/src/op.cc
@ -25,11 +25,12 @@ DEFINE_FLAG(int, try_use_32bit_index, 0,
 string_view_map<jit_op_entry_t> jit_ops;
 string_view_map<string> jit_key_mapper;

-int64_t Op::number_of_lived_ops = 0;
+int64 Op::number_of_lived_ops = 0;

 Op::Op() {
    flags.set(NodeFlags::_var, 0);
    flags.set(NodeFlags::_cpu, 1);
+    flags.flags |= ((amp_reg & 7) << NodeFlags::_prefer_32);
    number_of_lived_ops++;
    if (PREDICT_BRANCH_NOT_TAKEN(trace_py_var)) trace_data.record_node(this);
 }
@ -122,43 +123,24 @@ void Op::do_jit_prepare(JK& jk) {
        if (has_cuda && has_cpu && !use_cuda)
            flags.set(NodeFlags::_cuda, 0);
    } else {
-        // check use int64_t as index_t if array is too big
-        int in_id=0, out_id=0;
        bool use_int64_t = false;
        // TODO: fused op do not have inputs,
        //   check use_cuda_op from outputs may not be enough
        bool use_cuda_op = use_cuda;
        for (Var* var : inputs()) {
-            if (var->mem_ptr) {
-                /* jit key don't include here, because 
-                    parallel compiler don't known
-                jk << JK::key << "alloc_i" << JK::hex1(in_id)
-                    << JK::hex1(var->allocator->flags()) << JK::end;
-                */
-                use_cuda_op &= var->allocator->is_cuda();
-            }
            if (var->num >= std::numeric_limits<int32_t>::max())
                use_int64_t = true;
-            in_id ++;
        }
        for (Var* var : outputs()) {
-            if (var->mem_ptr) {
-                /*
-                jk << JK::key << "alloc_o" << JK::hex1(in_id)
-                    << JK::hex1(var->allocator->flags()) << JK::end;
-                */
-                use_cuda_op &= var->allocator->is_cuda();
-            }
            if (var->num >= std::numeric_limits<int32_t>::max())
                use_int64_t = true;
-            out_id ++;
        }
        jk << _CS("[JIT:1]");
        if (use_cuda_op && flags.get(NodeFlags::_cuda)) {
            jk << _CS("[JIT_cuda:1]");
            flags.set(NodeFlags::_cpu, 0);
            // TODO: 64bit index in CUDA
-            use_int64_t = false;
+            // use_int64_t = false;
        } else {
            if (use_cuda==2) {
                if (flags.get(NodeFlags::_cuda))
@ -268,11 +250,15 @@ void Op::jit_run(JK& jk) {

 void Op::statistics(uint64_t& in, uint64_t& out, uint64_t& compute) {
    in = out = compute = 0;
-    for (Var* var : inputs()) {
+    for (auto& e : _inputs) {
+        auto var = e.node->var();
+        if (e.back->index<0) continue;
        in += var->size;
        compute = std::max(compute, (uint64_t)var->num);
    }
-    for (Var* var : outputs()) {
+    for (auto& e : _outputs) {
+        auto var = e.node->var();
+        if (e.index<0) continue;
        out += var->size;
        compute = std::max(compute, (uint64_t)var->num);
    }
--- a/python/jittor/src/op.h
+++ b/python/jittor/src/op.h
@ -15,7 +15,7 @@ namespace jittor {
 enum OpType {other=0, element=1, broadcast=2, reduce=3};
 struct Op : Node {
    vector<VarPtr> outputs_holder;
-    static int64_t number_of_lived_ops;
+    static int64 number_of_lived_ops;
    
    inline Caster<Var*, Node::input_t> inputs() { CHECK_EXIST; return &_inputs; }
    inline Caster<Var*, Node::output_t> outputs() { CHECK_EXIST; return &_outputs; }
--- a/python/jittor/src/op_compiler.cc
+++ b/python/jittor/src/op_compiler.cc
@ -112,7 +112,7 @@ int OpCompiler::total_member_count() {
    return member_count;
 }

-int64_t OpCompiler::eval(const string& expr, const unordered_map<string,string>& vars) {
+int64 OpCompiler::eval(const string& expr, const unordered_map<string,string>& vars) {
    if (expr.find("@") != string::npos) {
        string new_expr;
        for (size_t i=0; i<expr.size(); i++) {
@ -223,6 +223,16 @@ void load_macros(const string& src, unordered_map<string,string>& macros) {
    }
 }

+string expand_op_search(const vector<string>& args) {
+    for (auto op_type : op_types) {
+        string ret = op_type->expand_op(args);
+        if (ret.size())
+            return ret;
+    }
+    LOGf << "No expand op pattern found for args:" << args;
+    return "";
+}
+
 void expand_macro(const string& macro, const vector<string>& args, string& new_src) {
    LOGvvvv << "expand_macro" << macro << "args:" << args;
    if (macro.size() == 0 || macro[0] != '<') {
@ -434,6 +444,7 @@ string precompile(unordered_map<string,string> defs, string src, unordered_map<s
                vector<string> args;
                size_t l = k+1;
                if (expr == "for" || expr == "if" || expr == "expand_macro" ||
+                expr == "expand_op" ||
                    expr == "is_def" || expr == "python" ||
                    (k<src.size() && src[k]=='(')) {
                    ASSERT(src[k] == '(');
@ -555,6 +566,18 @@ string precompile(unordered_map<string,string> defs, string src, unordered_map<s
                    i = l-1;
                    continue;
                } else
+                if (expr == "expand_op") {
+                    // syntax: @expand_op(args)
+                    for (auto& arg : args) {
+                        uint p=0;
+                        while (p<arg.size() && arg[p] == ' ') p++;
+                        arg = precompile(defs, arg.substr(p), macros);
+                    }
+                    string ns = expand_op_search(args);
+                    new_src += precompile(defs, ns, macros);
+                    i = l-1;
+                    continue;
+                } else
                if (expr == "define") {
                    // syntax: @define(macro, value)
                    //         ij     k             l
@ -846,6 +869,9 @@ string OpCompiler::__get_fused_src(
    };
    auto not_change = [&](const string& s) -> bool {
        if (unchanged.count(s)) return true;
+        for (auto op_type : op_types)
+            if (op_type->types.count(s))
+                return true;
        return (s.find("::") != string::npos) || (s.find("LOG") != string::npos);
    };
    // regex find XxxXxxOp::jit_run
@ -1043,7 +1069,7 @@ jit_op_entry_t OpCompiler::compile(const string& jit_key, const string& src) {
        if (v->loop_options)
            for (auto& kv : v->loop_options.data()) {
                if (kv.second && startswith(kv.first, "FLAGS:"))
-                    extra_flags += " "+kv.first.substr(6)+" ";
+                    extra_flags += " " + kv.first.substr(6) + " ";
            }
    return jit_compiler::compile(jit_key, src, is_cuda, extra_flags);
 }
@ -1052,6 +1078,8 @@ jit_op_entry_t OpCompiler::do_compile(Op* op) {
    jittor::lock_guard lg;
    OpCompiler oc(op);
    string* src = &oc.src;
+    for (auto op_type : op_types)
+        op_type->post_pass(&oc);
    string src_after_passes;
    // if is fused op
    if (oc.op) {
--- a/python/jittor/src/ops/binary_op.cc
+++ b/python/jittor/src/ops/binary_op.cc
@ -8,7 +8,6 @@
 #include "var.h"
 #include "ops/binary_op.h"
 #include "ops/broadcast_to_op.h"
-#include "ops/binary_op_defs.h"
 #include "ops/op_register.h"

 namespace jittor {
@ -419,21 +418,13 @@ unordered_set<string> binary_ops = {
    "bitwise_xor",
 };

-NanoString binary_dtype_infer(NanoString op, Var* x, Var* y) {
-    if (op == ns_mean) return dtype_infer(x->ns, y->ns, 2); // force float
-    int force_type=0;
-    if (op == ns_divide) force_type=2; // force float
-    if (op == ns_floor_divide) force_type=1; // force int
-    return op.is_bool() ? ns_bool : dtype_infer(x->ns, y->ns, force_type, op);
-}
-
 BinaryOp::BinaryOp(Var* x, Var* y, NanoString op) : x(x), y(y) {
    flags.set(NodeFlags::_cpu);
    flags.set(NodeFlags::_cuda);
    set_type(OpType::element);
    ns = op;
    ASSERT(ns.is_binary());
-    z = create_output(nullptr, binary_dtype_infer(op, x, y));
+    z = create_output(nullptr, binary_dtype_infer(op, x->ns, y->ns));
 }

 VarPtr dirty_clone_broadcast(Var* v) {
@ -554,7 +545,7 @@ void BinaryOp::jit_run() {
    auto* __restrict__ zp = z->ptr<Tz>();
    index_t num = z->num;
    for (index_t i=0; i<num; i++)
-        zp[i] = @expand_macro(@OP, Tz, xp[i], yp[i]);
+        zp[i] = @expand_op(@OP, @Tz, xp[i], @Tx, yp[i], @Ty);
 }
 #endif // JIT

--- a/python/jittor/src/ops/binary_op_defs.h
+++ b/python/jittor/src/ops/binary_op_defs.h
@ -1,61 +0,0 @@
-// ***************************************************************
-// Copyright (c) 2021 Jittor. All Rights Reserved. 
-// Maintainers: Dun Liang <randonlang@gmail.com>. 
-// This file is subject to the terms and conditions defined in
-// file 'LICENSE.txt', which is part of this source code package.
-// ***************************************************************
-#pragma once
-#include "common.h"
-
-namespace jittor {
-
-#ifdef JIT_cuda
-#define pow(T,a,b) ::pow(a,b)
-#define maximum(T,a,b) ::max(T(a), T(b))
-#define minimum(T,a,b) ::min(T(a), T(b))
-#define mod(T,a,b) @if(@strcmp(@T,float32)==0,(a-::floorf((a)/(b))*(b)),@if(@strcmp(@Tx,float64)==0,(a-::floor((a)/(b))*(b)),(a%b)))
-#else // JIT_cpu
-#define pow(T,a,b) std::pow(a,b)
-#define maximum(T,a,b) std::max(T(a), T(b))
-#define minimum(T,a,b) std::min(T(a), T(b))
-#define mod(T,a,b) @if(@strcmp(@T,float32)==0,(a-std::floor((a)/(b))*(b)),@if(@strcmp(@Tx,float64)==0,(a-std::floor((a)/(b))*(b)),(a%b)))
-#endif
-#define add(T,a,b) ((a)+(b))
-#define subtract(T,a,b) ((a)-(b))
-#define multiply(T,a,b) ((a)*(b))
-#define divide(T,a,b) (T((T(a))/(T(b))))
-#define floor_divide(T,a,b) (T((T(a))/(T(b))))
-#define less(T,a,b) ((a)<(b))
-#define less_equal(T,a,b) ((a)<=(b))
-#define greater(T,a,b) ((a)>(b))
-#define greater_equal(T,a,b) ((a)>=(b))
-#define equal(T,a,b) ((a)==(b))
-#define not_equal(T,a,b) ((a)!=(b))
-#define left_shift(T,a,b) ((a)<<(b))
-#define right_shift(T,a,b) ((a)>>(b))
-#define logical_and(T,a,b) ((a)&&(b))
-#define logical_or(T,a,b) ((a)||(b))
-#define logical_xor(T,a,b) ((bool(a))!=(bool(b)))
-#define bitwise_and(T,a,b) ((a)&(b))
-#define bitwise_or(T,a,b) ((a)|(b))
-#define bitwise_xor(T,a,b) ((a)^(b))
-#define mean(T,a,b) ((a)+T(b)*(T(rcount)))
-
-#ifdef JIT_cuda
-#define init_maximum(T) ::numeric_min<T>()
-#define init_minimum(T) ::numeric_max<T>()
-#else
-#define init_maximum(T) std::numeric_limits<T>::lowest()
-#define init_minimum(T) std::numeric_limits<T>::max()
-#endif
-#define init_add(T) T(0)
-#define init_multiply(T) T(1)
-#define init_logical_and(T) true
-#define init_logical_or(T) false
-#define init_logical_xor(T) false
-#define init_bitwise_and(T) T(-1)
-#define init_bitwise_or(T) T(0)
-#define init_bitwise_xor(T) T(0)
-#define init_mean(T) T(0)
-
-} // jittor
--- a/python/jittor/src/ops/candidate_op.cc
+++ b/python/jittor/src/ops/candidate_op.cc
@ -91,7 +91,7 @@ void CandidateOp::jit_run() {

    int n=0;
    // checkCudaErrors(cudaDeviceSynchronize());
-    checkCudaErrors(cudaMemcpy(&n, np, 4, cudaMemcpyDefault));
+    checkCudaErrors(cudaMemcpy(&n, np, 4, cudaMemcpyDeviceToHost));
    y->set_shape({n});
    exe.temp_allocator->free(np, 4, n_allocation);
    exe.temp_allocator->free(maskp, xshape0, mask_allocation);
--- a/python/jittor/src/ops/copy_op.cc
+++ b/python/jittor/src/ops/copy_op.cc
@ -40,7 +40,7 @@ void CopyOp::run() {
    auto y_ptr = outputs().front()->mem_ptr;
    #ifdef HAS_CUDA
    if (flags.get(NodeFlags::_cuda)) {
-        checkCudaErrors(cudaMemcpyAsync(y_ptr, x_ptr, size, cudaMemcpyDefault, 0));
+        checkCudaErrors(cudaMemcpyAsync(y_ptr, x_ptr, size, cudaMemcpyDeviceToDevice, 0));
    } else
    #endif
    {
--- a/python/jittor/src/ops/fetch_op.cc
+++ b/python/jittor/src/ops/fetch_op.cc
@ -121,13 +121,18 @@ void FetchOp::run() {
            checkCudaErrors(cudaStreamWaitEvent(stream, event, 0));
            new (&allocation) Allocation(&cuda_dual_allocator, v->size);
            // mostly device to device
+            #if IS_CUDA
            checkCudaErrors(cudaMemcpyAsync(
                allocation.ptr, v->mem_ptr, v->size, cudaMemcpyDefault, stream));
+            #else
+            checkCudaErrors(cudaMemcpyAsync(
+                allocation.ptr, v->mem_ptr, v->size, cudaMemcpyDeviceToDevice, stream));
+            #endif
            auto host_ptr = cuda_dual_allocator.get_dual_allocation(
                allocation.allocation).host_ptr;
            // device to host
            checkCudaErrors(cudaMemcpyAsync(
-                host_ptr, allocation.ptr, v->size, cudaMemcpyDefault, stream));
+                host_ptr, allocation.ptr, v->size, cudaMemcpyDeviceToHost, stream));
            allocation.ptr = host_ptr;
            has_cuda_memcpy = true;
        } else
--- a/python/jittor/src/ops/op_register.cc
+++ b/python/jittor/src/ops/op_register.cc
@ -33,4 +33,12 @@ OpInfo get_op_info(const string& name) {
    return op_info_map.at(op_file_name);
 }

+vector<OpByType*> op_types;
+
+int registe_op_type(OpByType* op_type) {
+    op_types.push_back(op_type);
+    return 0;
+}
+
+
 } // jittor
--- a/python/jittor/src/ops/op_register.h
+++ b/python/jittor/src/ops/op_register.h
@ -32,4 +32,14 @@ void op_registe(const OpInfo& op_info);
 bool has_op(const string& name);
 OpInfo get_op_info(const string& name);

+struct OpCompiler;
+struct OpByType {
+    unordered_set<string> types;
+    virtual string expand_op(const vector<string>& args) = 0;
+    virtual void post_pass(OpCompiler*) = 0;
+};
+
+extern vector<OpByType*> op_types;
+int registe_op_type(OpByType*);
+
 } // jittor
--- a/python/jittor/src/ops/reduce_op.cc
+++ b/python/jittor/src/ops/reduce_op.cc
@ -8,7 +8,6 @@
 #include <limits>
 #include "var.h"
 #include "ops/reduce_op.h"
-#include "ops/binary_op_defs.h"
 #include "ops/op_register.h"
 #include "executor.h"

@ -272,7 +271,7 @@ ReduceOp::ReduceOp(Var* x, NanoString op, NanoVector dims, bool keepdims)
    if (x->dtype() == ns_bool)
        y = create_output(nullptr, ns_int32);
    else
-        y = create_output(nullptr, binary_dtype_infer(ns, x, x));
+        y = create_output(nullptr, reduce_dtype_infer(ns, x->ns));
 }

 ReduceOp::ReduceOp(Var* x, NanoString op, uint dims_mask, uint keepdims_mask)
@ -284,7 +283,7 @@ ReduceOp::ReduceOp(Var* x, NanoString op, uint dims_mask, uint keepdims_mask)
    ASSERT(ns.is_binary());
    reduce_mask = dims_mask;
    this->keepdims_mask = keepdims_mask;
-    y = create_output(nullptr, binary_dtype_infer(ns, x, x));
+    y = create_output(nullptr, reduce_dtype_infer(ns, x->ns));
 }

 ReduceOp::ReduceOp(Var* x, NanoString op, int dim, bool keepdims)
@ -360,18 +359,18 @@ void ReduceOp::jit_run() {
    @for(i, DIM-2, -1, -1, auto ystride@i = ystride@{i+1} * yshape@{i+1};)
    index_t xstride@{DIM-1} = 1;
    @for(i, DIM-2, -1, -1, auto xstride@i = xstride@{i+1} * xshape@{i+1};)
-    Ty count = Ty(x->num) / Ty(y->num);
-    Ty rcount = Ty(y->num) / Ty(x->num);
+    Ty count = x->num*1.0 / y->num;
+    Ty rcount = y->num*1.0 / x->num;
    @for(d, 0, DIM,@if(REDUCE>>d&1,, for (index_t xi@d=0; xi@d < xshape@d; xi@d++))) {
        auto yid = 0 @for(d, 0, DIM,@if(REDUCE>>d&1,, + xi@d * ystride@d));
-        yp[yid] = @expand_macro(init_@OP, Ty);
+        yp[yid] = @expand_op(init_@OP, @Ty);
    }
    
    @for(d, 0, DIM,@if(REDUCE>>d&1,, for (index_t xi@d=0; xi@d < xshape@d; xi@d++))) {
        @for(d, 0, DIM,@if(REDUCE>>d&1, for (index_t xi@d=0; xi@d < xshape@d; xi@d++),)) {
            auto yid = 0 @for(d, 0, DIM,@if(REDUCE>>d&1,, + xi@d * ystride@d));
            auto xid = 0 @for(d, 0, DIM, + xi@d * xstride@d);
-            yp[yid] = @expand_macro(@OP, Ty, yp[yid], xp[xid]);
+            yp[yid] = @expand_op(@OP, @Ty, yp[yid], @Ty, xp[xid], @Tx);
        }
    }
    (void)count, (void)rcount, (void)yshape0, (void)ystride0;
--- a/python/jittor/src/ops/reindex_op.cc
+++ b/python/jittor/src/ops/reindex_op.cc
@ -132,7 +132,7 @@ void ReindexOp::jit_run() {
        @for(d, 0, XDIM, index_t xid@d = @expand_macro(INDEX@d);)
        auto xid = @for(d, 0, XDIM, + xid@d * xstride@d);
        bool check_overflow = 0 @for(d, 0, XDIM, || xid@d<0 || xid@d>=xshape@d) @for(d, 0, OSIZE, || (@expand_macro(OFD@d)));
-        yp[yid] = check_overflow ? (@OVERFLOW) : xp[xid];
+        yp[yid] = check_overflow ? Tx(@OVERFLOW) : xp[xid];
    }
 }
 #endif // JIT
--- a/python/jittor/src/ops/reindex_reduce_op.cc
+++ b/python/jittor/src/ops/reindex_reduce_op.cc
@ -8,7 +8,6 @@
 #include <limits>
 #include "var.h"
 #include "ops/reindex_reduce_op.h"
-#include "ops/binary_op_defs.h"
 #include "ops/op_register.h"

 namespace jittor {
@ -112,7 +111,7 @@ void ReindexReduceOp::jit_run() {

    @for(d, 0, XDIM, for (index_t i@d=0; i@d < xshape@d; i@d++)) {
        auto xid = @for(d, 0, XDIM, + i@d * xstride@d);
-        xp[xid] = @expand_macro(init_@OP, Tx);
+        xp[xid] = @expand_op(init_@OP, @Tx);
    }
    // generate d-for loop
    @for(d, 0, YDIM, for (index_t i@d=0; i@d < yshape@d; i@d++)) {
@ -121,7 +120,7 @@ void ReindexReduceOp::jit_run() {
        auto xid = @for(d, 0, XDIM, + xid@d * xstride@d);
        bool check_overflow = 0 @for(d, 0, XDIM, || xid@d<0 || xid@d>=xshape@d) @for(d, 0, OSIZE, || (@expand_macro(OFD@d)));
        if (!check_overflow)
-            xp[xid] = @expand_macro(@OP, Tx, xp[xid], yp[yid]);
+            xp[xid] = @expand_op(@OP, @Tx, xp[xid], @Tx, yp[yid], @Tx);
    }
 }
 #endif // JIT
--- a/python/jittor/src/ops/setitem_op.cc
+++ b/python/jittor/src/ops/setitem_op.cc
@ -9,7 +9,6 @@
 #include "ops/setitem_op.h"
 #include "ops/getitem_op.h"
 #ifdef JIT
-#include "ops/binary_op_defs.h"
 #ifdef JIT_cuda
 #include <cuda_runtime.h>
 #include "helper_cuda.h"
@ -313,7 +312,7 @@ void SetitemOp::jit_run() {
        std::memcpy(op, ip, out->size);
    #else
    if (op != ip)
-        checkCudaErrors(cudaMemcpyAsync(op, ip, out->size, cudaMemcpyDefault, 0));
+        checkCudaErrors(cudaMemcpyAsync(op, ip, out->size, cudaMemcpyDeviceToDevice, 0));
    #endif

    if (flags.get((NodeFlags::Flags(SetitemOp::_data_inplaced))) &&
@ -340,12 +339,12 @@ void SetitemOp::jit_run() {
        @if(@is_def(JIT_cpu),
            @if(@strcmp(@OP,void)==0,
                op[iid] = (Ti)dp[did],
-                op[iid] = @expand_macro(@OP, Ti, op[iid], dp[did])
+                op[iid] = @expand_op(@OP, @Ti, op[iid], @Ti, dp[did], @Td)
            );
        ,
            @if(@strcmp(@OP,void)==0, op[iid] = (Ti)dp[did],
            @if(@strcmp(@OP,add)==0, atomicAdd(&op[iid], (Ti)dp[did]),
-                op[iid] = @expand_macro(@OP, Ti, op[iid], dp[did])
+                op[iid] = @expand_op(@OP, @Ti, op[iid], @Ti, dp[did], @Td)
            )
            );
        )
--- a/python/jittor/src/ops/transpose_op.cc
+++ b/python/jittor/src/ops/transpose_op.cc
@ -28,6 +28,12 @@ TransposeOp::TransposeOp(Var* x, NanoVector axes_) : x(x), axes(axes_) {
        for (int i=0; i<(int)xdim; i++)
            axes.push_back(xdim-1-i);
    }
+    if (axes.size() < xdim || (axes.size() == xdim && axes[xdim-1]==xdim-1)) {
+        static VarPtr(*fuse_transpose)(Var*, NanoVector) = get_op_info("fuse_transpose").get_constructor<VarPtr, Var*, NanoVector>();
+        auto var = fuse_transpose(x, axes);
+        forward(var);
+        return;
+    }
    #ifdef HAS_CUDA
    if (use_cuda) {
        static VarPtr(*cutt_transpose)(Var*, NanoVector) = nullptr;
--- a/python/jittor/src/ops/unary_op.cc
+++ b/python/jittor/src/ops/unary_op.cc
@ -8,7 +8,6 @@
 #include "misc/cpu_math.h"
 #include "var.h"
 #include "ops/unary_op.h"
-#include "ops/unary_op_defs.h"
 #include "ops/op_register.h"

 namespace jittor {
@ -33,6 +32,7 @@ static unordered_set<string> unary_ops = {
    "uint16",
    "uint32",
    "uint64",
+    "float16",
    "float32",
    "float64",
    // please keep float64 the last type
@ -534,22 +534,15 @@ UnaryOp::UnaryOp(Var* x, NanoString op) : x(x) {
    ns = op;
    ASSERT(ns.is_unary() | ns.is_dtype());
    NanoString dtype;
-    if (ns.is_dtype()) {
    if (ns == x->dtype()) {
        forward(x);
        return;
    }
+    if (ns.is_dtype()) {
        dtype = ns;
        ns = ns_cast;
-    } else if (ns.is_bool())
-        dtype = ns_bool;
-    else if (ns.is_float())
-        dtype = dtype_infer(x->ns, x->ns, 2);
-    else if (ns.is_int())
-        dtype = dtype_infer(x->ns, x->ns, 1);
-    else {
-        dtype = x->ns;
-    }
+    } else 
+        dtype = unary_dtype_infer(ns, x->ns);
    y = create_output(nullptr, dtype);
 }

@ -688,7 +681,7 @@ void UnaryOp::jit_run() {
    auto* __restrict__ yp = y->ptr<Ty>();
    index_t num = y->num;
    for (index_t i=0; i<num; i++)
-        yp[i] = @expand_macro(@OP, Ty, xp[i]);
+        yp[i] = @expand_op(@OP, @Ty, xp[i], @Tx);
 }
 #endif // JIT

--- a/python/jittor/src/ops/unary_op_defs.h
+++ b/python/jittor/src/ops/unary_op_defs.h
@ -1,84 +0,0 @@
-// ***************************************************************
-// Copyright (c) 2021 Jittor. All Rights Reserved. 
-// Maintainers: Dun Liang <randonlang@gmail.com>. 
-// This file is subject to the terms and conditions defined in
-// file 'LICENSE.txt', which is part of this source code package.
-// ***************************************************************
-#pragma once
-#include "common.h"
-
-namespace jittor {
-
-#define logical_not(T,x) (!(x))
-#define bitwise_not(T,x) (~(x))
-#define negative(T,x) (-(x))
-#ifdef JIT_cuda
-// TODO: add float64 version
-#define abs(T,x) ::abs(x)
-#define log(T,x) ::logf((T)(x))
-#define exp(T,x) ::expf((T)(x))
-#define sqrt(T,x) ::sqrtf((T)(x))
-#define round(T,x) ((T) ::roundf((x)))
-#define floor(T,x) ((T) ::floorf((x)))
-#define ceil(T,x) ((T) ::ceilf((x)))
-#define round_int(T,x) ((T) ::roundf((x)))
-#define floor_int(T,x) ((T) ::floorf((x)))
-#define ceil_int(T,x) ((T) ::ceilf((x)))
-
-#define sin(T,x) ((T) ::sinf((x)))
-#define asin(T,x) ((T) ::asinf((x)))
-#define sinh(T,x) ((T) ::sinhf((x)))
-#define asinh(T,x) ((T) ::asinhf((x)))
-
-#define cos(T,x) ((T) ::cosf((x)))
-#define acos(T,x) ((T) ::acosf((x)))
-#define cosh(T,x) ((T) ::coshf((x)))
-#define acosh(T,x) ((T) ::acoshf((x)))
-
-#define tan(T,x) ((T) ::tanf((x)))
-#define atan(T,x) ((T) ::atanf((x)))
-#define tanh(T,x) ((T) ::tanhf((x)))
-#define atanh(T,x) ((T) ::atanhf((x)))
-
-#define sigmoid(T,x) ((T) (1.0f/(1.0f+::expf((::min(T(-(x)), T(@if(@strcmp(@T,float32)==0,30,300))))))))
-
-#define erf(T,x) ((T) ::erff((x)))
-#define erfinv(T,x) ((T) ::erfinvf((T)(x)))
-
-#else
-#define abs(T,x) std::abs(x)
-#define log(T,x) std::log((T)(x))
-#define exp(T,x) std::exp((T)(x))
-#define sqrt(T,x) std::sqrt((T)(x))
-#define round(T,x) ((T)std::round((x)))
-#define floor(T,x) ((T)std::floor((x)))
-#define ceil(T,x) ((T)std::ceil((x)))
-#define round_int(T,x) ((T)std::round((x)))
-#define floor_int(T,x) ((T)std::floor((x)))
-#define ceil_int(T,x) ((T)std::ceil((x)))
-
-#define sin(T,x) ((T) std::sin((x)))
-#define asin(T,x) ((T) std::asin((x)))
-#define sinh(T,x) ((T) std::sinh((x)))
-#define asinh(T,x) ((T) std::asinh((x)))
-
-#define cos(T,x) ((T) std::cos((x)))
-#define acos(T,x) ((T) std::acos((x)))
-#define cosh(T,x) ((T) std::cosh((x)))
-#define acosh(T,x) ((T) std::acosh((x)))
-
-#define tan(T,x) ((T) std::tan((x)))
-#define atan(T,x) ((T) std::atan((x)))
-#define tanh(T,x) ((T) std::tanh((x)))
-#define atanh(T,x) ((T) std::atanh((x)))
-
-#define sigmoid(T,x) ((T) (1.0f/(1.0f+std::exp(std::min(T(-(x)), T(@if(@strcmp(@T,float32)==0,30,300)))))))
-
-#define erf(T,x) ((T) std::erf((x)))
-#define erfinv(T,x) (jittor::_erfinv(x))
-
-#endif
-
-#define cast(T,x) ((T)(x))
-
-} // jittor
--- a/python/jittor/src/ops/where_op.cc
+++ b/python/jittor/src/ops/where_op.cc
@ -230,7 +230,7 @@ void WhereOp::jit_run() {

    int n=0;
    // checkCudaErrors(cudaDeviceSynchronize());
-    checkCudaErrors(cudaMemcpy(&n, np, 4, cudaMemcpyDefault));
+    checkCudaErrors(cudaMemcpy(&n, np, 4, cudaMemcpyDeviceToHost));
    @for(i, 0, NDIM, outs[@i]->set_shape({n});)
    exe.temp_allocator->free(np, 4, n_allocation);
 }
--- a/python/jittor/src/opt/tuner/conv_tuner.cc
+++ b/python/jittor/src/opt/tuner/conv_tuner.cc
@ -25,6 +25,7 @@
 namespace jittor {

 using namespace expr;
+extern int use_cuda;

 struct OpInspector {
    // binary mask for
@ -229,9 +230,14 @@ void ConvTuner::forwardTune(FusedOp* fop) {
        if (!(bop->y->input() && bop->x->input() && fop->has(bop->x->input()) && fop->has(bop->y->input()))) continue;
        if (!(bop->x->input()->type()==OpType::broadcast && bop->y->input()->type()==OpType::broadcast)) return;

-        // only support float32 currently
+        // only support float32,float16 currently
+        if (use_cuda) {
+            if (bop->z->dtype() != ns_float32 && bop->z->dtype() != ns_float16)
+                continue;
+        } else {
            if (bop->z->dtype() != ns_float32)
                continue;
+        }
        Op* ops[3] = {op, bop->x->input(), bop->y->input()};
        int ok = 0;
        LOGvvvv << "conv like op" << fop << fop->get_jit_key(get_jk());
--- a/python/jittor/src/profiler/profiler.cc
+++ b/python/jittor/src/profiler/profiler.cc
@ -23,6 +23,7 @@
 #include "fused_op.h"
 #include "profiler/memory_checker.h"
 #include "misc/deleter.h"
+#include "executor.h"

 namespace jittor {

@ -30,6 +31,8 @@ Profiler profiler;

 DEFINE_FLAG(int, profiler_warmup, 0, "Profiler warmup.");
 DEFINE_FLAG(int, profiler_rerun, 0, "Profiler rerun.");
+DEFINE_FLAG(int, profiler_record_peek, 0, "Profiler record peek mem bandwidth.");
+DEFINE_FLAG(int, profiler_record_shape, 0, "Profiler record shape for op.");
 DEFINE_FLAG(int, profiler_hide_relay, 0, "Profiler hide relayed op.");
 DEFINE_FLAG_WITH_SETTER(int, profiler_enable, 0, "Enable profiler.");

@ -54,6 +57,8 @@ void Profiler::start(int64 warmup, int64 rerun) {
    profiler.records.clear();
    profiler.warmup = warmup;
    profiler.rerun = rerun;
+    profiler.relay_extra_cost = 0;
+    profiler.relay_fop = 0;
 }

 void Profiler::stop() {
@ -138,6 +143,60 @@ static  string get_stack_info(Op* op) {
    }
 }

+static void stat_peek_bandwidth(uint64 in, uint64 out, uint64 loop, uint64& peek_time_total) {
+    auto size = (in+out) / 2;
+    // memcpy in some not aligned case will drop performance
+    size &= ~((1 << 12)-1);
+    // size = 7680000*4;
+    auto temp1 = exe.alloc_temp(size);
+    auto temp2 = exe.alloc_temp(size);
+    loop = 1 << loop;
+    int warmup = std::max(loop/8, (uint64)1);
+    for (int i=0; i<warmup; i++)
+    #ifdef HAS_CUDA
+        if (use_cuda)
+            cudaMemcpyAsync(temp1.ptr, temp2.ptr, size, cudaMemcpyDeviceToDevice, 0);
+        else
+    #endif
+            std::memcpy(temp1.ptr, temp2.ptr, size);
+    #ifdef HAS_CUDA
+    if (use_cuda)
+        checkCudaErrors(cudaDeviceSynchronize());
+    #endif
+    auto start = std::chrono::high_resolution_clock::now();
+    for (int i=0; i<loop; i++)
+    #ifdef HAS_CUDA
+        if (use_cuda)
+            cudaMemcpyAsync(temp1.ptr, temp2.ptr, size, cudaMemcpyDeviceToDevice, 0);
+        else
+    #endif
+            std::memcpy(temp1.ptr, temp2.ptr, size);
+    #ifdef HAS_CUDA
+    if (use_cuda)
+        checkCudaErrors(cudaDeviceSynchronize());
+    #endif
+    auto finish = std::chrono::high_resolution_clock::now();
+    auto total_ns =  (int64_t)std::chrono::duration_cast<std::chrono::nanoseconds>(finish-start).count();
+    peek_time_total += total_ns;
+}
+
+struct RecordExtraCost {
+    int ck;
+    std::chrono::high_resolution_clock::time_point start;
+
+    RecordExtraCost(int ck) : ck(ck) {
+        if (!ck) return;
+        start = std::chrono::high_resolution_clock::now();
+    }
+
+    ~RecordExtraCost() {
+        if (!ck) return;
+        auto finish = std::chrono::high_resolution_clock::now();
+        auto total_ns =  (int64_t)std::chrono::duration_cast<std::chrono::nanoseconds>(finish-start).count();
+        profiler.relay_extra_cost += total_ns;
+    }
+};
+
 void Profiler::record_and_run(
    jit_op_entry_t jit_entry,
    Op* op,
@ -151,6 +210,9 @@ void Profiler::record_and_run(
            jit_key : ikey->second.c_str();
        auto iter = profiler.records.find(key);
        uint64_t in, out, compute;
+        if (profiler.relay_fop)
+            profiler.relay_fop->statistics(in, out, compute);
+        else
            op->statistics(in, out, compute);
        if (iter == profiler.records.end()) {
            profiler.records[key] = Info{
@ -165,7 +227,7 @@ void Profiler::record_and_run(
        bool is_fused = op->name() == string("fused");

        uint64* shape_time = nullptr;
-        if (trace_py_var) {
+        if (trace_py_var || profiler_record_shape) {
            // record shape
            NanoVector shape;
            int64 num = 0;
@ -193,41 +255,62 @@ void Profiler::record_and_run(
            iter->second.shapes[shape].second += 1;
            shape_time = &iter->second.shapes[shape].first;
        }
-        int loop = (is_fused && 
-            ((FusedOp*)op)->get_loop_option("insert_profile_loop")) ? 10 : 0;
-        int64_t warmup = profiler.warmup ? std::max(profiler.warmup>>loop, (int64_t)1) : 0;
-        int64_t rerun = std::max((profiler.rerun+1)>>loop, (int64_t)1);
-        // prevent relayed op being rerun
-        auto warmup_bk = profiler.warmup;
-        auto rerun_bk = profiler.rerun;
-        profiler.warmup = profiler.rerun = 0;
-        Deleter del([&]() {
-            profiler.warmup = warmup_bk;
-            profiler.rerun = rerun_bk;
-        });
+        int64_t warmup = profiler.warmup;
+        int64_t rerun = profiler.rerun + 1;
+        rerun = std::max(NanoVector::get_nbits(rerun) - 2, 0);
+        int loop = 0;
+        Deleter _d;
+        if (is_fused) {
+            auto fop = ((FusedOp*)op);
+            if (fop->context && fop->context->vrm.relay_groups.size()) {
+                // relay op
+                loop = rerun;
+                profiler.relay_extra_cost = 0;
+                profiler.relay_fop = fop;
+                _d.del = [&]() {
+                    profiler.relay_extra_cost = 0;
+                    profiler.relay_fop = 0;
+                };
+            } else
+                loop = fop->get_loop_option("insert_profile_loop") ? 10 : 0;
+        }
+        int64 num = 1<<(rerun - loop);

+        {
+            profiler_enable = 0;
+            Deleter del([&]() { profiler_enable = 1;});
+            RecordExtraCost rec(profiler.relay_fop && profiler.relay_fop != op);
            for (int64_t i=0; i<warmup; i++) {
                jit_entry(op);
+            }
            #ifdef HAS_CUDA
            if (use_cuda)
                checkCudaErrors(cudaDeviceSynchronize());
            #endif
        }
-        for (int64_t i=0; i<rerun; i++) {
+
        auto start = std::chrono::high_resolution_clock::now();
+        for (int64_t i=0; i<num; i++) {
            jit_entry(op);
+        }
        #ifdef HAS_CUDA
        if (use_cuda)
            checkCudaErrors(cudaDeviceSynchronize());
        #endif
        auto finish = std::chrono::high_resolution_clock::now();
        auto total_ns =  (int64_t)std::chrono::duration_cast<std::chrono::nanoseconds>(finish-start).count();
+        if (profiler.relay_fop == op) {
+            total_ns -= profiler.relay_extra_cost;
+        }
        // 24ns function call overhead
        total_ns = std::max((int64_t)1, total_ns-24);
-            iter->second.update(loop, total_ns, in, out, compute);
+        iter->second.update(rerun, total_ns, in, out, compute);
        if (shape_time) shape_time[0] += total_ns;
+
+        RecordExtraCost rec(profiler.relay_fop && profiler.relay_fop != op);
+        if (profiler_record_peek)
+            stat_peek_bandwidth(in, out, rerun, iter->second.peek_time_total);
        LOGvvvv << "Duration" << total_ns >> "ns running" << op;
-        }
        if (is_fused && 
            ((FusedOp*)op)->get_loop_option("check_cache")) {
            auto fname = Op::get_filename_from_jit_key(key, ".so");
@ -239,6 +322,8 @@ void Profiler::record_and_run(

 vector<vector<string>> Profiler::report(const string& sort_key) {
    vector<vector<string>> rep = {{"Name", "FileName", "Count", "TotalTime", "AvgTime", "MinTime", "MaxTime", "Input", "Output", "InOut", "Compute"}};
+    if (profiler_record_peek)
+        rep[0].push_back("Peek");
    vector<string> names, fnames;
    vector<vector<double>> info;
    vector<int> order;
@ -295,6 +380,10 @@ vector<vector<string>> Profiler::report(const string& sort_key) {
            (double)(kinfo.in_total+kinfo.out_total)*1e9 / kinfo.time_total, // InOut
            (double)kinfo.compute_total*1e9 / kinfo.time_total, // Compute
        });
+        if (profiler_record_peek)
+            info.back().push_back(
+                (double)(kinfo.in_total+kinfo.out_total)*1e9 / kinfo.peek_time_total // Peek
+            );
    }
    if (sort_key_id>=2)
        std::sort(order.begin(), order.end(), [&](int i, int j) {
@ -363,7 +452,7 @@ vector<vector<string>> Profiler::report(const string& sort_key) {
                        << std::setw(3)
                        << std::setprecision(p) << cum_time / total_time * 100 << "%)";
                }
-            } else if (j<=7) {
+            } else if (j<=7 || j==9) {
                // output thoughtput
                output_float(" KMG", 1024, "B/s", k);
            } else {
--- a/python/jittor/src/profiler/profiler.h
+++ b/python/jittor/src/profiler/profiler.h
@ -24,6 +24,8 @@ struct Profiler {
        uint64_t in_total, out_total;
        // compute thoughtput in ops
        uint64_t compute_total;
+        // peek time use memcopy
+        uint64_t peek_time_total;
        // cache test info
        unique_ptr<CacheInfo> cache_info;
        cstr stack_info;
@ -56,6 +58,9 @@ struct Profiler {
    
    int64_t warmup=0, rerun=0;
    unordered_map<string, Info> records;
+    int64 relay_extra_cost;
+    FusedOp* relay_fop;
+
    ~Profiler();
 };

--- a/python/jittor/src/pybind/py_var_tracer.cc
+++ b/python/jittor/src/pybind/py_var_tracer.cc
@ -267,7 +267,7 @@ void TraceData::release_node(Node* node) {
        return;
    auto node_id = iter->second;
    id_map.erase(node);
-    if (trace_py_var < 2) {
+    if (trace_py_var < 2 || execute_op_info.size() > 100000) {
        node_data.erase(node_id);
    }
 }
@ -312,6 +312,7 @@ void TraceData::record_op(Op* op) {
 }

 void TraceData::record_execution(Op* op, bool is_fused_op, JK& jk) {
+    if (execute_op_info.size() > 100000) return;
    ExecuteOpInfo& einfo = execute_op_info[execute_op_info_cnt++];
    if (is_fused_op) {
        FusedOp* fop = (FusedOp*)op;
--- a/python/jittor/src/pyjt/numpy.cc
+++ b/python/jittor/src/pyjt/numpy.cc
@ -21,7 +21,9 @@ NanoString npy2ns[] = {
    ns_int64, ns_uint64,
    ns_float32, ns_float64, ns_float64,
    ns_void, ns_void, ns_void, 
-    ns_void 
+    ns_void, // 17
+    ns_void, ns_void, ns_void, ns_void, ns_void, // 22
+    ns_float16, // 23
 };

 NPY_TYPES ns2npy[] = {
@ -34,7 +36,7 @@ NPY_TYPES ns2npy[] = {
    NPY_BYTE, NPY_SHORT, NPY_INT, NPY_LONGLONG,
    NPY_UBYTE, NPY_USHORT, NPY_UINT, NPY_ULONGLONG,
    #endif
-    NPY_FLOAT, NPY_DOUBLE
+    NPY_HALF, NPY_FLOAT, NPY_DOUBLE
 };

 void** PyArray_API;
--- a/python/jittor/src/pyjt/numpy.h
+++ b/python/jittor/src/pyjt/numpy.h
@ -48,6 +48,8 @@ enum NPY_TYPES {
    NPY_FLOAT, NPY_DOUBLE, NPY_LONGDOUBLE,
    NPY_CFLOAT, NPY_CDOUBLE, NPY_CLONGDOUBLE,
    NPY_OBJECT=17,
+    NPY_HALF=23,
+    NPY_END=24,
 };

 EXTERN_LIB NanoString npy2ns[];
@ -60,11 +62,11 @@ EXTERN_LIB NPY_TYPES ns2npy[];
 inline bool is_c_style(PyArray_Proxy* obj) { return obj->flags & 1; }
 inline NanoString get_type_str(PyArray_Proxy* obj) {
    NanoString type = ns_void;
-    if (obj->descr->type_num < NPY_OBJECT)
+    if (obj->descr->type_num < NPY_END)
        type = npy2ns[obj->descr->type_num];
    CHECK(type != ns_void) << "Numpy type not support, type_num:"
        << obj->descr->type_num 
-        << "type_char:" << obj->descr->type;
+        << "type_char:" << obj->descr->type << NPY_END << npy2ns[obj->descr->type_num];
    return type;
 }

--- a/python/jittor/src/pyjt/py_array_op.cc
+++ b/python/jittor/src/pyjt/py_array_op.cc
@ -141,7 +141,7 @@ ArrayOp::ArrayOp(PyObject* obj) {
    } else {
        // this is non-continue numpy array
 #if defined(__linux__) || defined(_WIN32)
-        STACK_ALLOC(int64, dims, args.shape.size());
+        STACK_ALLOC(int64_t, dims, args.shape.size());
 #elif defined(__APPLE__)
        long dims[args.shape.size()];
 #endif
--- a/python/jittor/src/pyjt/py_converter.h
+++ b/python/jittor/src/pyjt/py_converter.h
@ -15,7 +15,7 @@
 #include "misc/nano_string.h"
 #include "misc/fast_shared_ptr.h"
 #include "profiler/simple_profiler.h"
-#ifdef HAS_CUDA
+#ifdef IS_CUDA
 #include "misc/cuda_flags.h"
 #endif

@ -274,7 +274,7 @@ DEF_IS(ArrayArgs, bool) is_type(PyObject* obj) {

 DEF_IS(ArrayArgs, PyObject*) to_py_object(const T& a) {
 #if defined(__linux__) || defined(_WIN32)
-    STACK_ALLOC(int64, dims, a.shape.size());
+    STACK_ALLOC(int64_t, dims, a.shape.size());
 #elif defined(__APPLE__)
    long dims[a.shape.size()];
 #endif
@ -390,7 +390,7 @@ DEF_IS(VarHolder*, T) from_py_object(PyObject* obj, unique_ptr<VarHolder>& holde
 struct DataView;
 DEF_IS(DataView, PyObject*) to_py_object(T a) {
 #if defined(__linux__) || defined(_WIN32)
-    STACK_ALLOC(int64, dims, a.shape.size());
+    STACK_ALLOC(int64_t, dims, a.shape.size());
 #elif defined(__APPLE__)
    long dims[a.shape.size()];
 #endif
@ -652,7 +652,7 @@ DEF_IS(NumpyFunc, T) from_py_object(PyObject* obj) {
        [obj](typename T::R* result) {
            // import numpy
            string npstr="numpy";
-            #ifdef HAS_CUDA
+            #ifdef IS_CUDA
            if (use_cuda) npstr="cupy";
            #endif

@ -669,7 +669,7 @@ DEF_IS(NumpyFunc, T) from_py_object(PyObject* obj) {
            PyTuple_SET_ITEM(args.obj, 0, np.release());
            PyTuple_SET_ITEM(args.obj, 1, data.release());

-            #ifdef HAS_CUDA
+            #ifdef IS_CUDA
            if (npstr=="cupy") {
                PyObjHolder jt(PyImport_ImportModule("jittor"));
                PyObjHolder pFunc(PyObject_GetAttrString(jt.obj,"numpy2cupy"));
--- a/python/jittor/src/pyjt/py_ring_buffer.cc
+++ b/python/jittor/src/pyjt/py_ring_buffer.cc
@ -110,7 +110,7 @@ static void push_py_object(RingBuffer* rb, PyObject* obj, uint64& __restrict__ o
                rb->push(size, offset);
                args.ptr = rb->get_ptr(size, offset);
 #if defined(__linux__) || defined(_WIN32)
-                STACK_ALLOC(int64, dims, args.shape.size());
+                STACK_ALLOC(int64_t, dims, args.shape.size());
 #elif defined(__APPLE__)
                long dims[args.shape.size()];
 #endif
--- a/python/jittor/src/type/common_op_type.cc
+++ b/python/jittor/src/type/common_op_type.cc
@ -0,0 +1,165 @@
+// ***************************************************************
+// Copyright (c) 2021 Jittor. All Rights Reserved. 
+// Maintainers: Dun Liang <randonlang@gmail.com>. 
+// This file is subject to the terms and conditions defined in
+// file 'LICENSE.txt', which is part of this source code package.
+// ***************************************************************
+#include "common.h"
+#include "utils/str_utils.h"
+#include "ops/op_register.h"
+
+namespace jittor {
+
+extern int use_cuda;
+
+unordered_map<string,string> common_op_type_cuda_map = {
+    {"logical_not", "(!($2))"},
+    {"bitwise_not", "(~($2))"},
+    {"negative", "(-($2))"},
+    {"abs", "::abs($2)"},
+    {"log", "::logf(($1)($2))"},
+    {"exp", "::expf(($1)($2))"},
+    {"sqrt", "::sqrtf(($1)($2))"},
+    {"round", "(($1) ::roundf(($2)))"},
+    {"floor", "(($1) ::floorf(($2)))"},
+    {"ceil", "(($1) ::ceilf(($2)))"},
+    {"round_int", "(($1) ::roundf(($2)))"},
+    {"floor_int", "(($1) ::floorf(($2)))"},
+    {"ceil_int", "(($1) ::ceilf(($2)))"},
+    {"sin", "(($1) ::sinf(($2)))"},
+    {"asin", "(($1) ::asinf(($2)))"},
+    {"sinh", "(($1) ::sinhf(($2)))"},
+    {"asinh", "(($1) ::asinhf(($2)))"},
+    {"cos", "(($1) ::cosf(($2)))"},
+    {"acos", "(($1) ::acosf(($2)))"},
+    {"cosh", "(($1) ::coshf(($2)))"},
+    {"acosh", "(($1) ::acoshf(($2)))"},
+    {"tan", "(($1) ::tanf(($2)))"},
+    {"atan", "(($1) ::atanf(($2)))"},
+    {"tanh", "(($1) ::tanhf(($2)))"},
+    {"atanh", "(($1) ::atanhf(($2)))"},
+    {"sigmoid", "(($1) (1.0f/(1.0f+::expf((::min($1(-($2)), $1(@if(@strcmp($1,float32)==0,30,300))))))))"},
+    {"erf", "(($1) ::erff(($2)))"},
+    {"erfinv", "(($1) ::erfinvf(($1)($2)))"},
+    {"cast", "(($1)($2))"},
+    {"pow", "::pow(($2),($4))"},
+    {"maximum", "::max($1($2), $1($4))"},
+    {"minimum", "::min($1($2), $1($4))"},
+    {"mod", "@if(@strcmp($1,float32)==0,(($2)-::floorf(($2)/($4))*($4)),@if(@strcmp(@Tx,float64)==0,(($2)-::floor(($2)/($4))*($4)),(($2)%($4))))"},
+    {"init_maximum", "::numeric_min<$1>()"},
+    {"init_minimum", "::numeric_max<$1>()"},
+};
+
+struct CommonOpType : OpByType {
+    CommonOpType() {
+        types = {
+            "bool",
+            "int8",
+            "int16",
+            "int32",
+            "int64",
+            "uint8",
+            "uint16",
+            "uint32",
+            "uint64",
+            "float32",
+            "float64",
+        };
+    }
+
+    string expand_op(const vector<string>& args) {
+        for (int i=1; i<args.size(); i+=2) {
+            if (!types.count(args[i]))
+                return "";
+        }
+        auto& cuda_map = common_op_type_cuda_map;
+
+        static unordered_map<string,string> cpu_map = {
+            {"logical_not", "(!($2))"},
+            {"bitwise_not", "(~($2))"},
+            {"negative", "(-($2))"},
+            {"abs", "std::abs($2)"},
+            {"log", "std::log(($1)($2))"},
+            {"exp", "std::exp(($1)($2))"},
+            {"sqrt", "std::sqrt(($1)($2))"},
+            {"round", "(($1)std::round(($2)))"},
+            {"floor", "(($1)std::floor(($2)))"},
+            {"ceil", "(($1)std::ceil(($2)))"},
+            {"round_int", "(($1)std::round(($2)))"},
+            {"floor_int", "(($1)std::floor(($2)))"},
+            {"ceil_int", "(($1)std::ceil(($2)))"},
+            {"sin", "(($1) std::sin(($2)))"},
+            {"asin", "(($1) std::asin(($2)))"},
+            {"sinh", "(($1) std::sinh(($2)))"},
+            {"asinh", "(($1) std::asinh(($2)))"},
+            {"cos", "(($1) std::cos(($2)))"},
+            {"acos", "(($1) std::acos(($2)))"},
+            {"cosh", "(($1) std::cosh(($2)))"},
+            {"acosh", "(($1) std::acosh(($2)))"},
+            {"tan", "(($1) std::tan(($2)))"},
+            {"atan", "(($1) std::atan(($2)))"},
+            {"tanh", "(($1) std::tanh(($2)))"},
+            {"atanh", "(($1) std::atanh(($2)))"},
+            {"sigmoid", "(($1) (1.0f/(1.0f+std::exp(std::min($1(-($2)), $1(@if(@strcmp($1,float32)==0,30,300)))))))"},
+            {"erf", "(($1) std::erf(($2)))"},
+            {"erfinv", "(jittor::_erfinv($2))"},
+            {"cast", "(($1)($2))"},
+            {"pow", "std::pow(($2),($4))"},
+            {"maximum", "std::max($1($2), $1($4))"},
+            {"minimum", "std::min($1($2), $1($4))"},
+            {"mod", "@if(@strcmp($1,float32)==0,(($2)-std::floor(($2)/($4))*($4)),@if(@strcmp(@Tx,float64)==0,(($2)-std::floor(($2)/($4))*($4)),(($2)%($4))))"},
+            {"init_maximum", "std::numeric_limits<$1>::lowest()"},
+            {"init_minimum", "std::numeric_limits<$1>::max()"},
+        };
+
+        static unordered_map<string,string> both_map {
+            {"add", "(($2)+($4))"},
+            {"subtract", "(($2)-($4))"},
+            {"multiply", "(($2)*($4))"},
+            {"divide", "($1(($1($2))/($1($4))))"},
+            {"floor_divide", "($1(($1($2))/($1($4))))"},
+            {"less", "(($2)<($4))"},
+            {"less_equal", "(($2)<=($4))"},
+            {"greater", "(($2)>($4))"},
+            {"greater_equal", "(($2)>=($4))"},
+            {"equal", "(($2)==($4))"},
+            {"not_equal", "(($2)!=($4))"},
+            {"left_shift", "(($2)<<($4))"},
+            {"right_shift", "(($2)>>($4))"},
+            {"logical_and", "(($2)&&($4))"},
+            {"logical_or", "(($2)||($4))"},
+            {"logical_xor", "((bool($2))!=(bool($4)))"},
+            {"bitwise_and", "(($2)&($4))"},
+            {"bitwise_or", "(($2)|($4))"},
+            {"bitwise_xor", "(($2)^($4))"},
+            {"mean", "(($2)+$1($4)*($1(rcount)))"},
+            {"init_add", "$1(0)"},
+            {"init_multiply", "$1(1)"},
+            {"init_logical_and", "true"},
+            {"init_logical_or", "false"},
+            {"init_logical_xor", "false"},
+            {"init_bitwise_and", "$1(-1)"},
+            {"init_bitwise_or", "$1(0)"},
+            {"init_bitwise_xor", "$1(0)"},
+            {"init_mean", "$1(0)"},
+        };
+
+        string ret;
+        if (both_map.count(args.at(0)))
+            ret = both_map[args.at(0)];
+        else if (use_cuda)
+            ret = cuda_map[args.at(0)];
+        else
+            ret = cpu_map[args.at(0)];
+        return format(ret, args);
+    }
+
+    void post_pass(OpCompiler*) {
+        return;
+    }
+};
+
+
+static int _ = registe_op_type(new CommonOpType());
+
+}
--- a/python/jittor/src/type/fp16_compute.h
+++ b/python/jittor/src/type/fp16_compute.h
@ -0,0 +1,164 @@
+// ***************************************************************
+// Copyright (c) 2021 Jittor. All Rights Reserved. 
+// Maintainers: Dun Liang <randonlang@gmail.com>. 
+// This file is subject to the terms and conditions defined in
+// file 'LICENSE.txt', which is part of this source code package.
+// ***************************************************************
+#pragma once
+#include "common.h"
+
+#ifdef JIT_cuda
+
+#include <driver_types.h>
+#include <cuda_fp16.h>
+
+namespace jittor {
+
+typedef __half float16;
+
+#if CUDA_ARCH >= 800
+inline __device__ float16 max(float16 a, float16 b) { return __hmax(a, b); }
+inline __device__ float16 min(float16 a, float16 b) { return __hmin(a, b); }
+#else
+inline __device__ float16 max(float16 a, float16 b) { return a<b?b:a; }
+inline __device__ float16 min(float16 a, float16 b) { return a<b?a:b; }
+#endif
+
+inline __device__ float16 pow(float16 a, float16 b) { return ::pow(float32(a), float32(b)); }
+
+
+template<int nbyte, class T>
+__device__ inline void vload(T* __restrict__ a, T* __restrict__ b) {
+    if constexpr (nbyte<=0) return;
+    if constexpr (nbyte>=16) {
+        auto __restrict__ aa = (float4* __restrict__)a;
+        auto __restrict__ bb = (float4* __restrict__)b;
+        aa[0] = bb[0];
+        return vload<nbyte-16>(aa+1, bb+1);
+    }
+    if constexpr (nbyte>=8) {
+        auto __restrict__ aa = (float2* __restrict__)a;
+        auto __restrict__ bb = (float2* __restrict__)b;
+        aa[0] = bb[0];
+        return vload<nbyte-8>(aa+1, bb+1);
+    }
+    if constexpr (nbyte>=4) {
+        auto __restrict__ aa = (float* __restrict__)a;
+        auto __restrict__ bb = (float* __restrict__)b;
+        aa[0] = bb[0];
+        return vload<nbyte-4>(aa+1, bb+1);
+    }
+    if constexpr (nbyte>=2) {
+        auto __restrict__ aa = (__half* __restrict__)a;
+        auto __restrict__ bb = (__half* __restrict__)b;
+        aa[0] = bb[0];
+        return vload<nbyte-2>(aa+1, bb+1);
+    }
+    if constexpr (nbyte>=1) {
+        auto __restrict__ aa = (int8_t* __restrict__)a;
+        auto __restrict__ bb = (int8_t* __restrict__)b;
+        aa[0] = bb[0];
+        return vload<nbyte-1>(aa+1, bb+1);
+    }
+}
+
+
+}
+
+using jittor::max;
+using jittor::min;
+using jittor::pow;
+
+#else
+
+namespace jittor {
+
+struct float16 {
+    uint16 x;
+
+    inline float16(float32 f) {
+        unsigned x = *((int*)(void*)(&f));
+        unsigned u = (x & 0x7fffffff), remainder, shift, lsb, lsb_s1, lsb_m1;
+        unsigned sign, exponent, mantissa;
+
+
+        // Get rid of +NaN/-NaN case first.
+        if (u > 0x7f800000) {
+            this->x = 0x7fffU;
+            return;
+        }
+    
+        sign = ((x >> 16) & 0x8000);
+    
+        // Get rid of +Inf/-Inf, +0/-0.
+        if (u > 0x477fefff) {
+            this->x = sign | 0x7c00U;
+            return;
+        }
+        if (u < 0x33000001) {
+            this->x = sign | 0x0000U;
+            return;
+        }
+
+        exponent = ((u >> 23) & 0xff);
+        mantissa = (u & 0x7fffff);
+
+        if (exponent > 0x70) {
+            shift = 13;
+            exponent -= 0x70;
+        } else {
+            shift = 0x7e - exponent;
+            exponent = 0;
+            mantissa |= 0x800000;
+        }
+        lsb = (1 << shift);
+        lsb_s1 = (lsb >> 1);
+        lsb_m1 = (lsb - 1);
+    
+        // Round to nearest even.
+        remainder = (mantissa & lsb_m1);
+        mantissa >>= shift;
+        if (remainder > lsb_s1 || (remainder == lsb_s1 && (mantissa & 0x1))) {
+            ++mantissa;
+            if (!(mantissa & 0x3ff)) {
+                ++exponent;
+                mantissa = 0;
+            }
+        }  
+
+        this->x = (sign | (exponent << 10) | mantissa);  
+    }
+
+    inline operator float() {
+
+        unsigned sign     = ((x >> 15) & 1);
+        unsigned exponent = ((x >> 10) & 0x1f);
+        unsigned mantissa = ((x & 0x3ff) << 13);
+
+        if (exponent == 0x1f) {  /* NaN or Inf */
+            mantissa = (mantissa ? (sign = 0, 0x7fffff) : 0);
+            exponent = 0xff;
+        } else if (!exponent) {  /* Denorm or Zero */
+            if (mantissa) {
+                unsigned int msb;
+                exponent = 0x71;
+                do {
+                    msb = (mantissa & 0x400000);
+                    mantissa <<= 1;  /* normalize */
+                    --exponent;
+                } while (!msb);
+                mantissa &= 0x7fffff;  /* 1.mantissa is implicit */
+            }
+        } else {
+            exponent += 0x70;
+        }
+
+        int temp = ((sign << 31) | (exponent << 23) | mantissa);
+
+        return reinterpret_cast<float&>(temp);
+    }
+};
+
+}
+
+#endif
--- a/python/jittor/src/type/fp16_op_type.cc
+++ b/python/jittor/src/type/fp16_op_type.cc
@ -0,0 +1,188 @@
+// ***************************************************************
+// Copyright (c) 2021 Jittor. All Rights Reserved. 
+// Maintainers: Dun Liang <randonlang@gmail.com>. 
+// This file is subject to the terms and conditions defined in
+// file 'LICENSE.txt', which is part of this source code package.
+// ***************************************************************
+#include "common.h"
+#include "utils/str_utils.h"
+#include "ops/op_register.h"
+#include "op_compiler.h"
+
+namespace jittor {
+
+extern int use_cuda;
+
+extern unordered_map<string,string> common_op_type_cuda_map;
+
+static bool isvar(char x) { return isalnum(x) || x == '_' || x == ':'; }
+
+struct FP16OpType : OpByType {
+    FP16OpType() {
+        types = {
+            "float16",
+        };
+    }
+
+    string expand_op(const vector<string>& args) {
+        bool found_fp16 = 0;
+        for (int i=1; i<args.size(); i+=2) {
+            if (types.count(args[i]))
+                found_fp16 = 1;
+        }
+        if (!found_fp16) return "";
+        static unordered_map<string,string> cuda_map = {
+            {"logical_not", "(!($2))"},
+            {"bitwise_not", "(~($2))"},
+            {"negative", "(-($2))"},
+            {"abs", "::abs($2)"},
+            {"log", "::hlog(($1)($2))"},
+            {"exp", "::hexp(($1)($2))"},
+            {"sqrt", "::hsqrt(($1)($2))"},
+            {"round", "(($1) ::roundf(($2)))"},
+            {"floor", "(($1) ::floorf(($2)))"},
+            {"ceil", "(($1) ::ceilf(($2)))"},
+            {"round_int", "(($1) ::roundf(($2)))"},
+            {"floor_int", "(($1) ::floorf(($2)))"},
+            {"ceil_int", "(($1) ::ceilf(($2)))"},
+            {"sin", "(($1) ::sinf(($2)))"},
+            {"asin", "(($1) ::asinf(($2)))"},
+            {"sinh", "(($1) ::sinhf(($2)))"},
+            {"asinh", "(($1) ::asinhf(($2)))"},
+            {"cos", "(($1) ::cosf(($2)))"},
+            {"acos", "(($1) ::acosf(($2)))"},
+            {"cosh", "(($1) ::coshf(($2)))"},
+            {"acosh", "(($1) ::acoshf(($2)))"},
+            {"tan", "(($1) ::tanf(($2)))"},
+            {"atan", "(($1) ::atanf(($2)))"},
+            {"tanh", "(($1) ::tanhf(($2)))"},
+            {"atanh", "(($1) ::atanhf(($2)))"},
+            {"sigmoid", "(($1) (1.0f/(1.0f+::expf((::min($1(-($2)), $1(@if(@strcmp($1,float16)==0,30,300))))))))"},
+            {"erf", "(($1) ::erff(($2)))"},
+            {"erfinv", "(($1) ::erfinvf(($1)($2)))"},
+            {"cast", "(($1)($2))"},
+            {"pow", "::pow(($2),($4))"},
+            {"maximum", "::max($1($2), $1($4))"},
+            {"minimum", "::min($1($2), $1($4))"},
+            {"mod", "$1(($2)-::hfloor(($2)/($4))*($4))"},
+            {"init_maximum", "-32768.0f"},
+            {"init_minimum", "32768.0f"},
+        };
+
+        static unordered_map<string,string> cpu_map = {
+            {"logical_not", "(!($2))"},
+            {"bitwise_not", "(~($2))"},
+            {"negative", "(-($2))"},
+            {"abs", "std::abs($2)"},
+            {"log", "std::log(($1)($2))"},
+            {"exp", "std::exp(($1)($2))"},
+            {"sqrt", "std::sqrt(($1)($2))"},
+            {"round", "(($1)std::round(($2)))"},
+            {"floor", "(($1)std::floor(($2)))"},
+            {"ceil", "(($1)std::ceil(($2)))"},
+            {"round_int", "(($1)std::round(($2)))"},
+            {"floor_int", "(($1)std::floor(($2)))"},
+            {"ceil_int", "(($1)std::ceil(($2)))"},
+            {"sin", "(($1) std::sin(($2)))"},
+            {"asin", "(($1) std::asin(($2)))"},
+            {"sinh", "(($1) std::sinh(($2)))"},
+            {"asinh", "(($1) std::asinh(($2)))"},
+            {"cos", "(($1) std::cos(($2)))"},
+            {"acos", "(($1) std::acos(($2)))"},
+            {"cosh", "(($1) std::cosh(($2)))"},
+            {"acosh", "(($1) std::acosh(($2)))"},
+            {"tan", "(($1) std::tan(($2)))"},
+            {"atan", "(($1) std::atan(($2)))"},
+            {"tanh", "(($1) std::tanh(($2)))"},
+            {"atanh", "(($1) std::atanh(($2)))"},
+            {"sigmoid", "(($1) (1.0f/(1.0f+std::exp(std::min($1(-($2)), $1(@if(@strcmp($1,float32)==0,30,300)))))))"},
+            {"erf", "(($1) std::erf(($2)))"},
+            {"erfinv", "(jittor::_erfinv($2))"},
+            {"cast", "(($1)($2))"},
+            {"pow", "std::pow(($2),($4))"},
+            {"maximum", "std::max($1($2), $1($4))"},
+            {"minimum", "std::min($1($2), $1($4))"},
+            {"mod", "$1(($2)-std::floor(($2)/($4))*($4))"},
+            {"init_maximum", "-32768.0f"},
+            {"init_minimum", "32768.0f"},
+        };
+
+        static unordered_map<string,string> both_map {
+            {"add", "(($2)+($4))"},
+            {"subtract", "(($2)-($4))"},
+            {"multiply", "(($2)*($4))"},
+            {"divide", "($1(($1($2))/($1($4))))"},
+            {"floor_divide", "($1(($1($2))/($1($4))))"},
+            {"less", "(($2)<($4))"},
+            {"less_equal", "(($2)<=($4))"},
+            {"greater", "(($2)>($4))"},
+            {"greater_equal", "(($2)>=($4))"},
+            {"equal", "(($2)==($4))"},
+            {"not_equal", "(($2)!=($4))"},
+            {"left_shift", "(($2)<<($4))"},
+            {"right_shift", "(($2)>>($4))"},
+            {"logical_and", "(($2)&&($4))"},
+            {"logical_or", "(($2)||($4))"},
+            {"logical_xor", "((bool($2))!=(bool($4)))"},
+            {"bitwise_and", "(($2)&($4))"},
+            {"bitwise_or", "(($2)|($4))"},
+            {"bitwise_xor", "(($2)^($4))"},
+            {"mean", "(($2)+($4)*($1(rcount)))"},
+            {"init_add", "$1(0)"},
+            {"init_multiply", "$1(1)"},
+            {"init_logical_and", "true"},
+            {"init_logical_or", "false"},
+            {"init_logical_xor", "false"},
+            {"init_bitwise_and", "$1(-1)"},
+            {"init_bitwise_or", "$1(0)"},
+            {"init_bitwise_xor", "$1(0)"},
+            {"init_mean", "$1(0)"},
+        };
+
+        string ret;
+        if (both_map.count(args.at(0)))
+            ret = both_map[args.at(0)];
+        else if (use_cuda)
+            ret = cuda_map[args.at(0)];
+        else
+            ret = cpu_map[args.at(0)];
+        if (use_cuda) {
+            if (args[1] == "float32" && !both_map.count(args.at(0))) {
+                ret = common_op_type_cuda_map[args.at(0)];
+            }
+            if (args[1] == "float16" || args[1] == "float32") {
+                for (int i=3; i<args.size(); i+=2) {
+                    if (args[i] != args[1]) {
+                        ret = replace(ret, "$"+S(i-1),
+                            args[1]+"($"+S(i-1)+")");
+                    }
+                }
+            } else {
+                for (int i=3; i<args.size(); i+=2) {
+                    if (args[i] != "float16") {
+                        ret = replace(ret, "$"+S(i-1),
+                            "float16($"+S(i-1)+")");
+                    }
+                }
+            }
+        }
+        return format(ret, args);
+    }
+
+    void post_pass(OpCompiler* oc) {
+        string& src = oc->src;
+        if (src.find("float16") == string::npos)
+            return;
+        int i = src.rfind("#include");
+        if (i<0) i=0;
+        i = src.find('\n', i) + 1;
+        src = src.substr(0, i) + "#include \"type/fp16_compute.h\"\n" + 
+            src.substr(i);
+        return;
+    }
+};
+
+
+static int _ = registe_op_type(new FP16OpType());
+
+}
--- a/python/jittor/src/types.h
+++ b/python/jittor/src/types.h
@ -18,7 +18,7 @@ namespace jittor {
 typedef int8_t int8;
 typedef int16_t int16;
 typedef int int32;
-typedef int64_t int64;
+typedef long long int64;
 typedef uint8_t uint8;
 typedef uint16_t uint16;
 typedef uint32_t uint32;
@ -239,4 +239,6 @@ std::ostream& operator<<(std::ostream& os, const Caster<T,To>& input) {
    return os << ']';
 }

+#define JPU(x) ;
+
 } // jittor
--- a/python/jittor/src/utils/cache_compile.cc
+++ b/python/jittor/src/utils/cache_compile.cc
@ -167,14 +167,14 @@ void process(string src, vector<string>& input_names, string& cmd) {
            // #include "a.h"
            // i       jk    l
            auto j=i+1;
-            while (j<src.size() && (src[j] != ' ' && src[j] != '\n')) j++;
+            while (j<src.size() && (src[j] != ' ' && src[j] != '\n' && src[j] != '\r')) j++;
            if (j>=src.size()) return;
            if (j-i != 8 && j-i != 6) continue;
            auto k=j+1;
            while (k<src.size() && src[k] == ' ') k++;
            if (k>=src.size()) return;
            auto l=k+1;
-            while (l<src.size() && (src[l] != ' ' && src[l] != '\n')) l++;
+            while (l<src.size() && (src[l] != ' ' && src[l] != '\n' && src[l] != '\r')) l++;
            if (src[k] == '"' && src[l-1] == '"' && j-i==8 && src.substr(i,j-i) == "#include") {
                auto inc = src.substr(k+1, l-k-2);
                if (inc != "test.h" && inc != "helper_cuda.h") {
--- a/python/jittor/src/utils/str_utils.cc
+++ b/python/jittor/src/utils/str_utils.cc
@ -47,4 +47,151 @@ string strip(const string& s) {
    return s.substr(i,j-i);
 }

+string format(const string& s, const vector<string>& v) {
+    string ss;
+    for (int i=0; i<s.size(); i++) {
+        if (s[i] == '$') {
+            int j = s[i+1] - '0';
+            ss += v.at(j);
+            i ++;
+            continue;
+        } else
+            ss += s[i];
+    }
+    return ss;
+}
+
+string join(const vector<string>& vs, const string& x) {
+    string s;
+    for (int i=0; i<vs.size(); i++) {
+        s += vs[i];
+        if (i!=vs.size()-1)
+            s += x;
+    }
+    return s;
+}
+
+string replace(const string& a, const string& b, const string& c) {
+    auto vs = split(a, b);
+    return join(vs, c);
+}
+
+static inline bool isvar(char x) { return isalnum(x) || x == '_' || x == ':'; }
+
+vector<string> token_split(const string& s) {
+    vector<string> ss;
+    if (!s.size()) return ss;
+    ss.push_back(string()+s[0]);
+    for (int i=1; i<s.size(); i++) {
+        if (isvar(s[i]) != isvar(s[i-1]))
+            ss.push_back("");
+        ss.back() += s[i];
+    }
+    return ss;
+}
+
+static void parse_reg(const string& src, 
+    vector<string>& patterns,
+    vector<int>& arg_id) {
+    patterns.clear();
+    arg_id.clear();
+    patterns.push_back("");
+    for (int j=0; j<src.size(); j++) {
+        if (src[j] == '$') {
+            j++;
+            arg_id.push_back(src[j]-'0');
+            patterns.push_back("");
+            continue;
+        }
+        patterns.back() += src[j];
+    }
+}
+
+void token_replace(vector<string>& tokens, int i, const string& src, const string& dst) {
+    ASSERT(src.at(0) != '$' && src.at(src.size()-1) != '$' && 
+        src.at(src.size()-2) != '$') << "illegal src:" << src;
+    vector<string> patterns;
+    vector<int> arg_id;
+    vector<string> patterns2;
+    vector<int> arg_id2;
+    unordered_map<int, string> args;
+    parse_reg(src, patterns, arg_id);
+    parse_reg(dst, patterns2, arg_id2);
+
+    int start_i, start_pos, end_i, end_pos;
+    int c_i = i, c_pos = 0;
+    int match_i, match_pos;
+    string c_arg;
+
+    auto match = [&](int c_i, int c_pos, const string& pat) -> bool {
+        for (int i=0; i<pat.size(); i++) {
+            if (tokens[c_i][c_pos] != pat[i])
+                return false;
+            c_pos ++;
+            if (c_pos >= tokens[c_i].size()) {
+                c_pos = 0;
+                c_i ++;
+                if (c_i >= tokens.size())
+                    return false;
+            }
+        }
+        match_i = c_i;
+        match_pos = c_pos;
+        return true;
+    };
+
+    for (int j=0; j<patterns.size(); j++) {
+        int ok = 0;
+        while (c_i < tokens.size()) {
+            while (c_pos < tokens[c_i].size()) {
+                if (match(c_i, c_pos, patterns[j])) {
+                    ok = 1;
+                    break;
+                }
+                c_arg += tokens[c_i][c_pos];
+                c_pos ++;
+            }
+            if (ok) break;
+            c_i ++;
+            c_pos = 0;
+        }
+        CHECK(ok) << "Pattern not match:" << patterns[j] << j;
+        if (j == 0) {
+            start_i = c_i;
+            start_pos = c_pos;
+        }
+        if (j) {
+            args[arg_id[j-1]] = c_arg;
+        }
+        c_arg = "";
+        c_i = match_i;
+        c_pos = match_pos;
+        if (j == patterns.size()-1) {
+            end_i = c_i;
+            end_pos = c_pos;
+        }
+    }
+    string new_src;
+    for (int j=0; j<patterns2.size(); j++) {
+        if (j) new_src += args[arg_id2.at(j-1)];
+        new_src += patterns2[j];
+    }
+    if (start_i == end_i) {
+        tokens[start_i] = tokens[start_i].substr(0, start_pos) +
+            new_src + tokens[start_i].substr(end_pos);
+    } else {
+        tokens[start_i] = tokens[start_i].substr(0, start_pos)
+            + new_src;
+        tokens[end_i] = tokens[end_i].substr(end_pos);
+        for (int j=start_i+1; j<end_i; j++)
+            tokens[j] = "";
+    }
+}
+
+string token_replace(const string& s, const string& src, const string& dst) {
+    vector<string> ss{s};
+    token_replace(ss, 0, src, dst);
+    return join(ss, "");
+}
+
 } // jittor
--- a/python/jittor/src/utils/str_utils.h
+++ b/python/jittor/src/utils/str_utils.h
@ -27,4 +27,16 @@ vector<string> split(const string& s, const string& sep, int max_split=0);

 string strip(const string& s);

+string format(const string& s, const vector<string>& v);
+
+string replace(const string& a, const string& b, const string& c);
+
+string join(const vector<string>& vs, const string& x);
+
+vector<string> token_split(const string& s);
+
+void token_replace(vector<string>& tokens, int i, const string& src, const string& dst);
+
+string token_replace(const string& s, const string& src, const string& dst);
+
 } // jittor
--- a/python/jittor/src/var.cc
+++ b/python/jittor/src/var.cc
@ -14,7 +14,7 @@

 namespace jittor {

-int64_t Var::number_of_lived_vars = 0;
+int64 Var::number_of_lived_vars = 0;

 DEFINE_FLAG(fast_shared_ptr<loop_options_t>, compile_options, {}, 
    "Override the default loop transfrom options");
@ -42,7 +42,7 @@ string Var::to_string() {
    return s;
 }

-int64_t Var::numel() {
+int64 Var::numel() {
    if (!shape.size()) return size=num=-1;
    bool negtive = 0;
    num=1;
--- a/python/jittor/src/var.h
+++ b/python/jittor/src/var.h
@ -18,13 +18,13 @@ struct Var : Node {
    NanoVector shape;
    cstr name;
    fast_shared_ptr<loop_options_t> loop_options;
-    static int64_t number_of_lived_vars;
+    static int64 number_of_lived_vars;

    // this var will be generated after alloc.
    void* mem_ptr = nullptr;
    Allocator* allocator = nullptr;
    size_t allocation;
-    int64_t size, num;
+    int64 size, num;
    inline bool is_float() const { CHECK_EXIST; return ns.is_float(); }
    inline int dsize() const { CHECK_EXIST; return ns.dsize(); }
    inline NanoString dtype() const { CHECK_EXIST; return ns; }
@ -40,7 +40,7 @@ struct Var : Node {
    Var(NanoVector shape, NanoString dtype);

    string to_string();
-    int64_t numel();
+    int64 numel();
    void set_shape(NanoVector shape);
    bool alloc(Allocator* allocator);
    inline void share_with(Var* x, size_t offset = 0) { CHECK_EXIST; allocator = (Allocator*)x; allocation = offset; }
--- a/python/jittor/test/main.py
+++ b/python/jittor/test/main.py
@ -15,6 +15,7 @@ if __name__ == "__main__":

    skip_l = int(os.environ.get("test_skip_l", "0"))
    skip_r = int(os.environ.get("test_skip_r", "1000000"))
+    skip = os.environ.get("test_skip", "").split(",")
    test_only = None
    if "test_only" in os.environ:
        test_only = set(os.environ.get("test_only").split(","))
@ -34,6 +35,9 @@ if __name__ == "__main__":
            continue
        if test_only and test_name not in test_only:
            continue
+        for s in skip:
+            if s in test_name:
+                continue

        print("Add Test", _, test_name)
        suite.addTest(tests)
--- a/python/jittor/test/misc/superglue.py
+++ b/python/jittor/test/misc/superglue.py
@ -0,0 +1,374 @@
+from copy import deepcopy
+from pathlib import Path
+import jittor as jt
+import jittor.nn as nn
+import numpy as np
+import os
+
+split_size = 1000000
+
+conv_opt = int(os.environ.get("conv_opt", "0"))
+
+if conv_opt:
+    Conv1d_sp = nn.Conv1d_sp
+else:
+    Conv1d_sp = nn.Conv1d
+
+
+def MLP(channels: list, do_bn=True):
+    """ Multi-layer perceptron """
+    n = len(channels)
+    layers = []
+    for i in range(1, n):
+        layers.append(Conv1d_sp(channels[i - 1], channels[i], kernel_size=1, bias=True))
+        if i < (n - 1):
+            if do_bn:
+                layers.append(nn.BatchNorm(channels[i]))
+                # layers.append(nn.InstanceNorm1d(channels[i]))
+                # layers.append(nn.LayerNorm(channels[i]))
+            layers.append(nn.ReLU())
+    return nn.Sequential(*layers)
+
+
+def normalize_keypoints(kpts, image_shape):
+    size = image_shape.flip(1)  # shape=(b,2) ;h w -> w, h
+    center = size / 2
+    scaling = size.float32().max(1, keepdims=True) * 0.7
+    return (kpts - center[:, None, :]) / scaling[:, None, :]
+
+
+class KeypointEncoder(nn.Module):
+    """ Joint encoding of visual appearance and location using MLPs"""
+    def __init__(self, feature_dim, layers, keypoint_position_dim=2):
+        super().__init__()
+        # self.keypoint_position_dim = keypoint_position_dim
+        self.encoder = MLP([keypoint_position_dim + 1] + layers + [feature_dim])
+        nn.init.constant_(self.encoder[-1].bias, 0.0)
+
+    def execute(self, kpts, scores):
+        inputs = jt.concat([kpts.t(), scores.unsqueeze(1)], dim=1)
+        return self.encoder(inputs)
+
+cnt = 0
+
+def attention(query, key, value):
+    global cnt
+    cnt += 1
+    b, d, h, n = query.shape
+    # print("attention", b,d,h,n, cnt)
+    dim_factor = (1.0 / d)**0.5
+    query = query.transpose(0, 2, 3, 1).reshape(b * h, -1, d) * dim_factor
+    key = key.transpose(0, 2, 1, 3).reshape(b * h, d, -1)
+    value = value.transpose(0, 2, 3, 1).reshape(b * h, -1, d)
+    # print("attention", query.shape, key.shape, value.shape)
+
+    data = []
+    for i in range(0, query.shape[0], split_size):
+        end = min(i + split_size, query.shape[0])
+        tmp1 = nn.bmm(query[i:end], key[i:end])
+        tmp2 = nn.softmax(tmp1, dim=-1)
+        tmp3 = nn.bmm(tmp2, value[i:end])
+        tmp3.sync()
+        data.append(tmp3)
+    tmp3 = jt.concat(data)
+    
+    # for i in range(0, query.shape[0], split_size):
+    #     end = min(i + split_size, query.shape[0])
+    #     tmp1 = nn.bmm(query[:,i:end], key[:,i:end])
+    #     tmp2 = nn.softmax(tmp1, dim=-1)
+    #     tmp3 = nn.bmm(tmp2, value[:,i:end])
+    #     tmp3.sync()
+    #     data.append(tmp3)
+    # tmp3 = jt.concat(data, dim=1)
+
+    # tmp1 = nn.bmm(query, key)
+    # print(tmp1.shape)
+    # tmp2 = nn.softmax(tmp1, dim=-1)
+    # print(tmp2.shape)
+    # tmp3 = nn.bmm(tmp2, value)
+    # print(tmp3.shape)
+    return tmp3.reshape(b, h, -1, d).transpose(0, 3, 1, 2)
+    return nn.bmm(nn.softmax(nn.bmm(query, key), dim=-1), value).reshape(b, h, -1, d).transpose(0, 3, 1, 2)
+
+
+class MultiHeadedAttention(nn.Module):
+    """ Multi-head attention to increase model expressivitiy """
+    def __init__(self, num_heads: int, d_model: int):
+        super().__init__()
+        assert d_model % num_heads == 0
+        self.dim = d_model // num_heads
+        self.num_heads = num_heads
+        self.merge = Conv1d_sp(d_model, d_model, kernel_size=1)
+        self.proj = nn.ModuleList([deepcopy(self.merge) for _ in range(3)])
+
+    def execute(self, query, key, value):
+        batch_dim = query.size(0)
+        query, key, value = [l(x).reshape(batch_dim, self.dim, self.num_heads, -1) for l, x in zip(self.proj, (query, key, value))]
+        x = attention(query, key, value)
+        # x = attention_chunk(query, key, value)
+        return self.merge(x.reshape(batch_dim, self.dim * self.num_heads, -1))
+
+
+class AttentionalPropagation(nn.Module):
+    def __init__(self, feature_dim: int, num_heads: int):
+        super().__init__()
+        self.attn = MultiHeadedAttention(num_heads, feature_dim)
+        self.mlp = MLP([feature_dim * 2, feature_dim * 2, feature_dim])
+        nn.init.constant_(self.mlp[-1].bias, 0.0)
+
+    def execute(self, x, source):
+        message = self.attn(x, source, source)
+        return self.mlp(jt.concat([x, message], dim=1))
+
+
+class AttentionalGNN(nn.Module):
+    def __init__(self, feature_dim: int, layer_names: list):
+        super().__init__()
+        self.layers = nn.ModuleList([AttentionalPropagation(feature_dim, 4) for _ in range(len(layer_names))])
+        self.is_cross = [x == 'cross' for x in layer_names]
+
+    def execute(self, desc0, desc1):
+        for layer, is_cross in zip(self.layers, self.is_cross):
+            layer.attn.prob = []
+            if is_cross:
+                src0, src1 = desc1, desc0
+            else:  # if name == 'self':
+                src0, src1 = desc0, desc1
+            # delta0, delta1 = layer(desc0, src0), layer(desc1, src1)
+
+            delta0 = layer(desc0, src0)
+            # print(delta0.numel()*4)
+            # breakpoint()
+            jt.sync_all()
+            delta1 = layer(desc1, src1)
+            jt.sync_all()
+            desc0, desc1 = (desc0 + delta0), (desc1 + delta1)
+            jt.sync_all()
+        return desc0, desc1
+
+
+def log_sinkhorn_iterations(Z, log_mu, log_nu, iters: int):
+    """ Perform Sinkhorn Normalization in Log-space for stability"""
+    u, v = jt.zeros_like(log_mu), jt.zeros_like(log_nu)
+    for _ in range(iters):
+        u = log_mu - (Z + v.unsqueeze(1)).exp().sum(dim=2).log()
+        v = log_nu - (Z + u.unsqueeze(2)).exp().sum(dim=1).log()
+    return Z + u.unsqueeze(2) + v.unsqueeze(1)
+
+
+def log_optimal_transport(scores, alpha, iters: int):
+    """ Perform Differentiable Optimal Transport in Log-space for stability"""
+    b, m, n = scores.shape
+    ms, ns = jt.float(m, requires_grad=False), jt.float(n, requires_grad=False)
+
+    bins0 = alpha.broadcast([b, m, 1])
+    bins1 = alpha.broadcast([b, 1, n])
+    alpha = alpha.broadcast([b, 1, 1])
+
+    couplings = jt.concat([jt.concat([scores, bins0], -1), jt.concat([bins1, alpha], -1)], 1)
+
+    norm = -(ms + ns).log()
+    log_mu = jt.concat([norm.broadcast([m]), ns.log() + norm])
+    log_nu = jt.concat([norm.broadcast([n]), ms.log() + norm])
+    log_mu, log_nu = log_mu[None].broadcast([b, m + 1]), log_nu[None].broadcast([b, n + 1])
+
+    Z = log_sinkhorn_iterations(couplings, log_mu, log_nu, iters)
+    Z = Z - norm  # multiply probabilities by M+N
+    return Z
+
+
+def arange_like(x, dim: int):
+    return jt.ones(x.shape[dim], dtype=x.dtype)[None].cumsum()[0] - 1  # traceable in 1.1
+
+
+default_config = {
+    'descriptor_dim': 256,  # SuperPoint
+    'weights': 'indoor',
+    'keypoint_encoder': [32, 64, 128, 256],  # SuperPoint
+    'GNN_layers': ['self', 'cross'] * 9,
+    'sinkhorn_iterations': 100,
+    'match_threshold': 0.2,
+}
+
+
+def get_weighted_loss_batch(scores, all_matches):
+    matches0, matches1 = all_matches.chunk(chunks=2, dim=2)
+    batchIdx = jt.arange(all_matches.shape[0]).unsqueeze(1).repeat(1, all_matches.shape[1])
+    batchIdx, matches0, matches1 = batchIdx.view(-1), matches0.view(-1), matches1.view(-1)
+    valid_index0, valid_index1 = matches0 >= 0, matches1 >= 0
+    valid_match = jt.logical_and(valid_index0, valid_index1)
+    valid_unmatch = jt.logical_xor(valid_index0, valid_index1)
+    num_match = valid_match.sum().maximum(1e-9)
+    num_unmatch = valid_unmatch.sum().maximum(1e-9)
+
+
+
+    score_ = scores[batchIdx, matches0, matches1]
+    score_match_ = (score_*valid_match).float32().sum() / num_match
+    score_umatch_ = (score_*valid_unmatch).float32().sum() / num_unmatch
+    return -(num_unmatch * score_match_ + num_match * score_umatch_) / (num_match + num_unmatch)
+    # print(score_umatch_, score_match_)
+    # return -(score_match + score_umatch) / (num_match + num_unmatch)
+
+    score_match = scores[(batchIdx[valid_match], matches0[valid_match], matches1[valid_match])].float32().mean() if num_match > 0 else 0
+    score_umatch = scores[(batchIdx[valid_unmatch], matches0[valid_unmatch], matches1[valid_unmatch])].float32().mean() if num_unmatch > 0 else 0
+    # print(score_match, score_umatch)
+    return -(num_unmatch * score_match + num_match * score_umatch) / (num_match + num_unmatch)
+
+
+def add_dustbin(scores, alpha):
+    b, m, n = scores.shape
+    bins0 = jt.broadcast(alpha, (b, m, 1))
+    bins1 = jt.broadcast(alpha, (b, 1, n))
+    alpha = jt.broadcast(alpha, (b, 1, 1))
+    couplings = jt.concat([jt.concat([scores, bins0], -1), jt.concat([bins1, alpha], -1)], 1)
+    return couplings
+
+
+class SuperGlue(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        config = {**default_config, **config}
+        self.descriptor_dim = config['descriptor_dim']
+        self.keypoint_encoder = config['keypoint_encoder']
+        self.GNN_layers = config['GNN_layers']
+        self.sinkhorn_iterations = config['sinkhorn_iterations']
+        self.match_threshold = config['match_threshold']
+        self.keypoint_position_dim = config['keypoint_position_dim']
+        self.use_dual_softmax = config['use_dual_softmax']
+        self.scale = jt.float(self.descriptor_dim**-0.5).stop_grad()
+        # self.scale.requires_grad = False
+
+        # self.des_extend = MLP([128, 256])
+
+        self.kenc = KeypointEncoder(self.descriptor_dim, self.keypoint_encoder, keypoint_position_dim=self.keypoint_position_dim)
+
+        self.gnn = AttentionalGNN(self.descriptor_dim, self.GNN_layers)
+
+        self.final_proj = Conv1d_sp(self.descriptor_dim, self.descriptor_dim, kernel_size=1, bias=True)
+
+        self.bin_score = jt.float(1.0)
+
+    def execute(self, data):
+        """Run SuperGlue on a pair of keypoints and descriptors"""
+
+        kpts0, kpts1 = data['keypoints0'], data['keypoints1']
+        desc0, desc1 = data['descriptors0'], data['descriptors1']
+        all_matches = data['all_matches']
+        # match_num = data['match_num']
+
+        if kpts0.shape[1] == 0 or kpts1.shape[1] == 0 or all_matches.shape[1] == 0:  # no keypoints or no matches/unmatches
+            shape0, shape1 = kpts0.shape[:-1], kpts1.shape[:-1]
+            return {
+                'matches0': jt.ones(shape0, dtype=jt.int),
+                'matches1': jt.ones(shape1, dtype=jt.int),
+                'matching_scores0': jt.zeros(shape0, dtype=jt.float),
+                'matching_scores1': jt.zeros(shape1, dtype=jt.float),
+                'skip_train': True
+            }
+
+        # Keypoint normalization.
+        kpts0 = normalize_keypoints(kpts0, data['shape0'])
+        kpts1 = normalize_keypoints(kpts1, data['shape1'])
+
+        # Keypoint MLP encoder.
+        # desc0 = self.des_extend(desc0) + self.kenc(kpts0, data['scores0'])
+        # desc1 = self.des_extend(desc1) + self.kenc(kpts1, data['scores1'])
+        desc0 = desc0 + self.kenc(kpts0, data['scores0'])
+        desc1 = desc1 + self.kenc(kpts1, data['scores1'])
+
+        # Multi-layer Transformer network.
+        desc0, desc1 = self.gnn(desc0, desc1)
+
+        # Final MLP projection.
+        desc0, desc1 = self.final_proj(desc0), self.final_proj(desc1)
+        desc0_t = desc0.t()
+        losses = []
+
+        for i in range(0, desc1.shape[0], split_size):
+            end = min(desc1.shape[0], i + split_size)
+
+            # Compute matching descriptor distance.
+            scores = nn.bmm(desc0_t[i:end], desc1[i:end]) * self.scale  # 457.76 MB
+            scores.sync()
+
+            # Run the optimal transport.
+            if self.use_dual_softmax:
+                scores = add_dustbin(scores, self.bin_score)  # 458.68 MB
+                scores.sync()
+                dual_softmax0, dual_softmax1 = nn.log_softmax(scores, 1), nn.log_softmax(scores, 2)
+                scores = dual_softmax0 + dual_softmax1  # 458.22 MB
+                scores.sync()
+            else:
+                scores = log_optimal_transport(scores, self.bin_score, iters=self.config['sinkhorn_iterations'])
+
+            # loss = torch.stack([get_match_score(scores[b], all_matches[b]) for b in range(all_matches.shape[0])])
+
+            loss = get_weighted_loss_batch(scores, all_matches[i:end])
+            loss.sync()
+            losses.append(loss)
+        loss = jt.concat(losses)
+        '''
+        # Compute matching descriptor distance.
+        scores = nn.bmm(desc0.t(), desc1) * self.scale # 457.76 MB
+        scores.sync()
+
+        # Run the optimal transport.
+        if self.use_dual_softmax:
+            scores = add_dustbin(scores, self.bin_score) # 458.68 MB
+            scores.sync()
+            dual_softmax0, dual_softmax1 = nn.log_softmax(scores, 1), nn.log_softmax(scores, 2)
+            scores = dual_softmax0 + dual_softmax1 # 458.22 MB
+            scores.sync()
+        else:
+            scores = log_optimal_transport(scores, self.bin_score, iters=self.config['sinkhorn_iterations'])
+
+        # loss = torch.stack([get_match_score(scores[b], all_matches[b]) for b in range(all_matches.shape[0])])
+
+        loss = get_weighted_loss_batch(scores, all_matches)
+        # print(scores.shape, all_matches.shape, loss.shape)
+        '''
+
+        # matches0, matches1 = all_matches.chunk(chunks=2, dim=2)
+        # batchIdx = jt.arange(0, b).unsqueeze(1).repeat(1, num)
+        # batchIdx, matches0, matches1 = batchIdx.view(-1), matches0.view(-1), matches1.view(-1)
+        # validmatch = (matches0 >= 0) | (matches1 >= 0)
+        # batchIdx, matches0, matches1 = batchIdx[validmatch], matches0[validmatch], matches1[validmatch]
+        # matches0[matches0 == -1] = n
+        # matches1[matches1 == -1] = m
+        # loss_mean = -scores[(batchIdx, matches0, matches1)].mean()
+        # loss_mean = nn.l1_loss(loss_mean, jt.float(0.0))
+
+        if not data['return_match']:
+            return {'loss': loss}
+
+        with jt.no_grad():
+            b, n, m = scores.shape
+            # Get the matches with score above "match_threshold".
+            indices0, max0 = scores[:, :-1, :-1].argmax(2)
+            indices1, max1 = scores[:, :-1, :-1].argmax(1)
+            mutual0 = jt.arange(0, n)[None] == indices1.gather(1, indices0)
+            mutual1 = jt.arange(0, m)[None] == indices0.gather(1, indices1)
+            # zero = scores.new_tensor(0)
+            # mscores0 = torch.where(mutual0, max0.values.exp(), zero)
+            mscores0 = max0.exp()
+            mscores0[mutual0.logical_not()] = 0
+            # mscores1 = torch.where(mutual1, mscores0.gather(1, indices1), zero)
+            mscores1 = mscores0.gather(1, indices1)
+            mscores1[mutual1.logical_not()] = 0
+            valid0 = mutual0 & (mscores0 > self.match_threshold)
+            valid1 = mutual1 & valid0.gather(1, indices1)
+            # indices0 = torch.where(valid0, indices0, indices0.new_tensor(-1))
+            # indices1 = torch.where(valid1, indices1, indices1.new_tensor(-1))
+            indices0[valid0.logical_not()] = -1
+            indices1[valid1.logical_not()] = -1
+
+        return {
+            'matches0': indices0,  # use -1 for invalid match
+            'matches1': indices1,  # use -1 for invalid match
+            'matching_scores0': mscores0,
+            'matching_scores1': mscores1,
+            'loss': loss,
+        }
+
+        # scores big value or small value means confidence? log can't take neg value
--- a/python/jittor/test/perf/perf.py
+++ b/python/jittor/test/perf/perf.py
@ -4,8 +4,8 @@ suffix = ""

 import jittor as jt
 import time
-from pathlib import Path
-home_path = str(Path.home())
+import jittor_utils as jit_utils
+home_path = jit_utils.home()
 perf_path = os.path.join(home_path, ".cache", "jittor_perf")

 def main():
--- a/python/jittor/test/test_acl.py
+++ b/python/jittor/test/test_acl.py
@ -0,0 +1,31 @@
+# ***************************************************************
+# Copyright (c) 2021 Jittor. All Rights Reserved. 
+# Maintainers: Dun Liang <randonlang@gmail.com>. 
+# This file is subject to the terms and conditions defined in
+# file 'LICENSE.txt', which is part of this source code package.
+# ***************************************************************
+import unittest
+import jittor as jt
+from .test_core import expect_error
+import numpy as np
+
+@unittest.skipIf(not jt.compiler.has_acl, "No ACL found")
+class TestACL(unittest.TestCase):
+
+    @jt.flag_scope(use_acl=1)
+    def test_array(self):
+        print("use_acl", jt.flags.use_acl)
+        a = jt.array([1,2,3])
+        np.testing.assert_allclose(a.numpy(), [1,2,3])
+
+    @jt.flag_scope(use_acl=1)
+    def test_add(self):
+        a = jt.array([1,2,3])
+        b = a+a
+        np.testing.assert_allclose(b.numpy(), [2,4,6])
+
+    def test_meminfo(self):
+        jt.display_memory_info()
+
+if __name__ == "__main__":
+    unittest.main()
--- a/python/jittor/test/test_benchmark.py
+++ b/python/jittor/test/test_benchmark.py
@ -0,0 +1,344 @@
+# ***************************************************************
+# Copyright (c) 2021 Jittor. All Rights Reserved. 
+# Maintainers: Dun Liang <randonlang@gmail.com>. 
+# This file is subject to the terms and conditions defined in
+# file 'LICENSE.txt', which is part of this source code package.
+# ***************************************************************
+import unittest
+import jittor as jt
+import numpy as np
+import os
+
+n = 400000000
+# n = 4000000
+n = 7680000
+
+def get_mem_band():
+    a = jt.rand((n)).float32()
+    for i in range(100):
+        a.copy().sync()
+    jt.sync_all(True)
+    import time
+    t = time.time()
+    for i in range(1000):
+        a.copy().sync()
+    jt.sync_all(True)
+    dt = time.time() - t
+    band = a.numel() * 4 * 2000 / dt / 1024**3
+    print("Mem band: ", band)
+    return band
+
+def check_simple_add_band():
+    # copy: 816
+    # S=1 128,1024, ILP=1 634
+    # S=0 128,1024, ILP=1 734
+    # S=0 128,512, ILP=1 716
+    # S=0 64,1024, ILP=1 706
+    # S=0 256,1024, ILP=1 706
+    def test(S=0, B=128, T=1024, ILP=1):
+        a = jt.rand((n)).float32()
+        jt.sync_all(True)
+        jt.flags.log_silent = 1
+        with jt.profile_scope(100, 1000) as rep:
+            b = jt.code(a.shape, a.dtype, [a], 
+            cuda_header="#include \"type/fp16_compute.h\"",
+            cuda_src=f"""
+            __global__ void kernel(in0_type * __restrict__ a, in0_type* __restrict__ b, int num) {{
+                int tid = threadIdx.x + blockIdx.x * blockDim.x;
+                int tnum = blockDim.x * gridDim.x;
+                #define ILP {ILP}
+                for (int i=tid*ILP; i<num; i+=tnum*ILP) {{
+                    // b[i] = a[i];
+                    vload<ILP*sizeof(in0_type)>(b+i, a+i);
+                    {"__syncthreads();" if S else ""}
+                }}
+            }}
+            kernel<<<{B},{T}>>>(in0_p, out0_p, in0->num);
+            """)
+            b.sync()
+        bw = float(rep[-1][9]) / 1024**3
+        s = f"S={S}, B={B}, T={T}, ILP={ILP} BW={bw}"
+        print(s)
+        return s, bw
+
+    def test2(S=0, B=128, T=1024, ILP=1):
+        a = jt.rand((n)).float32()
+        jt.sync_all(True)
+        # jt.flags.log_silent = 0
+        with jt.profile_scope(10, 1000) as rep:
+            b = jt.code(a.shape, a.dtype, [a], 
+            cuda_header="#include \"type/fp16_compute.h\"",
+            cuda_src=f"""
+            __global__ void kernel(float2 * __restrict__ a, float2* __restrict__ b, int num) {{
+                int tid = threadIdx.x + blockIdx.x * blockDim.x;
+                int tnum = blockDim.x * gridDim.x;
+                #define ILP 1
+                for (int i=tid*ILP; i<num; i+=tnum*ILP) {{
+                    b[i] = a[i];
+                    // b[i+1] = a[i+1];
+                    // vload<ILP*sizeof(in0_type)>(b+i, a+i);
+                    {"__syncthreads();" if S else ""}
+                }}
+            }}
+            kernel<<<{B},{T}>>>((float2*)in0_p, (float2*)out0_p, in0->num/2);
+            """)
+            b.sync()
+        bw = float(rep[-1][9]) / 1024**3
+        s = f"T2: S={S}, B={B}, T={T}, ILP={ILP} BW={bw}"
+        print(s)
+        return s, bw
+
+            
+    def test3(S=0, B=128, T=1024, ILP=1, C=0):
+        a = jt.rand((n)).float32()
+        b = jt.rand(B)
+        jt.sync_all(True)
+        jt.flags.log_silent = 1
+        with jt.profile_scope(100, 1000) as rep:
+            b = jt.code(a.shape, a.dtype, [a, b], 
+            cuda_header="#include \"type/fp16_compute.h\"",
+            cuda_src=f"""
+            __global__ void kernel(in0_type * __restrict__ a, in0_type* __restrict__ b, int num) {{
+                int tid = threadIdx.x + blockIdx.x * blockDim.x;
+                int tnum = blockDim.x * gridDim.x;
+                #define ILP {ILP}
+                for (int i=tid*ILP; i<num; i+=tnum*ILP) {{
+                    // b[i] = a[i];
+                    vload<ILP*sizeof(in0_type)>(b+i, a+i);
+                    {"__syncthreads();" if S else ""}
+                }}
+                {"__syncthreads();" if C else ""}
+            }}
+            kernel<<<in1->shape[0],{T}>>>(in0_p, out0_p, in0->num);
+            """)
+            b.compile_options = {"FLAGS: -Xptxas -dlcm=ca ": C}
+            # b.compile_options = {"FLAGS: –Xptxas –dlcm=ca ": 1}
+            b.sync()
+
+        bw = float(rep[-1][9]) / 1024**3
+        s = f"T3: S={S}, B={B}, T={T}, ILP={ILP} C={C} BW={bw}"
+        print(s)
+        return s, bw
+
+            
+    def test4(S=0, B=128, T=1024, ILP=1, C=0, name="b.png"):
+        a = jt.rand((n)).float32()
+        b = jt.rand(B*4).uint32()
+        jt.sync_all(True)
+        # jt.flags.log_silent = 1
+        with jt.profile_scope(100, 10000) as rep:
+            _ = jt.code(a.shape, a.dtype, [a, b], 
+            cuda_header="#include \"type/fp16_compute.h\"",
+            cuda_src=f"""
+            __device__ uint get_smid(void) {{
+                uint ret;
+                asm("mov.u32 %0, %smid;" : "=r"(ret) );
+                return ret;
+            }}
+            __device__ uint get_time(void) {{
+                uint ret;
+                asm volatile("mov.u32 %0, %%globaltimer_lo;" : "=r"(ret));
+                return ret;
+            }}
+
+            __global__ void kernel(in0_type * __restrict__ a, in0_type* __restrict__ b, int num, in1_type* __restrict__ c) {{
+                uint t = get_time();
+                int tid = threadIdx.x + blockIdx.x * blockDim.x;
+                int tnum = blockDim.x * gridDim.x;
+                #define ILP {ILP}
+                for (int i=tid*ILP; i<num; i+=tnum*ILP) {{
+                    // b[i] = a[i];
+                    vload<ILP*sizeof(in0_type)>(b+i, a+i);
+                    {"__syncthreads();" if S else ""}
+                }}
+                {"__syncthreads();" if C else ""}
+                if (threadIdx.x == 0)
+                    ((uint4* __restrict__)c)[blockIdx.x] = 
+                    uint4{{get_smid(), t, get_time(), 0}};
+            }}
+            kernel<<<in1->shape[0]/4,{T}>>>(in0_p, out0_p, in0->num, in1_p);
+            """)
+            _.compile_options = {"FLAGS: -Xptxas -dlcm=ca ": C}
+            # b.compile_options = {"FLAGS: –Xptxas –dlcm=ca ": 1}
+            _.sync()
+
+        bw = float(rep[-1][9]) / 1024**3
+        b = b.data.reshape(-1, 4)[:,:3]
+        mint = b[:,1].min()
+        b[:,1:] -= mint
+        smmax = int(b[:,0].max())
+        smmin = int(b[:,0].min())
+        maxt = b.max()
+
+        # print(b)
+
+        s = f"T4: S={S}, B={B}, T={T}, ILP={ILP} C={C} BW={bw:.3f} sm={smmin},{smmax} maxt={maxt}"
+        print(s)
+        import pylab as pl
+        pl.figure(figsize=(16,16))
+        texts = []
+        pret = np.zeros(200, dtype="uint32")
+        for i in range(B):
+            smid, s, t = b[i]
+            pl.plot([s,t], [smid, smid], 'ro-')
+            texts.append((s, smid, i))
+            texts.append((t, smid, i))
+
+        texts = sorted(texts)
+        for (s, smid, bid) in texts:
+            cpos = max(pret[smid], s)
+            pl.text(cpos, smid, str(bid))
+            pret[smid] = cpos + maxt // 30
+
+
+        # print("???")
+        # adjust_text(texts, arrowprops=dict(arrowstyle='->', color='blue'))
+        # print("???")
+        pl.savefig(name)
+        pl.close()
+        return s, bw
+    # test(S=0, B=128, T=1024, ILP=1)
+    # test(S=1, B=128, T=1024, ILP=1)
+    # test(S=0, B=64, T=1024, ILP=1)
+    # test(S=0, B=256, T=1024, ILP=1)
+    # test(S=1, B=128, T=512, ILP=1)
+    # test(S=1, B=128, T=256, ILP=1)
+    
+    # test(S=0, B=128, T=1024, ILP=2)
+    # test(S=0, B=128, T=1024, ILP=4)
+    # test(S=0, B=128, T=512, ILP=2)
+    # test(S=0, B=128, T=512, ILP=4)
+    
+    # test(S=1, B=128, T=1024, ILP=2)
+    # test(S=1, B=128, T=1024, ILP=4)
+    # test(S=1, B=128, T=1024, ILP=8)
+    # test(S=1, B=128, T=1024, ILP=16)
+    # test(S=1, B=128, T=512, ILP=2)
+    # test(S=1, B=128, T=512, ILP=4)
+    
+    # test(S=1, B=256, T=1024, ILP=2)
+    # test(S=1, B=512, T=1024, ILP=2)
+    # test(S=1, B=256, T=1024, ILP=4)
+    # test(S=1, B=256, T=1024, ILP=8)
+    # test(S=1, B=256, T=1024, ILP=16)
+    # test(S=1, B=256, T=512, ILP=2)
+    # test(S=1, B=256, T=512, ILP=4)
+    
+    # test(S=1, B=128, T=256, ILP=2)
+    # test(S=1, B=128, T=256, ILP=4)
+    # test(S=0, B=128, T=256, ILP=2)
+    # test(S=0, B=128, T=256, ILP=4)
+    
+    # for b in [1, 2, 4, 8, 16, 32, 64, 128,256]:
+    #     test(S=1, B=b, T=512, ILP=2)
+
+    import matplotlib as mpl
+    mpl.use('Agg')
+    import pylab as pl
+    import numpy as np
+
+    # test4(S=1, B=82, T=1024, ILP=2, C=0, name="b.png")
+    # test4(S=1, B=83, T=1024, ILP=2, C=0, name="c.png")
+    # test4(S=1, B=82*3, T=512, ILP=2, C=0, name="d1.png")
+    # test4(S=1, B=82*3+1, T=512, ILP=2, C=0, name="d2.png")
+    # test4(S=1, B=82*6+1, T=512, ILP=2, C=0, name="d3.png")
+    # test4(S=0, B=82*6+1, T=512, ILP=2, C=0, name="d4.png")
+    
+    for b in range(70, 83):
+        test4(S=1, B=b, T=1024, ILP=2, C=0, name=f"b-{b}.png")
+
+    # data = []
+    # for b in range(32, 2000, 8):
+    #     _, bw = test3(S=0, B=b, T=32, ILP=2)
+    #     data.append([b, bw])
+    # data = np.array(data)
+    # pl.plot(data[:,0], data[:,1])
+
+    # for t in [32, 64, 128, 256, 512, 1024]:
+    #     data = []
+    #     for b in range(32, 2000, 8):
+    #         _, bw = test3(S=1, B=b*(1024//t), T=t, ILP=2)
+    #         data.append([b, bw])
+    #     data = np.array(data)
+    #     pl.plot(data[:,0], data[:,1])
+
+    # for t in [1024]:
+    #     for c in [0,1]:
+    #         data = []
+    #         # for b in range(32, 1000, 8):
+    #         for b in range(32, 33, 8):
+    #             _, bw = test3(S=c, B=b*(1024//t), T=t, ILP=2, C=0)
+    #             data.append([b, bw])
+    #         data = np.array(data)
+    #         pl.plot(data[:,0], data[:,1])
+    
+    # for ilp in [2]:
+    #     for s in [1]:
+    #         for t in [1024,512,256,128]:
+    #             data = []
+    #             for b in range(32, 1100, 8):
+    #                 _, bw = test3(S=s, B=b*(1024//t), T=t, ILP=ilp)
+    #                 data.append([b, bw])
+    #             data = np.array(data)
+    #             pl.plot(data[:,0], data[:,1])
+
+    # pl.savefig("a.png")
+    # pl.close()
+    # for b in range(80, 90, 1):
+    #     _, bw = test3(S=1, B=b, T=1024, ILP=2)
+    # # 82
+    # for b in range(240, 260, 1):
+    #     _, bw = test3(S=1, B=b, T=512, ILP=2)
+    # # 82*3 = 246
+    # for b in range(240, 500, 1):
+    #     _, bw = test3(S=1, B=b, T=256, ILP=2)
+    # # 492 = 82*6
+    # for b in range(240, 1000, 1):
+    #     _, bw = test3(S=1, B=b, T=128, ILP=2)
+    # # 984 = 82*12
+
+
+    # for b in [128,256]:
+    #     test(S=1, B=b, T=1024, ILP=2)
+    # for b in [128,256]:
+    #     test(S=0, B=b, T=512, ILP=2)
+    # for b in [128,256]:
+    #     test(S=0, B=b, T=1024, ILP=2)
+    # for b in [128,256]:
+    #     test(S=1, B=b, T=512, ILP=1)
+    # for b in [128,256]:
+    #     test(S=1, B=b, T=1024, ILP=1)
+    # for b in [128,256]:
+    #     test(S=0, B=b, T=512, ILP=1)
+    # for b in [128,256]:
+    #     test(S=0, B=b, T=1024, ILP=1)
+    # test(S=1, B=128, T=512, ILP=4)
+    # test(S=1, B=64, T=512, ILP=2)
+    # test(S=1, B=80, T=512, ILP=2)
+    # test(S=1, B=100, T=512, ILP=2)
+    # test(S=1, B=110, T=512, ILP=2)
+    # test(S=1, B=115, T=512, ILP=2)
+    # test(S=1, B=120, T=512, ILP=2)
+    # test(S=1, B=130, T=512, ILP=2)
+    # test(S=1, B=140, T=512, ILP=2)
+    # test2(S=1, B=128, T=512, ILP=2)
+    # test(S=1, B=128, T=256, ILP=4)
+    # test(S=1, B=128, T=128, ILP=8)
+    # test(S=1, B=128, T=64, ILP=16)
+    
+
+
+@unittest.skipIf(not jt.compiler.has_cuda, "No CUDA found")
+class TestBenchmarkCUDA(unittest.TestCase):
+    def setUp(self):
+        jt.flags.use_cuda = 1
+    def tearDown(self):
+        jt.flags.use_cuda = 0
+
+    def test_main(self):
+        return
+        get_mem_band()
+        check_simple_add_band()
+
+if __name__ == "__main__":
+    unittest.main()
--- a/python/jittor/test/test_binary_op.py
+++ b/python/jittor/test/test_binary_op.py
@ -19,12 +19,12 @@ def all_eq(x, y):
    y = convert(y)
    if str(x.dtype).startswith("float"):
        return str(y.dtype).startswith("float") and x.shape == y.shape and (x==y).all()
-    return x.dtype == y.dtype and x.shape == y.shape and (x==y).all()
+    return x.dtype == y.dtype and x.shape == y.shape and np.testing.assert_allclose(x, y)

 def check(op, *args):
    x = eval(f"np.{op}(*args)")
    y = eval(f"jt.{op}(*args).data")
-    assert all_eq(x, y), f"{x}\n{y}"
+    all_eq(x, y)

 class TestBinaryOp(unittest.TestCase):
    def test_binary_op(self):
@ -47,6 +47,9 @@ class TestBinaryOp(unittest.TestCase):
        
    def test_i(self):
        def check(op, a, b):
+            if isinstance(a, list):
+                a = np.array(a)
+                b = np.array(b)
            if jt.flags.use_cuda and op == "@":
                return
            if op=="@":
@ -65,13 +68,13 @@ class TestBinaryOp(unittest.TestCase):
                a = np.float32(a)
            ja = np.float32(ja)
            
-            assert all_eq(ja, a), (ja,a)
+            all_eq(ja, a)
        check("+", 5, 2)
        check("-", 5, 2)
        check("*", 5, 2)
        check("/", 5, 2)
        check("//", 5, 2)
-        check("@", [[5]], [[2]])
+        # check("@", [[5]], [[2]])
        check("%", 5, 2)
        check("**", 5, 2)
        check("<<", 5, 2)
@ -80,6 +83,15 @@ class TestBinaryOp(unittest.TestCase):
        check("^", 5, 2)
        check("|", 5, 2)
        
+        check("+", [5.0,6.0], [2.0,3.0])
+        check("-", [5.0,6.0], [2.0,3.0])
+        check("*", [5.0,6.0], [2.0,3.0])
+        check("/", [5.0,6.0], [2.0,3.0])
+        check("//", [5.0,6.0], [2.0,3.0])
+        check("@", [[5,6],[7,8]], [[2,3],[4,5]])
+        check("%", [5.0,6.0], [2.0,3.0])
+        check("**", [5.0,6.0], [2.0,3.0])
+        
    def test_r(self):
        def check(op, a, b):
            a = np.array(a)
@ -97,7 +109,7 @@ class TestBinaryOp(unittest.TestCase):
                a = eval(f"a {op} b")
                a = np.array(a)
            
-            assert all_eq(jc, a), f"\n{jc}\n{a}"
+            all_eq(jc, a)
        check("+", 5, 2)
        check("-", 5, 2)
        check("*", 5, 2)
@ -118,6 +130,7 @@ class TestBinaryOp(unittest.TestCase):
        a = np.random.rand(10)
        b = np.random.rand(10)
        c = np.random.rand(10)
+        tol = 1e-2 if jt.flags.amp_reg & 2 else 1e-4
        for op in ops:
            func = lambda x: eval(f"((x[0]{op}x[1])*x[2]).sum()")
            x, grads = ngrad(func, [a,b,c], 1e-8)
@ -127,7 +140,7 @@ class TestBinaryOp(unittest.TestCase):
            jx = eval(f"(ja{op}jb)*jc")
            jgrads = jt.grad(jx, [ja,jb,jc])
            for jd, nd in zip(jgrads, grads):
-                assert (np.abs(jd.data-nd)<1e-4).all(), f"\n{jd.data}\n{nd}"
+                np.testing.assert_allclose(jd.data, nd, atol=tol, rtol=tol)

    def test_mod_float(self):
        a = jt.random((10,))
@ -137,7 +150,8 @@ class TestBinaryOp(unittest.TestCase):
        a = jt.random((10,), 'float64')
        b = jt.random((10,), 'float64')
        c = a % b
-        assert np.allclose(c.data, a.data % b.data)
+        assert np.allclose(c.data, a.data % b.data, a.data, b.data)
+        if jt.flags.amp_reg & 2: return
        a = jt.random((10,)) * 1000
        b = (jt.random((10,)) * 10).int() + 1
        c = a % b
@ -169,5 +183,19 @@ class TestBinaryOp(unittest.TestCase):
 class TestBinaryOpCuda(TestBinaryOp, test_cuda(2)):
    pass

+class TestBinaryOpCpuFp16(TestBinaryOp):
+    def setUp(self):
+        jt.flags.amp_reg = 2 | 4 | 8 | 16
+    def tearDown(self):
+        jt.flags.amp_reg = 0
+
+class TestBinaryOpCudaFp16(TestBinaryOp):
+    def setUp(self):
+        jt.flags.amp_reg = 2 | 4 | 8 | 16
+        jt.flags.use_cuda = 1
+    def tearDown(self):
+        jt.flags.amp_reg = 0
+        jt.flags.use_cuda = 0
+
 if __name__ == "__main__":
    unittest.main()
--- a/python/jittor/test/test_core.py
+++ b/python/jittor/test/test_core.py
@ -106,5 +106,18 @@ class TestCore(unittest.TestCase):
        a.y = 2
        assert a.y == 2

+    def test_modules(self):
+        a = jt.Module()
+        a.x = jt.Module()
+        a.y = jt.Module()
+        a.a = jt.array([1,2,3])
+        a.b = jt.array([1,2,3])
+        assert list(a._modules.keys()) == ["x", "y"]
+        assert a._modules['x'] is a.x
+        assert a._modules['y'] is a.y
+        assert list(a._parameters.keys()) == ['a', 'b']
+        assert a._parameters['a'] is a.a
+        assert a._parameters['b'] is a.b
+
 if __name__ == "__main__":
    unittest.main()
--- a/python/jittor/test/test_cuda.py
+++ b/python/jittor/test/test_cuda.py
@ -101,6 +101,12 @@ class TestCuda(unittest.TestCase):
        assert a.shape == [3,4,5] and a.dtype == 'float'
        assert (-na.flatten() == range(3*4*5)).all(), na

+    def test_cuda_fused_op(self):
+        a = jt.array([1,2,3])
+        a.sync()
+        with jt.flag_scope(use_cuda=1):
+            ((a+a)*2).data
+

@unittest.skipIf(jt.compiler.has_cuda, "Only test without CUDA")
 class TestNoCuda(unittest.TestCase):
--- a/python/jittor/test/test_cudnn_op.py
+++ b/python/jittor/test/test_cudnn_op.py
@ -123,8 +123,8 @@ class TestCudnnConvOp(unittest.TestCase):
            logs = find_log_with_re(raw_log, "(Jit op key (not )?found: cudnn_conv.*)")
            assert len(logs)==3 and "oihw" in logs[0][0], logs
            assert np.allclose(y.data, cy.data)
-            np.testing.assert_allclose(dx.data, cdx.data, atol=1e-2)
-            np.testing.assert_allclose(dw.data, cdw.data, atol=1e-2)
+            np.testing.assert_allclose(dx.data, cdx.data, atol=1e-2, rtol=1e-3)
+            np.testing.assert_allclose(dw.data, cdw.data, atol=1e-2, rtol=1e-3)
        if os.name == 'nt': return
        check([10,3,100,100], [5,3,3,3], stride=2, padding=0, dilation=1)
        check([10,4,40,50], [5,4,5,5], stride=1, padding=1, dilation=1)
--- a/python/jittor/test/test_fp16.py
+++ b/python/jittor/test/test_fp16.py
@ -0,0 +1,347 @@
+# ***************************************************************
+# Copyright (c) 2021 Jittor. All Rights Reserved. 
+# Maintainers: Dun Liang <randonlang@gmail.com>. 
+# This file is subject to the terms and conditions defined in
+# file 'LICENSE.txt', which is part of this source code package.
+# ***************************************************************
+import unittest
+import jittor as jt
+import numpy as np
+import os
+
+def transpose0231(x):
+    s0, s1, s2, s3 = x.shape
+    asize = 16
+    bsize = 16
+    ILP = 2
+    return jt.code([s0, s2, s3, s1], x.dtype, [x],
+    cuda_header="#include <type/fp16_compute.h>\n#include <cassert>",
+    cuda_src=f"""
+    __global__ void kernel(in0_type* __restrict__ x, in0_type* __restrict__ y, int s0, int s1, int s2, int s3) {{
+        __shared__ in0_type t[{asize*ILP}*{bsize*ILP+1}];
+        int t3 = threadIdx.x % {bsize};
+        int t1 = threadIdx.x / {bsize};
+        int b3 = blockIdx.x;
+        int b2 = blockIdx.y;
+        int b0 = blockIdx.z;
+        int x3 = 1;
+        int x2 = s3;
+        int x1 = s2*x2;
+        int x0 = s1*x1;
+        int y3 = 1;
+        int y2 = s1;
+        int y1 = s3*y2;
+        int y0 = s2*y1;
+        in0_type tmp[{ILP}];
+        for (int i=0; i<(s1-1)/{asize*ILP}+1; i++)
+        {{
+            int _b3 = b3 * {bsize*ILP} + t3*{ILP};
+            if (_b3 < s3) {{
+                #pragma unroll
+                for (int j=0; j<{ILP}; j++) {{
+                    vload<sizeof(in0_type)*{ILP}>(
+                        tmp,
+                        &x[b0*x0+(t1*{ILP}+j+i*{asize*ILP})*x1+b2*x2+_b3*x3]
+                    );
+                    #pragma unroll
+                    for (int k=0; k<{ILP}; k++)
+                        t[(t1*{ILP}+j)*{bsize*ILP+1}+t3*{ILP}+k] = tmp[k];
+                    
+                }}
+            }}
+            __syncthreads();
+            int t3_ = threadIdx.x % {asize};
+            int t1_ = threadIdx.x / {asize};
+            _b3 = b3 * {bsize*ILP} + t1_*{ILP};
+            if (_b3 < s3) {{
+                #pragma unroll
+                for (int j=0; j<{ILP}; j++) {{
+                    #pragma unroll
+                    for (int k=0; k<{ILP}; k++) {{
+                        tmp[k] =
+                            t[(t3*{ILP}+k)*{bsize*ILP+1}+t1_*{ILP}+j];
+                    }}
+                    vload<sizeof(in0_type)*{ILP}>(
+                        &y[b0*y0+b2*y1+(_b3+j)*y2+((t3*{ILP})+i*{asize*ILP})*y3],
+                        tmp
+                    );
+                }}
+            }}
+            __syncthreads();
+        }}
+    }}
+    int s0, s1, s2, s3;
+    in0->shape.unpack(s0, s1, s2, s3);
+    kernel<<<{{(s3-1)/{bsize*ILP}+1, s2, s0 }}, {bsize*asize}>>>
+        (in0_p, out0_p, s0, s1, s2, s3);
+    """)
+
+def transpose0231_2(x):
+    s0, s1, s2, s3 = x.shape
+    asize = 16
+    bsize = 8
+    ILP = 2
+    return jt.code([s0, s2, s3, s1], x.dtype, [x],
+    cuda_header="#include <type/fp16_compute.h>\n#include <cassert>",
+    cuda_src=f"""
+    __global__ __launch_bounds__({asize*bsize}) void kernel(in0_type* __restrict__ x, in0_type* __restrict__ y, int s0, int s1, int s2, int s3) {{
+        __shared__ in0_type t[{asize*ILP}*{bsize*ILP+1}];
+        int t3 = threadIdx.x % {bsize};
+        int t1 = threadIdx.x / {bsize};
+        int b3 = blockIdx.x;
+        int b1 = blockIdx.y;
+        int b2 = 0;
+        int b0 = blockIdx.z;
+        int x3 = 1;
+        int x2 = s3;
+        int x1 = s2*x2;
+        int x0 = s1*x1;
+        int y3 = 1;
+        int y2 = s1;
+        int y1 = s3*y2;
+        int y0 = s2*y1;
+        in0_type tmp[{ILP}];
+        {{
+            int _b3 = b3 * {bsize*ILP} + t3*{ILP};
+            if (_b3 < s3) {{
+                #pragma unroll
+                for (int j=0; j<{ILP}; j++) {{
+                    if (t1*{ILP}+j+b1*{asize*ILP} >= s1)
+                        continue;
+                    vload<sizeof(in0_type)*{ILP}>(
+                        tmp,
+                        &x[b0*x0+(t1*{ILP}+j+b1*{asize*ILP})*x1+b2*x2+_b3*x3]
+                    );
+                    #pragma unroll
+                    for (int k=0; k<{ILP}; k++)
+                        t[(t1*{ILP}+j)*{bsize*ILP+1}+t3*{ILP}+k] = tmp[k];
+                    
+                }}
+            }}
+            __syncthreads();
+            int t3_ = threadIdx.x % {asize};
+            int t1_ = threadIdx.x / {asize};
+            _b3 = b3 * {bsize*ILP} + t1_*{ILP};
+            int yy3 = (t3_*{ILP})+b1*{asize*ILP};
+            if (_b3 < s3 && yy3 < s1) {{
+                #pragma unroll
+                for (int j=0; j<{ILP}; j++) {{
+                    #pragma unroll
+                    for (int k=0; k<{ILP}; k++) {{
+                        tmp[k] =
+                            t[(t3_*{ILP}+k)*{bsize*ILP+1}+t1_*{ILP}+j];
+                    }}
+                    vload<sizeof(in0_type)*{ILP}>(
+                        &y[b0*y0+b2*y1+(_b3+j)*y2+yy3*y3],
+                        tmp
+                    );
+                    // printf("%d %d %d %d %d\\n", b0*y0+b2*y1+(_b3+j)*y2+yy3*y3,
+                    //    b0, b2, (_b3+j), yy3);
+                }}
+            }}
+            __syncthreads();
+        }}
+    }}
+    int s0, s1, s2, s3;
+    in0->shape.unpack(s0, s1, s2, s3);
+    kernel<<<{{(s3-1)/{bsize*ILP}+1, (s1-1)/{asize*ILP}+1, s0 }}, {bsize*asize}>>>
+        (in0_p, out0_p, s0, s1, s2, s3);
+    """)
+
+def check_share():
+    return
+    a = jt.rand((30, 32, 4, 2000)).float32()
+    jt.code(a.shape, a.dtype, [a],
+    cuda_header="#include <type/fp16_compute.h>\n#include <cassert>",
+    cuda_src="""
+    __global__ void kernel(in0_type* __restrict__ a, in0_type* __restrict__ b) {
+        __shared__ float x[32*33];
+        for (int i=0; i<3; i++) {
+        ((float2*)&x[i])[0] = ((float2*)&a[i])[0];
+        ((float2*)&b[i])[0] = ((float2*)&x[i+1])[0];
+        }
+    }
+    kernel<<<1024,16*16>>>(in0_p, out0_p);
+    """).sync()
+    jt.sync_all(True)
+    # print(a[0]+1)
+    print("pass test")
+
+class TestFP16(unittest.TestCase):
+    def test_array(self):
+        a = np.array([1,2,3], dtype="float16")
+        b = jt.array(a)
+        np.testing.assert_allclose(a, b.data)
+
+    def test_add(self):
+        a = np.array([1,2,3], dtype="float16")
+        b = jt.array(a)
+        c = b+b
+        np.testing.assert_allclose(c.data, a+a)
+        d = c.sum()
+        np.testing.assert_allclose(d.data, [12])
+        c = c+1
+        print(c)
+
+    def test_matmul(self):
+        a = jt.random((100,100)).float16()
+        b = jt.random((100,100)).float16()
+        c = jt.matmul(a, b)
+        c.sync()
+
+    def test_matmul_grad(self):
+        a = jt.random((100,100)).float16()
+        b = jt.random((100,100)).float16()
+        c = jt.matmul(a, b)
+        c.sync()
+        da, db = jt.grad(c, [a,b])
+        jt.sync_all()
+        assert da.dtype == "float16"
+        assert db.dtype == "float16"
+
+    def test_array_random_auto_cast(self):
+        a = jt.array([1.0,2.0])
+        assert a.dtype == "float32"
+        with jt.flag_scope(amp_reg=2+16):
+            a = jt.array([1.0,2.0])
+            assert a.dtype == "float16", a.dtype
+            
+        a = jt.random([10])
+        assert a.dtype == "float32"
+        with jt.flag_scope(amp_reg=2+16):
+            a = jt.random([10])
+            assert a.dtype == "float16", a.dtype
+
+    def test_conv(self):
+        a = jt.random((3,4,5,5)).float16()
+        b = jt.random((4,4,3,3)).float16()
+        c = jt.nn.conv(a, b)
+        c.sync()
+
+    def test_max(self):
+        a = jt.random((100,)).float16()
+        b = jt.random((100,)).float16()
+        c = a.maximum(b)
+        c.sync()
+
+    def test_reduce_dtype_infer(self):
+        with jt.flag_scope(amp_reg=1):
+            a = jt.random((3,4,5,5)).float16()
+            b = a.sum()
+            b.sync()
+            assert b.dtype == "float32"
+        with jt.flag_scope(amp_reg=2):
+            a = jt.random((3,4,5,5)).float16()
+            b = a.sum()
+            b.sync()
+            assert b.dtype == "float32"
+        with jt.flag_scope(amp_reg=0):
+            a = jt.random((3,4,5,5)).float16()
+            b = a.sum()
+            b.sync()
+            assert b.dtype == "float32"
+        with jt.flag_scope(amp_reg=2+4):
+            a = jt.random((3,4,5,5)).float16()
+            b = a.sum()
+            b.sync()
+            assert b.dtype == "float16", b.dtype
+
+    def test_white_dtype_infer(self):
+        with jt.flag_scope(amp_reg=1):
+            a = jt.random((3,4,5,5)).float16()
+            b = a**a
+            b.sync()
+            assert b.dtype == "float32"
+        with jt.flag_scope(amp_reg=2):
+            a = jt.random((3,4,5,5)).float16()
+            b = a**a
+            b.sync()
+            assert b.dtype == "float32"
+        with jt.flag_scope(amp_reg=0):
+            a = jt.random((3,4,5,5)).float16()
+            b = a**a
+            b.sync()
+            assert b.dtype == "float32"
+        with jt.flag_scope(amp_reg=2+8):
+            a = jt.random((3,4,5,5)).float16()
+            b = a**a
+            b.sync()
+            assert b.dtype == "float16", b.dtype
+
+    def test_module_half(self):
+        a = jt.nn.Linear(10,10)
+        assert a.weight.dtype == "float32"
+        a.half()
+        assert a.weight.dtype == "float16"
+
+
+
+@unittest.skipIf(not jt.compiler.has_cuda, "No CUDA found")
+class TestFP16CUDA(TestFP16):
+    def setUp(self):
+        jt.flags.use_cuda = 1
+    def tearDown(self):
+        jt.flags.use_cuda = 0
+
+    def test_softmax(self):
+        a = jt.rand((120, 2000, 2000)).float16()
+        # a = jt.rand((1, 2000, 2000)).float32()
+        jt.sync_all()
+        with jt.profile_scope(10, 100):
+            a.log_softmax(-1).sync()
+
+    def test_transpose(self):
+        check_share()
+        # return
+        a = jt.rand((30, 32, 4, 2000)).float32()
+        # a = jt.rand((1, 1024, 1, 2000)).float32()
+        diff = transpose0231(a).data != a.transpose((0,2,3,1)).data
+        print(np.where(diff))
+        # return
+        jt.sync_all()
+        # with jt.profile_scope(100, 11000):
+        with jt.profile_scope(100, 11000):
+            # a.log_softmax(-1).sync()
+            transpose0231(a).sync()
+
+            a.transpose((0,2,3,1)).sync()
+            # a.transpose((0,2,1,3)).sync()
+            a.fuse_transpose((0,2,1,3)).sync()
+            (a+1).sync()
+        jt.sync_all(True)
+        diff = transpose0231(a).data != a.transpose((0,2,3,1)).data
+        print(np.where(diff))
+        np.testing.assert_allclose(transpose0231(a).data, a.transpose((0,2,3,1)).data)
+
+    def test_transpose2(self):
+        # check_share()
+        # return
+        # a = jt.rand((30, 32, 4, 2000)).float32()
+        # a = jt.rand((1, 10000, 1, 2000)).float32()
+        a = jt.rand((1, 10000, 1, 2048)).float32()
+        print("transpose")
+        transpose0231_2(a).sync()
+        print("add")
+        (a+1).sync()
+        return
+        # a = jt.arange(32*16).reshape((1, 32, 1, 16))
+        diff = transpose0231_2(a).data != a.transpose((0,2,3,1)).data
+        print(np.where(diff))
+        # return
+        jt.sync_all()
+        # with jt.profile_scope(100, 11000):
+        with jt.profile_scope(100, 1100):
+            # a.log_softmax(-1).sync()
+            transpose0231_2(a).sync()
+
+            a.transpose((0,2,3,1)).sync()
+            # a.transpose((0,2,1,3)).sync()
+            a.fuse_transpose((0,2,1,3)).sync()
+            (a+1).sync()
+        jt.sync_all(True)
+        diff = transpose0231_2(a).data != a.transpose((0,2,3,1)).data
+        print(np.where(diff))
+        np.testing.assert_allclose(transpose0231_2(a).data, a.transpose((0,2,3,1)).data)
+        
+if __name__ == "__main__":
+    unittest.main()
--- a/python/jittor/test/test_lock.py
+++ b/python/jittor/test/test_lock.py
@ -10,12 +10,12 @@
 import unittest
 import os, sys
 import jittor as jt
-from pathlib import Path
+import jittor_utils as jit_utils

 class TestLock(unittest.TestCase):
    def test(self):
        if os.environ.get('lock_full_test', '0') == '1':
-            cache_path = os.path.join(str(Path.home()), ".cache", "jittor", "lock")
+            cache_path = os.path.join(jit_utils.home(), ".cache", "jittor", "lock")
            assert os.system(f"rm -rf {cache_path}") == 0
            cmd = f"cache_name=lock {sys.executable} -m jittor.test.test_example"
        else:
--- a/python/jittor/test/test_misc_op.py
+++ b/python/jittor/test/test_misc_op.py
@ -75,6 +75,8 @@ class TestPad(unittest.TestCase):
        print('pass flip test ...')

    def test_cross(self):
+        def check_equal(a, b, tol):
+            np.testing.assert_allclose(a.detach().numpy(), b.numpy(), atol=1e-5)
        arr1 = np.random.randn(16,3,224,224,3)
        arr2 = np.random.randn(16,3,224,224,3)
        check_equal(torch.Tensor(arr1).cross(torch.Tensor(arr2), dim=1), jt.array(arr1).cross(jt.array(arr2), dim=1), 1e-1)
@ -257,5 +259,52 @@ class TestOther(unittest.TestCase):
        a = jt.arctan2(jt.array([1,1.0,0]), jt.array([1,0.0,-1]))
        np.testing.assert_allclose(a.data, [0.7853982,1.5707964,3.1415927])

+        y = jt.random((100,))
+        x = jt.random((100,))
+        z = jt.arctan2(y, x)
+        z2 = np.arctan2(y.data, x.data)
+        np.testing.assert_allclose(z.data, z2)
+
+    def test_code_softmax(self):
+        if not jt.has_cuda: return
+        
+        def softmax(x, dim = None, log=False):
+            if dim is None:
+                x = (x - x.max()).exp()
+                ret = x / x.sum()
+            else:
+                x = (x-x.max(dim, keepdims=True)).exp()
+                ret = x / x.sum(dim, keepdims=True)
+            if log: return ret.log()
+            return ret
+        from jittor.other.code_softmax import softmax_v1
+
+        with jt.flag_scope(use_cuda = 1):
+            shape = (120, 2000, 2000)
+            shape = (3,3)
+            for log in [0,1]:
+                for shape in [(3,3), 
+                    (12, 200, 2000), 
+                    (12, 200, 2048), 
+                    (12, 200, 2049)]:
+                    print(shape)
+                    a = jt.rand(shape)
+                    c = jt.rand(shape)
+                    b = softmax(a, -1, log=log)
+                    bb = softmax_v1(a, log=log)
+
+                    err = (bb - b).abs().max()
+                    assert err.item() < 1e-5, (err, bb, b)
+
+                    d1 = jt.grad(b*c, a)
+                    d2 = jt.grad(bb*c, a)
+                    err = (d1 - d2).abs().max()
+
+                    if log:
+                        assert err.item() < 1e-2, (err.item())
+                    else:
+                        assert err.item() < 1e-5, (err.item())
+
+
 if __name__ == "__main__":
    unittest.main()
--- a/python/jittor/test/test_notebooks.py
+++ b/python/jittor/test/test_notebooks.py
@ -8,10 +8,10 @@ import unittest, os
 import jittor as jt
 from jittor import LOG
 import sys
-from pathlib import Path
+import jittor_utils as jit_utils

 dirname = os.path.join(jt.flags.jittor_path, "notebook")
-notebook_dir = os.path.join(str(Path.home()), ".cache","jittor","notebook")
+notebook_dir = os.path.join(jit_utils.home(), ".cache","jittor","notebook")
 tests = []
 for mdname in os.listdir(dirname):
    if not mdname.endswith(".src.md"): continue
--- a/python/jittor/test/test_op_compiler.py
+++ b/python/jittor/test/test_op_compiler.py
@ -111,17 +111,6 @@ class TestOpCompiler(unittest.TestCase):
        check("@{a^b == 7}", "2")
        check("@{(a^b) == 7}", "1")
        check("@{b<<a == 5*4}", "1")
-        check('''#include "ops/binary_op_defs.h"
-        #define OP1(a, b) a+b
-        OP1
-        @expand_macro(OP1,1,2)
-        @expand_macro(maximum, T, 1, 2)
-        @expand_macro(@OP,T,1,2)''',
-        '''        #define OP1(a, b) a+b
-        OP1
-        1+2
-        std::max(T(1), T(2))
-        ((1)+T(2)*(T(rcount)))''')
        expect_error(lambda: jit_precompile(vars, "@{a"))
        expect_error(lambda: jit_precompile(vars, "@for(a"))
        expect_error(lambda: jit_precompile(vars, "@for(i,l,r)"))
--- a/python/jittor/test/test_profiler.py
+++ b/python/jittor/test/test_profiler.py
@ -0,0 +1,24 @@
+# ***************************************************************
+# Copyright (c) 2021 Jittor. All Rights Reserved. 
+# Maintainers: Dun Liang <randonlang@gmail.com>. 
+# This file is subject to the terms and conditions defined in
+# file 'LICENSE.txt', which is part of this source code package.
+# ***************************************************************
+import unittest
+import jittor as jt
+import numpy as np
+import os
+
+class TestProfiler(unittest.TestCase):
+    def test_profiler(self):
+        a = jt.rand(1000,1000)
+        b = jt.rand(1000,1000)
+        jt.sync_all()
+        with jt.profile_scope(10, 100, profiler_record_peek=1) as rep:
+            jt.matmul(a, b).sync()
+        x = float(rep[-1][4])
+        y = float(rep[-2][4])
+        assert abs(x-y)/x < 1e-3
+
+if __name__ == "__main__":
+    unittest.main()
--- a/python/jittor/test/test_resnet.py
+++ b/python/jittor/test/test_resnet.py
@ -36,19 +36,7 @@ class MnistNet(Module):
        return x

@unittest.skipIf(skip_this_test, "skip_this_test")
-class TestResnet(unittest.TestCase):
-    @classmethod
-    def setUpClass(self):
-        # hyper-parameters
-        self.batch_size = int(os.environ.get("TEST_BATCH_SIZE", "100"))
-        self.weight_decay = 0.0001
-        self.momentum = 0.9
-        self.learning_rate = 0.1
-        # mnist dataset
-        self.train_loader = MNIST(train=True, transform=trans.Resize(224)) \
-            .set_attrs(batch_size=self.batch_size, shuffle=True)
-        self.train_loader.num_workers = 4
-
+class TestResnetFp32(unittest.TestCase):
    # setup random seed
    def setup_seed(self, seed):
        np.random.seed(seed)
@ -59,6 +47,19 @@ class TestResnet(unittest.TestCase):
    @jt.flag_scope(use_cuda=1, use_stat_allocator=1)
    def test_resnet(self):
        self.setup_seed(1)
+
+        # hyper-parameters
+        self.batch_size = int(os.environ.get("TEST_BATCH_SIZE", "100"))
+        self.weight_decay = 0.0001
+        self.momentum = 0.9
+        self.learning_rate = 0.1
+        if jt.flags.amp_reg:
+            self.learning_rate = 0.01
+        # mnist dataset
+        self.train_loader = MNIST(train=True, transform=trans.Resize(224)) \
+            .set_attrs(batch_size=self.batch_size, shuffle=True)
+        self.train_loader.num_workers = 4
+
        loss_list=[]
        acc_list=[]
        mnist_net = MnistNet()
@ -70,6 +71,7 @@ class TestResnet(unittest.TestCase):
        for data, target in self.train_loader:
            batch_id = self.train_loader.batch_id
            epoch_id = self.train_loader.epoch_id
+            data = data.float_auto()

            # train step
            # with jt.log_capture_scope(
@ -120,6 +122,8 @@ class TestResnet(unittest.TestCase):
            # Train Epoch: 0 [40/100 (40%)]   Loss: 2.286762  Acc: 0.130000
            # Train Epoch: 0 [50/100 (50%)]   Loss: 2.055014  Acc: 0.290000

+            if jt.flags.amp_reg:
+                continue
            if jt.in_mpi:
                assert jt.core.number_of_lived_vars() < 8100, jt.core.number_of_lived_vars()
            else:
@ -131,5 +135,14 @@ class TestResnet(unittest.TestCase):
        assert np.mean(loss_list[-50:])<0.5
        assert np.mean(acc_list[-50:])>0.8
        
+
+@unittest.skipIf(skip_this_test, "skip_this_test")
+class TestResnetFp16(TestResnetFp32):
+    def setup(self):
+        jt.flags.auto_mixed_precision_level = 5
+
+    def tearDown(self):
+        jt.flags.auto_mixed_precision_level = 0
+
 if __name__ == "__main__":
    unittest.main()
--- a/python/jittor/test/test_superglue.py
+++ b/python/jittor/test/test_superglue.py
@ -0,0 +1,121 @@
+# ***************************************************************
+# Copyright (c) 2021 Jittor. All Rights Reserved. 
+# Maintainers: Dun Liang <randonlang@gmail.com>. 
+# This file is subject to the terms and conditions defined in
+# file 'LICENSE.txt', which is part of this source code package.
+# ***************************************************************
+import unittest
+import jittor as jt
+import numpy as np
+import os
+
+from jittor.test.misc import superglue
+from jittor.test.misc.superglue import SuperGlue
+import time
+
+@jt.flag_scope(use_cuda=1)
+def main():
+    global superglue
+    superglue.split_size = int(os.environ.get("split_size", "12"))
+    # superglue.split_size = 1000000
+
+    batch = 30
+    num = 2000
+    dim = 128
+
+    # jt.display_memory_info()
+    # os.system("nvidia-smi")
+    # breakpoint()
+
+    with jt.no_grad():
+
+        config = {
+            'superglue': {
+                'sinkhorn_iterations': 25,
+                'match_threshold': 0.01,
+                'keypoint_position_dim': 2,
+                'descriptor_dim': dim,
+                'use_dual_softmax': True,
+                'GNN_layers': ['self', 'cross'] * 9,
+            }
+        }
+
+        superglue = SuperGlue(config.get('superglue', {}))
+
+        superglue.eval()
+
+        data = {
+            'keypoints0': jt.rand((batch, num, 2), dtype=jt.float),
+            'keypoints1': jt.rand((batch, num, 2), dtype=jt.float),
+            'shape0': jt.rand((batch, 2), dtype=jt.float),
+            'shape1': jt.rand((batch, 2), dtype=jt.float),
+            'descriptors0': jt.rand((batch, dim, num), dtype=jt.float),
+            'descriptors1': jt.rand((batch, dim, num), dtype=jt.float),
+            'scores0': jt.rand((batch, num), dtype=jt.float),
+            'scores1': jt.rand((batch, num), dtype=jt.float),
+            'all_matches': jt.randint(0, num, (batch, num, 2), dtype=jt.int),
+            'return_match': False,
+            # 'match_num': match_num
+        }
+
+        use_fp16 = int(os.environ.get("use_fp16", "0"))
+        if use_fp16:
+            jt.flags.amp_reg = 2
+            for k,v in data.items():
+                if isinstance(v, jt.Var) and v.dtype == "float32":
+                    v.assign(v.float16())
+            for v in superglue.parameters():
+                if v.dtype == "float32":
+                    v.assign(v.float16())
+            jt.sync_all(True)
+
+        import pickle
+        jt.sync_all(True)
+        for x in range(5):
+            print(x)
+            jt.gc()
+            x = superglue(data)['loss']
+            x.sync()
+            jt.display_memory_info()
+            # os.system("nvidia-smi")
+            # breakpoint()
+            # print(data)
+            # print(x)
+        
+        # with open("/tmp/record.pkl", "wb") as f:
+        #     pickle.dump([data, x], f, pickle.HIGHEST_PROTOCOL)
+
+        # with jt.flag_scope(trace_py_var=3, profile_memory_enable=1):
+        #     x = superglue(data)['loss']
+        #     x.sync()
+        #     jt.get_max_memory_treemap()
+        # exit(0)
+
+        jt.sync_all(True)
+        time0 = time.time()
+        jt.flags.profiler_enable = int(os.environ.get("profiler", "0"))
+
+        for x in range(20):
+            print(x)
+            # jt.display_memory_info()
+            x = superglue(data)['loss']
+            x.sync()
+            # print(x)
+
+        jt.sync_all(True)
+        time1 = time.time()
+        print("avg time:", (time1 - time0) / 20)
+        return (time1 - time0) / 20
+
+
+class TestSuperglue(unittest.TestCase):
+    def test(self):
+        if not jt.has_cuda: return
+        t1 = main()
+        os.environ["use_fp16"] = "1"
+        t2 = main()
+        os.environ["use_fp16"] = "0"
+        assert t1*0.55 > t2
+
+if __name__ == "__main__":
+    unittest.main()
--- a/python/jittor/test/test_unary_op.py
+++ b/python/jittor/test/test_unary_op.py
@ -17,7 +17,8 @@ def check(op, *args):
    x = convert(x)
    y = convert(y)
    # str match nan and inf
-    assert x.dtype == y.dtype and x.shape == y.shape
+    assert x.dtype == y.dtype and x.shape == y.shape, \
+        (x.dtype, y.dtype, x.shape, y.shape)
    for a,b in zip(x.flatten(), y.flatten()):
        assert str(a)[:5] == str(b)[:5], (a,b)

@ -32,9 +33,10 @@ class TestUnaryOp(unittest.TestCase):
        check("logical_not", a)
        check("bitwise_not", a)
        b = np.array([1.1, 2.2, 3.3, 4.4, -1, 0])
-        check("log", a.astype("float32"))
-        check("exp", a.astype("float32"))
-        check("sqrt", a.astype("float32"))
+        type = "float16" if (jt.flags.amp_reg & 2) else "float32"
+        check("log", a.astype(type))
+        check("exp", a.astype(type))
+        check("sqrt", a.astype(type))
        
    def test_grad(self):
        ops = ["abs", "negative", "log", "exp", "sqrt",
@ -60,7 +62,8 @@ class TestUnaryOp(unittest.TestCase):
            ja = jt.array(b)
            jb = eval(f"jt.{op}(ja)")
            jda = jt.grad(jb, ja)
-            assert (np.allclose(jda.data, da)), (jda.data,da,op)
+            tol = 1e-2 if jt.flags.amp_reg & 2 else 1e-6
+            assert (np.allclose(jda.data, da, atol=tol, rtol=tol)), (jda.data,da,op)

    def test_sigmoid(self):
        a = np.arange(-150,150, 10).astype("float32")
@ -92,11 +95,26 @@ class TestUnaryOp(unittest.TestCase):
        np.testing.assert_allclose(y.data, y2.data)
        d = jt.grad(x2, y2)
        _, (dn,) = ngrad(lambda y: special.erfinv(y).sum(), [y], 1e-8)
-        np.testing.assert_allclose(d.data, dn, atol=1e-6, rtol=1e-6)
+        tol = 1e-3 if jt.flags.amp_reg & 2 else 1e-6
+        np.testing.assert_allclose(d.data, dn, atol=tol, rtol=tol)


 class TestUnaryOpCuda(TestUnaryOp, test_cuda(2)):
    pass

+class TestUnaryOpCudaFp16(TestUnaryOp, test_cuda(2)):
+    def setUp(self):
+        jt.flags.amp_reg = 2 | 4 | 8 | 16
+    def tearDown(self):
+        jt.flags.amp_reg = 0
+
+class TestUnaryOpCudaFp16(TestUnaryOp, test_cuda(2)):
+    def setUp(self):
+        jt.flags.amp_reg = 2 | 4 | 8 | 16
+        jt.flags.use_cuda = 1
+    def tearDown(self):
+        jt.flags.amp_reg = 0
+        jt.flags.use_cuda = 0
+
 if __name__ == "__main__":
    unittest.main()
--- a/python/jittor/test/test_utils.py
+++ b/python/jittor/test/test_utils.py
@ -15,8 +15,8 @@ def find_jittor_path():
    return path[:-len(suffix)] + ".."
    
 def find_cache_path():
-    from pathlib import Path
-    path = str(Path.home())
+    import jittor_utils as jit_utils
+    path = jit_utils.home()
    dirs = [".cache", "jittor"]
    for d in dirs:
        path = os.path.join(path, d)
--- a/python/jittor/utils/data.gz
+++ b/python/jittor/utils/data.gz
--- a/Show More
+++ b/Show More