Merge branch 'master' into fft

This commit is contained in:
cxjyxx_me 2022-03-26 23:30:35 -04:00
commit b642b8f1d1
106 changed files with 3949 additions and 710 deletions

View File

@ -9,7 +9,7 @@
# file 'LICENSE.txt', which is part of this source code package.
# ***************************************************************
__version__ = '1.3.1.46'
__version__ = '1.3.1.55'
from jittor_utils import lock
with lock.lock_scope():
ori_int = int
@ -91,7 +91,7 @@ def safeunpickle(path):
import torch
except:
raise RuntimeError("pytorch need to be installed when load pth format.")
model_dict = torch.load(path, map_location=torch.device('cpu'))
model_dict = torch.load(path, map_location='cpu')
try:
for k, v in model_dict.items():
try:
@ -231,19 +231,19 @@ class profile_scope(_call_no_record_scope):
def __enter__(self):
assert not flags.profiler_enable
profiler.start(self.warmup, self.rerun)
self.report = []
try:
self.fs.__enter__()
profiler.start(self.warmup, self.rerun)
return self.report
except:
profiler.stop()
raise
def __exit__(self, *exc):
self.fs.__exit__(*exc)
profiler.stop()
self.report.extend(profiler.report())
self.fs.__exit__(*exc)
class __single_process_scope:
def __init__(self, rank=0):
@ -304,15 +304,52 @@ Var.cast = Var.cast
def array(data, dtype=None):
if isinstance(data, core.Var):
if dtype is None:
return data.clone()
return cast(data, dtype)
if dtype is not None:
ret = data.clone()
else:
ret = cast(data, dtype)
elif dtype is not None:
if isinstance(dtype, NanoString):
dtype = str(dtype)
elif callable(dtype):
dtype = dtype.__name__
return ops.array(np.array(data, dtype))
return ops.array(data)
ret = ops.array(np.array(data, dtype))
else:
ret = ops.array(data)
# TODO: move those code to core
amp_reg = jt.flags.amp_reg
if amp_reg and ret.numel() != 1 and ret.dtype.is_float():
if amp_reg & 16:
if amp_reg & 1:
if ret.dtype != "float32":
return ret.float32()
elif amp_reg & 2:
if ret.dtype != "float16":
return ret.float16()
return ret
def random(shape, dtype="float32", type="uniform"):
# TODO: move those code to core
if dtype == "float16":
# TODO: make curand support fp16
ret = ops.random(shape, "float32", type).float16()
else:
ret = ops.random(shape, dtype, type)
amp_reg = jt.flags.amp_reg
if amp_reg:
if amp_reg & 16:
if amp_reg & 1:
if ret.dtype != "float32":
return ret.float32()
elif amp_reg & 2:
if ret.dtype != "float16":
return ret.float16()
return ret
def float_auto(x):
if jt.flags.amp_reg & 2:
return x.float16()
return x.float32()
Var.float_auto = float_auto
def array64(data, dtype=None):
with jt.flag_scope(auto_convert_64_to_32=0):
@ -920,6 +957,14 @@ class Module:
self.dfs([], "", callback, callback_leave)
return ms
@property
def _modules(self):
return { k:v for k,v in self.__dict__.items() if isinstance(v, Module) }
@property
def _parameters(self):
return { k:v for k,v in self.__dict__.items() if isinstance(v, Var) }
def requires_grad_(self, requires_grad=True):
self._requires_grad = requires_grad
self._place_hooker()
@ -1187,6 +1232,33 @@ Arguments of hook are defined as::
def __getattr__(self, key):
return object.__getattribute__(self, key)
def float64(self):
'''convert all parameters to float16'''
for p in self.parameters():
if p.dtype.is_float():
p.assign(p.float64())
return self
def float16(self):
'''convert all parameters to float16'''
for p in self.parameters():
if p.dtype.is_float():
p.assign(p.float16())
return self
def half(self):
'''convert all parameters to float16'''
return self.float16()
def float_auto(self):
'''convert all parameters to float16 or float32 automatically
by jt.flags.auto_mixed_precision_level and jt.flags.amp_reg'''
for p in self.parameters():
if p.dtype.is_float():
p.assign(p.float_auto())
return self
class Function(Module):
''' Function Module for customized backward operations
@ -1417,18 +1489,15 @@ Var.size = size
def to_int(v):
dtype = str(v.dtype)
assert dtype.startswith("int")
assert v.dtype.is_int()
return v.item()
def to_float(v):
dtype = str(v.dtype)
assert dtype.startswith("float")
assert v.dtype.is_float()
return v.item()
def to_bool(v):
dtype = str(v.dtype)
assert dtype.startswith("int") or dtype=="bool"
assert v.dtype.is_int() or v.dtype.is_bool()
return ori_bool(v.item())
Var.__int__ = to_int
@ -1450,6 +1519,8 @@ float = float32
Var.float = Var.float32
double = float64
Var.double = Var.float64
half = float16
Var.half = Var.float16
def is_var(v):
return isinstance(v, Var)

View File

@ -5,7 +5,7 @@ from . import attention as attention, contrib as contrib, dataset as dataset, in
from .compile_extern import cublas as cublas, cudnn as cudnn, curand as curand, cufft as cufft, mkl_ops as mkl_ops, mpi_ops as mpi_ops, world_size as world_size
from .compiler import compile_custom_op as compile_custom_op, compile_custom_ops as compile_custom_ops
from .contrib import concat as concat
from .nn import matmul as matmul
from .nn import bmm as bmm, bmm_transpose as bmm_transpose, matmul as matmul
from collections import OrderedDict as OrderedDict
from collections.abc import Mapping as Mapping
from typing import Any
@ -64,6 +64,8 @@ def clean() -> None: ...
cast = unary
def array(data, dtype: Any | None = ...): ...
def random(shape, dtype: str = ..., type: str = ...): ...
def float_auto(x): ...
def array64(data, dtype: Any | None = ...): ...
def grad(loss, targets): ...
def liveness_info(): ...
@ -85,7 +87,6 @@ origin_transpose = transpose
def transpose(x, *dim): ...
permute = transpose
def flatten(input, start_dim: int = ..., end_dim: int = ...): ...
def start_grad(x): ...
def detach(x): ...
def unsqueeze(x, dim): ...
def squeeze(x, dim): ...
@ -149,6 +150,11 @@ class Module:
is_train: bool
def is_training(self) -> bool: ...
def mpi_param_broadcast(self, root: int = ...) -> None: ...
def __setattr__(self, key, value) -> None: ...
def __getattr__(self, key): ...
def float16(self) -> None: ...
def half(self) -> None: ...
def float_auto(self) -> None: ...
class Function(Module):
input_mask: Any
@ -187,6 +193,7 @@ def to_float(v): ...
def to_bool(v): ...
def format(v, spec): ...
def get_len(var): ...
half = float16
def is_var(v): ...
from typing import List, Tuple, Callable, overload
@ -374,7 +381,7 @@ def index(shape: Tuple[int], dim: int, dtype: str="int32")-> Var:
# output: [[0,1],[0,1]]'''
...
@overload
def index(shape: Tuple[int], dtype: str="int32")-> List[Var]:
def index(shape: Tuple[int], dtype: str="int32"):
'''Document:
*
Index Operator generate index of shape.
@ -428,7 +435,7 @@ def index(a: Var, dim: int, dtype: str="int32")-> Var:
# output: [[0,1],[0,1]]'''
...
@overload
def index(a: Var, dtype: str="int32")-> List[Var]:
def index(a: Var, dtype: str="int32"):
'''Document:
*
Index Operator generate index of shape.
@ -461,7 +468,7 @@ def index_var(a: Var, dim: int, dtype: str="int32")-> Var:
jt.index_var(a, 1) similar with jt.index(a.shape, 1)'''
...
@overload
def index_var(a: Var, dtype: str="int32")-> List[Var]:
def index_var(a: Var, dtype: str="int32"):
'''Document:
* shape dependency version of index op
jt.index_var(a, 1) similar with jt.index(a.shape, 1)'''
@ -824,7 +831,7 @@ def bitwise_xor(x: Var, y: Var)-> Var:
...
def tape(x: Var)-> Var:
...
def where(cond: Var, dtype: str="int32")-> List[Var]:
def where(cond: Var, dtype: str="int32"):
'''Document:
*
Where Operator generate index of true condition.
@ -838,9 +845,9 @@ def where(cond: Var, dtype: str="int32")-> List[Var]:
Example::
jt.where([[0,0,1],[1,0,0]])
# return ( [0,2], [1,0] )'''
# return [jt.Var([0 1], dtype=int32), jt.Var([2 0], dtype=int32)]'''
...
def argsort(x: Var, dim: int=-1, descending: bool=False, dtype: str="int32")-> List[Var]:
def argsort(x: Var, dim: int=-1, descending: bool=False, dtype: str="int32"):
'''Document:
*
Argsort Operator Perform an indirect sort by given key or compare function.
@ -883,7 +890,7 @@ def argsort(x: Var, dim: int=-1, descending: bool=False, dtype: str="int32")-> L
...
def fetch(inputs: List[Var], func: Callable)-> Var:
...
def arg_reduce(x: Var, op: str, dim: int, keepdims: bool)-> List[Var]:
def arg_reduce(x: Var, op: str, dim: int, keepdims: bool):
'''Document:
*
Returns the indices of the maximum / minimum of the input across a dimension.
@ -908,7 +915,7 @@ def arg_reduce(x: Var, op: str, dim: int, keepdims: bool)-> List[Var]:
>>> jt.arg_reduce(x, 'max', dim=1, keepdims=False)
[jt.Var([2 1], dtype=int32), jt.Var([5 7], dtype=int32)]
>>> jt.arg_reduce(x, 'min', dim=1, keepdims=False)
[jt.Var([1 2], dtype=int32), jt.Var([5 7], dtype=int32)]'''
[jt.Var([1 2], dtype=int32), jt.Var([2 1], dtype=int32)]'''
...
def random(shape: Tuple[int], dtype: str="float32", type: str="uniform")-> Var:
...
@ -2278,6 +2285,8 @@ def uint32(x: Var)-> Var:
...
def uint64(x: Var)-> Var:
...
def float16(x: Var)-> Var:
...
def float32(x: Var)-> Var:
...
def float64(x: Var)-> Var:
@ -2870,6 +2879,8 @@ def erf(x: Var)-> Var:
>>> jt.erf(a)
jt.Var([ 0.51559156 0.45739546 -0.85728306 -0.9258883 ], dtype=float32)'''
...
def erfinv(x: Var)-> Var:
...
def transpose(x: Var, axes: Tuple[int]=())-> Var:
...
def fuse_transpose(x: Var, axes: Tuple[int]=())-> Var:
@ -3005,7 +3016,7 @@ def numpy_code(shape: Tuple[int], dtype: str, inputs: List[Var], forward: Callab
)'''
...
@overload
def numpy_code(shapes: List[Tuple[int]], dtypes: List[str], inputs: List[Var], forward: Callable, backward: List[Callable])-> List[Var]:
def numpy_code(shapes: List[Tuple[int]], dtypes: List[str], inputs: List[Var], forward: Callable, backward: List[Callable]):
'''Document:
*
Numpy Code Operator for easily customized op.
@ -3151,7 +3162,7 @@ def numpy_code(shape: Tuple[int], dtype: str, inputs: List[Var], forward: Callab
)'''
...
@overload
def numpy_code(shapes: List[Tuple[int]], dtypes: List[str], inputs: List[Var], forward: Callable)-> List[Var]:
def numpy_code(shapes: List[Tuple[int]], dtypes: List[str], inputs: List[Var], forward: Callable):
'''Document:
*
Numpy Code Operator for easily customized op.
@ -3345,6 +3356,23 @@ def code(shape: Tuple[int], dtype: str, inputs: List[Var]={}, cpu_src: str="", c
assert (b.data == [5,3,1]).all()
assert (c.data == [-4,-2]).all()
Example-5::
# This example shows how to customize code op
# compilation flags, such as add include search
# path, add definitions, or any command line options
a = jt.random([10])
b = jt.code(a.shape, a.dtype, [a],
cpu_src="""
@out0(0) = HAHAHA;
""")
# HAHAHA is defined in flags below
# /any/include/path can be change to any path you want to include
b.compile_options = {"FLAGS: -DHAHAHA=233 -I/any/include/path ": 1}
print(b[0])
# will output 233
CUDA Example-1::
@ -3435,7 +3463,7 @@ def code(shape: Tuple[int], dtype: str, inputs: List[Var]={}, cpu_src: str="", c
print(jt.grad(c, [a, b]))'''
...
@overload
def code(shapes: List[Tuple[int]], dtypes: List[str], inputs: List[Var]={}, cpu_src: str="", cpu_grad_src: List[str]={}, cpu_header: str="", cuda_src: str="", cuda_grad_src: List[str]={}, cuda_header: str="")-> List[Var]:
def code(shapes: List[Tuple[int]], dtypes: List[str], inputs: List[Var]={}, cpu_src: str="", cpu_grad_src: List[str]={}, cpu_header: str="", cuda_src: str="", cuda_grad_src: List[str]={}, cuda_header: str=""):
'''Document:
*
Code Operator for easily customized op.
@ -3556,6 +3584,23 @@ def code(shapes: List[Tuple[int]], dtypes: List[str], inputs: List[Var]={}, cpu_
assert (b.data == [5,3,1]).all()
assert (c.data == [-4,-2]).all()
Example-5::
# This example shows how to customize code op
# compilation flags, such as add include search
# path, add definitions, or any command line options
a = jt.random([10])
b = jt.code(a.shape, a.dtype, [a],
cpu_src="""
@out0(0) = HAHAHA;
""")
# HAHAHA is defined in flags below
# /any/include/path can be change to any path you want to include
b.compile_options = {"FLAGS: -DHAHAHA=233 -I/any/include/path ": 1}
print(b[0])
# will output 233
CUDA Example-1::
@ -3646,7 +3691,7 @@ def code(shapes: List[Tuple[int]], dtypes: List[str], inputs: List[Var]={}, cpu_
print(jt.grad(c, [a, b]))'''
...
@overload
def code(inputs: List[Var], outputs: List[Var], cpu_src: str="", cpu_grad_src: List[str]={}, cpu_header: str="", cuda_src: str="", cuda_grad_src: List[str]={}, cuda_header: str="")-> List[Var]:
def code(inputs: List[Var], outputs: List[Var], cpu_src: str="", cpu_grad_src: List[str]={}, cpu_header: str="", cuda_src: str="", cuda_grad_src: List[str]={}, cuda_header: str=""):
'''Document:
*
Code Operator for easily customized op.
@ -3767,6 +3812,23 @@ def code(inputs: List[Var], outputs: List[Var], cpu_src: str="", cpu_grad_src: L
assert (b.data == [5,3,1]).all()
assert (c.data == [-4,-2]).all()
Example-5::
# This example shows how to customize code op
# compilation flags, such as add include search
# path, add definitions, or any command line options
a = jt.random([10])
b = jt.code(a.shape, a.dtype, [a],
cpu_src="""
@out0(0) = HAHAHA;
""")
# HAHAHA is defined in flags below
# /any/include/path can be change to any path you want to include
b.compile_options = {"FLAGS: -DHAHAHA=233 -I/any/include/path ": 1}
print(b[0])
# will output 233
CUDA Example-1::
@ -4239,7 +4301,7 @@ class Var:
# output: [[0,1],[0,1]]'''
...
@overload
def index(self, dtype: str="int32")-> List[Var]:
def index(self, dtype: str="int32"):
'''Document:
*
Index Operator generate index of shape.
@ -4272,7 +4334,7 @@ class Var:
jt.index_var(a, 1) similar with jt.index(a.shape, 1)'''
...
@overload
def index_var(self, dtype: str="int32")-> List[Var]:
def index_var(self, dtype: str="int32"):
'''Document:
* shape dependency version of index op
jt.index_var(a, 1) similar with jt.index(a.shape, 1)'''
@ -4633,7 +4695,7 @@ class Var:
* [in] y: the second input, jt.Var (integal or boolean).'''
...
def tape(self)-> Var: ...
def where(self, dtype: str="int32")-> List[Var]:
def where(self, dtype: str="int32"):
'''Document:
*
Where Operator generate index of true condition.
@ -4647,9 +4709,9 @@ class Var:
Example::
jt.where([[0,0,1],[1,0,0]])
# return ( [0,2], [1,0] )'''
# return [jt.Var([0 1], dtype=int32), jt.Var([2 0], dtype=int32)]'''
...
def argsort(self, dim: int=-1, descending: bool=False, dtype: str="int32")-> List[Var]:
def argsort(self, dim: int=-1, descending: bool=False, dtype: str="int32"):
'''Document:
*
Argsort Operator Perform an indirect sort by given key or compare function.
@ -4691,7 +4753,7 @@ class Var:
# return [[0 1 0],[1 0 1]], [[11 11 12],[12 13 13]]'''
...
def fetch(self, func: Callable)-> Var: ...
def arg_reduce(self, op: str, dim: int, keepdims: bool)-> List[Var]:
def arg_reduce(self, op: str, dim: int, keepdims: bool):
'''Document:
*
Returns the indices of the maximum / minimum of the input across a dimension.
@ -6059,6 +6121,7 @@ class Var:
def uint16(self)-> Var: ...
def uint32(self)-> Var: ...
def uint64(self)-> Var: ...
def float16(self)-> Var: ...
def float32(self)-> Var: ...
def float64(self)-> Var: ...
def abs(self)-> Var:
@ -6649,6 +6712,7 @@ class Var:
>>> jt.erf(a)
jt.Var([ 0.51559156 0.45739546 -0.85728306 -0.9258883 ], dtype=float32)'''
...
def erfinv(self)-> Var: ...
def transpose(self, axes: Tuple[int]=())-> Var: ...
def fuse_transpose(self, axes: Tuple[int]=())-> Var: ...
def safe_clip(self, left: float, right: float)-> Var:
@ -6705,7 +6769,7 @@ class Var:
# x[y[0], 1] <= x[y[1], 1] and x[y[1], 1] <= x[y[2], 1] and ... and x[y[m-2], 1] <= x[y[m-1], 1]'''
...
@overload
def code(self, outputs: List[Var], cpu_src: str="", cpu_grad_src: List[str]={}, cpu_header: str="", cuda_src: str="", cuda_grad_src: List[str]={}, cuda_header: str="")-> List[Var]:
def code(self, outputs: List[Var], cpu_src: str="", cpu_grad_src: List[str]={}, cpu_header: str="", cuda_src: str="", cuda_grad_src: List[str]={}, cuda_header: str=""):
'''Document:
*
Code Operator for easily customized op.
@ -6826,6 +6890,23 @@ class Var:
assert (b.data == [5,3,1]).all()
assert (c.data == [-4,-2]).all()
Example-5::
# This example shows how to customize code op
# compilation flags, such as add include search
# path, add definitions, or any command line options
a = jt.random([10])
b = jt.code(a.shape, a.dtype, [a],
cpu_src="""
@out0(0) = HAHAHA;
""")
# HAHAHA is defined in flags below
# /any/include/path can be change to any path you want to include
b.compile_options = {"FLAGS: -DHAHAHA=233 -I/any/include/path ": 1}
print(b[0])
# will output 233
CUDA Example-1::
@ -7177,6 +7258,11 @@ class Var:
*
* return True if operator fusion is stopped.'''
...
def start_grad(self)-> Var:
'''Document:
*
* enable the gradient calculation for the Var.'''
...
def item(self)-> float | int | bool:
'''Document:
*
@ -7266,7 +7352,13 @@ class Var:
[1 8 1 1 2 2]], dtype=int32)'''
...
def permute(self, x: Var, axes: Tuple[int]=())-> Var: ...
def detach_inplace(self)-> Var:
'''Document:
*
* enable the gradient calculation for the Var.'''
...
def astype(self, x: Var, op: str)-> Var: ...
def half(self, x: Var)-> Var: ...
def expand_as(self, x: Var, y: Var, dims: Tuple[int]=())-> Var:
'''Document:
*
@ -7310,8 +7402,12 @@ class Flags:
'''A set of flags to configure jittor running behaviors'''
addr2line_path: str
'''Path of addr2line. Default: ""'''
amp_reg: int
'''Auto mixed-precision control registers, bit 0: prefer 32; bit 1: prefer 16; bit 2: keep reduce type; bit 3 keep white list type; bit 4: array like op prefer too. Default: 0'''
auto_convert_64_to_32: int
'''auto convert 64bit numpy array into 32bit jittor array. Default: 1'''
auto_mixed_precision_level: int
'''Auto mixed-precision optimization level, 0: not use fp16, 1-3: preserve level, not use fp16 for now; 4: perfer fp16, but some ops use fp32 e.g. sum,exp; 5: simular with 4, and array op will automatically convert to fp16; 6: all ops prefer fp16. Default: 0'''
cache_path: str
'''Cache path of jittor. Default: ""'''
cc_flags: str
@ -7324,10 +7420,12 @@ class Flags:
'''Unify graph sanity check. Default: 0'''
compile_options: Any
'''Override the default loop transfrom options. Default: {}'''
disable_lock: bool
'''Disable file lock. Default: 0'''
enable_tuner: int
'''Enable tuner. Default: 1'''
exclude_pass: str
'''Don't run certian pass. Default: ""'''
'''Don't run certain pass. Default: ""'''
extra_gdb_cmd: str
'''Extra command pass to GDB, seperate by(;) . Default: ""): Extra command pass to GDB, seperate by(;'''
gdb_attach: int
@ -7352,6 +7450,8 @@ class Flags:
'''Default enabled, if disable, use immediately eager execution rather than lazy execution, This flag makes error message and traceback infomation better. But this flag will raise memory consumption and lower the performance. Default: 1'''
log_file: str
'''log to file, mpi env will add $OMPI_COMM_WORLD_RANK suffix. Default: ""'''
log_op_hash: str
'''Output compiler pass result of certain hash of op. Default: ""'''
log_silent: int
'''The log will be completely silent. Default: 0'''
log_sync: int
@ -7376,6 +7476,10 @@ class Flags:
'''Enable profiler. Default: 0'''
profiler_hide_relay: int
'''Profiler hide relayed op. Default: 0'''
profiler_record_peek: int
'''Profiler record peek mem bandwidth. Default: 0'''
profiler_record_shape: int
'''Profiler record shape for op. Default: 0'''
profiler_rerun: int
'''Profiler rerun. Default: 0'''
profiler_warmup: int
@ -7402,8 +7506,12 @@ class Flags:
'''If not overflow, try to use 32 bit type as index type. Default: 0'''
update_queue_auto_flush_delay: int
'''when size of a update queue is great than this value, update queue trigger auto flush(default 2). Default: 2): when size of a update queue is great than this value, update queue trigger auto flush(default 2'''
use_acl: int
'''Use cuda or not. 1 for trying to use cuda, 2 for forcing to use cuda. Default: 0'''
use_cuda: int
'''Use cuda or not. 1 for trying to use cuda, 2 for forcing to use cuda. Default: 0'''
use_device: int
'''Use cuda or not. 1 for trying to use cuda, 2 for forcing to use cuda. Default: 0'''
use_nfef_allocator: int
'''Enable never free exact fit allocator. Default: 0'''
use_parallel_op_compiler: int
@ -7414,5 +7522,7 @@ class Flags:
'''Enable stat allocator. Default: 0'''
use_temp_allocator: int
'''Enable temp allocator. Default: 1'''
use_tensorcore: int
'''use tensor core. Default: 0'''
flags: Flags
'''Jittor running time flags instance'''

View File

@ -9,6 +9,7 @@ import platform
from .compiler import *
from jittor_utils import run_cmd, get_version, get_int_version
from jittor_utils.misc import download_url_to_local
import jittor_utils as jit_utils
def search_file(dirs, name, prefer_version=()):
if os.name == 'nt':
@ -110,8 +111,7 @@ def setup_mkl():
LOG.v("setup mkl...")
# mkl_path = os.path.join(cache_path, "mkl")
# mkl_path decouple with cc_path
from pathlib import Path
mkl_path = os.path.join(str(Path.home()), ".cache", "jittor", "mkl")
mkl_path = os.path.join(jit_utils.home(), ".cache", "jittor", "mkl")
make_cache_dir(mkl_path)
install_mkl(mkl_path)
@ -141,12 +141,12 @@ def setup_mkl():
elif platform.system() == 'Darwin':
mkl_lib_paths = [
"/usr/local/lib/libmkldnn.dylib", # x86_64
"/opt/homebrew/lib/libmkldnn.dylib", # arm64
"/usr/local/lib/libdnnl.dylib", # x86_64
"/opt/homebrew/lib/libdnnl.dylib", # arm64
]
if not any([os.path.exists(lib) for lib in mkl_lib_paths]):
raise RuntimeError("Not found onednn, please install it by the command 'brew install onednn'")
extra_flags = f" -lmkldnn "
extra_flags = f" -ldnnl "
mkl_op_dir = os.path.join(jittor_path, "extern", "mkl", "ops")
mkl_op_files = [os.path.join(mkl_op_dir, name) for name in os.listdir(mkl_op_dir)]
@ -178,8 +178,7 @@ def install_cub(root_folder):
def setup_cub():
global cub_home
cub_home = ""
from pathlib import Path
cub_path = os.path.join(str(Path.home()), ".cache", "jittor", "cub")
cub_path = os.path.join(jit_utils.home(), ".cache", "jittor", "cub")
cuda_version = int(get_version(nvcc_path)[1:-1].split('.')[0])
extra_flags = ""
if cuda_version < 11:
@ -221,6 +220,12 @@ def setup_cuda_extern():
LOG.w(f"CUDA found but cub is not loaded:\n{line}")
libs = ["cublas", "cudnn", "curand", "cufft"]
# in cuda 11.4, module memory comsumptions:
# default context: 259 MB
# cublas: 340 MB
# cudnn: 340 MB
if int(os.environ.get("conv_opt", "0")):
libs = ["cublas", "curand"]
for lib_name in libs:
try:
setup_cuda_lib(lib_name, extra_flags=link_cuda_extern)
@ -320,7 +325,11 @@ def install_cutt(root_folder):
if md5 != true_md5:
os.remove(fullname)
shutil.rmtree(dirname)
if not os.path.isfile(os.path.join(cache_path, "libcutt"+so)):
CUTT_PATH = os.environ.get("CUTT_PATH", "")
if not os.path.isfile(os.path.join(cache_path, "libcutt"+so)) or CUTT_PATH:
if CUTT_PATH:
dirname = CUTT_PATH
else:
LOG.i("Downloading cutt...")
download_url_to_local(url, filename, root_folder, true_md5)
@ -335,7 +344,8 @@ def install_cutt(root_folder):
zf.close()
LOG.i("installing cutt...")
arch_flag = ""
# -Xptxas -dlcm=ca actually not work
arch_flag = " -Xptxas -dlcm=ca "
if len(flags.cuda_archs):
arch_flag = f" -arch=compute_{min(flags.cuda_archs)} "
arch_flag += ''.join(map(lambda x:f' -code=sm_{x} ', flags.cuda_archs))
@ -365,8 +375,7 @@ def setup_cutt():
if cutt_lib_path is None or cutt_include_path is None:
LOG.v("setup cutt...")
# cutt_path decouple with cc_path
from pathlib import Path
cutt_path = os.path.join(str(Path.home()), ".cache", "jittor", "cutt")
cutt_path = os.path.join(jit_utils.home(), ".cache", "jittor", "cutt")
make_cache_dir(cutt_path)
install_cutt(cutt_path)
@ -442,8 +451,7 @@ def setup_nccl():
if nccl_lib_path is None or nccl_include_path is None:
LOG.v("setup nccl...")
# nccl_path decouple with cc_path
from pathlib import Path
nccl_path = os.path.join(str(Path.home()), ".cache", "jittor", "nccl")
nccl_path = os.path.join(jit_utils.home(), ".cache", "jittor", "nccl")
make_cache_dir(nccl_path)
nccl_home = install_nccl(nccl_path)

View File

@ -19,7 +19,7 @@ from ctypes import cdll
from ctypes.util import find_library
import jittor_utils as jit_utils
from jittor_utils import LOG, run_cmd, cache_path, find_exe, cc_path, cc_type, cache_path
from jittor_utils import LOG, run_cmd, find_exe, cc_path, cc_type, cache_path
from . import pyjt_compiler
from jittor_utils import lock
from jittor_utils import install_cuda
@ -228,13 +228,20 @@ def gen_jit_flags():
continue
visit[name] = 1
jit_declares.append(f"DECLARE_FLAG({type}, {name});")
alias = []
if name == "use_cuda":
alias = ["use_device", "use_acl"]
elif name == "auto_mixed_precision_level":
alias = ["amp_level"]
get_names = ",".join(["__get__"+a for a in [name]+alias])
set_names = ",".join(["__set__"+a for a in [name]+alias])
flags_defs.append(f"""
/* {name}(type:{type}, default:{default}): {doc} */
// @pyjt(__get__{name})
// @pyjt({get_names})
{type} _get_{name}() {{ return {name}; }}
// @pyjt(__set__{name})
// @pyjt({set_names})
void _set_{name}({type} v) {{ set_{name}(v); }}
{f'''// @pyjt(__set__{name})
{f'''// @pyjt({set_names})
void _set_{name}(bool v) {{ set_{name}(v); }}
''' if type=="int" else ""}
""")
@ -843,7 +850,7 @@ def check_cuda():
# this nvcc is install by package manager
cuda_lib = "/usr/lib/x86_64-linux-gnu"
cuda_include2 = os.path.join(jittor_path, "extern","cuda","inc")
cc_flags += f" -DHAS_CUDA -I\"{cuda_include}\" -I\"{cuda_include2}\" "
cc_flags += f" -DHAS_CUDA -DIS_CUDA -I\"{cuda_include}\" -I\"{cuda_include2}\" "
if os.name == 'nt':
cuda_lib = os.path.abspath(os.path.join(cuda_dir, "..", "lib", "x64"))
# cc_flags += f" \"{cuda_lib}\\cudart.lib\" "
@ -1212,6 +1219,14 @@ if has_cuda:
return nvcc_flags
nvcc_flags = convert_nvcc_flags(nvcc_flags)
# from .acl_compiler import check_acl
from .extern.acl import acl_compiler
jit_utils.add_backend(acl_compiler)
for mod in jit_utils.backends:
if mod.check():
break
# build core
gen_jit_flags()
gen_jit_tests()
@ -1236,6 +1251,8 @@ files4 = [ f[len(jittor_path)+1:] for f in files4 ]
# files4 = run_cmd('find -L src | grep '+grep_args, jittor_path).splitlines()
at_beginning = [
"src/ops/op_utils.cc",
"src/ops/op_register.cc",
"src/init.cc",
"src/event_queue.cc",
"src/mem/allocator/sfrl_allocator.cc",
"src/mem/allocator.cc",

View File

@ -21,8 +21,9 @@ import signal
from jittor_utils import LOG
import jittor as jt
import time
import jittor_utils as jit_utils
dataset_root = os.path.join(pathlib.Path.home(), ".cache", "jittor", "dataset")
dataset_root = os.path.join(jit_utils.home(), ".cache", "jittor", "dataset")
mp_log_v = os.environ.get("mp_log_v", 0)
mpi = jt.mpi
img_open_hook = HookTimer(Image, "open")

View File

@ -0,0 +1,107 @@
import jittor as jt
from jittor import nn
import numpy as np
# import pylab as pl
# 隐空间向量长度
latent_dim = 100
# 类别数量
n_classes = 10
# 图片大小
img_size = 32
# 图片通道数量
channels = 1
# 图片张量的形状
img_shape = (channels, img_size, img_size)
class Generator(nn.Module):
def __init__(self):
super(Generator, self).__init__()
self.label_emb = nn.Embedding(n_classes, n_classes)
def block(in_feat, out_feat, normalize=True):
layers = [nn.Linear(in_feat, out_feat)]
if normalize:
layers.append(nn.BatchNorm1d(out_feat, 0.8))
layers.append(nn.LeakyReLU(0.2))
return layers
self.model = nn.Sequential(
*block((latent_dim + n_classes), 128, normalize=False),
*block(128, 256),
*block(256, 512),
*block(512, 1024),
nn.Linear(1024, int(np.prod(img_shape))),
nn.Tanh())
def execute(self, noise, labels):
gen_input = jt.contrib.concat((self.label_emb(labels), noise), dim=1)
img = self.model(gen_input)
img = img.view((img.shape[0], *img_shape))
return img
class Discriminator(nn.Module):
def __init__(self):
super(Discriminator, self).__init__()
self.label_embedding = nn.Embedding(n_classes, n_classes)
self.model = nn.Sequential(
nn.Linear((n_classes + int(np.prod(img_shape))), 512),
nn.LeakyReLU(0.2),
nn.Linear(512, 512),
nn.Dropout(0.4),
nn.LeakyReLU(0.2),
nn.Linear(512, 512),
nn.Dropout(0.4),
nn.LeakyReLU(0.2),
nn.Linear(512, 1))
def execute(self, img, labels):
d_in = jt.contrib.concat((img.view((img.shape[0], (- 1))), self.label_embedding(labels)), dim=1)
validity = self.model(d_in)
return validity
# 定义模型
generator = Generator()
discriminator = Discriminator()
generator.eval()
discriminator.eval()
# 加载参数
generator.load('https://cg.cs.tsinghua.edu.cn/jittor/assets/build/generator_last.pkl')
discriminator.load('https://cg.cs.tsinghua.edu.cn/jittor/assets/build/discriminator_last.pkl')
def gen_img(number):
print(number, type(number))
n_row = len(number)
z = jt.array(np.random.normal(0, 1, (n_row, latent_dim))).float32().stop_grad()
labels = jt.array(np.array([int(number[num]) for num in range(n_row)])).float32().stop_grad()
gen_imgs = generator(z,labels)
gen_imgs = gen_imgs.transpose((1,2,0,3)).reshape(gen_imgs.shape[2], -1)
gen_imgs = gen_imgs[:,:,None].broadcast(gen_imgs.shape+(3,)) # .uint8()
gen_imgs = (gen_imgs - gen_imgs.min()) / (gen_imgs.max() - gen_imgs.min()) * 255
gen_imgs = gen_imgs.uint8()
# print(gen_imgs.shape, gen_imgs.max(), gen_imgs.min())
return gen_imgs.numpy()
# gen_imgs = gen_imgs.data.transpose((1,2,0,3))[0].reshape((gen_imgs.shape[2], -1))
# print(gen_imgs.shape)
return gen_imgs[:,:,None]
from PIL import Image
import pywebio as pw
# 定义一串数字
number = "201962517"
# gen_img(number)
Image.fromarray(gen_img(number))
# pl.imshow()
# pl.show()
# print("done")
def web_server():
pw.pin.put_input("number", label="输入用于生成的数字(由计图框架支持)")
pw.output.put_buttons(['Gen image'],
lambda _: pw.output.put_image(Image.fromarray(gen_img(pw.pin.pin.number))))
pw.start_server(web_server, port=8123)

View File

@ -0,0 +1,54 @@
# ***************************************************************
# Copyright (c) 2021 Jittor. All Rights Reserved.
# Maintainers: Dun Liang <randonlang@gmail.com>.
# This file is subject to the terms and conditions defined in
# file 'LICENSE.txt', which is part of this source code package.
# ***************************************************************
import os
from jittor_utils import env_or_try_find
import jittor_utils
import ctypes
import glob
has_acl = 0
cc_flags = ""
tikcc_path = env_or_try_find('tikcc_path', 'tikcc')
dlopen_flags = os.RTLD_NOW | os.RTLD_GLOBAL
def install():
import jittor.compiler as compiler
global has_acl, cc_flags
acl_compiler_home = os.path.dirname(__file__)
cc_files = sorted(glob.glob(acl_compiler_home+"/**/*.cc", recursive=True))
cc_flags += f" -DHAS_CUDA -DIS_ACL -I/usr/local/Ascend/runtime/include -I/usr/local/Ascend/driver/include -L/usr/local/Ascend/compiler/lib64 -L/usr/local/Ascend/runtime/lib64 -I{acl_compiler_home} -ltikc_runtime -lascendcl "
ctypes.CDLL("libascendcl.so", dlopen_flags)
jittor_utils.LOG.i("ACL detected")
mod = jittor_utils.compile_module('''
#include "common.h"
namespace jittor {
// @pyjt(process)
string process_acl(const string& src, const string& name, const map<string,string>& kargs);
}''', compiler.cc_flags + " " + " ".join(cc_files) + cc_flags)
jittor_utils.process_jittor_source("acl", mod.process)
has_acl = 1
def check():
import jittor.compiler as compiler
global has_acl, cc_flags
if tikcc_path:
try:
install()
except Exception as e:
jittor_utils.LOG.w(f"load ACL failed, exception: {e}")
has_acl = 0
compiler.has_acl = has_acl
compiler.tikcc_path = tikcc_path
if not has_acl: return False
compiler.cc_flags += cc_flags
compiler.nvcc_path = tikcc_path
compiler.nvcc_flags = compiler.cc_flags.replace("-std=c++14","")
return True

View File

@ -0,0 +1,228 @@
// ***************************************************************
// Copyright (c) 2021 Jittor. All Rights Reserved.
// Maintainers: Dun Liang <randonlang@gmail.com>.
// This file is subject to the terms and conditions defined in
// file 'LICENSE.txt', which is part of this source code package.
// ***************************************************************
#include "common.h"
using std::string;
using std::unordered_map;
typedef int aclError;
static inline unordered_map<aclError,string> gen_map(string s) {
unordered_map<aclError,string> smap;
for (int i=0; i<s.size(); i++) {
if (s[i] == ';') {
int j=s.rfind(" ", i);
int code = std::stoi(s.substr(j+1, i-j-1));
int k = s.rfind(" ", j-1);
int l = s.rfind(" ACL_", k-1);
smap[code] = s.substr(l+1, k-l-1);
}
}
return smap;
}
string acl_error_to_string(aclError error) {
static unordered_map<aclError,string> acl_error_map = gen_map(R"(
// from acl_base.h
static const int ACL_ERROR_INVALID_PARAM = 100000;
static const int ACL_ERROR_UNINITIALIZE = 100001;
static const int ACL_ERROR_REPEAT_INITIALIZE = 100002;
static const int ACL_ERROR_INVALID_FILE = 100003;
static const int ACL_ERROR_WRITE_FILE = 100004;
static const int ACL_ERROR_INVALID_FILE_SIZE = 100005;
static const int ACL_ERROR_PARSE_FILE = 100006;
static const int ACL_ERROR_FILE_MISSING_ATTR = 100007;
static const int ACL_ERROR_FILE_ATTR_INVALID = 100008;
static const int ACL_ERROR_INVALID_DUMP_CONFIG = 100009;
static const int ACL_ERROR_INVALID_PROFILING_CONFIG = 100010;
static const int ACL_ERROR_INVALID_MODEL_ID = 100011;
static const int ACL_ERROR_DESERIALIZE_MODEL = 100012;
static const int ACL_ERROR_PARSE_MODEL = 100013;
static const int ACL_ERROR_READ_MODEL_FAILURE = 100014;
static const int ACL_ERROR_MODEL_SIZE_INVALID = 100015;
static const int ACL_ERROR_MODEL_MISSING_ATTR = 100016;
static const int ACL_ERROR_MODEL_INPUT_NOT_MATCH = 100017;
static const int ACL_ERROR_MODEL_OUTPUT_NOT_MATCH = 100018;
static const int ACL_ERROR_MODEL_NOT_DYNAMIC = 100019;
static const int ACL_ERROR_OP_TYPE_NOT_MATCH = 100020;
static const int ACL_ERROR_OP_INPUT_NOT_MATCH = 100021;
static const int ACL_ERROR_OP_OUTPUT_NOT_MATCH = 100022;
static const int ACL_ERROR_OP_ATTR_NOT_MATCH = 100023;
static const int ACL_ERROR_OP_NOT_FOUND = 100024;
static const int ACL_ERROR_OP_LOAD_FAILED = 100025;
static const int ACL_ERROR_UNSUPPORTED_DATA_TYPE = 100026;
static const int ACL_ERROR_FORMAT_NOT_MATCH = 100027;
static const int ACL_ERROR_BIN_SELECTOR_NOT_REGISTERED = 100028;
static const int ACL_ERROR_KERNEL_NOT_FOUND = 100029;
static const int ACL_ERROR_BIN_SELECTOR_ALREADY_REGISTERED = 100030;
static const int ACL_ERROR_KERNEL_ALREADY_REGISTERED = 100031;
static const int ACL_ERROR_INVALID_QUEUE_ID = 100032;
static const int ACL_ERROR_REPEAT_SUBSCRIBE = 100033;
static const int ACL_ERROR_STREAM_NOT_SUBSCRIBE = 100034;
static const int ACL_ERROR_THREAD_NOT_SUBSCRIBE = 100035;
static const int ACL_ERROR_WAIT_CALLBACK_TIMEOUT = 100036;
static const int ACL_ERROR_REPEAT_FINALIZE = 100037;
static const int ACL_ERROR_NOT_STATIC_AIPP = 100038;
static const int ACL_ERROR_COMPILING_STUB_MODE = 100039;
static const int ACL_ERROR_GROUP_NOT_SET = 100040;
static const int ACL_ERROR_GROUP_NOT_CREATE = 100041;
static const int ACL_ERROR_PROF_ALREADY_RUN = 100042;
static const int ACL_ERROR_PROF_NOT_RUN = 100043;
static const int ACL_ERROR_DUMP_ALREADY_RUN = 100044;
static const int ACL_ERROR_DUMP_NOT_RUN = 100045;
static const int ACL_ERROR_PROF_REPEAT_SUBSCRIBE = 148046;
static const int ACL_ERROR_PROF_API_CONFLICT = 148047;
static const int ACL_ERROR_INVALID_MAX_OPQUEUE_NUM_CONFIG = 148048;
static const int ACL_ERROR_INVALID_OPP_PATH = 148049;
static const int ACL_ERROR_OP_UNSUPPORTED_DYNAMIC = 148050;
static const int ACL_ERROR_RELATIVE_RESOURCE_NOT_CLEARED = 148051;
static const int ACL_ERROR_BAD_ALLOC = 200000;
static const int ACL_ERROR_API_NOT_SUPPORT = 200001;
static const int ACL_ERROR_INVALID_DEVICE = 200002;
static const int ACL_ERROR_MEMORY_ADDRESS_UNALIGNED = 200003;
static const int ACL_ERROR_RESOURCE_NOT_MATCH = 200004;
static const int ACL_ERROR_INVALID_RESOURCE_HANDLE = 200005;
static const int ACL_ERROR_FEATURE_UNSUPPORTED = 200006;
static const int ACL_ERROR_PROF_MODULES_UNSUPPORTED = 200007;
static const int ACL_ERROR_STORAGE_OVER_LIMIT = 300000;
static const int ACL_ERROR_INTERNAL_ERROR = 500000;
static const int ACL_ERROR_FAILURE = 500001;
static const int ACL_ERROR_GE_FAILURE = 500002;
static const int ACL_ERROR_RT_FAILURE = 500003;
static const int ACL_ERROR_DRV_FAILURE = 500004;
static const int ACL_ERROR_PROFILING_FAILURE = 500005;
// from ge_error_codes.h
static const uint32_t ACL_ERROR_GE_PARAM_INVALID = 145000U;
static const uint32_t ACL_ERROR_GE_EXEC_NOT_INIT = 145001U;
static const uint32_t ACL_ERROR_GE_EXEC_MODEL_PATH_INVALID = 145002U;
static const uint32_t ACL_ERROR_GE_EXEC_MODEL_ID_INVALID = 145003U;
static const uint32_t ACL_ERROR_GE_EXEC_MODEL_DATA_SIZE_INVALID = 145006U;
static const uint32_t ACL_ERROR_GE_EXEC_MODEL_ADDR_INVALID = 145007U;
static const uint32_t ACL_ERROR_GE_EXEC_MODEL_QUEUE_ID_INVALID = 145008U;
static const uint32_t ACL_ERROR_GE_EXEC_LOAD_MODEL_REPEATED = 145009U;
static const uint32_t ACL_ERROR_GE_DYNAMIC_INPUT_ADDR_INVALID = 145011U;
static const uint32_t ACL_ERROR_GE_DYNAMIC_INPUT_LENGTH_INVALID = 145012U;
static const uint32_t ACL_ERROR_GE_DYNAMIC_BATCH_SIZE_INVALID = 145013U;
static const uint32_t ACL_ERROR_GE_AIPP_BATCH_EMPTY = 145014U;
static const uint32_t ACL_ERROR_GE_AIPP_NOT_EXIST = 145015U;
static const uint32_t ACL_ERROR_GE_AIPP_MODE_INVALID = 145016U;
static const uint32_t ACL_ERROR_GE_OP_TASK_TYPE_INVALID = 145017U;
static const uint32_t ACL_ERROR_GE_OP_KERNEL_TYPE_INVALID = 145018U;
static const uint32_t ACL_ERROR_GE_PLGMGR_PATH_INVALID = 145019U;
static const uint32_t ACL_ERROR_GE_FORMAT_INVALID = 145020U;
static const uint32_t ACL_ERROR_GE_SHAPE_INVALID = 145021U;
static const uint32_t ACL_ERROR_GE_DATATYPE_INVALID = 145022U;
static const uint32_t ACL_ERROR_GE_MEMORY_ALLOCATION = 245000U;
static const uint32_t ACL_ERROR_GE_MEMORY_OPERATE_FAILED = 245001U;
static const uint32_t ACL_ERROR_GE_INTERNAL_ERROR = 545000U;
static const uint32_t ACL_ERROR_GE_LOAD_MODEL = 545001U;
static const uint32_t ACL_ERROR_GE_EXEC_LOAD_MODEL_PARTITION_FAILED = 545002U;
static const uint32_t ACL_ERROR_GE_EXEC_LOAD_WEIGHT_PARTITION_FAILED = 545003U;
static const uint32_t ACL_ERROR_GE_EXEC_LOAD_TASK_PARTITION_FAILED = 545004U;
static const uint32_t ACL_ERROR_GE_EXEC_LOAD_KERNEL_PARTITION_FAILED = 545005U;
static const uint32_t ACL_ERROR_GE_EXEC_RELEASE_MODEL_DATA = 545006U;
static const uint32_t ACL_ERROR_GE_COMMAND_HANDLE = 545007U;
static const uint32_t ACL_ERROR_GE_GET_TENSOR_INFO = 545008U;
static const uint32_t ACL_ERROR_GE_UNLOAD_MODEL = 545009U;
static const int32_t ACL_ERROR_RT_PARAM_INVALID = 107000; // param invalid
static const int32_t ACL_ERROR_RT_INVALID_DEVICEID = 107001; // invalid device id
static const int32_t ACL_ERROR_RT_CONTEXT_NULL = 107002; // current context null
static const int32_t ACL_ERROR_RT_STREAM_CONTEXT = 107003; // stream not in current context
static const int32_t ACL_ERROR_RT_MODEL_CONTEXT = 107004; // model not in current context
static const int32_t ACL_ERROR_RT_STREAM_MODEL = 107005; // stream not in model
static const int32_t ACL_ERROR_RT_EVENT_TIMESTAMP_INVALID = 107006; // event timestamp invalid
static const int32_t ACL_ERROR_RT_EVENT_TIMESTAMP_REVERSAL = 107007; // event timestamp reversal
static const int32_t ACL_ERROR_RT_ADDR_UNALIGNED = 107008; // memory address unaligned
static const int32_t ACL_ERROR_RT_FILE_OPEN = 107009; // open file failed
static const int32_t ACL_ERROR_RT_FILE_WRITE = 107010; // write file failed
static const int32_t ACL_ERROR_RT_STREAM_SUBSCRIBE = 107011; // error subscribe stream
static const int32_t ACL_ERROR_RT_THREAD_SUBSCRIBE = 107012; // error subscribe thread
static const int32_t ACL_ERROR_RT_GROUP_NOT_SET = 107013; // group not set
static const int32_t ACL_ERROR_RT_GROUP_NOT_CREATE = 107014; // group not create
static const int32_t ACL_ERROR_RT_STREAM_NO_CB_REG = 107015; // callback not register to stream
static const int32_t ACL_ERROR_RT_INVALID_MEMORY_TYPE = 107016; // invalid memory type
static const int32_t ACL_ERROR_RT_INVALID_HANDLE = 107017; // invalid handle
static const int32_t ACL_ERROR_RT_INVALID_MALLOC_TYPE = 107018; // invalid malloc type
static const int32_t ACL_ERROR_RT_WAIT_TIMEOUT = 107019; // wait timeout
static const int32_t ACL_ERROR_RT_FEATURE_NOT_SUPPORT = 207000; // feature not support
static const int32_t ACL_ERROR_RT_MEMORY_ALLOCATION = 207001; // memory allocation error
static const int32_t ACL_ERROR_RT_MEMORY_FREE = 207002; // memory free error
static const int32_t ACL_ERROR_RT_AICORE_OVER_FLOW = 207003; // aicore over flow
static const int32_t ACL_ERROR_RT_NO_DEVICE = 207004; // no device
static const int32_t ACL_ERROR_RT_RESOURCE_ALLOC_FAIL = 207005; // resource alloc fail
static const int32_t ACL_ERROR_RT_NO_PERMISSION = 207006; // no permission
static const int32_t ACL_ERROR_RT_NO_EVENT_RESOURCE = 207007; // no event resource
static const int32_t ACL_ERROR_RT_NO_STREAM_RESOURCE = 207008; // no stream resource
static const int32_t ACL_ERROR_RT_NO_NOTIFY_RESOURCE = 207009; // no notify resource
static const int32_t ACL_ERROR_RT_NO_MODEL_RESOURCE = 207010; // no model resource
static const int32_t ACL_ERROR_RT_NO_CDQ_RESOURCE = 207011; // no cdq resource
static const int32_t ACL_ERROR_RT_OVER_LIMIT = 207012; // over limit
static const int32_t ACL_ERROR_RT_QUEUE_EMPTY = 207013; // queue is empty
static const int32_t ACL_ERROR_RT_QUEUE_FULL = 207014; // queue is full
static const int32_t ACL_ERROR_RT_REPEATED_INIT = 207015; // repeated init
static const int32_t ACL_ERROR_RT_AIVEC_OVER_FLOW = 207016; // aivec over flow
static const int32_t ACL_ERROR_RT_INTERNAL_ERROR = 507000; // runtime internal error
static const int32_t ACL_ERROR_RT_TS_ERROR = 507001; // ts internel error
static const int32_t ACL_ERROR_RT_STREAM_TASK_FULL = 507002; // task full in stream
static const int32_t ACL_ERROR_RT_STREAM_TASK_EMPTY = 507003; // task empty in stream
static const int32_t ACL_ERROR_RT_STREAM_NOT_COMPLETE = 507004; // stream not complete
static const int32_t ACL_ERROR_RT_END_OF_SEQUENCE = 507005; // end of sequence
static const int32_t ACL_ERROR_RT_EVENT_NOT_COMPLETE = 507006; // event not complete
static const int32_t ACL_ERROR_RT_CONTEXT_RELEASE_ERROR = 507007; // context release error
static const int32_t ACL_ERROR_RT_SOC_VERSION = 507008; // soc version error
static const int32_t ACL_ERROR_RT_TASK_TYPE_NOT_SUPPORT = 507009; // task type not support
static const int32_t ACL_ERROR_RT_LOST_HEARTBEAT = 507010; // ts lost heartbeat
static const int32_t ACL_ERROR_RT_MODEL_EXECUTE = 507011; // model execute failed
static const int32_t ACL_ERROR_RT_REPORT_TIMEOUT = 507012; // report timeout
static const int32_t ACL_ERROR_RT_SYS_DMA = 507013; // sys dma error
static const int32_t ACL_ERROR_RT_AICORE_TIMEOUT = 507014; // aicore timeout
static const int32_t ACL_ERROR_RT_AICORE_EXCEPTION = 507015; // aicore exception
static const int32_t ACL_ERROR_RT_AICORE_TRAP_EXCEPTION = 507016; // aicore trap exception
static const int32_t ACL_ERROR_RT_AICPU_TIMEOUT = 507017; // aicpu timeout
static const int32_t ACL_ERROR_RT_AICPU_EXCEPTION = 507018; // aicpu exception
static const int32_t ACL_ERROR_RT_AICPU_DATADUMP_RSP_ERR = 507019; // aicpu datadump response error
static const int32_t ACL_ERROR_RT_AICPU_MODEL_RSP_ERR = 507020; // aicpu model operate response error
static const int32_t ACL_ERROR_RT_PROFILING_ERROR = 507021; // profiling error
static const int32_t ACL_ERROR_RT_IPC_ERROR = 507022; // ipc error
static const int32_t ACL_ERROR_RT_MODEL_ABORT_NORMAL = 507023; // model abort normal
static const int32_t ACL_ERROR_RT_KERNEL_UNREGISTERING = 507024; // kernel unregistering
static const int32_t ACL_ERROR_RT_RINGBUFFER_NOT_INIT = 507025; // ringbuffer not init
static const int32_t ACL_ERROR_RT_RINGBUFFER_NO_DATA = 507026; // ringbuffer no data
static const int32_t ACL_ERROR_RT_KERNEL_LOOKUP = 507027; // kernel lookup error
static const int32_t ACL_ERROR_RT_KERNEL_DUPLICATE = 507028; // kernel register duplicate
static const int32_t ACL_ERROR_RT_DEBUG_REGISTER_FAIL = 507029; // debug register failed
static const int32_t ACL_ERROR_RT_DEBUG_UNREGISTER_FAIL = 507030; // debug unregister failed
static const int32_t ACL_ERROR_RT_LABEL_CONTEXT = 507031; // label not in current context
static const int32_t ACL_ERROR_RT_PROGRAM_USE_OUT = 507032; // program register num use out
static const int32_t ACL_ERROR_RT_DEV_SETUP_ERROR = 507033; // device setup error
static const int32_t ACL_ERROR_RT_VECTOR_CORE_TIMEOUT = 507034; // vector core timeout
static const int32_t ACL_ERROR_RT_VECTOR_CORE_EXCEPTION = 507035; // vector core exception
static const int32_t ACL_ERROR_RT_VECTOR_CORE_TRAP_EXCEPTION = 507036; // vector core trap exception
static const int32_t ACL_ERROR_RT_CDQ_BATCH_ABNORMAL = 507037; // cdq alloc batch abnormal
static const int32_t ACL_ERROR_RT_DIE_MODE_CHANGE_ERROR = 507038; // can not change die mode
static const int32_t ACL_ERROR_RT_DIE_SET_ERROR = 507039; // single die mode can not set die
static const int32_t ACL_ERROR_RT_INVALID_DIEID = 507040; // invalid die id
static const int32_t ACL_ERROR_RT_DIE_MODE_NOT_SET = 507041; // die mode not set
static const int32_t ACL_ERROR_RT_DRV_INTERNAL_ERROR = 507899; // drv internal error
static const int32_t ACL_ERROR_RT_AICPU_INTERNAL_ERROR = 507900; // aicpu internal error
static const int32_t ACL_ERROR_RT_SOCKET_CLOSE = 507901; // hdc disconnect
)");
if (acl_error_map.count(error))
return acl_error_map[error];
return "unknown " + std::to_string((int)error);
}

186
python/jittor/extern/acl/acl_jittor.cc vendored Normal file
View File

@ -0,0 +1,186 @@
// ***************************************************************
// Copyright (c) 2021 Jittor. All Rights Reserved.
// Maintainers: Dun Liang <randonlang@gmail.com>.
// This file is subject to the terms and conditions defined in
// file 'LICENSE.txt', which is part of this source code package.
// ***************************************************************
#include "acl_jittor.h"
#include "utils/str_utils.h"
#include <chrono>
#include <thread>
namespace jittor {
uint64_t acl_jittor_tid;
int acl_jittor_thread_running=0;
aclrtContext acl_jittor_context;
#define CHECK_ACL(x) ASSERTop(x,==,0)
static void* acl_jittor_process_callback(void*) {
acl_jittor_thread_running = 1;
int deviceId = 0;
CHECK_ACL(aclrtSetCurrentContext(acl_jittor_context));
while (acl_jittor_thread_running) {
// LOGir << "acl_jittor_process_callback";
auto ret = aclrtProcessReport(1000);
if (ret) {
if (acl_jittor_thread_running && ret != ACL_ERROR_RT_REPORT_TIMEOUT)
LOGir << "aclrtProcessReport:" << ret << acl_error_to_string(ret);
break;
}
}
acl_jittor_thread_running = 0;
return (void*)0;
}
// void aaa(void*) {
// LOGir << "haha";
// }
struct acl_jittor_initer {
acl_jittor_initer() {
CHECK_ACL(aclInit(nullptr));
uint device_count = 0;
// 获取可用的Device数量
CHECK_ACL(aclrtGetDeviceCount(&device_count));
LOGi << "Found ACL device number:" << device_count;
CHECK_ACL(aclrtSetDevice(0));
CHECK_ACL(aclrtCreateContext(&acl_jittor_context, 0));
CHECK_ACL(aclrtSetCurrentContext(acl_jittor_context));
pthread_create(&acl_jittor_tid, nullptr, acl_jittor_process_callback, 0);
// subscribe for default stream
CHECK_ACL(aclrtSubscribeReport(acl_jittor_tid,0));
// simple callback test
// aclrtStream stream;
// CHECK_ACL(aclrtCreateStream(&stream));
// CHECK_ACL(aclrtSubscribeReport(acl_jittor_tid,stream));
// CHECK_ACL(aclrtLaunchCallback((aclrtCallback)&aaa, 0, ACL_CALLBACK_NO_BLOCK, stream));
// CHECK_ACL(aclrtLaunchCallback((aclrtCallback)&aaa, 0, ACL_CALLBACK_NO_BLOCK, 0));
}
~acl_jittor_initer() {
acl_jittor_thread_running = 0;
CHECK_ACL(aclrtUnSubscribeReport(acl_jittor_tid,0));
CHECK_ACL(aclrtDestroyContext(acl_jittor_context));
CHECK_ACL(aclFinalize());
}
} _acl_jittor_initer;
string process_acl(const string& src, const string& name, const map<string,string>& kargs) {
auto tokens = token_split(src);
int edit = 0;
for (int i=0; i<tokens.size(); i++) {
auto& token = tokens[i];
if (token == "cuda_runtime") token = "acl_jittor", edit ++; else
if (token == "CUDA") token = "ACL", edit ++; else
if (startswith(token, "cuda")) {
if (token.size()>=5 && token[4] >= 'A' && token[4] <= 'Z') {
if (token == "cudaGetDeviceCount") {
token_replace(tokens, i, "($1);", "((uint*)$1);");
} else if (token == "cudaLaunchHostFunc") {
// ACL_CALLBACK_BLOCK for 310
token_replace(tokens, i, "LaunchHostFunc($1,$2,$3)",
"LaunchCallback($2,$3,ACL_CALLBACK_NO_BLOCK,$1)");
} else if (token == "cudaMemcpy")
token_replace(tokens, i, "cudaMemcpy($1,$2,$3,",
"aclrtMemcpy($1,$3,$2,$3,");
else if (token == "cudaMemcpyAsync")
token_replace(tokens, i, "cudaMemcpyAsync($1,$2,$3,",
"aclrtMemcpyAsync($1,$3,$2,$3,");
else if (token == "cudaMemcpyDeviceToHost") token = "ACL_MEMCPY_DEVICE_TO_HOST";
else if (token == "cudaMemcpyHostToDevice") token = "ACL_MEMCPY_HOST_TO_DEVICE";
else if (token == "cudaMemcpyDeviceToDevice") token = "ACL_MEMCPY_DEVICE_TO_DEVICE";
else if (token == "cudaMallocManaged" || token == "cudaMalloc") {
// unified address not supported
token = "aclrtMalloc";
token_replace(tokens, i, "($1,$2)",
"($1,$2,ACL_MEM_MALLOC_HUGE_FIRST)");
} else if (token == "cudaMemGetInfo")
token_replace(tokens, i, "cudaMemGetInfo($1,$2)",
"aclrtGetMemInfo(ACL_DDR_MEM,$1,$2)");
else if (token == "cudaGetLastError")
token_replace(tokens, i, "cudaGetLastError()", "0");
else if (token == "cudaStreamCreateWithFlags")
token_replace(tokens, i-1,
"(cudaStreamCreateWithFlags($1,$2));",
"(aclrtCreateStream($1)); checkAclErrors(aclrtSubscribeReport(acl_jittor_tid,*$1));");
else if (token == "cudaEventCreate")
token_replace(tokens, i,
"cudaEventCreate($1,$2)",
"aclrtCreateEvent($1)");
else if (token == "cudaDeviceSynchronize")
token = "aclrtSynchronizeDevice";
else if (token == "cudaStreamDestroy")
token_replace(tokens, i, "cudaStreamDestroy($1)",
"(aclrtUnSubscribeReport(acl_jittor_tid,$1), aclrtDestroyStream($1))");
else if (token == "cudaEventDestroy")
token = "aclrtDestroyEvent";
else if (token == "cudaEventRecord")
token = "aclrtRecordEvent";
else if (token == "cudaStreamWaitEvent")
token_replace(tokens, i,
"cudaStreamWaitEvent($1,$2,$3)",
"aclrtStreamWaitEvent($1,$2)");
if (token.size() && token[0] == 'c')
token = "aclrt" + token.substr(4);
if (endswith(token, "_t"))
token = token.substr(0, token.size()-2);
edit ++;
}
} else
if (token == "_cudaGetErrorEnum") {
token_replace(tokens, i, "_cudaGetErrorEnum($1)", "(acl_error_to_string($1))");
edit ++;
} else
if (token == "checkCudaErrors")
token = "checkAclErrors";
else if (token == "JPU") {
edit ++;
string new_code;
if (tokens[i+2] == "op_compiler")
token_replace(tokens, i,
"JPU(op_compiler($1,$2,$3))",
"acl_jittor_op_compiler($1,$2,$3)");
else if (tokens[i+2] == "header")
new_code = "#include \"acl_jittor.h\"";
if (new_code.size())
token_replace(tokens, i, "JPU($1)", new_code);
} else if (token == "use_cuda_managed_allocator" && tokens[i+1][0]==',') {
tokens[i+2] = "0"; // disable unified address
}
}
if (!edit) return src;
return join(tokens, "");
}
void acl_jittor_op_compiler(string& filename, string& src, bool is_acl) {
if (!is_acl) return;
filename = replace(filename, ".cc", ".tikcc");
// LOGir << filename;
string new_src = process_acl(src, "", {});
new_src = replace(new_src, R"(#include "misc/cuda_atomic.h")", "");
new_src = replace(new_src, R"(#include "misc/cuda_limits.h")", "");
new_src = replace(new_src, "__global__", "__ai_device_entry__");
new_src = token_replace(new_src, "__launch_bounds__($1)", "");
new_src = token_replace(new_src, "int thread_num = $1;", "int thread_num = 1;");
new_src = token_replace(new_src, "tn0=std::max(tn0, $1);", "");
new_src = token_replace(new_src, "<<<$1,$2>>>", "<<<1,0>>>");
new_src = token_replace(new_src, "int thread_id = $1;", "int thread_id = 1;");
// for inc error
new_src = token_replace(new_src, "for ($1+=$2)", "for ($1++)");
// bit op error
new_src = token_replace(new_src, "int tnum$1;", "");
new_src = token_replace(new_src, "int tid$1=$2;", "int tid$1=0;");
src = new_src;
// auto tokens = token_split(new_src);
}
}

19
python/jittor/extern/acl/acl_jittor.h vendored Normal file
View File

@ -0,0 +1,19 @@
// ***************************************************************
// Copyright (c) 2021 Jittor. All Rights Reserved.
// Maintainers: Dun Liang <randonlang@gmail.com>.
// This file is subject to the terms and conditions defined in
// file 'LICENSE.txt', which is part of this source code package.
// ***************************************************************
#pragma once
#include "common.h"
#include <acl/acl.h>
std::string acl_error_to_string(aclError error);
namespace jittor {
EXTERN_LIB uint64_t acl_jittor_tid;
void acl_jittor_op_compiler(string& filename, string& src, bool is_acl);
}

View File

@ -23,8 +23,8 @@ EXTERN_LIB cublasHandle_t cublas_handle;
static inline cudaDataType get_dtype(NanoString dtype) {
if (dtype == ns_float32) return CUDA_R_32F;
// if (dtype == ns_float64) return CUDA_R_64F;
// if (dtype == ns_float16) return CUDA_R_16F;
if (dtype == ns_float64) return CUDA_R_64F;
if (dtype == ns_float16) return CUDA_R_16F;
LOGf << "not support type" << dtype;
return CUDA_R_32F;
}

View File

@ -89,7 +89,7 @@ void CublasBatchedMatmulOp::jit_prepare(JK& jk) {
jk << _CS("[T:") << a->dtype();
jk << _CS("][Trans_a:") << (trans_a ? 'T' : 'N');
jk << _CS("][Trans_b:") << (trans_b ? 'T' : 'N');
jk << _CS("][op:") << (a->dtype().dsize() == 4 ? 'S' : 'D');
jk << _CS("][op:") << (a->dtype().dsize() == 2? 'H' : (a->dtype().dsize() == 4 ? 'S' : 'D'));
jk << ']';
}
@ -124,6 +124,22 @@ void CublasBatchedMatmulOp::jit_run() {
if (use_tensorcore) {
computeType = CUBLAS_COMPUTE_32F_FAST_16F;
}
if (a->dtype() == ns_float16
|| b->dtype() == ns_float16 || c->dtype() == ns_float16) {
computeType = CUBLAS_COMPUTE_16F;
}
#else
cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT;
cudaDataType_t computeType = CUDA_R_32F;
if (use_tensorcore) {
algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP;
}
if (a->dtype() == ns_float16
|| b->dtype() == ns_float16 || c->dtype() == ns_float16) {
computeType = CUDA_R_16F;
algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP;
}
#endif
checkCudaErrors(cublasGemmStridedBatchedEx(handle_,
CUBLAS_OP_@Trans_b, CUBLAS_OP_@Trans_a,
k, n, m, &alpha,
@ -131,15 +147,13 @@ void CublasBatchedMatmulOp::jit_run() {
a->ptr<T>(),get_dtype(a->dtype()), '@Trans_a' == 'N' ? m : n, n * m, &beta,
c->ptr<T>(),get_dtype(c->dtype()), k, k * n,
batch_size,computeType,algo));
#else
checkCudaErrors(cublas@op@@gemmStridedBatched(handle_,
CUBLAS_OP_@Trans_b, CUBLAS_OP_@Trans_a,
k, n, m, &alpha,
b->ptr<T>(), '@Trans_b' == 'N' ? k : m, k * m,
a->ptr<T>(), '@Trans_a' == 'N' ? m : n, n * m, &beta,
c->ptr<T>(), k, k * n,
batch_size));
#endif
// checkCudaErrors(cublas@op@@gemmStridedBatched(handle_,
// CUBLAS_OP_@Trans_b, CUBLAS_OP_@Trans_a,
// k, n, m, &alpha,
// b->ptr<T>(), '@Trans_b' == 'N' ? k : m, k * m,
// a->ptr<T>(), '@Trans_a' == 'N' ? m : n, n * m, &beta,
// c->ptr<T>(), k, k * n,
// batch_size));
}
#endif
#endif // JIT

View File

@ -50,7 +50,7 @@ void CublasMatmulOp::jit_prepare(JK& jk) {
jk << _CS("[T:") << a->dtype();
jk << _CS("][Trans_a:") << (trans_a ? 'T' : 'N');
jk << _CS("][Trans_b:") << (trans_b ? 'T' : 'N');
jk << _CS("][op:") << (a->dtype().dsize() == 4 ? 'S' : 'D');
jk << _CS("][op:") << (a->dtype().dsize() == 2? 'H' : (a->dtype().dsize() == 4 ? 'S' : 'D'));
jk << ']';
}
@ -81,6 +81,22 @@ void CublasMatmulOp::jit_run() {
if (use_tensorcore) {
computeType = CUBLAS_COMPUTE_32F_FAST_16F;
}
if (a->dtype() == ns_float16
|| b->dtype() == ns_float16 || c->dtype() == ns_float16) {
computeType = CUBLAS_COMPUTE_16F;
}
#else
cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT;
cudaDataType_t computeType = CUDA_R_32F;
if (use_tensorcore) {
algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP;
}
if (a->dtype() == ns_float16
|| b->dtype() == ns_float16 || c->dtype() == ns_float16) {
computeType = CUDA_R_16F;
algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP;
}
#endif
checkCudaErrors(cublasGemmEx(handle_,
CUBLAS_OP_@Trans_b, CUBLAS_OP_@Trans_a,
k, n, m, &alpha,
@ -88,15 +104,13 @@ void CublasMatmulOp::jit_run() {
a->ptr<T>(),get_dtype(a->dtype()), '@Trans_a' == 'N' ? m : n, &beta,
c->ptr<T>(),get_dtype(c->dtype()), k,
computeType, algo));
#else
checkCudaErrors(cublas@op@@gemm(handle_,
CUBLAS_OP_@Trans_b, CUBLAS_OP_@Trans_a,
k, n, m, &alpha,
b->ptr<T>(), '@Trans_b' == 'N' ? k : m,
a->ptr<T>(), '@Trans_a' == 'N' ? m : n, &beta,
c->ptr<T>(), k));
// checkCudaErrors(cublas@op@@gemm(handle_,
// CUBLAS_OP_@Trans_b, CUBLAS_OP_@Trans_a,
// k, n, m, &alpha,
// b->ptr<T>(), '@Trans_b' == 'N' ? k : m,
// a->ptr<T>(), '@Trans_a' == 'N' ? m : n, &beta,
// c->ptr<T>(), k));
#endif
}
#endif // JIT

View File

@ -175,6 +175,11 @@ void CudnnConvOp::jit_run() {
checkCudaErrors( cudnnSetConvolutionMathType(cudnnConvDesc, CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION) );
}
if (x->dtype() == ns_float16
|| y->dtype() == ns_float16 || w->dtype() == ns_float16) {
checkCudaErrors( cudnnSetConvolutionMathType(cudnnConvDesc, CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION) );
}
int dimY[] = {
(int)y->shape[findc("@YFORMAT", 'a')], // n
(int)y->shape[findc("@YFORMAT", 'b')], // c

View File

@ -90,7 +90,7 @@ void CuttTransposeOp::jit_run() {
for (int i=0; i<dim; i++)
x_shape[i] = new_shape[dim-1-i];
if (dim == 1 || x->num==1) {
checkCudaErrors(cudaMemcpyAsync(yp, xp, x->size, cudaMemcpyDefault, 0));
checkCudaErrors(cudaMemcpyAsync(yp, xp, x->size, cudaMemcpyDeviceToDevice, 0));
return;
}
JK& jk = get_jk();

View File

@ -25,7 +25,9 @@
#include <stdlib.h>
#include <string.h>
#ifdef IS_CUDA
#include <helper_string.h>
#endif
#ifndef EXIT_WAIVED
#define EXIT_WAIVED 2
@ -129,6 +131,9 @@ void check(T result, char const *const func, const char *const file,
}
}
#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
#define peekCudaErrors(val) peek((val), #val, __FILE__, __LINE__)
#ifdef __DRIVER_TYPES_H__
// This will output the proper CUDA error strings in the event
// that a CUDA host call returns an error

View File

@ -47,9 +47,9 @@
#include <unordered_map>
#include <vector>
#include <mkldnn.hpp>
#include <dnnl.hpp>
using namespace mkldnn;
using namespace dnnl;
using namespace std;
@ -159,8 +159,8 @@ void simple_net(int times = 100) {
if (conv1_prim_desc.src_desc() != user_src_memory.get_desc()) {
conv1_src_memory = memory(conv1_prim_desc.src_desc(), eng);
net.push_back(reorder(user_src_memory, conv1_src_memory));
net_args.push_back({ { MKLDNN_ARG_FROM, user_src_memory },
{ MKLDNN_ARG_TO, conv1_src_memory } });
net_args.push_back({ { DNNL_ARG_FROM, user_src_memory },
{ DNNL_ARG_TO, conv1_src_memory } });
}
auto conv1_weights_memory = user_weights_memory;
@ -181,10 +181,10 @@ void simple_net(int times = 100) {
/// @snippet cpu_cnn_inference_f32.cpp Create memory for output
//[Create convolution primitive]
net.push_back(convolution_forward(conv1_prim_desc));
net_args.push_back({ { MKLDNN_ARG_SRC, conv1_src_memory },
{ MKLDNN_ARG_WEIGHTS, conv1_weights_memory },
{ MKLDNN_ARG_BIAS, conv1_user_bias_memory },
{ MKLDNN_ARG_DST, conv1_dst_memory } });
net_args.push_back({ { DNNL_ARG_SRC, conv1_src_memory },
{ DNNL_ARG_WEIGHTS, conv1_weights_memory },
{ DNNL_ARG_BIAS, conv1_user_bias_memory },
{ DNNL_ARG_DST, conv1_dst_memory } });
//[Create convolution primitive]
// AlexNet: relu1
@ -204,8 +204,8 @@ void simple_net(int times = 100) {
auto relu1_prim_desc = eltwise_forward::primitive_desc(relu1_desc, eng);
net.push_back(eltwise_forward(relu1_prim_desc));
net_args.push_back({ { MKLDNN_ARG_SRC, conv1_dst_memory },
{ MKLDNN_ARG_DST, conv1_dst_memory } });
net_args.push_back({ { DNNL_ARG_SRC, conv1_dst_memory },
{ DNNL_ARG_DST, conv1_dst_memory } });
//[Create relu primitive]
// AlexNet: lrn1
@ -226,8 +226,8 @@ void simple_net(int times = 100) {
auto lrn1_dst_memory = memory(lrn1_prim_desc.dst_desc(), eng);
net.push_back(lrn_forward(lrn1_prim_desc));
net_args.push_back({ { MKLDNN_ARG_SRC, conv1_dst_memory },
{ MKLDNN_ARG_DST, lrn1_dst_memory } });
net_args.push_back({ { DNNL_ARG_SRC, conv1_dst_memory },
{ DNNL_ARG_DST, lrn1_dst_memory } });
// AlexNet: pool1
// {batch, 96, 55, 55} -> {batch, 96, 27, 27}
@ -255,8 +255,8 @@ void simple_net(int times = 100) {
auto pool1_dst_memory = memory(pool1_pd.dst_desc(), eng);
net.push_back(pooling_forward(pool1_pd));
net_args.push_back({ { MKLDNN_ARG_SRC, lrn1_dst_memory },
{ MKLDNN_ARG_DST, pool1_dst_memory } });
net_args.push_back({ { DNNL_ARG_SRC, lrn1_dst_memory },
{ DNNL_ARG_DST, pool1_dst_memory } });
//[Create pooling primitive]
// AlexNet: conv2
@ -296,8 +296,8 @@ void simple_net(int times = 100) {
if (conv2_prim_desc.src_desc() != conv2_src_memory.get_desc()) {
conv2_src_memory = memory(conv2_prim_desc.src_desc(), eng);
net.push_back(reorder(pool1_dst_memory, conv2_src_memory));
net_args.push_back({ { MKLDNN_ARG_FROM, pool1_dst_memory },
{ MKLDNN_ARG_TO, conv2_src_memory } });
net_args.push_back({ { DNNL_ARG_FROM, pool1_dst_memory },
{ DNNL_ARG_TO, conv2_src_memory } });
}
auto conv2_weights_memory = conv2_user_weights_memory;
@ -312,10 +312,10 @@ void simple_net(int times = 100) {
// create convolution primitive and add it to net
net.push_back(convolution_forward(conv2_prim_desc));
net_args.push_back({ { MKLDNN_ARG_SRC, conv2_src_memory },
{ MKLDNN_ARG_WEIGHTS, conv2_weights_memory },
{ MKLDNN_ARG_BIAS, conv2_user_bias_memory },
{ MKLDNN_ARG_DST, conv2_dst_memory } });
net_args.push_back({ { DNNL_ARG_SRC, conv2_src_memory },
{ DNNL_ARG_WEIGHTS, conv2_weights_memory },
{ DNNL_ARG_BIAS, conv2_user_bias_memory },
{ DNNL_ARG_DST, conv2_dst_memory } });
// AlexNet: relu2
// {batch, 256, 27, 27} -> {batch, 256, 27, 27}
@ -328,8 +328,8 @@ void simple_net(int times = 100) {
auto relu2_prim_desc = eltwise_forward::primitive_desc(relu2_desc, eng);
net.push_back(eltwise_forward(relu2_prim_desc));
net_args.push_back({ { MKLDNN_ARG_SRC, conv2_dst_memory },
{ MKLDNN_ARG_DST, conv2_dst_memory } });
net_args.push_back({ { DNNL_ARG_SRC, conv2_dst_memory },
{ DNNL_ARG_DST, conv2_dst_memory } });
// AlexNet: lrn2
// {batch, 256, 27, 27} -> {batch, 256, 27, 27}
@ -349,8 +349,8 @@ void simple_net(int times = 100) {
auto lrn2_dst_memory = memory(lrn2_prim_desc.dst_desc(), eng);
net.push_back(lrn_forward(lrn2_prim_desc));
net_args.push_back({ { MKLDNN_ARG_SRC, conv2_dst_memory },
{ MKLDNN_ARG_DST, lrn2_dst_memory } });
net_args.push_back({ { DNNL_ARG_SRC, conv2_dst_memory },
{ DNNL_ARG_DST, lrn2_dst_memory } });
// AlexNet: pool2
// {batch, 256, 27, 27} -> {batch, 256, 13, 13}
@ -372,8 +372,8 @@ void simple_net(int times = 100) {
// create pooling primitive an add it to net
net.push_back(pooling_forward(pool2_pd));
net_args.push_back({ { MKLDNN_ARG_SRC, lrn2_dst_memory },
{ MKLDNN_ARG_DST, pool2_dst_memory } });
net_args.push_back({ { DNNL_ARG_SRC, lrn2_dst_memory },
{ DNNL_ARG_DST, pool2_dst_memory } });
// AlexNet: conv3
// {batch, 256, 13, 13} (x) {384, 256, 3, 3}; -> {batch, 384, 13, 13};
@ -412,8 +412,8 @@ void simple_net(int times = 100) {
if (conv3_prim_desc.src_desc() != conv3_src_memory.get_desc()) {
conv3_src_memory = memory(conv3_prim_desc.src_desc(), eng);
net.push_back(reorder(pool2_dst_memory, conv3_src_memory));
net_args.push_back({ { MKLDNN_ARG_FROM, pool2_dst_memory },
{ MKLDNN_ARG_TO, conv3_src_memory } });
net_args.push_back({ { DNNL_ARG_FROM, pool2_dst_memory },
{ DNNL_ARG_TO, conv3_src_memory } });
}
auto conv3_weights_memory = conv3_user_weights_memory;
@ -428,10 +428,10 @@ void simple_net(int times = 100) {
// create convolution primitive and add it to net
net.push_back(convolution_forward(conv3_prim_desc));
net_args.push_back({ { MKLDNN_ARG_SRC, conv3_src_memory },
{ MKLDNN_ARG_WEIGHTS, conv3_weights_memory },
{ MKLDNN_ARG_BIAS, conv3_user_bias_memory },
{ MKLDNN_ARG_DST, conv3_dst_memory } });
net_args.push_back({ { DNNL_ARG_SRC, conv3_src_memory },
{ DNNL_ARG_WEIGHTS, conv3_weights_memory },
{ DNNL_ARG_BIAS, conv3_user_bias_memory },
{ DNNL_ARG_DST, conv3_dst_memory } });
// AlexNet: relu3
// {batch, 384, 13, 13} -> {batch, 384, 13, 13}
@ -444,8 +444,8 @@ void simple_net(int times = 100) {
auto relu3_prim_desc = eltwise_forward::primitive_desc(relu3_desc, eng);
net.push_back(eltwise_forward(relu3_prim_desc));
net_args.push_back({ { MKLDNN_ARG_SRC, conv3_dst_memory },
{ MKLDNN_ARG_DST, conv3_dst_memory } });
net_args.push_back({ { DNNL_ARG_SRC, conv3_dst_memory },
{ DNNL_ARG_DST, conv3_dst_memory } });
// AlexNet: conv4
// {batch, 384, 13, 13} (x) {2, 192, 192, 3, 3}; ->
@ -485,8 +485,8 @@ void simple_net(int times = 100) {
if (conv4_prim_desc.src_desc() != conv4_src_memory.get_desc()) {
conv4_src_memory = memory(conv4_prim_desc.src_desc(), eng);
net.push_back(reorder(conv3_dst_memory, conv4_src_memory));
net_args.push_back({ { MKLDNN_ARG_FROM, conv3_dst_memory },
{ MKLDNN_ARG_TO, conv4_src_memory } });
net_args.push_back({ { DNNL_ARG_FROM, conv3_dst_memory },
{ DNNL_ARG_TO, conv4_src_memory } });
}
auto conv4_weights_memory = conv4_user_weights_memory;
@ -501,10 +501,10 @@ void simple_net(int times = 100) {
// create convolution primitive and add it to net
net.push_back(convolution_forward(conv4_prim_desc));
net_args.push_back({ { MKLDNN_ARG_SRC, conv4_src_memory },
{ MKLDNN_ARG_WEIGHTS, conv4_weights_memory },
{ MKLDNN_ARG_BIAS, conv4_user_bias_memory },
{ MKLDNN_ARG_DST, conv4_dst_memory } });
net_args.push_back({ { DNNL_ARG_SRC, conv4_src_memory },
{ DNNL_ARG_WEIGHTS, conv4_weights_memory },
{ DNNL_ARG_BIAS, conv4_user_bias_memory },
{ DNNL_ARG_DST, conv4_dst_memory } });
// AlexNet: relu4
// {batch, 384, 13, 13} -> {batch, 384, 13, 13}
@ -517,8 +517,8 @@ void simple_net(int times = 100) {
auto relu4_prim_desc = eltwise_forward::primitive_desc(relu4_desc, eng);
net.push_back(eltwise_forward(relu4_prim_desc));
net_args.push_back({ { MKLDNN_ARG_SRC, conv4_dst_memory },
{ MKLDNN_ARG_DST, conv4_dst_memory } });
net_args.push_back({ { DNNL_ARG_SRC, conv4_dst_memory },
{ DNNL_ARG_DST, conv4_dst_memory } });
// AlexNet: conv5
// {batch, 384, 13, 13} (x) {2, 128, 192, 3, 3}; -> {batch, 256, 13, 13};
@ -557,8 +557,8 @@ void simple_net(int times = 100) {
if (conv5_prim_desc.src_desc() != conv5_src_memory.get_desc()) {
conv5_src_memory = memory(conv5_prim_desc.src_desc(), eng);
net.push_back(reorder(conv4_dst_memory, conv5_src_memory));
net_args.push_back({ { MKLDNN_ARG_FROM, conv4_dst_memory },
{ MKLDNN_ARG_TO, conv5_src_memory } });
net_args.push_back({ { DNNL_ARG_FROM, conv4_dst_memory },
{ DNNL_ARG_TO, conv5_src_memory } });
}
auto conv5_weights_memory = conv5_user_weights_memory;
@ -573,10 +573,10 @@ void simple_net(int times = 100) {
// create convolution primitive and add it to net
net.push_back(convolution_forward(conv5_prim_desc));
net_args.push_back({ { MKLDNN_ARG_SRC, conv5_src_memory },
{ MKLDNN_ARG_WEIGHTS, conv5_weights_memory },
{ MKLDNN_ARG_BIAS, conv5_user_bias_memory },
{ MKLDNN_ARG_DST, conv5_dst_memory } });
net_args.push_back({ { DNNL_ARG_SRC, conv5_src_memory },
{ DNNL_ARG_WEIGHTS, conv5_weights_memory },
{ DNNL_ARG_BIAS, conv5_user_bias_memory },
{ DNNL_ARG_DST, conv5_dst_memory } });
// AlexNet: relu5
// {batch, 256, 13, 13} -> {batch, 256, 13, 13}
@ -589,8 +589,8 @@ void simple_net(int times = 100) {
auto relu5_prim_desc = eltwise_forward::primitive_desc(relu5_desc, eng);
net.push_back(eltwise_forward(relu5_prim_desc));
net_args.push_back({ { MKLDNN_ARG_SRC, conv5_dst_memory },
{ MKLDNN_ARG_DST, conv5_dst_memory } });
net_args.push_back({ { DNNL_ARG_SRC, conv5_dst_memory },
{ DNNL_ARG_DST, conv5_dst_memory } });
// AlexNet: pool5
// {batch, 256, 13, 13} -> {batch, 256, 6, 6}
@ -615,8 +615,8 @@ void simple_net(int times = 100) {
// create pooling primitive an add it to net
net.push_back(pooling_forward(pool5_pd));
net_args.push_back({ { MKLDNN_ARG_SRC, conv5_dst_memory },
{ MKLDNN_ARG_DST, pool5_dst_memory } });
net_args.push_back({ { DNNL_ARG_SRC, conv5_dst_memory },
{ DNNL_ARG_DST, pool5_dst_memory } });
// fc6 inner product {batch, 256, 6, 6} (x) {4096, 256, 6, 6}-> {batch,
@ -651,8 +651,8 @@ void simple_net(int times = 100) {
if (fc6_prim_desc.src_desc() != fc6_src_memory.get_desc()) {
fc6_src_memory = memory(fc6_prim_desc.src_desc(), eng);
net.push_back(reorder(pool5_dst_memory, fc6_src_memory));
net_args.push_back({ { MKLDNN_ARG_FROM, pool5_dst_memory },
{ MKLDNN_ARG_TO, fc6_src_memory } });
net_args.push_back({ { DNNL_ARG_FROM, pool5_dst_memory },
{ DNNL_ARG_TO, fc6_src_memory } });
}
auto fc6_weights_memory = fc6_user_weights_memory;
@ -666,10 +666,10 @@ void simple_net(int times = 100) {
// create convolution primitive and add it to net
net.push_back(inner_product_forward(fc6_prim_desc));
net_args.push_back({ { MKLDNN_ARG_SRC, fc6_src_memory },
{ MKLDNN_ARG_WEIGHTS, fc6_weights_memory },
{ MKLDNN_ARG_BIAS, fc6_user_bias_memory },
{ MKLDNN_ARG_DST, fc6_dst_memory } });
net_args.push_back({ { DNNL_ARG_SRC, fc6_src_memory },
{ DNNL_ARG_WEIGHTS, fc6_weights_memory },
{ DNNL_ARG_BIAS, fc6_user_bias_memory },
{ DNNL_ARG_DST, fc6_dst_memory } });
// fc7 inner product {batch, 4096} (x) {4096, 4096}-> {batch, 4096}
@ -708,10 +708,10 @@ void simple_net(int times = 100) {
// create convolution primitive and add it to net
net.push_back(inner_product_forward(fc7_prim_desc));
net_args.push_back({ { MKLDNN_ARG_SRC, fc6_dst_memory },
{ MKLDNN_ARG_WEIGHTS, fc7_weights_memory },
{ MKLDNN_ARG_BIAS, fc7_user_bias_memory },
{ MKLDNN_ARG_DST, fc7_dst_memory } });
net_args.push_back({ { DNNL_ARG_SRC, fc6_dst_memory },
{ DNNL_ARG_WEIGHTS, fc7_weights_memory },
{ DNNL_ARG_BIAS, fc7_user_bias_memory },
{ DNNL_ARG_DST, fc7_dst_memory } });
// fc8 inner product {batch, 4096} (x) {1000, 4096}-> {batch, 1000}
memory::dims fc8_weights_tz = { 1000, 4096 };
@ -750,17 +750,17 @@ void simple_net(int times = 100) {
// create convolution primitive and add it to net
net.push_back(inner_product_forward(fc8_prim_desc));
net_args.push_back({ { MKLDNN_ARG_SRC, fc7_dst_memory },
{ MKLDNN_ARG_WEIGHTS, fc8_weights_memory },
{ MKLDNN_ARG_BIAS, fc8_user_bias_memory },
{ MKLDNN_ARG_DST, fc8_dst_memory } });
net_args.push_back({ { DNNL_ARG_SRC, fc7_dst_memory },
{ DNNL_ARG_WEIGHTS, fc8_weights_memory },
{ DNNL_ARG_BIAS, fc8_user_bias_memory },
{ DNNL_ARG_DST, fc8_dst_memory } });
// create reorder between internal and user data if it is needed and
// add it to net after pooling
if (fc8_dst_memory != user_dst_memory) {
net.push_back(reorder(fc8_dst_memory, user_dst_memory));
net_args.push_back({ { MKLDNN_ARG_FROM, fc8_dst_memory },
{ MKLDNN_ARG_TO, user_dst_memory } });
net_args.push_back({ { DNNL_ARG_FROM, fc8_dst_memory },
{ DNNL_ARG_TO, user_dst_memory } });
}
/// @page cpu_cnn_inference_f32_cpp

View File

@ -13,9 +13,9 @@
#include "var.h"
#include "mkl_conv_backward_w_op.h"
#include <mkldnn.hpp>
#include <dnnl.hpp>
using namespace mkldnn;
using namespace dnnl;
using namespace std;
namespace jittor {
@ -143,8 +143,8 @@ void MklConvBackwardWOp::jit_run() {
if (conv_pd.src_desc() != conv_user_src_memory.get_desc()) {
conv_src_memory = memory(conv_pd.src_desc(), eng);
net_bwd.push_back(reorder(conv_user_src_memory, conv_src_memory));
net_bwd_args.push_back({{MKLDNN_ARG_FROM, conv_user_src_memory},
{MKLDNN_ARG_TO, conv_src_memory}});
net_bwd_args.push_back({{DNNL_ARG_FROM, conv_user_src_memory},
{DNNL_ARG_TO, conv_src_memory}});
}
auto conv_user_diff_dst_memory
@ -169,8 +169,8 @@ void MklConvBackwardWOp::jit_run() {
if (conv_bwd_weights_pd.src_desc() != conv_src_memory.get_desc()) {
conv_bwd_src_memory = memory(conv_bwd_weights_pd.src_desc(), eng);
net_bwd.push_back(reorder(conv_src_memory, conv_bwd_src_memory));
net_bwd_args.push_back({{MKLDNN_ARG_FROM, conv_src_memory},
{MKLDNN_ARG_TO, conv_bwd_src_memory}});
net_bwd_args.push_back({{DNNL_ARG_FROM, conv_src_memory},
{DNNL_ARG_TO, conv_bwd_src_memory}});
}
auto conv_diff_dst_memory = conv_user_diff_dst_memory;
@ -178,13 +178,13 @@ void MklConvBackwardWOp::jit_run() {
!= conv_user_diff_dst_memory.get_desc()) {
conv_diff_dst_memory = memory(conv_bwd_weights_pd.diff_dst_desc(), eng);
net_bwd.push_back(reorder(conv_user_diff_dst_memory, conv_diff_dst_memory));
net_bwd_args.push_back({{MKLDNN_ARG_FROM, conv_user_diff_dst_memory},
{MKLDNN_ARG_TO, conv_diff_dst_memory}});
net_bwd_args.push_back({{DNNL_ARG_FROM, conv_user_diff_dst_memory},
{DNNL_ARG_TO, conv_diff_dst_memory}});
}
net_bwd.push_back(convolution_backward_weights(conv_bwd_weights_pd));
net_bwd_args.push_back({{MKLDNN_ARG_SRC, conv_bwd_src_memory},
{MKLDNN_ARG_DIFF_DST, conv_diff_dst_memory}});
net_bwd_args.push_back({{DNNL_ARG_SRC, conv_bwd_src_memory},
{DNNL_ARG_DIFF_DST, conv_diff_dst_memory}});
auto conv_diff_weights_memory = conv_user_diff_weights_memory;
if (conv_bwd_weights_pd.diff_weights_desc()
@ -192,15 +192,15 @@ void MklConvBackwardWOp::jit_run() {
conv_diff_weights_memory
= memory(conv_bwd_weights_pd.diff_weights_desc(), eng);
net_bwd_args.back().insert(
{MKLDNN_ARG_DIFF_WEIGHTS, conv_diff_weights_memory});
{DNNL_ARG_DIFF_WEIGHTS, conv_diff_weights_memory});
net_bwd.push_back(reorder(
conv_diff_weights_memory, conv_user_diff_weights_memory));
net_bwd_args.push_back({{MKLDNN_ARG_FROM, conv_diff_weights_memory},
{MKLDNN_ARG_TO, conv_user_diff_weights_memory}});
net_bwd_args.push_back({{DNNL_ARG_FROM, conv_diff_weights_memory},
{DNNL_ARG_TO, conv_user_diff_weights_memory}});
} else {
net_bwd_args.back().insert(
{MKLDNN_ARG_DIFF_WEIGHTS, conv_diff_weights_memory});
{DNNL_ARG_DIFF_WEIGHTS, conv_diff_weights_memory});
}
ASSERTop(net_bwd.size(),==,net_bwd_args.size());

View File

@ -13,9 +13,9 @@
#include "var.h"
#include "mkl_conv_backward_x_op.h"
#include <mkldnn.hpp>
#include <dnnl.hpp>
using namespace mkldnn;
using namespace dnnl;
using namespace std;
namespace jittor {
@ -142,8 +142,8 @@ void MklConvBackwardXOp::jit_run() {
conv_weights_memory = memory(conv_pd.weights_desc(), eng);
net_bwd.push_back(
reorder(conv_user_weights_memory, conv_weights_memory));
net_bwd_args.push_back({{MKLDNN_ARG_FROM, conv_user_weights_memory},
{MKLDNN_ARG_TO, conv_weights_memory}});
net_bwd_args.push_back({{DNNL_ARG_FROM, conv_user_weights_memory},
{DNNL_ARG_TO, conv_weights_memory}});
}
auto conv_user_diff_dst_memory
@ -168,21 +168,21 @@ void MklConvBackwardXOp::jit_run() {
!= conv_user_diff_dst_memory.get_desc()) {
conv_diff_dst_memory = memory(conv_bwd_data_pd.diff_dst_desc(), eng);
net_bwd.push_back(reorder(conv_user_diff_dst_memory, conv_diff_dst_memory));
net_bwd_args.push_back({{MKLDNN_ARG_FROM, conv_user_diff_dst_memory},
{MKLDNN_ARG_TO, conv_diff_dst_memory}});
net_bwd_args.push_back({{DNNL_ARG_FROM, conv_user_diff_dst_memory},
{DNNL_ARG_TO, conv_diff_dst_memory}});
}
auto conv_bwd_weights_memory = conv_weights_memory;
if (conv_bwd_data_pd.weights_desc() != conv_weights_memory.get_desc()) {
conv_bwd_weights_memory = memory(conv_bwd_data_pd.weights_desc(), eng);
net_bwd.push_back(reorder(conv_weights_memory, conv_bwd_weights_memory));
net_bwd_args.push_back({{MKLDNN_ARG_FROM, conv_weights_memory},
{MKLDNN_ARG_TO, conv_bwd_weights_memory}});
net_bwd_args.push_back({{DNNL_ARG_FROM, conv_weights_memory},
{DNNL_ARG_TO, conv_bwd_weights_memory}});
}
net_bwd.push_back(convolution_backward_data(conv_bwd_data_pd));
net_bwd_args.push_back({{MKLDNN_ARG_WEIGHTS, conv_bwd_weights_memory},
{MKLDNN_ARG_DIFF_DST, conv_diff_dst_memory}});
net_bwd_args.push_back({{DNNL_ARG_WEIGHTS, conv_bwd_weights_memory},
{DNNL_ARG_DIFF_DST, conv_diff_dst_memory}});
auto conv_diff_src_memory = conv_user_diff_src_memory;
if (conv_bwd_data_pd.diff_src_desc()
@ -190,15 +190,15 @@ void MklConvBackwardXOp::jit_run() {
conv_diff_src_memory
= memory(conv_bwd_data_pd.diff_src_desc(), eng);
net_bwd_args.back().insert(
{MKLDNN_ARG_DIFF_SRC, conv_diff_src_memory});
{DNNL_ARG_DIFF_SRC, conv_diff_src_memory});
net_bwd.push_back(reorder(
conv_diff_src_memory, conv_user_diff_src_memory));
net_bwd_args.push_back({{MKLDNN_ARG_FROM, conv_diff_src_memory},
{MKLDNN_ARG_TO, conv_user_diff_src_memory}});
net_bwd_args.push_back({{DNNL_ARG_FROM, conv_diff_src_memory},
{DNNL_ARG_TO, conv_user_diff_src_memory}});
} else {
net_bwd_args.back().insert(
{MKLDNN_ARG_DIFF_SRC, conv_diff_src_memory});
{DNNL_ARG_DIFF_SRC, conv_diff_src_memory});
}
ASSERTop(net_bwd.size(),==,net_bwd_args.size());

View File

@ -7,12 +7,12 @@
// This file is subject to the terms and conditions defined in
// file 'LICENSE.txt', which is part of this source code package.
// ***************************************************************
#include <mkldnn.hpp>
#include <dnnl.hpp>
#include "var.h"
#include "mkl_conv_op.h"
using namespace mkldnn;
using namespace dnnl;
using namespace std;
namespace jittor {
@ -110,7 +110,7 @@ void MklConvOp::jit_run() {
auto n = ws[3];
auto k = xs[3];
// x: [m,k], w: [k,n], y: [m,n]
ASSERTop(0,==,mkldnn_sgemm('N', 'N', m, n, k,
ASSERTop(0,==,dnnl_sgemm('N', 'N', m, n, k,
1.f, x->ptr<float32>(), k,
w->ptr<float32>(), n,
0.f, y->ptr<float32>(), n));
@ -162,27 +162,27 @@ void MklConvOp::jit_run() {
if (conv1_prim_desc.src_desc() != user_src_memory.get_desc()) {
conv1_src_memory = memory(conv1_prim_desc.src_desc(), eng);
net.push_back(reorder(user_src_memory, conv1_src_memory));
net_args.push_back({ { MKLDNN_ARG_FROM, user_src_memory },
{ MKLDNN_ARG_TO, conv1_src_memory } });
net_args.push_back({ { DNNL_ARG_FROM, user_src_memory },
{ DNNL_ARG_TO, conv1_src_memory } });
}
auto conv1_weights_memory = user_weights_memory;
if (conv1_prim_desc.weights_desc() != user_weights_memory.get_desc()) {
conv1_weights_memory = memory(conv1_prim_desc.weights_desc(), eng);
net.push_back(reorder(user_weights_memory, conv1_weights_memory));
net_args.push_back({ { MKLDNN_ARG_FROM, user_weights_memory }, { MKLDNN_ARG_TO, conv1_weights_memory } });
net_args.push_back({ { DNNL_ARG_FROM, user_weights_memory }, { DNNL_ARG_TO, conv1_weights_memory } });
}
auto conv1_dst_memory = memory(conv1_prim_desc.dst_desc(), eng);
net.push_back(convolution_forward(conv1_prim_desc));
net_args.push_back({ { MKLDNN_ARG_SRC, conv1_src_memory },
{ MKLDNN_ARG_WEIGHTS, conv1_weights_memory },
{ MKLDNN_ARG_DST, conv1_dst_memory } });
net_args.push_back({ { DNNL_ARG_SRC, conv1_src_memory },
{ DNNL_ARG_WEIGHTS, conv1_weights_memory },
{ DNNL_ARG_DST, conv1_dst_memory } });
if (conv1_dst_memory != user_dst_memory) {
net.push_back(reorder(conv1_dst_memory, user_dst_memory));
net_args.push_back({ { MKLDNN_ARG_FROM, conv1_dst_memory },{ MKLDNN_ARG_TO, user_dst_memory } });
net_args.push_back({ { DNNL_ARG_FROM, conv1_dst_memory },{ DNNL_ARG_TO, user_dst_memory } });
}
ASSERTop(net.size(),==,net_args.size());

View File

@ -7,12 +7,12 @@
// This file is subject to the terms and conditions defined in
// file 'LICENSE.txt', which is part of this source code package.
// ***************************************************************
#include <mkldnn.hpp>
#include <dnnl.hpp>
#include "var.h"
#include "mkl_matmul_op.h"
using namespace mkldnn;
using namespace dnnl;
using namespace std;
namespace jittor {
@ -66,7 +66,7 @@ void MklMatmulOp::jit_run() {
k = bs[0];
}
// a: [n,m], b: [m,k], c: [n,k]
ASSERTop(0,==,mkldnn_sgemm('@Trans_a', '@Trans_b', n, k, m,
ASSERTop(0,==,dnnl_sgemm('@Trans_a', '@Trans_b', n, k, m,
1.f, a->ptr<T>(), '@Trans_a'=='N'? m : n,
b->ptr<T>(), '@Trans_b' == 'N' ? k : m,
0.f, c->ptr<T>(), k));

View File

@ -706,7 +706,7 @@ def _no_grad_trunc_normal_(var, mean, std, a, b):
"The distribution of values may be incorrect.",
stacklevel=2)
with jt.no_grad():
# Values are generated by using a truncated uniform distribution and
# then using the inverse CDF for the normal distribution.
# Get upper and lower cdf values
@ -716,16 +716,16 @@ def _no_grad_trunc_normal_(var, mean, std, a, b):
# Uniformly fill tensor with values from [l, u], then translate to
# [2l-1, 2u-1].
# var.uniform(2 * l - 1, 2 * u - 1)
jt.init.uniform_(var, low=2 * l - 1, high=2 * u - 1)
var.uniform_(low=2 * l - 1, high=2 * u - 1)
# Use inverse cdf transform for normal distribution to get truncated
# standard normal
var.erfinv()
var = var.erfinv()
# Transform to proper mean, std
var.multiply(std * math.sqrt(2.))
var.add(mean)
var = var.multiply(std * math.sqrt(2.))
var = var.add(mean)
# Clamp to ensure it's in the proper range
var.clamp(min_v=a, max_v=b)
var = var.clamp(min_v=a, max_v=b)
return var

View File

@ -488,18 +488,11 @@ def arctan2(y,x):
angle = jt.zeros(x.shape,dtype=x.dtype)
x = (x!=0.0).ternary(x, x+1e-30)
angle = (y/x).arctan()
mask = (y<0) & (x<0)
if angle[mask].numel()>0:
angle[mask] -= np.pi
mask = (y>=0) &(x<0)
if angle[mask].numel()>0:
angle[mask] +=np.pi
mask = y<0 | ((y==0) & (x<0))
angle = angle + mask*np.pi
return angle
def nonzero(x):
r'''
Return the index of the elements of input tensor which are not equal to zero.

View File

@ -143,7 +143,7 @@ class ResNet(nn.Module):
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)
x = self.avgpool(x)
x = self.avgpool(x).float_auto()
x = jt.reshape(x, (x.shape[0], -1))
x = self.fc(x)
return x

View File

@ -37,6 +37,7 @@ def matmul_transpose(a, b):
assert len(a.shape) == 2 and len(b.shape) == 2
shape = list(a.shape)[:-1] + list(b.shape)
with jt.flag_scope(amp_reg = jt.flags.amp_reg | 4):
a = a.broadcast(shape, [len(shape)-2])
b = b.broadcast(shape)
return (a*b).sum(len(shape)-1)
@ -108,6 +109,7 @@ Example::
c = jt.matmul(a, b)
assert c.shape == [8, 10, 3, 5]
'''
with jt.flag_scope(amp_reg = jt.flags.amp_reg | 4):
len_a = len(a.shape)
len_b = len(b.shape)
if len_b == 1:
@ -488,19 +490,22 @@ class BCEWithLogitsLoss(Module):
def execute(self, output, target):
return binary_cross_entropy_with_logits(output,target,self.weight,self.pos_weight,self.size_average)
def softmax(x, dim = None):
def softmax(x, dim=None, log=False):
import jittor.other.code_softmax as code_softmax
if code_softmax.can_softmax_v1(x, dim):
return code_softmax.softmax_v1(x, log)
if dim is None:
x = (x - x.max()).exp()
ret = x / x.sum()
else:
x = (x-x.max(dim, keepdims=True)).exp()
ret = x / x.sum(dim, keepdims=True)
if log: return ret.log()
return ret
jt.Var.softmax = softmax
def log_softmax(x,dim=None):
x = softmax(x,dim=dim)
return jt.log(x)
return softmax(x,dim=dim, log=True)
jt.Var.log_softmax = log_softmax
def log_sigmoid(x):
@ -829,6 +834,7 @@ class Conv(Module):
oh = (H+self.padding[0]*2-Kh*self.dilation[0]+self.dilation[0]-1)//self.stride[0]+1
ow = (W+self.padding[1]*2-Kw*self.dilation[1]+self.dilation[1]-1)//self.stride[1]+1
assert oh>0 and ow>0
with jt.flag_scope(amp_reg = jt.flags.amp_reg | 4):
xx = x.reindex([N,self.out_channels,C,oh,ow,Kh,Kw], [
'i0', # Nid
'i2', # Cid
@ -1005,6 +1011,18 @@ class Conv3d(Module):
def execute(self, x):
return conv3d(x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
class Conv1d_sp(Linear):
def __init__(self, inchannels, outchannels, kernel_size=1, bias=True):
super().__init__(inchannels, outchannels, bias=bias)
assert kernel_size == 1
def execute(self, x):
x = x.transpose(0, 2, 1)
x = super().execute(x)
x = x.transpose(0, 2, 1)
return x
def conv2d(x, weight, bias=None, stride=1, padding=0, dilation=1, groups=1):
''' Applies a 2D convolution over an input signal composed of several input planes.
@ -1045,6 +1063,7 @@ def conv2d(x, weight, bias=None, stride=1, padding=0, dilation=1, groups=1):
Kh, Kw = weight.shape[-2:]
oh = (H+padding[0]*2-Kh*dilation[0]+dilation[0]-1)//stride[0]+1
ow = (W+padding[1]*2-Kw*dilation[1]+dilation[1]-1)//stride[1]+1
with jt.flag_scope(amp_reg = jt.flags.amp_reg | 4):
xx = x.reindex([N,out_channels,C,oh,ow,Kh,Kw], [
'i0', # Nid
'i2', # Cid

View File

@ -1,7 +1,7 @@
#!python3
import os, json
from pathlib import Path
notebook_dir = os.path.join(str(Path.home()), ".cache","jittor","notebook")
import jittor_utils as jit_utils
notebook_dir = os.path.join(jit_utils.home(), ".cache","jittor","notebook")
if not os.path.isdir(notebook_dir):
os.mkdir(notebook_dir)
dirname = os.path.dirname(__file__)

View File

@ -0,0 +1,130 @@
import jittor as jt
from jittor import nn
def can_softmax_v1(a, dim):
if not jt.flags.use_cuda:
return False
if dim != -1 and dim != len(a.shape)-1:
return False
if a.shape[len(a.shape)-1] > 10000:
return False
return True
def softmax_v1(a, log=False):
assert can_softmax_v1(a, -1)
length = a.shape[-1]
# tnum = 1024
tnum = 500 if length % 500 == 0 else 512
tnum = 125 if length % 125 == 0 else 128
# tnum = 125
# tnum = 1000 if length % 1000 == 0 else 1024
# tnum = 250
per_thread = (length-1) // tnum + 1
ILP = 1
for ilp in [8,4,2]:
if length % tnum == 0 and per_thread % ilp == 0:
ILP = ilp
per_thread //= ILP
break
for_loop = f"""
#pragma unroll
for (int i=0; i<{per_thread}; i++)
"""
if length % tnum != 0:
for_loop += f"if ((i*{tnum}+threadIdx.x)*{ILP} < len)\n"
return jt.code(a.shape, a.dtype, [a], cuda_header=f'''
#include <{jt.compile_extern.cub_home}cub/cub.cuh>
#include <type/fp16_compute.h>
''', cuda_src=f'''
__global__ void kernel(in0_type* x, out0_type* y, int len) {{
typedef cub::BlockReduce<float, {tnum}> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;
int id = blockIdx.x * len;
in0_type v[{per_thread}][{ILP}];
{for_loop}
vload<sizeof(in0_type)*{ILP}>(v[i], &x[id+(i*{tnum}+threadIdx.x)*{ILP}]);
// v[i] = x[id+i*{tnum}+threadIdx.x];
float v1 = -1e30;
{for_loop}
#pragma unroll
for (int j=0; j<{ILP}; j++) {{
v1 = max(v1, float(v[i][j]));
}}
__shared__ float vmax;
auto tmp = BlockReduce(temp_storage).Reduce(v1, cub::Max());
if (threadIdx.x == 0)
vmax = tmp;
__syncthreads();
v1 = 0;
{for_loop}
#pragma unroll
for (int j=0; j<{ILP}; j++) {{
v[i][j] = expf(float(v[i][j]) - vmax);
v1 += float(v[i][j]);
}}
tmp = BlockReduce(temp_storage).Sum(v1);
__shared__ float vsum;
if (threadIdx.x == 0)
vsum = tmp;
__syncthreads();
{for_loop}
#pragma unroll
for (int j=0; j<{ILP}; j++)
v[i][j] = {
"@expand_op(log,@in0_type,float(v[i][j])/vsum)" if log
else "float(v[i][j])/vsum"
};
{for_loop}
vload<sizeof(in0_type)*{ILP}>(&y[id+(i*{tnum}+threadIdx.x)*{ILP}], v[i]);
}}
int len = in0->shape[in0->shape.size()-1];
int bnum = in0->numel() / len;
cudaGetLastError();
kernel<<<bnum, {tnum}>>>(in0_p, out0_p, len);
CHECK(0 == cudaGetLastError());
''', cuda_grad_src=[f"""
__global__ void kernel(pout0_type* x, dout_type* y, out0_type* z, int len) {{
int id = blockIdx.x * len;
in0_type vx[{per_thread}][{ILP}];
in0_type vy[{per_thread}][{ILP}];
{for_loop} {{
vload<sizeof(in0_type)*{ILP}>(vx[i], &x[id+(i*{tnum}+threadIdx.x)*{ILP}]);
vload<sizeof(in0_type)*{ILP}>(vy[i], &y[id+(i*{tnum}+threadIdx.x)*{ILP}]);
}}
float v1 = 0;
{for_loop}
#pragma unroll
for (int j=0; j<{ILP}; j++)
v1 += {"float(vy[i][j]);" if log else "float(vx[i][j]*vy[i][j]);"}
typedef cub::BlockReduce<float, {tnum}> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;
auto tmp = BlockReduce(temp_storage).Sum(v1);
__shared__ float reduce_var;
if (threadIdx.x == 0)
reduce_var = tmp;
__syncthreads();
{for_loop}
#pragma unroll
for (int j=0; j<{ILP}; j++)
vx[i][j] = {
"vy[i][j] - in0_type(expf(vx[i][j]) * reduce_var);" if log
else "vx[i][j] * (vy[i][j] - in0_type(reduce_var));"
}
{for_loop}
vload<sizeof(in0_type)*{ILP}>(&z[id+(i*{tnum}+threadIdx.x)*{ILP}],
vx[i]);
}}
int len = in0->shape[in0->shape.size()-1];
int bnum = in0->numel() / len;
cudaGetLastError();
kernel<<<bnum, {tnum}>>>(pout0_p, dout_p, out0_p, len);
CHECK(0 == cudaGetLastError());
"""])

View File

@ -60,14 +60,14 @@ class Pool(Module):
'''
if not self.return_indices:
forward_body += f'''
@out(i0, i1, i2, i3) = init_{self.op}(out_type);
@out(i0, i1, i2, i3) = @expand_op(init_{self.op}, @out_type);
for (int p = k2; p < k2_; ++p)
for (int q = k3; q < k3_; ++q)
@out(i0, i1, i2, i3) = {self.op}(out_type, @out(i0, i1, i2, i3), @in0(i0, i1, p, q));
@out(i0, i1, i2, i3) = @expand_op({self.op}, @out_type, @out(i0, i1, i2, i3), @out_type, @in0(i0, i1, p, q), @in0_type);
'''
else:
forward_body += f'''
auto out_value = init_{self.op}(out_type);
auto out_value = @expand_op(init_{self.op}, @out_type);
int out_index = -1;
for (int p = k2; p < k2_; ++p)
for (int q = k3; q < k3_; ++q)
@ -105,7 +105,6 @@ class Pool(Module):
return_dtypes = x.dtype
out = jt.code(return_shapes, return_dtypes, [x],
cuda_header="""
#include <ops/binary_op_defs.h>
#include <misc/cuda_limits.h>
""",
cuda_src=f'''
@ -121,8 +120,8 @@ class Pool(Module):
for (int i2 = p2; i2 < out_shape2; i2 += s2)
{{ {forward_body} }}
}}
int tx = min(1024, out_shape3);
int ty = min(1024 / tx, out_shape2);
int tx = std::min(1024, out_shape3);
int ty = std::min(1024 / tx, out_shape2);
int bx = (out_shape2 - 1) / ty + 1;
int by = out_shape1;
int bz = out_shape0;
@ -144,8 +143,8 @@ class Pool(Module):
{{ {backward_body} }}
}}
cudaMemsetAsync(out_p, 0, out->size);
int tx = min(1024, pout_shape3);
int ty = min(1024 / tx, pout_shape2);
int tx = std::min(1024, pout_shape3);
int ty = std::min(1024 / tx, pout_shape2);
int bx = (pout_shape2 - 1) / ty + 1;
int by = pout_shape1;
int bz = pout_shape0;
@ -153,7 +152,7 @@ class Pool(Module):
dim3 s2_(tx, ty);
kernel3<<<s1_, s2_>>>(@ARGS);
'''],
cpu_header='#include <ops/binary_op_defs.h>',
cpu_header='',
cpu_src=f'''
using namespace std;
for (int i0=0; i0<out_shape0; i0++)
@ -242,15 +241,15 @@ class Pool3d(Module):
'''
if not self.return_indices:
forward_body += f'''
@out(i0, i1, i2, i3, i4) = init_{self.op}(out_type);
@out(i0, i1, i2, i3, i4) = @expand_op(init_{self.op}, @out_type);
for (int p = k2; p < k2_; ++p)
for (int q = k3; q < k3_; ++q)
for (int r = k4; r < k4_; ++r)
@out(i0, i1, i2, i3, i4) = {self.op}(out_type, @out(i0, i1, i2, i3, i4), @in0(i0, i1, p, q, r));
@out(i0, i1, i2, i3, i4) = @expand_op({self.op}, @out_type, @out(i0, i1, i2, i3, i4), @out_type, @in0(i0, i1, p, q, r), @in0_type);
'''
else:
forward_body += f'''
auto out_value = init_{self.op}(out_type);
auto out_value = @expand_op(init_{self.op}, @out_type);
int out_index = -1;
for (int p = k2; p < k2_; ++p)
for (int q = k3; q < k3_; ++q)
@ -293,7 +292,6 @@ class Pool3d(Module):
return_dtypes = x.dtype
out = jt.code(return_shapes, return_dtypes, [x],
cuda_header="""
#include <ops/binary_op_defs.h>
#include <misc/cuda_limits.h>
""",
cuda_src=f'''
@ -312,9 +310,9 @@ class Pool3d(Module):
for (int i2 = p2; i2 < out_shape2; i2 += s2)
{{ {forward_body} }}
}}
int tx = min(1024, out_shape4);
int ty = min(1024 / tx, out_shape3);
int tz = min(1024 / tx / ty, out_shape2);
int tx = std::min(1024, out_shape4);
int ty = std::min(1024 / tx, out_shape3);
int tz = std::min(1024 / tx / ty, out_shape2);
int bx = (out_shape2 - 1) / tz + 1;
int by = out_shape1;
int bz = out_shape0;
@ -339,9 +337,9 @@ class Pool3d(Module):
{{ {backward_body} }}
}}
cudaMemsetAsync(out_p, 0, out->size);
int tx = min(1024, pout_shape4);
int ty = min(1024 / tx, pout_shape3);
int tz = min(1024 / tx / ty, pout_shape2);
int tx = std::min(1024, pout_shape4);
int ty = std::min(1024 / tx, pout_shape3);
int tz = std::min(1024 / tx / ty, pout_shape2);
int bx = (pout_shape2 - 1) / tz + 1;
int by = pout_shape1;
int bz = pout_shape0;
@ -349,7 +347,7 @@ class Pool3d(Module):
dim3 s2(tx, ty, tz);
kernel3<<<s1, s2>>>(@ARGS);
'''],
cpu_header='#include <ops/binary_op_defs.h>',
cpu_header='',
cpu_src=f'''
using namespace std;
for (int i0=0; i0<out_shape0; i0++)

View File

@ -509,6 +509,11 @@ void Executor::run_sync(vector<Var*> vars, bool device_sync) {
var->alloc(cpu_allocator);
}
}
} else {
for (Var* v : op->inputs()) {
if (!v->allocator->is_cuda())
migrate_to_gpu(v, allocator);
}
}
#endif
#ifdef NODE_MEMCHECK

View File

@ -22,6 +22,10 @@ struct Executor {
Allocator* temp_allocator;
bool last_is_cuda = false;
void run_sync(vector<Var*> vars, bool device_sync);
inline Allocation alloc_temp(size_t size) {
return Allocation(temp_allocator, size);
}
};
EXTERN_LIB Executor exe;

View File

@ -9,6 +9,7 @@
#include "op_compiler.h"
#include "profiler/profiler.h"
#include "misc/fast_shared_ptr.h"
#include "misc/cuda_flags.h"
namespace jittor {
@ -42,6 +43,7 @@ void FusedOp::update_ops() {
loop_options_tuned.clear();
loop_options = loop_options_origin = nullptr;
_inputs.clear();
_outputs.clear();
for (Op* op : ops) {
for (Var* o : op->outputs()) {
@ -101,6 +103,7 @@ void FusedOp::update_ops() {
if (!(c&2)) {
c += 2 + vars.size()*4;
vars.push_back({i, 0});
_inputs.emplace_back((Node*)i);
}
}
for (Var* o : opi->outputs()) {
@ -135,6 +138,7 @@ FusedOp::FusedOp(const FusedOp& other) {
}
FusedOp::~FusedOp() {
_inputs.clear();
_outputs.clear();
Op::number_of_lived_ops++;
}
@ -159,20 +163,15 @@ void FusedOp::statistics(uint64_t& in, uint64_t& out, uint64_t& compute) {
void FusedOp::do_jit_prepare(JK& jk) {
jk.clear();
int8 flags = 3;
for (uint i=0; i<ops.size(); i++) {
Op* op = ops[i];
jk << "[opkey" << i << JK::val;
op->do_jit_prepare(jk);
jk << op->name();
op->jit_prepare(jk);
jk << JK::end;
if (op->flags.get(NodeFlags::_cpu))
flags &= 1; // only cpu
else
flags &= 2; // only gpu
}
ASSERT(flags) << "FusedOp cannot contain both cpu and cuda ops.";
jk << _CS("[JIT:1]");
if (flags==1) {
if (!use_cuda) {
// only cpu
jk << _CS("[JIT_cpu:1]");
this->flags.set(NodeFlags::_cuda, 0);
@ -189,9 +188,17 @@ void FusedOp::do_jit_prepare(JK& jk) {
jk << JK::hex2(i) << JK::hex1(j) << JK::hex2(k) << JK::hex1(l) << ',';
}
jk << _CS("][var_info:") << JK::val;
for (auto& vi : vars)
bool use_int64_t = false;
for (auto& vi : vars) {
jk << JK::hex1(vi.type) << JK::hex1(vi.var->shape.size());
if (vi.type != 1 && vi.var->num >= std::numeric_limits<int32_t>::max())
use_int64_t = true;
}
jk << JK::end;
if (use_int64_t)
jk << _CS("[index_t:int64]");
else
jk << _CS("[index_t:int32]");
if (loop_options->size()) {
if (get_loop_option("compile_shapes")) {
jk << _CS("[shapes:");

View File

@ -39,11 +39,24 @@ template<class T> struct StackIniter {
#define STACK_ALLOC2(T, a, n) T a[n]
#endif
struct AmpGradGuard {
int amp_reg_bk;
AmpGradGuard(Op* op) {
amp_reg_bk = amp_reg;
amp_reg |= (op->flags.flags >> NodeFlags::_prefer_32);
}
~AmpGradGuard() {
amp_reg = amp_reg_bk;
}
};
VarPtr make_grad(Op* op, Var* out, Var* dout, Var* x, int x_index) {
if (dout == nullptr) return nullptr;
if (x_index<0) return nullptr;
LOGvvvv << "Make grad op:" >> op->name() << "inputs:" >> op->inputs()
<< "out:" >> out << "dout:" >> dout << "x:" >> x << "xid:" >> x_index;
AmpGradGuard agg(op);
auto dx = op->grad(out, dout, x, x_index);
if (x->loop_options)
dx->loop_options = x->loop_options;
@ -182,7 +195,10 @@ vector<VarPtr> grad(Var* loss, vector<Var*> targets) {
douts[i] = nullptr;
}
trace_grad_op = op;
{
AmpGradGuard agg(op);
op->grads(douts, dins);
}
// dump "for (Var* in : op->inputs())"
for (int i=0; i<n_i; i++,j++) {
auto id = id_buffer[j].second;

View File

@ -43,7 +43,7 @@ void cleanup() {
}
static void init_cuda_devices() {
#ifdef HAS_CUDA
#ifdef IS_CUDA
if (cuda_archs.size()) return;
int count=0;
cudaGetDeviceCount(&count);

View File

@ -20,6 +20,7 @@
#include "utils/flags.h"
#include "fused_op.h"
#include "utils/str_utils.h"
JPU(header)
namespace jittor {
@ -204,6 +205,8 @@ jit_op_entry_t compile(const string& jit_key, const string& src, const bool is_c
// compiler do not allowed filename too long
CHECK(cc_path.size());
string jit_src_path = Op::get_filename_from_jit_key(jit_key, ".cc");
string* src2 = (string*)&src;
JPU(op_compiler(jit_src_path, *src2, is_cuda_op));
#ifdef _WIN32
string jit_lib_path = Op::get_filename_from_jit_key(jit_key, ".dll");
string jit_src_path2 = _to_winstr(jit_src_path);

View File

@ -167,7 +167,7 @@ inline JK& operator<<(JK& jk, int64 c) {
}
#ifdef __linux__
inline JK& operator<<(JK& jk, long long int c) {
inline JK& operator<<(JK& jk, int64_t c) {
return jk << (int64)c;
}
#endif

View File

@ -107,7 +107,7 @@ void migrate_to_cpu(Var* var, Allocator* allocator) {
if (!use_cuda_managed_allocator) {
// must be a device allocator
Allocation a(allocator, var->size);
checkCudaErrors(cudaMemcpy(a.ptr, var->mem_ptr, var->size, cudaMemcpyDefault));
checkCudaErrors(cudaMemcpy(a.ptr, var->mem_ptr, var->size, cudaMemcpyDeviceToHost));
var->allocator->free(var->mem_ptr, var->size, var->allocation);
var->mem_ptr = a.ptr;
var->allocation = a.allocation;

View File

@ -62,7 +62,7 @@ void display_memory_info(const char* fileline, bool dump_var, bool red_color) {
log << "\n=== display_memory_info ===\n";
log << "total_cpu_ram:" <<
FloatOutput{(double)mem_info.total_cpu_ram, " KMG", 1024, "B"};
log << "total_cuda_ram:" <<
log << "total_device_ram:" <<
FloatOutput{(double)mem_info.total_cuda_ram, " KMG", 1024, "B"} >> "\n";
log << "hold_vars:" << hold_vars.size()
<< "lived_vars:" << Var::number_of_lived_vars
@ -105,7 +105,7 @@ void display_memory_info(const char* fileline, bool dump_var, bool red_color) {
auto total = a->used_memory + a->unused_memory;
all_total += total;
a->is_cuda() ? gpu_total += total : cpu_total += total;
log << "name:" << a->name() << "is_cuda:" << a->is_cuda()
log << "name:" << a->name() << "is_device:" << a->is_cuda()
<< "used:" << FloatOutput{(double)a->used_memory, " KMG", 1024, "B"}
>> "(" >> std::setprecision(p) >> a->used_memory*100.0 / total >> "%)"
<< "unused:" << FloatOutput{(double)a->unused_memory, " KMG", 1024, "B"}
@ -117,7 +117,7 @@ void display_memory_info(const char* fileline, bool dump_var, bool red_color) {
auto total = a->used_memory + a->unused_memory;
all_total += total;
a->is_cuda() ? gpu_total += total : cpu_total += total;
log << "name:" << a->name() << "is_cuda:" << a->is_cuda()
log << "name:" << a->name() << "is_device:" << a->is_cuda()
<< "used:" << FloatOutput{(double)a->used_memory, " KMG", 1024, "B"}
>> "(" >> std::setprecision(p) >> a->used_memory*100.0 / total >> "%)"
<< "unused:" << FloatOutput{(double)a->unused_memory, " KMG", 1024, "B"}
@ -227,9 +227,9 @@ MemInfo::MemInfo() {
total_cuda_ram = 0;
#ifdef HAS_CUDA
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, 0);
total_cuda_ram = prop.totalGlobalMem;
size_t gpu_free = 0, _gpu_total = 0;
cudaMemGetInfo(&gpu_free, &_gpu_total);
total_cuda_ram = _gpu_total;
#endif
sigquit_callback.push_back(&meminfo_callback);
}

View File

@ -24,7 +24,7 @@ inline int get_device_count() {
} // jittor
#if CUDART_VERSION < 10000
#if defined(CUDART_VERSION) && CUDART_VERSION < 10000
#define _cudaLaunchHostFunc(a,b,c) \
cudaStreamAddCallback(a,b,c,0)
#define CUDA_HOST_FUNC_ARGS cudaStream_t stream, cudaError_t status, void*

View File

@ -13,7 +13,8 @@ namespace jittor {
struct Deleter {
std::function<void()> del;
inline Deleter(std::function<void()>&& func) : del(move(func)) {}
inline ~Deleter() { del(); }
inline Deleter() {}
inline ~Deleter() { if (del) del(); }
};
} // jittor

View File

@ -17,7 +17,7 @@
namespace jittor {
#ifdef HAS_CUDA
#ifdef IS_CUDA
EXTERN_LIB void check_nan_float32(float32* ptr, int64 num);
EXTERN_LIB void check_nan_float64(float64* ptr, int64 num);
#endif
@ -28,7 +28,7 @@ bool check_nan(Var* v) {
v->input()->name() == string("empty") ||
v->input()->name() == string("setitem")))
return true;
#ifdef HAS_CUDA
#ifdef IS_CUDA
if (v->allocator->is_cuda()) {
if (v->dtype() == ns_float32) {
check_nan_float32((float32*)v->mem_ptr, v->num);

View File

@ -9,6 +9,17 @@
namespace jittor {
DEFINE_FLAG(int, amp_reg, 0, "Auto mixed-precision control registers, bit 0: prefer 32; bit 1: prefer 16; bit 2: keep reduce type; bit 3 keep white list type; bit 4: array like op prefer too");
DEFINE_FLAG_WITH_SETTER(int, auto_mixed_precision_level, 0, "Auto mixed-precision optimization level, 0: not use fp16, 1-3: preserve level, not use fp16 for now; 4: perfer fp16, but some ops use fp32 e.g. sum,exp; 5: simular with 4, and array op will automatically convert to fp16; 6: all ops prefer fp16");
void setter_auto_mixed_precision_level(int value) {
if (value <= 3) amp_reg = 0; else
if (value == 4) amp_reg = amp_prefer16; else
if (value == 5) amp_reg = amp_prefer16 | amp_array_prefer; else
if (value == 6) amp_reg = amp_prefer16 | amp_array_prefer | amp_keep_reduce | amp_keep_white;
}
#define FOR_ALL_TYPES(m) \
m(bool) \
m(int8) \
@ -89,15 +100,18 @@ static unordered_set<string> unary_ops = {
"erfinv"
};
static unordered_set<string> unary_float_ops = {
static unordered_set<string> float_ops = {
"log",
"exp",
"sqrt",
"mean",
"divide",
};
static unordered_set<string> unary_int_ops = {
static unordered_set<string> int_ops = {
"round_int",
"floor_int",
"ceil_int",
"floor_divide",
};
static unordered_set<string> binary_ops = {
@ -127,6 +141,13 @@ static unordered_set<string> binary_ops = {
"mean",
};
static unordered_set<string> white_ops = {
// "log",
"exp",
"pow",
};
#define DEFINE_NS(T) NanoString ns_##T;
FOR_ALL_NS(DEFINE_NS);
@ -135,6 +156,9 @@ char __ns_to_string[ns_max_size*ns_max_len];
int __ns_len[ns_max_size];
static void init_ns() {
dsize_map["float16"] = 1;
is_float_map["float16"] = 1;
is_unsigned["float16"] = 0;
NanoString::ns_t i=0;
auto func = [&](const char* name, NanoString& ns) {
ns.set(NanoString::_index, i++, NanoString::_index_nbits);
@ -149,13 +173,16 @@ static void init_ns() {
if (unary_ops.count(name)) {
ns.set(NanoString::_type, NanoString::_unary, NanoString::_type_nbits);
ns.set(NanoString::_bool, is_bool.count(name));
ns.set(NanoString::_int, unary_int_ops.count(name));
ns.set(NanoString::_float, unary_float_ops.count(name));
ns.set(NanoString::_int, int_ops.count(name));
ns.set(NanoString::_float, float_ops.count(name));
} else
if (binary_ops.count(name)) {
ns.set(NanoString::_type, NanoString::_binary, NanoString::_type_nbits);
ns.set(NanoString::_bool, is_bool.count(name));
ns.set(NanoString::_int, int_ops.count(name));
ns.set(NanoString::_float, float_ops.count(name));
}
ns.set(NanoString::_white_list, white_ops.count(name));
__string_to_ns[name] = ns;
auto name2 = ns.to_cstring();
int len=0;
@ -171,6 +198,7 @@ static void init_ns() {
__string_to_ns["sum"] = ns_add;
__string_to_ns["min"] = ns_minimum;
__string_to_ns["max"] = ns_maximum;
__string_to_ns["half"] = ns_float16;
__string_to_ns["float"] = ns_float32;
__string_to_ns["double"] = ns_float64;
__string_to_ns["int"] = ns_int32;

View File

@ -24,6 +24,7 @@ constexpr int ns_max_len = 16;
m(uint16) \
m(uint32) \
m(uint64) \
m(float16) \
m(float32) \
m(float64) \
\
@ -100,7 +101,7 @@ struct NanoString {
typedef uint16 ns_t;
enum Flags {
// bit0~7: index
_index=0, _index_nbits=8,
_index=0, _index_nbits=7,
_n=_index_nbits,
// bit0-1: type
@ -116,6 +117,8 @@ struct NanoString {
_float=_n+5,
// bit6-7: dsize(1,2,4,8 byte)
_dsize=_n+6, _dsize_nbits=2,
// bit8: white list
_white_list=_n+8,
};
ns_t data=0;
@ -130,11 +133,16 @@ struct NanoString {
inline ns_t index() const { return get(_index, _index_nbits); }
inline int len() const { return __ns_len[index()]; }
inline ns_t type() const { return get(_type, _type_nbits); }
inline ns_t is_bool() const { return get(_bool); }
inline ns_t is_int() const { return get(_int); }
inline ns_t is_unsigned() const { return get(_unsigned); }
inline ns_t is_float() const { return get(_float); }
// @pyjt(is_bool)
inline bool is_bool() const { return get(_bool); }
// @pyjt(is_int)
inline bool is_int() const { return get(_int); }
inline bool is_unsigned() const { return get(_unsigned); }
// @pyjt(is_float)
inline bool is_float() const { return get(_float); }
inline ns_t is_white() const { return get(_white_list); }
inline ns_t dsize() const { return 1<<get(_dsize, _dsize_nbits); }
inline ns_t dsize_() const { return get(_dsize, _dsize_nbits); }
inline ns_t is_dtype() const { return get(_type, _type_nbits)==_dtype; }
inline ns_t is_binary() const { return get(_type, _type_nbits)==_binary; }
inline ns_t is_unary() const { return get(_type, _type_nbits)==_unary; }
@ -156,28 +164,6 @@ struct NanoString {
{ return __ns_to_string+index()*ns_max_len; }
};
// force_type = 1 for int, 2 for float
inline
NanoString dtype_infer(NanoString v1, NanoString v2, int force_type=0, NanoString op=ns_void) {
bool is_float = v1.is_float() || v2.is_float();
int dsize = std::max(v1.dsize(), v2.dsize());
if (force_type == 1)
is_float = false;
else if (force_type == 2)
is_float = true;
if (is_float) {
if (dsize==4) return ns_float32;
return ns_float64;
} else {
if (dsize==8) return ns_int64;
if (dsize==4) return ns_int32;
if (dsize==2) return ns_int16;
if (op.data == ns_add.data || op.data == ns_subtract.data)
return ns_int8;
return v1;
}
}
// @pyjt(NanoString.__eq__)
inline bool eq(const NanoString& a, const NanoString& b) {
return a.data == b.data;
@ -199,4 +185,72 @@ inline std::ostream& operator<<(std::ostream& os, const NanoString& v) {
return os << v.to_cstring();
}
EXTERN_LIB int amp_reg;
constexpr int amp_prefer32 = 1;
constexpr int amp_prefer16 = 2;
constexpr int amp_keep_reduce = 4;
constexpr int amp_keep_white = 8;
constexpr int amp_array_prefer = 16;
inline NanoString float_dtype(int dsize_) {
if (amp_reg & amp_prefer32) return ns_float32;
if (amp_reg & amp_prefer16) return ns_float16;
return (dsize_ == 3) ? ns_float64 :
(dsize_ == 2 ) ? ns_float32 : ns_float16;
}
inline NanoString int_dtype(int dsize_) {
return (dsize_ == 3) ? ns_int64 :
(dsize_ == 2) ? ns_int32 :
(dsize_ == 1) ? ns_int16 : ns_int8;
}
inline NanoString dtype_infer(NanoString x, NanoString y) {
int dsize_ = std::max(x.dsize_(), y.dsize_());
bool is_float = x.is_float() || y.is_float();
if (is_float)
return float_dtype(dsize_);
else {
return int_dtype(dsize_);
}
}
inline NanoString binary_dtype_infer(NanoString op, NanoString x, NanoString y) {
if (op.is_bool()) return ns_bool;
int dsize_ = std::max(x.dsize_(), y.dsize_());
bool is_float = !op.is_int() &&
(x.is_float() || y.is_float() || op.is_float());
if (is_float) {
if (op.is_white() && !(amp_reg & amp_keep_white))
return (dsize_ == 3) ? ns_float64 : ns_float32;
return float_dtype(dsize_);
} else {
return int_dtype(dsize_);
}
}
inline NanoString unary_dtype_infer(NanoString op, NanoString x) {
if (op.is_bool()) return ns_bool;
int dsize_ = x.dsize_();
if (op.is_float()) {
if (op.is_white() && !(amp_reg & amp_keep_white))
return (dsize_ == 3) ? ns_float64 : ns_float32;
return float_dtype(dsize_);
}
if (op.is_int()) return int_dtype(dsize_);
return x;
}
inline NanoString reduce_dtype_infer(NanoString op, NanoString x) {
bool is_float = x.is_float() || op.is_float();
int dsize_ = x.dsize_();
if (is_float) {
if (amp_reg & amp_keep_reduce)
return float_dtype(dsize_);
return (dsize_ == 3) ? ns_float64 : ns_float32;
} else {
return x;
}
}
}

View File

@ -16,12 +16,12 @@
namespace jittor {
#if defined(__clang__)
#if __cplusplus < 201400L || defined(IS_ACL)
using string_view = string;
#elif defined(__clang__)
using std::string_view;
#elif defined(__GNUC__)
using std::experimental::string_view;
#elif __cplusplus < 201400L
using string_view = string;
#else
using std::string_view;
#endif

View File

@ -51,8 +51,14 @@ struct NodeFlags {
_grads=_n+6,
// bit7: has graph optimize
_has_gopt=_n+7,
// bit7: has vary input
// bit8: has vary input
_has_vary_input=_n+8,
// bit9: prefer 32 bit
_prefer_32=_n+9,
// bit10: force 16 bit
_prefer_16=_n+10,
// bit11: reduce keep type unchange
_reduce_keep=_n+11,
};
inline void set(Flags f, int a=1, int nbits=1) {
@ -90,7 +96,7 @@ struct Node {
operator Var*() { return (Var*)node; }
operator var_output_t() { return {(Op*)node, index}; }
};
static int64_t tflag_count;
static int64 tflag_count;
NodeFlags flags;
NanoString ns;
inline bool is_var() const { return flags.get(NodeFlags::_var); }

View File

@ -25,11 +25,12 @@ DEFINE_FLAG(int, try_use_32bit_index, 0,
string_view_map<jit_op_entry_t> jit_ops;
string_view_map<string> jit_key_mapper;
int64_t Op::number_of_lived_ops = 0;
int64 Op::number_of_lived_ops = 0;
Op::Op() {
flags.set(NodeFlags::_var, 0);
flags.set(NodeFlags::_cpu, 1);
flags.flags |= ((amp_reg & 7) << NodeFlags::_prefer_32);
number_of_lived_ops++;
if (PREDICT_BRANCH_NOT_TAKEN(trace_py_var)) trace_data.record_node(this);
}
@ -122,43 +123,24 @@ void Op::do_jit_prepare(JK& jk) {
if (has_cuda && has_cpu && !use_cuda)
flags.set(NodeFlags::_cuda, 0);
} else {
// check use int64_t as index_t if array is too big
int in_id=0, out_id=0;
bool use_int64_t = false;
// TODO: fused op do not have inputs,
// check use_cuda_op from outputs may not be enough
bool use_cuda_op = use_cuda;
for (Var* var : inputs()) {
if (var->mem_ptr) {
/* jit key don't include here, because
parallel compiler don't known
jk << JK::key << "alloc_i" << JK::hex1(in_id)
<< JK::hex1(var->allocator->flags()) << JK::end;
*/
use_cuda_op &= var->allocator->is_cuda();
}
if (var->num >= std::numeric_limits<int32_t>::max())
use_int64_t = true;
in_id ++;
}
for (Var* var : outputs()) {
if (var->mem_ptr) {
/*
jk << JK::key << "alloc_o" << JK::hex1(in_id)
<< JK::hex1(var->allocator->flags()) << JK::end;
*/
use_cuda_op &= var->allocator->is_cuda();
}
if (var->num >= std::numeric_limits<int32_t>::max())
use_int64_t = true;
out_id ++;
}
jk << _CS("[JIT:1]");
if (use_cuda_op && flags.get(NodeFlags::_cuda)) {
jk << _CS("[JIT_cuda:1]");
flags.set(NodeFlags::_cpu, 0);
// TODO: 64bit index in CUDA
use_int64_t = false;
// use_int64_t = false;
} else {
if (use_cuda==2) {
if (flags.get(NodeFlags::_cuda))
@ -268,11 +250,15 @@ void Op::jit_run(JK& jk) {
void Op::statistics(uint64_t& in, uint64_t& out, uint64_t& compute) {
in = out = compute = 0;
for (Var* var : inputs()) {
for (auto& e : _inputs) {
auto var = e.node->var();
if (e.back->index<0) continue;
in += var->size;
compute = std::max(compute, (uint64_t)var->num);
}
for (Var* var : outputs()) {
for (auto& e : _outputs) {
auto var = e.node->var();
if (e.index<0) continue;
out += var->size;
compute = std::max(compute, (uint64_t)var->num);
}

View File

@ -15,7 +15,7 @@ namespace jittor {
enum OpType {other=0, element=1, broadcast=2, reduce=3};
struct Op : Node {
vector<VarPtr> outputs_holder;
static int64_t number_of_lived_ops;
static int64 number_of_lived_ops;
inline Caster<Var*, Node::input_t> inputs() { CHECK_EXIST; return &_inputs; }
inline Caster<Var*, Node::output_t> outputs() { CHECK_EXIST; return &_outputs; }

View File

@ -112,7 +112,7 @@ int OpCompiler::total_member_count() {
return member_count;
}
int64_t OpCompiler::eval(const string& expr, const unordered_map<string,string>& vars) {
int64 OpCompiler::eval(const string& expr, const unordered_map<string,string>& vars) {
if (expr.find("@") != string::npos) {
string new_expr;
for (size_t i=0; i<expr.size(); i++) {
@ -223,6 +223,16 @@ void load_macros(const string& src, unordered_map<string,string>& macros) {
}
}
string expand_op_search(const vector<string>& args) {
for (auto op_type : op_types) {
string ret = op_type->expand_op(args);
if (ret.size())
return ret;
}
LOGf << "No expand op pattern found for args:" << args;
return "";
}
void expand_macro(const string& macro, const vector<string>& args, string& new_src) {
LOGvvvv << "expand_macro" << macro << "args:" << args;
if (macro.size() == 0 || macro[0] != '<') {
@ -434,6 +444,7 @@ string precompile(unordered_map<string,string> defs, string src, unordered_map<s
vector<string> args;
size_t l = k+1;
if (expr == "for" || expr == "if" || expr == "expand_macro" ||
expr == "expand_op" ||
expr == "is_def" || expr == "python" ||
(k<src.size() && src[k]=='(')) {
ASSERT(src[k] == '(');
@ -555,6 +566,18 @@ string precompile(unordered_map<string,string> defs, string src, unordered_map<s
i = l-1;
continue;
} else
if (expr == "expand_op") {
// syntax: @expand_op(args)
for (auto& arg : args) {
uint p=0;
while (p<arg.size() && arg[p] == ' ') p++;
arg = precompile(defs, arg.substr(p), macros);
}
string ns = expand_op_search(args);
new_src += precompile(defs, ns, macros);
i = l-1;
continue;
} else
if (expr == "define") {
// syntax: @define(macro, value)
// ij k l
@ -846,6 +869,9 @@ string OpCompiler::__get_fused_src(
};
auto not_change = [&](const string& s) -> bool {
if (unchanged.count(s)) return true;
for (auto op_type : op_types)
if (op_type->types.count(s))
return true;
return (s.find("::") != string::npos) || (s.find("LOG") != string::npos);
};
// regex find XxxXxxOp::jit_run
@ -1043,7 +1069,7 @@ jit_op_entry_t OpCompiler::compile(const string& jit_key, const string& src) {
if (v->loop_options)
for (auto& kv : v->loop_options.data()) {
if (kv.second && startswith(kv.first, "FLAGS:"))
extra_flags += " "+kv.first.substr(6)+" ";
extra_flags += " " + kv.first.substr(6) + " ";
}
return jit_compiler::compile(jit_key, src, is_cuda, extra_flags);
}
@ -1052,6 +1078,8 @@ jit_op_entry_t OpCompiler::do_compile(Op* op) {
jittor::lock_guard lg;
OpCompiler oc(op);
string* src = &oc.src;
for (auto op_type : op_types)
op_type->post_pass(&oc);
string src_after_passes;
// if is fused op
if (oc.op) {

View File

@ -8,7 +8,6 @@
#include "var.h"
#include "ops/binary_op.h"
#include "ops/broadcast_to_op.h"
#include "ops/binary_op_defs.h"
#include "ops/op_register.h"
namespace jittor {
@ -419,21 +418,13 @@ unordered_set<string> binary_ops = {
"bitwise_xor",
};
NanoString binary_dtype_infer(NanoString op, Var* x, Var* y) {
if (op == ns_mean) return dtype_infer(x->ns, y->ns, 2); // force float
int force_type=0;
if (op == ns_divide) force_type=2; // force float
if (op == ns_floor_divide) force_type=1; // force int
return op.is_bool() ? ns_bool : dtype_infer(x->ns, y->ns, force_type, op);
}
BinaryOp::BinaryOp(Var* x, Var* y, NanoString op) : x(x), y(y) {
flags.set(NodeFlags::_cpu);
flags.set(NodeFlags::_cuda);
set_type(OpType::element);
ns = op;
ASSERT(ns.is_binary());
z = create_output(nullptr, binary_dtype_infer(op, x, y));
z = create_output(nullptr, binary_dtype_infer(op, x->ns, y->ns));
}
VarPtr dirty_clone_broadcast(Var* v) {
@ -554,7 +545,7 @@ void BinaryOp::jit_run() {
auto* __restrict__ zp = z->ptr<Tz>();
index_t num = z->num;
for (index_t i=0; i<num; i++)
zp[i] = @expand_macro(@OP, Tz, xp[i], yp[i]);
zp[i] = @expand_op(@OP, @Tz, xp[i], @Tx, yp[i], @Ty);
}
#endif // JIT

View File

@ -1,61 +0,0 @@
// ***************************************************************
// Copyright (c) 2021 Jittor. All Rights Reserved.
// Maintainers: Dun Liang <randonlang@gmail.com>.
// This file is subject to the terms and conditions defined in
// file 'LICENSE.txt', which is part of this source code package.
// ***************************************************************
#pragma once
#include "common.h"
namespace jittor {
#ifdef JIT_cuda
#define pow(T,a,b) ::pow(a,b)
#define maximum(T,a,b) ::max(T(a), T(b))
#define minimum(T,a,b) ::min(T(a), T(b))
#define mod(T,a,b) @if(@strcmp(@T,float32)==0,(a-::floorf((a)/(b))*(b)),@if(@strcmp(@Tx,float64)==0,(a-::floor((a)/(b))*(b)),(a%b)))
#else // JIT_cpu
#define pow(T,a,b) std::pow(a,b)
#define maximum(T,a,b) std::max(T(a), T(b))
#define minimum(T,a,b) std::min(T(a), T(b))
#define mod(T,a,b) @if(@strcmp(@T,float32)==0,(a-std::floor((a)/(b))*(b)),@if(@strcmp(@Tx,float64)==0,(a-std::floor((a)/(b))*(b)),(a%b)))
#endif
#define add(T,a,b) ((a)+(b))
#define subtract(T,a,b) ((a)-(b))
#define multiply(T,a,b) ((a)*(b))
#define divide(T,a,b) (T((T(a))/(T(b))))
#define floor_divide(T,a,b) (T((T(a))/(T(b))))
#define less(T,a,b) ((a)<(b))
#define less_equal(T,a,b) ((a)<=(b))
#define greater(T,a,b) ((a)>(b))
#define greater_equal(T,a,b) ((a)>=(b))
#define equal(T,a,b) ((a)==(b))
#define not_equal(T,a,b) ((a)!=(b))
#define left_shift(T,a,b) ((a)<<(b))
#define right_shift(T,a,b) ((a)>>(b))
#define logical_and(T,a,b) ((a)&&(b))
#define logical_or(T,a,b) ((a)||(b))
#define logical_xor(T,a,b) ((bool(a))!=(bool(b)))
#define bitwise_and(T,a,b) ((a)&(b))
#define bitwise_or(T,a,b) ((a)|(b))
#define bitwise_xor(T,a,b) ((a)^(b))
#define mean(T,a,b) ((a)+T(b)*(T(rcount)))
#ifdef JIT_cuda
#define init_maximum(T) ::numeric_min<T>()
#define init_minimum(T) ::numeric_max<T>()
#else
#define init_maximum(T) std::numeric_limits<T>::lowest()
#define init_minimum(T) std::numeric_limits<T>::max()
#endif
#define init_add(T) T(0)
#define init_multiply(T) T(1)
#define init_logical_and(T) true
#define init_logical_or(T) false
#define init_logical_xor(T) false
#define init_bitwise_and(T) T(-1)
#define init_bitwise_or(T) T(0)
#define init_bitwise_xor(T) T(0)
#define init_mean(T) T(0)
} // jittor

View File

@ -91,7 +91,7 @@ void CandidateOp::jit_run() {
int n=0;
// checkCudaErrors(cudaDeviceSynchronize());
checkCudaErrors(cudaMemcpy(&n, np, 4, cudaMemcpyDefault));
checkCudaErrors(cudaMemcpy(&n, np, 4, cudaMemcpyDeviceToHost));
y->set_shape({n});
exe.temp_allocator->free(np, 4, n_allocation);
exe.temp_allocator->free(maskp, xshape0, mask_allocation);

View File

@ -40,7 +40,7 @@ void CopyOp::run() {
auto y_ptr = outputs().front()->mem_ptr;
#ifdef HAS_CUDA
if (flags.get(NodeFlags::_cuda)) {
checkCudaErrors(cudaMemcpyAsync(y_ptr, x_ptr, size, cudaMemcpyDefault, 0));
checkCudaErrors(cudaMemcpyAsync(y_ptr, x_ptr, size, cudaMemcpyDeviceToDevice, 0));
} else
#endif
{

View File

@ -121,13 +121,18 @@ void FetchOp::run() {
checkCudaErrors(cudaStreamWaitEvent(stream, event, 0));
new (&allocation) Allocation(&cuda_dual_allocator, v->size);
// mostly device to device
#if IS_CUDA
checkCudaErrors(cudaMemcpyAsync(
allocation.ptr, v->mem_ptr, v->size, cudaMemcpyDefault, stream));
#else
checkCudaErrors(cudaMemcpyAsync(
allocation.ptr, v->mem_ptr, v->size, cudaMemcpyDeviceToDevice, stream));
#endif
auto host_ptr = cuda_dual_allocator.get_dual_allocation(
allocation.allocation).host_ptr;
// device to host
checkCudaErrors(cudaMemcpyAsync(
host_ptr, allocation.ptr, v->size, cudaMemcpyDefault, stream));
host_ptr, allocation.ptr, v->size, cudaMemcpyDeviceToHost, stream));
allocation.ptr = host_ptr;
has_cuda_memcpy = true;
} else

View File

@ -33,4 +33,12 @@ OpInfo get_op_info(const string& name) {
return op_info_map.at(op_file_name);
}
vector<OpByType*> op_types;
int registe_op_type(OpByType* op_type) {
op_types.push_back(op_type);
return 0;
}
} // jittor

View File

@ -32,4 +32,14 @@ void op_registe(const OpInfo& op_info);
bool has_op(const string& name);
OpInfo get_op_info(const string& name);
struct OpCompiler;
struct OpByType {
unordered_set<string> types;
virtual string expand_op(const vector<string>& args) = 0;
virtual void post_pass(OpCompiler*) = 0;
};
extern vector<OpByType*> op_types;
int registe_op_type(OpByType*);
} // jittor

View File

@ -8,7 +8,6 @@
#include <limits>
#include "var.h"
#include "ops/reduce_op.h"
#include "ops/binary_op_defs.h"
#include "ops/op_register.h"
#include "executor.h"
@ -272,7 +271,7 @@ ReduceOp::ReduceOp(Var* x, NanoString op, NanoVector dims, bool keepdims)
if (x->dtype() == ns_bool)
y = create_output(nullptr, ns_int32);
else
y = create_output(nullptr, binary_dtype_infer(ns, x, x));
y = create_output(nullptr, reduce_dtype_infer(ns, x->ns));
}
ReduceOp::ReduceOp(Var* x, NanoString op, uint dims_mask, uint keepdims_mask)
@ -284,7 +283,7 @@ ReduceOp::ReduceOp(Var* x, NanoString op, uint dims_mask, uint keepdims_mask)
ASSERT(ns.is_binary());
reduce_mask = dims_mask;
this->keepdims_mask = keepdims_mask;
y = create_output(nullptr, binary_dtype_infer(ns, x, x));
y = create_output(nullptr, reduce_dtype_infer(ns, x->ns));
}
ReduceOp::ReduceOp(Var* x, NanoString op, int dim, bool keepdims)
@ -360,18 +359,18 @@ void ReduceOp::jit_run() {
@for(i, DIM-2, -1, -1, auto ystride@i = ystride@{i+1} * yshape@{i+1};)
index_t xstride@{DIM-1} = 1;
@for(i, DIM-2, -1, -1, auto xstride@i = xstride@{i+1} * xshape@{i+1};)
Ty count = Ty(x->num) / Ty(y->num);
Ty rcount = Ty(y->num) / Ty(x->num);
Ty count = x->num*1.0 / y->num;
Ty rcount = y->num*1.0 / x->num;
@for(d, 0, DIM,@if(REDUCE>>d&1,, for (index_t xi@d=0; xi@d < xshape@d; xi@d++))) {
auto yid = 0 @for(d, 0, DIM,@if(REDUCE>>d&1,, + xi@d * ystride@d));
yp[yid] = @expand_macro(init_@OP, Ty);
yp[yid] = @expand_op(init_@OP, @Ty);
}
@for(d, 0, DIM,@if(REDUCE>>d&1,, for (index_t xi@d=0; xi@d < xshape@d; xi@d++))) {
@for(d, 0, DIM,@if(REDUCE>>d&1, for (index_t xi@d=0; xi@d < xshape@d; xi@d++),)) {
auto yid = 0 @for(d, 0, DIM,@if(REDUCE>>d&1,, + xi@d * ystride@d));
auto xid = 0 @for(d, 0, DIM, + xi@d * xstride@d);
yp[yid] = @expand_macro(@OP, Ty, yp[yid], xp[xid]);
yp[yid] = @expand_op(@OP, @Ty, yp[yid], @Ty, xp[xid], @Tx);
}
}
(void)count, (void)rcount, (void)yshape0, (void)ystride0;

View File

@ -132,7 +132,7 @@ void ReindexOp::jit_run() {
@for(d, 0, XDIM, index_t xid@d = @expand_macro(INDEX@d);)
auto xid = @for(d, 0, XDIM, + xid@d * xstride@d);
bool check_overflow = 0 @for(d, 0, XDIM, || xid@d<0 || xid@d>=xshape@d) @for(d, 0, OSIZE, || (@expand_macro(OFD@d)));
yp[yid] = check_overflow ? (@OVERFLOW) : xp[xid];
yp[yid] = check_overflow ? Tx(@OVERFLOW) : xp[xid];
}
}
#endif // JIT

View File

@ -8,7 +8,6 @@
#include <limits>
#include "var.h"
#include "ops/reindex_reduce_op.h"
#include "ops/binary_op_defs.h"
#include "ops/op_register.h"
namespace jittor {
@ -112,7 +111,7 @@ void ReindexReduceOp::jit_run() {
@for(d, 0, XDIM, for (index_t i@d=0; i@d < xshape@d; i@d++)) {
auto xid = @for(d, 0, XDIM, + i@d * xstride@d);
xp[xid] = @expand_macro(init_@OP, Tx);
xp[xid] = @expand_op(init_@OP, @Tx);
}
// generate d-for loop
@for(d, 0, YDIM, for (index_t i@d=0; i@d < yshape@d; i@d++)) {
@ -121,7 +120,7 @@ void ReindexReduceOp::jit_run() {
auto xid = @for(d, 0, XDIM, + xid@d * xstride@d);
bool check_overflow = 0 @for(d, 0, XDIM, || xid@d<0 || xid@d>=xshape@d) @for(d, 0, OSIZE, || (@expand_macro(OFD@d)));
if (!check_overflow)
xp[xid] = @expand_macro(@OP, Tx, xp[xid], yp[yid]);
xp[xid] = @expand_op(@OP, @Tx, xp[xid], @Tx, yp[yid], @Tx);
}
}
#endif // JIT

View File

@ -9,7 +9,6 @@
#include "ops/setitem_op.h"
#include "ops/getitem_op.h"
#ifdef JIT
#include "ops/binary_op_defs.h"
#ifdef JIT_cuda
#include <cuda_runtime.h>
#include "helper_cuda.h"
@ -313,7 +312,7 @@ void SetitemOp::jit_run() {
std::memcpy(op, ip, out->size);
#else
if (op != ip)
checkCudaErrors(cudaMemcpyAsync(op, ip, out->size, cudaMemcpyDefault, 0));
checkCudaErrors(cudaMemcpyAsync(op, ip, out->size, cudaMemcpyDeviceToDevice, 0));
#endif
if (flags.get((NodeFlags::Flags(SetitemOp::_data_inplaced))) &&
@ -340,12 +339,12 @@ void SetitemOp::jit_run() {
@if(@is_def(JIT_cpu),
@if(@strcmp(@OP,void)==0,
op[iid] = (Ti)dp[did],
op[iid] = @expand_macro(@OP, Ti, op[iid], dp[did])
op[iid] = @expand_op(@OP, @Ti, op[iid], @Ti, dp[did], @Td)
);
,
@if(@strcmp(@OP,void)==0, op[iid] = (Ti)dp[did],
@if(@strcmp(@OP,add)==0, atomicAdd(&op[iid], (Ti)dp[did]),
op[iid] = @expand_macro(@OP, Ti, op[iid], dp[did])
op[iid] = @expand_op(@OP, @Ti, op[iid], @Ti, dp[did], @Td)
)
);
)

View File

@ -28,6 +28,12 @@ TransposeOp::TransposeOp(Var* x, NanoVector axes_) : x(x), axes(axes_) {
for (int i=0; i<(int)xdim; i++)
axes.push_back(xdim-1-i);
}
if (axes.size() < xdim || (axes.size() == xdim && axes[xdim-1]==xdim-1)) {
static VarPtr(*fuse_transpose)(Var*, NanoVector) = get_op_info("fuse_transpose").get_constructor<VarPtr, Var*, NanoVector>();
auto var = fuse_transpose(x, axes);
forward(var);
return;
}
#ifdef HAS_CUDA
if (use_cuda) {
static VarPtr(*cutt_transpose)(Var*, NanoVector) = nullptr;

View File

@ -8,7 +8,6 @@
#include "misc/cpu_math.h"
#include "var.h"
#include "ops/unary_op.h"
#include "ops/unary_op_defs.h"
#include "ops/op_register.h"
namespace jittor {
@ -33,6 +32,7 @@ static unordered_set<string> unary_ops = {
"uint16",
"uint32",
"uint64",
"float16",
"float32",
"float64",
// please keep float64 the last type
@ -534,22 +534,15 @@ UnaryOp::UnaryOp(Var* x, NanoString op) : x(x) {
ns = op;
ASSERT(ns.is_unary() | ns.is_dtype());
NanoString dtype;
if (ns.is_dtype()) {
if (ns == x->dtype()) {
forward(x);
return;
}
if (ns.is_dtype()) {
dtype = ns;
ns = ns_cast;
} else if (ns.is_bool())
dtype = ns_bool;
else if (ns.is_float())
dtype = dtype_infer(x->ns, x->ns, 2);
else if (ns.is_int())
dtype = dtype_infer(x->ns, x->ns, 1);
else {
dtype = x->ns;
}
} else
dtype = unary_dtype_infer(ns, x->ns);
y = create_output(nullptr, dtype);
}
@ -688,7 +681,7 @@ void UnaryOp::jit_run() {
auto* __restrict__ yp = y->ptr<Ty>();
index_t num = y->num;
for (index_t i=0; i<num; i++)
yp[i] = @expand_macro(@OP, Ty, xp[i]);
yp[i] = @expand_op(@OP, @Ty, xp[i], @Tx);
}
#endif // JIT

View File

@ -1,84 +0,0 @@
// ***************************************************************
// Copyright (c) 2021 Jittor. All Rights Reserved.
// Maintainers: Dun Liang <randonlang@gmail.com>.
// This file is subject to the terms and conditions defined in
// file 'LICENSE.txt', which is part of this source code package.
// ***************************************************************
#pragma once
#include "common.h"
namespace jittor {
#define logical_not(T,x) (!(x))
#define bitwise_not(T,x) (~(x))
#define negative(T,x) (-(x))
#ifdef JIT_cuda
// TODO: add float64 version
#define abs(T,x) ::abs(x)
#define log(T,x) ::logf((T)(x))
#define exp(T,x) ::expf((T)(x))
#define sqrt(T,x) ::sqrtf((T)(x))
#define round(T,x) ((T) ::roundf((x)))
#define floor(T,x) ((T) ::floorf((x)))
#define ceil(T,x) ((T) ::ceilf((x)))
#define round_int(T,x) ((T) ::roundf((x)))
#define floor_int(T,x) ((T) ::floorf((x)))
#define ceil_int(T,x) ((T) ::ceilf((x)))
#define sin(T,x) ((T) ::sinf((x)))
#define asin(T,x) ((T) ::asinf((x)))
#define sinh(T,x) ((T) ::sinhf((x)))
#define asinh(T,x) ((T) ::asinhf((x)))
#define cos(T,x) ((T) ::cosf((x)))
#define acos(T,x) ((T) ::acosf((x)))
#define cosh(T,x) ((T) ::coshf((x)))
#define acosh(T,x) ((T) ::acoshf((x)))
#define tan(T,x) ((T) ::tanf((x)))
#define atan(T,x) ((T) ::atanf((x)))
#define tanh(T,x) ((T) ::tanhf((x)))
#define atanh(T,x) ((T) ::atanhf((x)))
#define sigmoid(T,x) ((T) (1.0f/(1.0f+::expf((::min(T(-(x)), T(@if(@strcmp(@T,float32)==0,30,300))))))))
#define erf(T,x) ((T) ::erff((x)))
#define erfinv(T,x) ((T) ::erfinvf((T)(x)))
#else
#define abs(T,x) std::abs(x)
#define log(T,x) std::log((T)(x))
#define exp(T,x) std::exp((T)(x))
#define sqrt(T,x) std::sqrt((T)(x))
#define round(T,x) ((T)std::round((x)))
#define floor(T,x) ((T)std::floor((x)))
#define ceil(T,x) ((T)std::ceil((x)))
#define round_int(T,x) ((T)std::round((x)))
#define floor_int(T,x) ((T)std::floor((x)))
#define ceil_int(T,x) ((T)std::ceil((x)))
#define sin(T,x) ((T) std::sin((x)))
#define asin(T,x) ((T) std::asin((x)))
#define sinh(T,x) ((T) std::sinh((x)))
#define asinh(T,x) ((T) std::asinh((x)))
#define cos(T,x) ((T) std::cos((x)))
#define acos(T,x) ((T) std::acos((x)))
#define cosh(T,x) ((T) std::cosh((x)))
#define acosh(T,x) ((T) std::acosh((x)))
#define tan(T,x) ((T) std::tan((x)))
#define atan(T,x) ((T) std::atan((x)))
#define tanh(T,x) ((T) std::tanh((x)))
#define atanh(T,x) ((T) std::atanh((x)))
#define sigmoid(T,x) ((T) (1.0f/(1.0f+std::exp(std::min(T(-(x)), T(@if(@strcmp(@T,float32)==0,30,300)))))))
#define erf(T,x) ((T) std::erf((x)))
#define erfinv(T,x) (jittor::_erfinv(x))
#endif
#define cast(T,x) ((T)(x))
} // jittor

View File

@ -230,7 +230,7 @@ void WhereOp::jit_run() {
int n=0;
// checkCudaErrors(cudaDeviceSynchronize());
checkCudaErrors(cudaMemcpy(&n, np, 4, cudaMemcpyDefault));
checkCudaErrors(cudaMemcpy(&n, np, 4, cudaMemcpyDeviceToHost));
@for(i, 0, NDIM, outs[@i]->set_shape({n});)
exe.temp_allocator->free(np, 4, n_allocation);
}

View File

@ -25,6 +25,7 @@
namespace jittor {
using namespace expr;
extern int use_cuda;
struct OpInspector {
// binary mask for
@ -229,9 +230,14 @@ void ConvTuner::forwardTune(FusedOp* fop) {
if (!(bop->y->input() && bop->x->input() && fop->has(bop->x->input()) && fop->has(bop->y->input()))) continue;
if (!(bop->x->input()->type()==OpType::broadcast && bop->y->input()->type()==OpType::broadcast)) return;
// only support float32 currently
// only support float32,float16 currently
if (use_cuda) {
if (bop->z->dtype() != ns_float32 && bop->z->dtype() != ns_float16)
continue;
} else {
if (bop->z->dtype() != ns_float32)
continue;
}
Op* ops[3] = {op, bop->x->input(), bop->y->input()};
int ok = 0;
LOGvvvv << "conv like op" << fop << fop->get_jit_key(get_jk());

View File

@ -23,6 +23,7 @@
#include "fused_op.h"
#include "profiler/memory_checker.h"
#include "misc/deleter.h"
#include "executor.h"
namespace jittor {
@ -30,6 +31,8 @@ Profiler profiler;
DEFINE_FLAG(int, profiler_warmup, 0, "Profiler warmup.");
DEFINE_FLAG(int, profiler_rerun, 0, "Profiler rerun.");
DEFINE_FLAG(int, profiler_record_peek, 0, "Profiler record peek mem bandwidth.");
DEFINE_FLAG(int, profiler_record_shape, 0, "Profiler record shape for op.");
DEFINE_FLAG(int, profiler_hide_relay, 0, "Profiler hide relayed op.");
DEFINE_FLAG_WITH_SETTER(int, profiler_enable, 0, "Enable profiler.");
@ -54,6 +57,8 @@ void Profiler::start(int64 warmup, int64 rerun) {
profiler.records.clear();
profiler.warmup = warmup;
profiler.rerun = rerun;
profiler.relay_extra_cost = 0;
profiler.relay_fop = 0;
}
void Profiler::stop() {
@ -138,6 +143,60 @@ static string get_stack_info(Op* op) {
}
}
static void stat_peek_bandwidth(uint64 in, uint64 out, uint64 loop, uint64& peek_time_total) {
auto size = (in+out) / 2;
// memcpy in some not aligned case will drop performance
size &= ~((1 << 12)-1);
// size = 7680000*4;
auto temp1 = exe.alloc_temp(size);
auto temp2 = exe.alloc_temp(size);
loop = 1 << loop;
int warmup = std::max(loop/8, (uint64)1);
for (int i=0; i<warmup; i++)
#ifdef HAS_CUDA
if (use_cuda)
cudaMemcpyAsync(temp1.ptr, temp2.ptr, size, cudaMemcpyDeviceToDevice, 0);
else
#endif
std::memcpy(temp1.ptr, temp2.ptr, size);
#ifdef HAS_CUDA
if (use_cuda)
checkCudaErrors(cudaDeviceSynchronize());
#endif
auto start = std::chrono::high_resolution_clock::now();
for (int i=0; i<loop; i++)
#ifdef HAS_CUDA
if (use_cuda)
cudaMemcpyAsync(temp1.ptr, temp2.ptr, size, cudaMemcpyDeviceToDevice, 0);
else
#endif
std::memcpy(temp1.ptr, temp2.ptr, size);
#ifdef HAS_CUDA
if (use_cuda)
checkCudaErrors(cudaDeviceSynchronize());
#endif
auto finish = std::chrono::high_resolution_clock::now();
auto total_ns = (int64_t)std::chrono::duration_cast<std::chrono::nanoseconds>(finish-start).count();
peek_time_total += total_ns;
}
struct RecordExtraCost {
int ck;
std::chrono::high_resolution_clock::time_point start;
RecordExtraCost(int ck) : ck(ck) {
if (!ck) return;
start = std::chrono::high_resolution_clock::now();
}
~RecordExtraCost() {
if (!ck) return;
auto finish = std::chrono::high_resolution_clock::now();
auto total_ns = (int64_t)std::chrono::duration_cast<std::chrono::nanoseconds>(finish-start).count();
profiler.relay_extra_cost += total_ns;
}
};
void Profiler::record_and_run(
jit_op_entry_t jit_entry,
Op* op,
@ -151,6 +210,9 @@ void Profiler::record_and_run(
jit_key : ikey->second.c_str();
auto iter = profiler.records.find(key);
uint64_t in, out, compute;
if (profiler.relay_fop)
profiler.relay_fop->statistics(in, out, compute);
else
op->statistics(in, out, compute);
if (iter == profiler.records.end()) {
profiler.records[key] = Info{
@ -165,7 +227,7 @@ void Profiler::record_and_run(
bool is_fused = op->name() == string("fused");
uint64* shape_time = nullptr;
if (trace_py_var) {
if (trace_py_var || profiler_record_shape) {
// record shape
NanoVector shape;
int64 num = 0;
@ -193,41 +255,62 @@ void Profiler::record_and_run(
iter->second.shapes[shape].second += 1;
shape_time = &iter->second.shapes[shape].first;
}
int loop = (is_fused &&
((FusedOp*)op)->get_loop_option("insert_profile_loop")) ? 10 : 0;
int64_t warmup = profiler.warmup ? std::max(profiler.warmup>>loop, (int64_t)1) : 0;
int64_t rerun = std::max((profiler.rerun+1)>>loop, (int64_t)1);
// prevent relayed op being rerun
auto warmup_bk = profiler.warmup;
auto rerun_bk = profiler.rerun;
profiler.warmup = profiler.rerun = 0;
Deleter del([&]() {
profiler.warmup = warmup_bk;
profiler.rerun = rerun_bk;
});
int64_t warmup = profiler.warmup;
int64_t rerun = profiler.rerun + 1;
rerun = std::max(NanoVector::get_nbits(rerun) - 2, 0);
int loop = 0;
Deleter _d;
if (is_fused) {
auto fop = ((FusedOp*)op);
if (fop->context && fop->context->vrm.relay_groups.size()) {
// relay op
loop = rerun;
profiler.relay_extra_cost = 0;
profiler.relay_fop = fop;
_d.del = [&]() {
profiler.relay_extra_cost = 0;
profiler.relay_fop = 0;
};
} else
loop = fop->get_loop_option("insert_profile_loop") ? 10 : 0;
}
int64 num = 1<<(rerun - loop);
{
profiler_enable = 0;
Deleter del([&]() { profiler_enable = 1;});
RecordExtraCost rec(profiler.relay_fop && profiler.relay_fop != op);
for (int64_t i=0; i<warmup; i++) {
jit_entry(op);
}
#ifdef HAS_CUDA
if (use_cuda)
checkCudaErrors(cudaDeviceSynchronize());
#endif
}
for (int64_t i=0; i<rerun; i++) {
auto start = std::chrono::high_resolution_clock::now();
for (int64_t i=0; i<num; i++) {
jit_entry(op);
}
#ifdef HAS_CUDA
if (use_cuda)
checkCudaErrors(cudaDeviceSynchronize());
#endif
auto finish = std::chrono::high_resolution_clock::now();
auto total_ns = (int64_t)std::chrono::duration_cast<std::chrono::nanoseconds>(finish-start).count();
if (profiler.relay_fop == op) {
total_ns -= profiler.relay_extra_cost;
}
// 24ns function call overhead
total_ns = std::max((int64_t)1, total_ns-24);
iter->second.update(loop, total_ns, in, out, compute);
iter->second.update(rerun, total_ns, in, out, compute);
if (shape_time) shape_time[0] += total_ns;
RecordExtraCost rec(profiler.relay_fop && profiler.relay_fop != op);
if (profiler_record_peek)
stat_peek_bandwidth(in, out, rerun, iter->second.peek_time_total);
LOGvvvv << "Duration" << total_ns >> "ns running" << op;
}
if (is_fused &&
((FusedOp*)op)->get_loop_option("check_cache")) {
auto fname = Op::get_filename_from_jit_key(key, ".so");
@ -239,6 +322,8 @@ void Profiler::record_and_run(
vector<vector<string>> Profiler::report(const string& sort_key) {
vector<vector<string>> rep = {{"Name", "FileName", "Count", "TotalTime", "AvgTime", "MinTime", "MaxTime", "Input", "Output", "InOut", "Compute"}};
if (profiler_record_peek)
rep[0].push_back("Peek");
vector<string> names, fnames;
vector<vector<double>> info;
vector<int> order;
@ -295,6 +380,10 @@ vector<vector<string>> Profiler::report(const string& sort_key) {
(double)(kinfo.in_total+kinfo.out_total)*1e9 / kinfo.time_total, // InOut
(double)kinfo.compute_total*1e9 / kinfo.time_total, // Compute
});
if (profiler_record_peek)
info.back().push_back(
(double)(kinfo.in_total+kinfo.out_total)*1e9 / kinfo.peek_time_total // Peek
);
}
if (sort_key_id>=2)
std::sort(order.begin(), order.end(), [&](int i, int j) {
@ -363,7 +452,7 @@ vector<vector<string>> Profiler::report(const string& sort_key) {
<< std::setw(3)
<< std::setprecision(p) << cum_time / total_time * 100 << "%)";
}
} else if (j<=7) {
} else if (j<=7 || j==9) {
// output thoughtput
output_float(" KMG", 1024, "B/s", k);
} else {

View File

@ -24,6 +24,8 @@ struct Profiler {
uint64_t in_total, out_total;
// compute thoughtput in ops
uint64_t compute_total;
// peek time use memcopy
uint64_t peek_time_total;
// cache test info
unique_ptr<CacheInfo> cache_info;
cstr stack_info;
@ -56,6 +58,9 @@ struct Profiler {
int64_t warmup=0, rerun=0;
unordered_map<string, Info> records;
int64 relay_extra_cost;
FusedOp* relay_fop;
~Profiler();
};

View File

@ -267,7 +267,7 @@ void TraceData::release_node(Node* node) {
return;
auto node_id = iter->second;
id_map.erase(node);
if (trace_py_var < 2) {
if (trace_py_var < 2 || execute_op_info.size() > 100000) {
node_data.erase(node_id);
}
}
@ -312,6 +312,7 @@ void TraceData::record_op(Op* op) {
}
void TraceData::record_execution(Op* op, bool is_fused_op, JK& jk) {
if (execute_op_info.size() > 100000) return;
ExecuteOpInfo& einfo = execute_op_info[execute_op_info_cnt++];
if (is_fused_op) {
FusedOp* fop = (FusedOp*)op;

View File

@ -21,7 +21,9 @@ NanoString npy2ns[] = {
ns_int64, ns_uint64,
ns_float32, ns_float64, ns_float64,
ns_void, ns_void, ns_void,
ns_void
ns_void, // 17
ns_void, ns_void, ns_void, ns_void, ns_void, // 22
ns_float16, // 23
};
NPY_TYPES ns2npy[] = {
@ -34,7 +36,7 @@ NPY_TYPES ns2npy[] = {
NPY_BYTE, NPY_SHORT, NPY_INT, NPY_LONGLONG,
NPY_UBYTE, NPY_USHORT, NPY_UINT, NPY_ULONGLONG,
#endif
NPY_FLOAT, NPY_DOUBLE
NPY_HALF, NPY_FLOAT, NPY_DOUBLE
};
void** PyArray_API;

View File

@ -48,6 +48,8 @@ enum NPY_TYPES {
NPY_FLOAT, NPY_DOUBLE, NPY_LONGDOUBLE,
NPY_CFLOAT, NPY_CDOUBLE, NPY_CLONGDOUBLE,
NPY_OBJECT=17,
NPY_HALF=23,
NPY_END=24,
};
EXTERN_LIB NanoString npy2ns[];
@ -60,11 +62,11 @@ EXTERN_LIB NPY_TYPES ns2npy[];
inline bool is_c_style(PyArray_Proxy* obj) { return obj->flags & 1; }
inline NanoString get_type_str(PyArray_Proxy* obj) {
NanoString type = ns_void;
if (obj->descr->type_num < NPY_OBJECT)
if (obj->descr->type_num < NPY_END)
type = npy2ns[obj->descr->type_num];
CHECK(type != ns_void) << "Numpy type not support, type_num:"
<< obj->descr->type_num
<< "type_char:" << obj->descr->type;
<< "type_char:" << obj->descr->type << NPY_END << npy2ns[obj->descr->type_num];
return type;
}

View File

@ -141,7 +141,7 @@ ArrayOp::ArrayOp(PyObject* obj) {
} else {
// this is non-continue numpy array
#if defined(__linux__) || defined(_WIN32)
STACK_ALLOC(int64, dims, args.shape.size());
STACK_ALLOC(int64_t, dims, args.shape.size());
#elif defined(__APPLE__)
long dims[args.shape.size()];
#endif

View File

@ -15,7 +15,7 @@
#include "misc/nano_string.h"
#include "misc/fast_shared_ptr.h"
#include "profiler/simple_profiler.h"
#ifdef HAS_CUDA
#ifdef IS_CUDA
#include "misc/cuda_flags.h"
#endif
@ -274,7 +274,7 @@ DEF_IS(ArrayArgs, bool) is_type(PyObject* obj) {
DEF_IS(ArrayArgs, PyObject*) to_py_object(const T& a) {
#if defined(__linux__) || defined(_WIN32)
STACK_ALLOC(int64, dims, a.shape.size());
STACK_ALLOC(int64_t, dims, a.shape.size());
#elif defined(__APPLE__)
long dims[a.shape.size()];
#endif
@ -390,7 +390,7 @@ DEF_IS(VarHolder*, T) from_py_object(PyObject* obj, unique_ptr<VarHolder>& holde
struct DataView;
DEF_IS(DataView, PyObject*) to_py_object(T a) {
#if defined(__linux__) || defined(_WIN32)
STACK_ALLOC(int64, dims, a.shape.size());
STACK_ALLOC(int64_t, dims, a.shape.size());
#elif defined(__APPLE__)
long dims[a.shape.size()];
#endif
@ -652,7 +652,7 @@ DEF_IS(NumpyFunc, T) from_py_object(PyObject* obj) {
[obj](typename T::R* result) {
// import numpy
string npstr="numpy";
#ifdef HAS_CUDA
#ifdef IS_CUDA
if (use_cuda) npstr="cupy";
#endif
@ -669,7 +669,7 @@ DEF_IS(NumpyFunc, T) from_py_object(PyObject* obj) {
PyTuple_SET_ITEM(args.obj, 0, np.release());
PyTuple_SET_ITEM(args.obj, 1, data.release());
#ifdef HAS_CUDA
#ifdef IS_CUDA
if (npstr=="cupy") {
PyObjHolder jt(PyImport_ImportModule("jittor"));
PyObjHolder pFunc(PyObject_GetAttrString(jt.obj,"numpy2cupy"));

View File

@ -110,7 +110,7 @@ static void push_py_object(RingBuffer* rb, PyObject* obj, uint64& __restrict__ o
rb->push(size, offset);
args.ptr = rb->get_ptr(size, offset);
#if defined(__linux__) || defined(_WIN32)
STACK_ALLOC(int64, dims, args.shape.size());
STACK_ALLOC(int64_t, dims, args.shape.size());
#elif defined(__APPLE__)
long dims[args.shape.size()];
#endif

View File

@ -0,0 +1,165 @@
// ***************************************************************
// Copyright (c) 2021 Jittor. All Rights Reserved.
// Maintainers: Dun Liang <randonlang@gmail.com>.
// This file is subject to the terms and conditions defined in
// file 'LICENSE.txt', which is part of this source code package.
// ***************************************************************
#include "common.h"
#include "utils/str_utils.h"
#include "ops/op_register.h"
namespace jittor {
extern int use_cuda;
unordered_map<string,string> common_op_type_cuda_map = {
{"logical_not", "(!($2))"},
{"bitwise_not", "(~($2))"},
{"negative", "(-($2))"},
{"abs", "::abs($2)"},
{"log", "::logf(($1)($2))"},
{"exp", "::expf(($1)($2))"},
{"sqrt", "::sqrtf(($1)($2))"},
{"round", "(($1) ::roundf(($2)))"},
{"floor", "(($1) ::floorf(($2)))"},
{"ceil", "(($1) ::ceilf(($2)))"},
{"round_int", "(($1) ::roundf(($2)))"},
{"floor_int", "(($1) ::floorf(($2)))"},
{"ceil_int", "(($1) ::ceilf(($2)))"},
{"sin", "(($1) ::sinf(($2)))"},
{"asin", "(($1) ::asinf(($2)))"},
{"sinh", "(($1) ::sinhf(($2)))"},
{"asinh", "(($1) ::asinhf(($2)))"},
{"cos", "(($1) ::cosf(($2)))"},
{"acos", "(($1) ::acosf(($2)))"},
{"cosh", "(($1) ::coshf(($2)))"},
{"acosh", "(($1) ::acoshf(($2)))"},
{"tan", "(($1) ::tanf(($2)))"},
{"atan", "(($1) ::atanf(($2)))"},
{"tanh", "(($1) ::tanhf(($2)))"},
{"atanh", "(($1) ::atanhf(($2)))"},
{"sigmoid", "(($1) (1.0f/(1.0f+::expf((::min($1(-($2)), $1(@if(@strcmp($1,float32)==0,30,300))))))))"},
{"erf", "(($1) ::erff(($2)))"},
{"erfinv", "(($1) ::erfinvf(($1)($2)))"},
{"cast", "(($1)($2))"},
{"pow", "::pow(($2),($4))"},
{"maximum", "::max($1($2), $1($4))"},
{"minimum", "::min($1($2), $1($4))"},
{"mod", "@if(@strcmp($1,float32)==0,(($2)-::floorf(($2)/($4))*($4)),@if(@strcmp(@Tx,float64)==0,(($2)-::floor(($2)/($4))*($4)),(($2)%($4))))"},
{"init_maximum", "::numeric_min<$1>()"},
{"init_minimum", "::numeric_max<$1>()"},
};
struct CommonOpType : OpByType {
CommonOpType() {
types = {
"bool",
"int8",
"int16",
"int32",
"int64",
"uint8",
"uint16",
"uint32",
"uint64",
"float32",
"float64",
};
}
string expand_op(const vector<string>& args) {
for (int i=1; i<args.size(); i+=2) {
if (!types.count(args[i]))
return "";
}
auto& cuda_map = common_op_type_cuda_map;
static unordered_map<string,string> cpu_map = {
{"logical_not", "(!($2))"},
{"bitwise_not", "(~($2))"},
{"negative", "(-($2))"},
{"abs", "std::abs($2)"},
{"log", "std::log(($1)($2))"},
{"exp", "std::exp(($1)($2))"},
{"sqrt", "std::sqrt(($1)($2))"},
{"round", "(($1)std::round(($2)))"},
{"floor", "(($1)std::floor(($2)))"},
{"ceil", "(($1)std::ceil(($2)))"},
{"round_int", "(($1)std::round(($2)))"},
{"floor_int", "(($1)std::floor(($2)))"},
{"ceil_int", "(($1)std::ceil(($2)))"},
{"sin", "(($1) std::sin(($2)))"},
{"asin", "(($1) std::asin(($2)))"},
{"sinh", "(($1) std::sinh(($2)))"},
{"asinh", "(($1) std::asinh(($2)))"},
{"cos", "(($1) std::cos(($2)))"},
{"acos", "(($1) std::acos(($2)))"},
{"cosh", "(($1) std::cosh(($2)))"},
{"acosh", "(($1) std::acosh(($2)))"},
{"tan", "(($1) std::tan(($2)))"},
{"atan", "(($1) std::atan(($2)))"},
{"tanh", "(($1) std::tanh(($2)))"},
{"atanh", "(($1) std::atanh(($2)))"},
{"sigmoid", "(($1) (1.0f/(1.0f+std::exp(std::min($1(-($2)), $1(@if(@strcmp($1,float32)==0,30,300)))))))"},
{"erf", "(($1) std::erf(($2)))"},
{"erfinv", "(jittor::_erfinv($2))"},
{"cast", "(($1)($2))"},
{"pow", "std::pow(($2),($4))"},
{"maximum", "std::max($1($2), $1($4))"},
{"minimum", "std::min($1($2), $1($4))"},
{"mod", "@if(@strcmp($1,float32)==0,(($2)-std::floor(($2)/($4))*($4)),@if(@strcmp(@Tx,float64)==0,(($2)-std::floor(($2)/($4))*($4)),(($2)%($4))))"},
{"init_maximum", "std::numeric_limits<$1>::lowest()"},
{"init_minimum", "std::numeric_limits<$1>::max()"},
};
static unordered_map<string,string> both_map {
{"add", "(($2)+($4))"},
{"subtract", "(($2)-($4))"},
{"multiply", "(($2)*($4))"},
{"divide", "($1(($1($2))/($1($4))))"},
{"floor_divide", "($1(($1($2))/($1($4))))"},
{"less", "(($2)<($4))"},
{"less_equal", "(($2)<=($4))"},
{"greater", "(($2)>($4))"},
{"greater_equal", "(($2)>=($4))"},
{"equal", "(($2)==($4))"},
{"not_equal", "(($2)!=($4))"},
{"left_shift", "(($2)<<($4))"},
{"right_shift", "(($2)>>($4))"},
{"logical_and", "(($2)&&($4))"},
{"logical_or", "(($2)||($4))"},
{"logical_xor", "((bool($2))!=(bool($4)))"},
{"bitwise_and", "(($2)&($4))"},
{"bitwise_or", "(($2)|($4))"},
{"bitwise_xor", "(($2)^($4))"},
{"mean", "(($2)+$1($4)*($1(rcount)))"},
{"init_add", "$1(0)"},
{"init_multiply", "$1(1)"},
{"init_logical_and", "true"},
{"init_logical_or", "false"},
{"init_logical_xor", "false"},
{"init_bitwise_and", "$1(-1)"},
{"init_bitwise_or", "$1(0)"},
{"init_bitwise_xor", "$1(0)"},
{"init_mean", "$1(0)"},
};
string ret;
if (both_map.count(args.at(0)))
ret = both_map[args.at(0)];
else if (use_cuda)
ret = cuda_map[args.at(0)];
else
ret = cpu_map[args.at(0)];
return format(ret, args);
}
void post_pass(OpCompiler*) {
return;
}
};
static int _ = registe_op_type(new CommonOpType());
}

View File

@ -0,0 +1,164 @@
// ***************************************************************
// Copyright (c) 2021 Jittor. All Rights Reserved.
// Maintainers: Dun Liang <randonlang@gmail.com>.
// This file is subject to the terms and conditions defined in
// file 'LICENSE.txt', which is part of this source code package.
// ***************************************************************
#pragma once
#include "common.h"
#ifdef JIT_cuda
#include <driver_types.h>
#include <cuda_fp16.h>
namespace jittor {
typedef __half float16;
#if CUDA_ARCH >= 800
inline __device__ float16 max(float16 a, float16 b) { return __hmax(a, b); }
inline __device__ float16 min(float16 a, float16 b) { return __hmin(a, b); }
#else
inline __device__ float16 max(float16 a, float16 b) { return a<b?b:a; }
inline __device__ float16 min(float16 a, float16 b) { return a<b?a:b; }
#endif
inline __device__ float16 pow(float16 a, float16 b) { return ::pow(float32(a), float32(b)); }
template<int nbyte, class T>
__device__ inline void vload(T* __restrict__ a, T* __restrict__ b) {
if constexpr (nbyte<=0) return;
if constexpr (nbyte>=16) {
auto __restrict__ aa = (float4* __restrict__)a;
auto __restrict__ bb = (float4* __restrict__)b;
aa[0] = bb[0];
return vload<nbyte-16>(aa+1, bb+1);
}
if constexpr (nbyte>=8) {
auto __restrict__ aa = (float2* __restrict__)a;
auto __restrict__ bb = (float2* __restrict__)b;
aa[0] = bb[0];
return vload<nbyte-8>(aa+1, bb+1);
}
if constexpr (nbyte>=4) {
auto __restrict__ aa = (float* __restrict__)a;
auto __restrict__ bb = (float* __restrict__)b;
aa[0] = bb[0];
return vload<nbyte-4>(aa+1, bb+1);
}
if constexpr (nbyte>=2) {
auto __restrict__ aa = (__half* __restrict__)a;
auto __restrict__ bb = (__half* __restrict__)b;
aa[0] = bb[0];
return vload<nbyte-2>(aa+1, bb+1);
}
if constexpr (nbyte>=1) {
auto __restrict__ aa = (int8_t* __restrict__)a;
auto __restrict__ bb = (int8_t* __restrict__)b;
aa[0] = bb[0];
return vload<nbyte-1>(aa+1, bb+1);
}
}
}
using jittor::max;
using jittor::min;
using jittor::pow;
#else
namespace jittor {
struct float16 {
uint16 x;
inline float16(float32 f) {
unsigned x = *((int*)(void*)(&f));
unsigned u = (x & 0x7fffffff), remainder, shift, lsb, lsb_s1, lsb_m1;
unsigned sign, exponent, mantissa;
// Get rid of +NaN/-NaN case first.
if (u > 0x7f800000) {
this->x = 0x7fffU;
return;
}
sign = ((x >> 16) & 0x8000);
// Get rid of +Inf/-Inf, +0/-0.
if (u > 0x477fefff) {
this->x = sign | 0x7c00U;
return;
}
if (u < 0x33000001) {
this->x = sign | 0x0000U;
return;
}
exponent = ((u >> 23) & 0xff);
mantissa = (u & 0x7fffff);
if (exponent > 0x70) {
shift = 13;
exponent -= 0x70;
} else {
shift = 0x7e - exponent;
exponent = 0;
mantissa |= 0x800000;
}
lsb = (1 << shift);
lsb_s1 = (lsb >> 1);
lsb_m1 = (lsb - 1);
// Round to nearest even.
remainder = (mantissa & lsb_m1);
mantissa >>= shift;
if (remainder > lsb_s1 || (remainder == lsb_s1 && (mantissa & 0x1))) {
++mantissa;
if (!(mantissa & 0x3ff)) {
++exponent;
mantissa = 0;
}
}
this->x = (sign | (exponent << 10) | mantissa);
}
inline operator float() {
unsigned sign = ((x >> 15) & 1);
unsigned exponent = ((x >> 10) & 0x1f);
unsigned mantissa = ((x & 0x3ff) << 13);
if (exponent == 0x1f) { /* NaN or Inf */
mantissa = (mantissa ? (sign = 0, 0x7fffff) : 0);
exponent = 0xff;
} else if (!exponent) { /* Denorm or Zero */
if (mantissa) {
unsigned int msb;
exponent = 0x71;
do {
msb = (mantissa & 0x400000);
mantissa <<= 1; /* normalize */
--exponent;
} while (!msb);
mantissa &= 0x7fffff; /* 1.mantissa is implicit */
}
} else {
exponent += 0x70;
}
int temp = ((sign << 31) | (exponent << 23) | mantissa);
return reinterpret_cast<float&>(temp);
}
};
}
#endif

View File

@ -0,0 +1,188 @@
// ***************************************************************
// Copyright (c) 2021 Jittor. All Rights Reserved.
// Maintainers: Dun Liang <randonlang@gmail.com>.
// This file is subject to the terms and conditions defined in
// file 'LICENSE.txt', which is part of this source code package.
// ***************************************************************
#include "common.h"
#include "utils/str_utils.h"
#include "ops/op_register.h"
#include "op_compiler.h"
namespace jittor {
extern int use_cuda;
extern unordered_map<string,string> common_op_type_cuda_map;
static bool isvar(char x) { return isalnum(x) || x == '_' || x == ':'; }
struct FP16OpType : OpByType {
FP16OpType() {
types = {
"float16",
};
}
string expand_op(const vector<string>& args) {
bool found_fp16 = 0;
for (int i=1; i<args.size(); i+=2) {
if (types.count(args[i]))
found_fp16 = 1;
}
if (!found_fp16) return "";
static unordered_map<string,string> cuda_map = {
{"logical_not", "(!($2))"},
{"bitwise_not", "(~($2))"},
{"negative", "(-($2))"},
{"abs", "::abs($2)"},
{"log", "::hlog(($1)($2))"},
{"exp", "::hexp(($1)($2))"},
{"sqrt", "::hsqrt(($1)($2))"},
{"round", "(($1) ::roundf(($2)))"},
{"floor", "(($1) ::floorf(($2)))"},
{"ceil", "(($1) ::ceilf(($2)))"},
{"round_int", "(($1) ::roundf(($2)))"},
{"floor_int", "(($1) ::floorf(($2)))"},
{"ceil_int", "(($1) ::ceilf(($2)))"},
{"sin", "(($1) ::sinf(($2)))"},
{"asin", "(($1) ::asinf(($2)))"},
{"sinh", "(($1) ::sinhf(($2)))"},
{"asinh", "(($1) ::asinhf(($2)))"},
{"cos", "(($1) ::cosf(($2)))"},
{"acos", "(($1) ::acosf(($2)))"},
{"cosh", "(($1) ::coshf(($2)))"},
{"acosh", "(($1) ::acoshf(($2)))"},
{"tan", "(($1) ::tanf(($2)))"},
{"atan", "(($1) ::atanf(($2)))"},
{"tanh", "(($1) ::tanhf(($2)))"},
{"atanh", "(($1) ::atanhf(($2)))"},
{"sigmoid", "(($1) (1.0f/(1.0f+::expf((::min($1(-($2)), $1(@if(@strcmp($1,float16)==0,30,300))))))))"},
{"erf", "(($1) ::erff(($2)))"},
{"erfinv", "(($1) ::erfinvf(($1)($2)))"},
{"cast", "(($1)($2))"},
{"pow", "::pow(($2),($4))"},
{"maximum", "::max($1($2), $1($4))"},
{"minimum", "::min($1($2), $1($4))"},
{"mod", "$1(($2)-::hfloor(($2)/($4))*($4))"},
{"init_maximum", "-32768.0f"},
{"init_minimum", "32768.0f"},
};
static unordered_map<string,string> cpu_map = {
{"logical_not", "(!($2))"},
{"bitwise_not", "(~($2))"},
{"negative", "(-($2))"},
{"abs", "std::abs($2)"},
{"log", "std::log(($1)($2))"},
{"exp", "std::exp(($1)($2))"},
{"sqrt", "std::sqrt(($1)($2))"},
{"round", "(($1)std::round(($2)))"},
{"floor", "(($1)std::floor(($2)))"},
{"ceil", "(($1)std::ceil(($2)))"},
{"round_int", "(($1)std::round(($2)))"},
{"floor_int", "(($1)std::floor(($2)))"},
{"ceil_int", "(($1)std::ceil(($2)))"},
{"sin", "(($1) std::sin(($2)))"},
{"asin", "(($1) std::asin(($2)))"},
{"sinh", "(($1) std::sinh(($2)))"},
{"asinh", "(($1) std::asinh(($2)))"},
{"cos", "(($1) std::cos(($2)))"},
{"acos", "(($1) std::acos(($2)))"},
{"cosh", "(($1) std::cosh(($2)))"},
{"acosh", "(($1) std::acosh(($2)))"},
{"tan", "(($1) std::tan(($2)))"},
{"atan", "(($1) std::atan(($2)))"},
{"tanh", "(($1) std::tanh(($2)))"},
{"atanh", "(($1) std::atanh(($2)))"},
{"sigmoid", "(($1) (1.0f/(1.0f+std::exp(std::min($1(-($2)), $1(@if(@strcmp($1,float32)==0,30,300)))))))"},
{"erf", "(($1) std::erf(($2)))"},
{"erfinv", "(jittor::_erfinv($2))"},
{"cast", "(($1)($2))"},
{"pow", "std::pow(($2),($4))"},
{"maximum", "std::max($1($2), $1($4))"},
{"minimum", "std::min($1($2), $1($4))"},
{"mod", "$1(($2)-std::floor(($2)/($4))*($4))"},
{"init_maximum", "-32768.0f"},
{"init_minimum", "32768.0f"},
};
static unordered_map<string,string> both_map {
{"add", "(($2)+($4))"},
{"subtract", "(($2)-($4))"},
{"multiply", "(($2)*($4))"},
{"divide", "($1(($1($2))/($1($4))))"},
{"floor_divide", "($1(($1($2))/($1($4))))"},
{"less", "(($2)<($4))"},
{"less_equal", "(($2)<=($4))"},
{"greater", "(($2)>($4))"},
{"greater_equal", "(($2)>=($4))"},
{"equal", "(($2)==($4))"},
{"not_equal", "(($2)!=($4))"},
{"left_shift", "(($2)<<($4))"},
{"right_shift", "(($2)>>($4))"},
{"logical_and", "(($2)&&($4))"},
{"logical_or", "(($2)||($4))"},
{"logical_xor", "((bool($2))!=(bool($4)))"},
{"bitwise_and", "(($2)&($4))"},
{"bitwise_or", "(($2)|($4))"},
{"bitwise_xor", "(($2)^($4))"},
{"mean", "(($2)+($4)*($1(rcount)))"},
{"init_add", "$1(0)"},
{"init_multiply", "$1(1)"},
{"init_logical_and", "true"},
{"init_logical_or", "false"},
{"init_logical_xor", "false"},
{"init_bitwise_and", "$1(-1)"},
{"init_bitwise_or", "$1(0)"},
{"init_bitwise_xor", "$1(0)"},
{"init_mean", "$1(0)"},
};
string ret;
if (both_map.count(args.at(0)))
ret = both_map[args.at(0)];
else if (use_cuda)
ret = cuda_map[args.at(0)];
else
ret = cpu_map[args.at(0)];
if (use_cuda) {
if (args[1] == "float32" && !both_map.count(args.at(0))) {
ret = common_op_type_cuda_map[args.at(0)];
}
if (args[1] == "float16" || args[1] == "float32") {
for (int i=3; i<args.size(); i+=2) {
if (args[i] != args[1]) {
ret = replace(ret, "$"+S(i-1),
args[1]+"($"+S(i-1)+")");
}
}
} else {
for (int i=3; i<args.size(); i+=2) {
if (args[i] != "float16") {
ret = replace(ret, "$"+S(i-1),
"float16($"+S(i-1)+")");
}
}
}
}
return format(ret, args);
}
void post_pass(OpCompiler* oc) {
string& src = oc->src;
if (src.find("float16") == string::npos)
return;
int i = src.rfind("#include");
if (i<0) i=0;
i = src.find('\n', i) + 1;
src = src.substr(0, i) + "#include \"type/fp16_compute.h\"\n" +
src.substr(i);
return;
}
};
static int _ = registe_op_type(new FP16OpType());
}

View File

@ -18,7 +18,7 @@ namespace jittor {
typedef int8_t int8;
typedef int16_t int16;
typedef int int32;
typedef int64_t int64;
typedef long long int64;
typedef uint8_t uint8;
typedef uint16_t uint16;
typedef uint32_t uint32;
@ -239,4 +239,6 @@ std::ostream& operator<<(std::ostream& os, const Caster<T,To>& input) {
return os << ']';
}
#define JPU(x) ;
} // jittor

View File

@ -167,14 +167,14 @@ void process(string src, vector<string>& input_names, string& cmd) {
// #include "a.h"
// i jk l
auto j=i+1;
while (j<src.size() && (src[j] != ' ' && src[j] != '\n')) j++;
while (j<src.size() && (src[j] != ' ' && src[j] != '\n' && src[j] != '\r')) j++;
if (j>=src.size()) return;
if (j-i != 8 && j-i != 6) continue;
auto k=j+1;
while (k<src.size() && src[k] == ' ') k++;
if (k>=src.size()) return;
auto l=k+1;
while (l<src.size() && (src[l] != ' ' && src[l] != '\n')) l++;
while (l<src.size() && (src[l] != ' ' && src[l] != '\n' && src[l] != '\r')) l++;
if (src[k] == '"' && src[l-1] == '"' && j-i==8 && src.substr(i,j-i) == "#include") {
auto inc = src.substr(k+1, l-k-2);
if (inc != "test.h" && inc != "helper_cuda.h") {

View File

@ -47,4 +47,151 @@ string strip(const string& s) {
return s.substr(i,j-i);
}
string format(const string& s, const vector<string>& v) {
string ss;
for (int i=0; i<s.size(); i++) {
if (s[i] == '$') {
int j = s[i+1] - '0';
ss += v.at(j);
i ++;
continue;
} else
ss += s[i];
}
return ss;
}
string join(const vector<string>& vs, const string& x) {
string s;
for (int i=0; i<vs.size(); i++) {
s += vs[i];
if (i!=vs.size()-1)
s += x;
}
return s;
}
string replace(const string& a, const string& b, const string& c) {
auto vs = split(a, b);
return join(vs, c);
}
static inline bool isvar(char x) { return isalnum(x) || x == '_' || x == ':'; }
vector<string> token_split(const string& s) {
vector<string> ss;
if (!s.size()) return ss;
ss.push_back(string()+s[0]);
for (int i=1; i<s.size(); i++) {
if (isvar(s[i]) != isvar(s[i-1]))
ss.push_back("");
ss.back() += s[i];
}
return ss;
}
static void parse_reg(const string& src,
vector<string>& patterns,
vector<int>& arg_id) {
patterns.clear();
arg_id.clear();
patterns.push_back("");
for (int j=0; j<src.size(); j++) {
if (src[j] == '$') {
j++;
arg_id.push_back(src[j]-'0');
patterns.push_back("");
continue;
}
patterns.back() += src[j];
}
}
void token_replace(vector<string>& tokens, int i, const string& src, const string& dst) {
ASSERT(src.at(0) != '$' && src.at(src.size()-1) != '$' &&
src.at(src.size()-2) != '$') << "illegal src:" << src;
vector<string> patterns;
vector<int> arg_id;
vector<string> patterns2;
vector<int> arg_id2;
unordered_map<int, string> args;
parse_reg(src, patterns, arg_id);
parse_reg(dst, patterns2, arg_id2);
int start_i, start_pos, end_i, end_pos;
int c_i = i, c_pos = 0;
int match_i, match_pos;
string c_arg;
auto match = [&](int c_i, int c_pos, const string& pat) -> bool {
for (int i=0; i<pat.size(); i++) {
if (tokens[c_i][c_pos] != pat[i])
return false;
c_pos ++;
if (c_pos >= tokens[c_i].size()) {
c_pos = 0;
c_i ++;
if (c_i >= tokens.size())
return false;
}
}
match_i = c_i;
match_pos = c_pos;
return true;
};
for (int j=0; j<patterns.size(); j++) {
int ok = 0;
while (c_i < tokens.size()) {
while (c_pos < tokens[c_i].size()) {
if (match(c_i, c_pos, patterns[j])) {
ok = 1;
break;
}
c_arg += tokens[c_i][c_pos];
c_pos ++;
}
if (ok) break;
c_i ++;
c_pos = 0;
}
CHECK(ok) << "Pattern not match:" << patterns[j] << j;
if (j == 0) {
start_i = c_i;
start_pos = c_pos;
}
if (j) {
args[arg_id[j-1]] = c_arg;
}
c_arg = "";
c_i = match_i;
c_pos = match_pos;
if (j == patterns.size()-1) {
end_i = c_i;
end_pos = c_pos;
}
}
string new_src;
for (int j=0; j<patterns2.size(); j++) {
if (j) new_src += args[arg_id2.at(j-1)];
new_src += patterns2[j];
}
if (start_i == end_i) {
tokens[start_i] = tokens[start_i].substr(0, start_pos) +
new_src + tokens[start_i].substr(end_pos);
} else {
tokens[start_i] = tokens[start_i].substr(0, start_pos)
+ new_src;
tokens[end_i] = tokens[end_i].substr(end_pos);
for (int j=start_i+1; j<end_i; j++)
tokens[j] = "";
}
}
string token_replace(const string& s, const string& src, const string& dst) {
vector<string> ss{s};
token_replace(ss, 0, src, dst);
return join(ss, "");
}
} // jittor

View File

@ -27,4 +27,16 @@ vector<string> split(const string& s, const string& sep, int max_split=0);
string strip(const string& s);
string format(const string& s, const vector<string>& v);
string replace(const string& a, const string& b, const string& c);
string join(const vector<string>& vs, const string& x);
vector<string> token_split(const string& s);
void token_replace(vector<string>& tokens, int i, const string& src, const string& dst);
string token_replace(const string& s, const string& src, const string& dst);
} // jittor

View File

@ -14,7 +14,7 @@
namespace jittor {
int64_t Var::number_of_lived_vars = 0;
int64 Var::number_of_lived_vars = 0;
DEFINE_FLAG(fast_shared_ptr<loop_options_t>, compile_options, {},
"Override the default loop transfrom options");
@ -42,7 +42,7 @@ string Var::to_string() {
return s;
}
int64_t Var::numel() {
int64 Var::numel() {
if (!shape.size()) return size=num=-1;
bool negtive = 0;
num=1;

View File

@ -18,13 +18,13 @@ struct Var : Node {
NanoVector shape;
cstr name;
fast_shared_ptr<loop_options_t> loop_options;
static int64_t number_of_lived_vars;
static int64 number_of_lived_vars;
// this var will be generated after alloc.
void* mem_ptr = nullptr;
Allocator* allocator = nullptr;
size_t allocation;
int64_t size, num;
int64 size, num;
inline bool is_float() const { CHECK_EXIST; return ns.is_float(); }
inline int dsize() const { CHECK_EXIST; return ns.dsize(); }
inline NanoString dtype() const { CHECK_EXIST; return ns; }
@ -40,7 +40,7 @@ struct Var : Node {
Var(NanoVector shape, NanoString dtype);
string to_string();
int64_t numel();
int64 numel();
void set_shape(NanoVector shape);
bool alloc(Allocator* allocator);
inline void share_with(Var* x, size_t offset = 0) { CHECK_EXIST; allocator = (Allocator*)x; allocation = offset; }

View File

@ -15,6 +15,7 @@ if __name__ == "__main__":
skip_l = int(os.environ.get("test_skip_l", "0"))
skip_r = int(os.environ.get("test_skip_r", "1000000"))
skip = os.environ.get("test_skip", "").split(",")
test_only = None
if "test_only" in os.environ:
test_only = set(os.environ.get("test_only").split(","))
@ -34,6 +35,9 @@ if __name__ == "__main__":
continue
if test_only and test_name not in test_only:
continue
for s in skip:
if s in test_name:
continue
print("Add Test", _, test_name)
suite.addTest(tests)

View File

@ -0,0 +1,374 @@
from copy import deepcopy
from pathlib import Path
import jittor as jt
import jittor.nn as nn
import numpy as np
import os
split_size = 1000000
conv_opt = int(os.environ.get("conv_opt", "0"))
if conv_opt:
Conv1d_sp = nn.Conv1d_sp
else:
Conv1d_sp = nn.Conv1d
def MLP(channels: list, do_bn=True):
""" Multi-layer perceptron """
n = len(channels)
layers = []
for i in range(1, n):
layers.append(Conv1d_sp(channels[i - 1], channels[i], kernel_size=1, bias=True))
if i < (n - 1):
if do_bn:
layers.append(nn.BatchNorm(channels[i]))
# layers.append(nn.InstanceNorm1d(channels[i]))
# layers.append(nn.LayerNorm(channels[i]))
layers.append(nn.ReLU())
return nn.Sequential(*layers)
def normalize_keypoints(kpts, image_shape):
size = image_shape.flip(1) # shape=(b,2) ;h w -> w, h
center = size / 2
scaling = size.float32().max(1, keepdims=True) * 0.7
return (kpts - center[:, None, :]) / scaling[:, None, :]
class KeypointEncoder(nn.Module):
""" Joint encoding of visual appearance and location using MLPs"""
def __init__(self, feature_dim, layers, keypoint_position_dim=2):
super().__init__()
# self.keypoint_position_dim = keypoint_position_dim
self.encoder = MLP([keypoint_position_dim + 1] + layers + [feature_dim])
nn.init.constant_(self.encoder[-1].bias, 0.0)
def execute(self, kpts, scores):
inputs = jt.concat([kpts.t(), scores.unsqueeze(1)], dim=1)
return self.encoder(inputs)
cnt = 0
def attention(query, key, value):
global cnt
cnt += 1
b, d, h, n = query.shape
# print("attention", b,d,h,n, cnt)
dim_factor = (1.0 / d)**0.5
query = query.transpose(0, 2, 3, 1).reshape(b * h, -1, d) * dim_factor
key = key.transpose(0, 2, 1, 3).reshape(b * h, d, -1)
value = value.transpose(0, 2, 3, 1).reshape(b * h, -1, d)
# print("attention", query.shape, key.shape, value.shape)
data = []
for i in range(0, query.shape[0], split_size):
end = min(i + split_size, query.shape[0])
tmp1 = nn.bmm(query[i:end], key[i:end])
tmp2 = nn.softmax(tmp1, dim=-1)
tmp3 = nn.bmm(tmp2, value[i:end])
tmp3.sync()
data.append(tmp3)
tmp3 = jt.concat(data)
# for i in range(0, query.shape[0], split_size):
# end = min(i + split_size, query.shape[0])
# tmp1 = nn.bmm(query[:,i:end], key[:,i:end])
# tmp2 = nn.softmax(tmp1, dim=-1)
# tmp3 = nn.bmm(tmp2, value[:,i:end])
# tmp3.sync()
# data.append(tmp3)
# tmp3 = jt.concat(data, dim=1)
# tmp1 = nn.bmm(query, key)
# print(tmp1.shape)
# tmp2 = nn.softmax(tmp1, dim=-1)
# print(tmp2.shape)
# tmp3 = nn.bmm(tmp2, value)
# print(tmp3.shape)
return tmp3.reshape(b, h, -1, d).transpose(0, 3, 1, 2)
return nn.bmm(nn.softmax(nn.bmm(query, key), dim=-1), value).reshape(b, h, -1, d).transpose(0, 3, 1, 2)
class MultiHeadedAttention(nn.Module):
""" Multi-head attention to increase model expressivitiy """
def __init__(self, num_heads: int, d_model: int):
super().__init__()
assert d_model % num_heads == 0
self.dim = d_model // num_heads
self.num_heads = num_heads
self.merge = Conv1d_sp(d_model, d_model, kernel_size=1)
self.proj = nn.ModuleList([deepcopy(self.merge) for _ in range(3)])
def execute(self, query, key, value):
batch_dim = query.size(0)
query, key, value = [l(x).reshape(batch_dim, self.dim, self.num_heads, -1) for l, x in zip(self.proj, (query, key, value))]
x = attention(query, key, value)
# x = attention_chunk(query, key, value)
return self.merge(x.reshape(batch_dim, self.dim * self.num_heads, -1))
class AttentionalPropagation(nn.Module):
def __init__(self, feature_dim: int, num_heads: int):
super().__init__()
self.attn = MultiHeadedAttention(num_heads, feature_dim)
self.mlp = MLP([feature_dim * 2, feature_dim * 2, feature_dim])
nn.init.constant_(self.mlp[-1].bias, 0.0)
def execute(self, x, source):
message = self.attn(x, source, source)
return self.mlp(jt.concat([x, message], dim=1))
class AttentionalGNN(nn.Module):
def __init__(self, feature_dim: int, layer_names: list):
super().__init__()
self.layers = nn.ModuleList([AttentionalPropagation(feature_dim, 4) for _ in range(len(layer_names))])
self.is_cross = [x == 'cross' for x in layer_names]
def execute(self, desc0, desc1):
for layer, is_cross in zip(self.layers, self.is_cross):
layer.attn.prob = []
if is_cross:
src0, src1 = desc1, desc0
else: # if name == 'self':
src0, src1 = desc0, desc1
# delta0, delta1 = layer(desc0, src0), layer(desc1, src1)
delta0 = layer(desc0, src0)
# print(delta0.numel()*4)
# breakpoint()
jt.sync_all()
delta1 = layer(desc1, src1)
jt.sync_all()
desc0, desc1 = (desc0 + delta0), (desc1 + delta1)
jt.sync_all()
return desc0, desc1
def log_sinkhorn_iterations(Z, log_mu, log_nu, iters: int):
""" Perform Sinkhorn Normalization in Log-space for stability"""
u, v = jt.zeros_like(log_mu), jt.zeros_like(log_nu)
for _ in range(iters):
u = log_mu - (Z + v.unsqueeze(1)).exp().sum(dim=2).log()
v = log_nu - (Z + u.unsqueeze(2)).exp().sum(dim=1).log()
return Z + u.unsqueeze(2) + v.unsqueeze(1)
def log_optimal_transport(scores, alpha, iters: int):
""" Perform Differentiable Optimal Transport in Log-space for stability"""
b, m, n = scores.shape
ms, ns = jt.float(m, requires_grad=False), jt.float(n, requires_grad=False)
bins0 = alpha.broadcast([b, m, 1])
bins1 = alpha.broadcast([b, 1, n])
alpha = alpha.broadcast([b, 1, 1])
couplings = jt.concat([jt.concat([scores, bins0], -1), jt.concat([bins1, alpha], -1)], 1)
norm = -(ms + ns).log()
log_mu = jt.concat([norm.broadcast([m]), ns.log() + norm])
log_nu = jt.concat([norm.broadcast([n]), ms.log() + norm])
log_mu, log_nu = log_mu[None].broadcast([b, m + 1]), log_nu[None].broadcast([b, n + 1])
Z = log_sinkhorn_iterations(couplings, log_mu, log_nu, iters)
Z = Z - norm # multiply probabilities by M+N
return Z
def arange_like(x, dim: int):
return jt.ones(x.shape[dim], dtype=x.dtype)[None].cumsum()[0] - 1 # traceable in 1.1
default_config = {
'descriptor_dim': 256, # SuperPoint
'weights': 'indoor',
'keypoint_encoder': [32, 64, 128, 256], # SuperPoint
'GNN_layers': ['self', 'cross'] * 9,
'sinkhorn_iterations': 100,
'match_threshold': 0.2,
}
def get_weighted_loss_batch(scores, all_matches):
matches0, matches1 = all_matches.chunk(chunks=2, dim=2)
batchIdx = jt.arange(all_matches.shape[0]).unsqueeze(1).repeat(1, all_matches.shape[1])
batchIdx, matches0, matches1 = batchIdx.view(-1), matches0.view(-1), matches1.view(-1)
valid_index0, valid_index1 = matches0 >= 0, matches1 >= 0
valid_match = jt.logical_and(valid_index0, valid_index1)
valid_unmatch = jt.logical_xor(valid_index0, valid_index1)
num_match = valid_match.sum().maximum(1e-9)
num_unmatch = valid_unmatch.sum().maximum(1e-9)
score_ = scores[batchIdx, matches0, matches1]
score_match_ = (score_*valid_match).float32().sum() / num_match
score_umatch_ = (score_*valid_unmatch).float32().sum() / num_unmatch
return -(num_unmatch * score_match_ + num_match * score_umatch_) / (num_match + num_unmatch)
# print(score_umatch_, score_match_)
# return -(score_match + score_umatch) / (num_match + num_unmatch)
score_match = scores[(batchIdx[valid_match], matches0[valid_match], matches1[valid_match])].float32().mean() if num_match > 0 else 0
score_umatch = scores[(batchIdx[valid_unmatch], matches0[valid_unmatch], matches1[valid_unmatch])].float32().mean() if num_unmatch > 0 else 0
# print(score_match, score_umatch)
return -(num_unmatch * score_match + num_match * score_umatch) / (num_match + num_unmatch)
def add_dustbin(scores, alpha):
b, m, n = scores.shape
bins0 = jt.broadcast(alpha, (b, m, 1))
bins1 = jt.broadcast(alpha, (b, 1, n))
alpha = jt.broadcast(alpha, (b, 1, 1))
couplings = jt.concat([jt.concat([scores, bins0], -1), jt.concat([bins1, alpha], -1)], 1)
return couplings
class SuperGlue(nn.Module):
def __init__(self, config):
super().__init__()
config = {**default_config, **config}
self.descriptor_dim = config['descriptor_dim']
self.keypoint_encoder = config['keypoint_encoder']
self.GNN_layers = config['GNN_layers']
self.sinkhorn_iterations = config['sinkhorn_iterations']
self.match_threshold = config['match_threshold']
self.keypoint_position_dim = config['keypoint_position_dim']
self.use_dual_softmax = config['use_dual_softmax']
self.scale = jt.float(self.descriptor_dim**-0.5).stop_grad()
# self.scale.requires_grad = False
# self.des_extend = MLP([128, 256])
self.kenc = KeypointEncoder(self.descriptor_dim, self.keypoint_encoder, keypoint_position_dim=self.keypoint_position_dim)
self.gnn = AttentionalGNN(self.descriptor_dim, self.GNN_layers)
self.final_proj = Conv1d_sp(self.descriptor_dim, self.descriptor_dim, kernel_size=1, bias=True)
self.bin_score = jt.float(1.0)
def execute(self, data):
"""Run SuperGlue on a pair of keypoints and descriptors"""
kpts0, kpts1 = data['keypoints0'], data['keypoints1']
desc0, desc1 = data['descriptors0'], data['descriptors1']
all_matches = data['all_matches']
# match_num = data['match_num']
if kpts0.shape[1] == 0 or kpts1.shape[1] == 0 or all_matches.shape[1] == 0: # no keypoints or no matches/unmatches
shape0, shape1 = kpts0.shape[:-1], kpts1.shape[:-1]
return {
'matches0': jt.ones(shape0, dtype=jt.int),
'matches1': jt.ones(shape1, dtype=jt.int),
'matching_scores0': jt.zeros(shape0, dtype=jt.float),
'matching_scores1': jt.zeros(shape1, dtype=jt.float),
'skip_train': True
}
# Keypoint normalization.
kpts0 = normalize_keypoints(kpts0, data['shape0'])
kpts1 = normalize_keypoints(kpts1, data['shape1'])
# Keypoint MLP encoder.
# desc0 = self.des_extend(desc0) + self.kenc(kpts0, data['scores0'])
# desc1 = self.des_extend(desc1) + self.kenc(kpts1, data['scores1'])
desc0 = desc0 + self.kenc(kpts0, data['scores0'])
desc1 = desc1 + self.kenc(kpts1, data['scores1'])
# Multi-layer Transformer network.
desc0, desc1 = self.gnn(desc0, desc1)
# Final MLP projection.
desc0, desc1 = self.final_proj(desc0), self.final_proj(desc1)
desc0_t = desc0.t()
losses = []
for i in range(0, desc1.shape[0], split_size):
end = min(desc1.shape[0], i + split_size)
# Compute matching descriptor distance.
scores = nn.bmm(desc0_t[i:end], desc1[i:end]) * self.scale # 457.76 MB
scores.sync()
# Run the optimal transport.
if self.use_dual_softmax:
scores = add_dustbin(scores, self.bin_score) # 458.68 MB
scores.sync()
dual_softmax0, dual_softmax1 = nn.log_softmax(scores, 1), nn.log_softmax(scores, 2)
scores = dual_softmax0 + dual_softmax1 # 458.22 MB
scores.sync()
else:
scores = log_optimal_transport(scores, self.bin_score, iters=self.config['sinkhorn_iterations'])
# loss = torch.stack([get_match_score(scores[b], all_matches[b]) for b in range(all_matches.shape[0])])
loss = get_weighted_loss_batch(scores, all_matches[i:end])
loss.sync()
losses.append(loss)
loss = jt.concat(losses)
'''
# Compute matching descriptor distance.
scores = nn.bmm(desc0.t(), desc1) * self.scale # 457.76 MB
scores.sync()
# Run the optimal transport.
if self.use_dual_softmax:
scores = add_dustbin(scores, self.bin_score) # 458.68 MB
scores.sync()
dual_softmax0, dual_softmax1 = nn.log_softmax(scores, 1), nn.log_softmax(scores, 2)
scores = dual_softmax0 + dual_softmax1 # 458.22 MB
scores.sync()
else:
scores = log_optimal_transport(scores, self.bin_score, iters=self.config['sinkhorn_iterations'])
# loss = torch.stack([get_match_score(scores[b], all_matches[b]) for b in range(all_matches.shape[0])])
loss = get_weighted_loss_batch(scores, all_matches)
# print(scores.shape, all_matches.shape, loss.shape)
'''
# matches0, matches1 = all_matches.chunk(chunks=2, dim=2)
# batchIdx = jt.arange(0, b).unsqueeze(1).repeat(1, num)
# batchIdx, matches0, matches1 = batchIdx.view(-1), matches0.view(-1), matches1.view(-1)
# validmatch = (matches0 >= 0) | (matches1 >= 0)
# batchIdx, matches0, matches1 = batchIdx[validmatch], matches0[validmatch], matches1[validmatch]
# matches0[matches0 == -1] = n
# matches1[matches1 == -1] = m
# loss_mean = -scores[(batchIdx, matches0, matches1)].mean()
# loss_mean = nn.l1_loss(loss_mean, jt.float(0.0))
if not data['return_match']:
return {'loss': loss}
with jt.no_grad():
b, n, m = scores.shape
# Get the matches with score above "match_threshold".
indices0, max0 = scores[:, :-1, :-1].argmax(2)
indices1, max1 = scores[:, :-1, :-1].argmax(1)
mutual0 = jt.arange(0, n)[None] == indices1.gather(1, indices0)
mutual1 = jt.arange(0, m)[None] == indices0.gather(1, indices1)
# zero = scores.new_tensor(0)
# mscores0 = torch.where(mutual0, max0.values.exp(), zero)
mscores0 = max0.exp()
mscores0[mutual0.logical_not()] = 0
# mscores1 = torch.where(mutual1, mscores0.gather(1, indices1), zero)
mscores1 = mscores0.gather(1, indices1)
mscores1[mutual1.logical_not()] = 0
valid0 = mutual0 & (mscores0 > self.match_threshold)
valid1 = mutual1 & valid0.gather(1, indices1)
# indices0 = torch.where(valid0, indices0, indices0.new_tensor(-1))
# indices1 = torch.where(valid1, indices1, indices1.new_tensor(-1))
indices0[valid0.logical_not()] = -1
indices1[valid1.logical_not()] = -1
return {
'matches0': indices0, # use -1 for invalid match
'matches1': indices1, # use -1 for invalid match
'matching_scores0': mscores0,
'matching_scores1': mscores1,
'loss': loss,
}
# scores big value or small value means confidence? log can't take neg value

View File

@ -4,8 +4,8 @@ suffix = ""
import jittor as jt
import time
from pathlib import Path
home_path = str(Path.home())
import jittor_utils as jit_utils
home_path = jit_utils.home()
perf_path = os.path.join(home_path, ".cache", "jittor_perf")
def main():

View File

@ -0,0 +1,31 @@
# ***************************************************************
# Copyright (c) 2021 Jittor. All Rights Reserved.
# Maintainers: Dun Liang <randonlang@gmail.com>.
# This file is subject to the terms and conditions defined in
# file 'LICENSE.txt', which is part of this source code package.
# ***************************************************************
import unittest
import jittor as jt
from .test_core import expect_error
import numpy as np
@unittest.skipIf(not jt.compiler.has_acl, "No ACL found")
class TestACL(unittest.TestCase):
@jt.flag_scope(use_acl=1)
def test_array(self):
print("use_acl", jt.flags.use_acl)
a = jt.array([1,2,3])
np.testing.assert_allclose(a.numpy(), [1,2,3])
@jt.flag_scope(use_acl=1)
def test_add(self):
a = jt.array([1,2,3])
b = a+a
np.testing.assert_allclose(b.numpy(), [2,4,6])
def test_meminfo(self):
jt.display_memory_info()
if __name__ == "__main__":
unittest.main()

View File

@ -0,0 +1,344 @@
# ***************************************************************
# Copyright (c) 2021 Jittor. All Rights Reserved.
# Maintainers: Dun Liang <randonlang@gmail.com>.
# This file is subject to the terms and conditions defined in
# file 'LICENSE.txt', which is part of this source code package.
# ***************************************************************
import unittest
import jittor as jt
import numpy as np
import os
n = 400000000
# n = 4000000
n = 7680000
def get_mem_band():
a = jt.rand((n)).float32()
for i in range(100):
a.copy().sync()
jt.sync_all(True)
import time
t = time.time()
for i in range(1000):
a.copy().sync()
jt.sync_all(True)
dt = time.time() - t
band = a.numel() * 4 * 2000 / dt / 1024**3
print("Mem band: ", band)
return band
def check_simple_add_band():
# copy: 816
# S=1 128,1024, ILP=1 634
# S=0 128,1024, ILP=1 734
# S=0 128,512, ILP=1 716
# S=0 64,1024, ILP=1 706
# S=0 256,1024, ILP=1 706
def test(S=0, B=128, T=1024, ILP=1):
a = jt.rand((n)).float32()
jt.sync_all(True)
jt.flags.log_silent = 1
with jt.profile_scope(100, 1000) as rep:
b = jt.code(a.shape, a.dtype, [a],
cuda_header="#include \"type/fp16_compute.h\"",
cuda_src=f"""
__global__ void kernel(in0_type * __restrict__ a, in0_type* __restrict__ b, int num) {{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int tnum = blockDim.x * gridDim.x;
#define ILP {ILP}
for (int i=tid*ILP; i<num; i+=tnum*ILP) {{
// b[i] = a[i];
vload<ILP*sizeof(in0_type)>(b+i, a+i);
{"__syncthreads();" if S else ""}
}}
}}
kernel<<<{B},{T}>>>(in0_p, out0_p, in0->num);
""")
b.sync()
bw = float(rep[-1][9]) / 1024**3
s = f"S={S}, B={B}, T={T}, ILP={ILP} BW={bw}"
print(s)
return s, bw
def test2(S=0, B=128, T=1024, ILP=1):
a = jt.rand((n)).float32()
jt.sync_all(True)
# jt.flags.log_silent = 0
with jt.profile_scope(10, 1000) as rep:
b = jt.code(a.shape, a.dtype, [a],
cuda_header="#include \"type/fp16_compute.h\"",
cuda_src=f"""
__global__ void kernel(float2 * __restrict__ a, float2* __restrict__ b, int num) {{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int tnum = blockDim.x * gridDim.x;
#define ILP 1
for (int i=tid*ILP; i<num; i+=tnum*ILP) {{
b[i] = a[i];
// b[i+1] = a[i+1];
// vload<ILP*sizeof(in0_type)>(b+i, a+i);
{"__syncthreads();" if S else ""}
}}
}}
kernel<<<{B},{T}>>>((float2*)in0_p, (float2*)out0_p, in0->num/2);
""")
b.sync()
bw = float(rep[-1][9]) / 1024**3
s = f"T2: S={S}, B={B}, T={T}, ILP={ILP} BW={bw}"
print(s)
return s, bw
def test3(S=0, B=128, T=1024, ILP=1, C=0):
a = jt.rand((n)).float32()
b = jt.rand(B)
jt.sync_all(True)
jt.flags.log_silent = 1
with jt.profile_scope(100, 1000) as rep:
b = jt.code(a.shape, a.dtype, [a, b],
cuda_header="#include \"type/fp16_compute.h\"",
cuda_src=f"""
__global__ void kernel(in0_type * __restrict__ a, in0_type* __restrict__ b, int num) {{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int tnum = blockDim.x * gridDim.x;
#define ILP {ILP}
for (int i=tid*ILP; i<num; i+=tnum*ILP) {{
// b[i] = a[i];
vload<ILP*sizeof(in0_type)>(b+i, a+i);
{"__syncthreads();" if S else ""}
}}
{"__syncthreads();" if C else ""}
}}
kernel<<<in1->shape[0],{T}>>>(in0_p, out0_p, in0->num);
""")
b.compile_options = {"FLAGS: -Xptxas -dlcm=ca ": C}
# b.compile_options = {"FLAGS: Xptxas dlcm=ca ": 1}
b.sync()
bw = float(rep[-1][9]) / 1024**3
s = f"T3: S={S}, B={B}, T={T}, ILP={ILP} C={C} BW={bw}"
print(s)
return s, bw
def test4(S=0, B=128, T=1024, ILP=1, C=0, name="b.png"):
a = jt.rand((n)).float32()
b = jt.rand(B*4).uint32()
jt.sync_all(True)
# jt.flags.log_silent = 1
with jt.profile_scope(100, 10000) as rep:
_ = jt.code(a.shape, a.dtype, [a, b],
cuda_header="#include \"type/fp16_compute.h\"",
cuda_src=f"""
__device__ uint get_smid(void) {{
uint ret;
asm("mov.u32 %0, %smid;" : "=r"(ret) );
return ret;
}}
__device__ uint get_time(void) {{
uint ret;
asm volatile("mov.u32 %0, %%globaltimer_lo;" : "=r"(ret));
return ret;
}}
__global__ void kernel(in0_type * __restrict__ a, in0_type* __restrict__ b, int num, in1_type* __restrict__ c) {{
uint t = get_time();
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int tnum = blockDim.x * gridDim.x;
#define ILP {ILP}
for (int i=tid*ILP; i<num; i+=tnum*ILP) {{
// b[i] = a[i];
vload<ILP*sizeof(in0_type)>(b+i, a+i);
{"__syncthreads();" if S else ""}
}}
{"__syncthreads();" if C else ""}
if (threadIdx.x == 0)
((uint4* __restrict__)c)[blockIdx.x] =
uint4{{get_smid(), t, get_time(), 0}};
}}
kernel<<<in1->shape[0]/4,{T}>>>(in0_p, out0_p, in0->num, in1_p);
""")
_.compile_options = {"FLAGS: -Xptxas -dlcm=ca ": C}
# b.compile_options = {"FLAGS: Xptxas dlcm=ca ": 1}
_.sync()
bw = float(rep[-1][9]) / 1024**3
b = b.data.reshape(-1, 4)[:,:3]
mint = b[:,1].min()
b[:,1:] -= mint
smmax = int(b[:,0].max())
smmin = int(b[:,0].min())
maxt = b.max()
# print(b)
s = f"T4: S={S}, B={B}, T={T}, ILP={ILP} C={C} BW={bw:.3f} sm={smmin},{smmax} maxt={maxt}"
print(s)
import pylab as pl
pl.figure(figsize=(16,16))
texts = []
pret = np.zeros(200, dtype="uint32")
for i in range(B):
smid, s, t = b[i]
pl.plot([s,t], [smid, smid], 'ro-')
texts.append((s, smid, i))
texts.append((t, smid, i))
texts = sorted(texts)
for (s, smid, bid) in texts:
cpos = max(pret[smid], s)
pl.text(cpos, smid, str(bid))
pret[smid] = cpos + maxt // 30
# print("???")
# adjust_text(texts, arrowprops=dict(arrowstyle='->', color='blue'))
# print("???")
pl.savefig(name)
pl.close()
return s, bw
# test(S=0, B=128, T=1024, ILP=1)
# test(S=1, B=128, T=1024, ILP=1)
# test(S=0, B=64, T=1024, ILP=1)
# test(S=0, B=256, T=1024, ILP=1)
# test(S=1, B=128, T=512, ILP=1)
# test(S=1, B=128, T=256, ILP=1)
# test(S=0, B=128, T=1024, ILP=2)
# test(S=0, B=128, T=1024, ILP=4)
# test(S=0, B=128, T=512, ILP=2)
# test(S=0, B=128, T=512, ILP=4)
# test(S=1, B=128, T=1024, ILP=2)
# test(S=1, B=128, T=1024, ILP=4)
# test(S=1, B=128, T=1024, ILP=8)
# test(S=1, B=128, T=1024, ILP=16)
# test(S=1, B=128, T=512, ILP=2)
# test(S=1, B=128, T=512, ILP=4)
# test(S=1, B=256, T=1024, ILP=2)
# test(S=1, B=512, T=1024, ILP=2)
# test(S=1, B=256, T=1024, ILP=4)
# test(S=1, B=256, T=1024, ILP=8)
# test(S=1, B=256, T=1024, ILP=16)
# test(S=1, B=256, T=512, ILP=2)
# test(S=1, B=256, T=512, ILP=4)
# test(S=1, B=128, T=256, ILP=2)
# test(S=1, B=128, T=256, ILP=4)
# test(S=0, B=128, T=256, ILP=2)
# test(S=0, B=128, T=256, ILP=4)
# for b in [1, 2, 4, 8, 16, 32, 64, 128,256]:
# test(S=1, B=b, T=512, ILP=2)
import matplotlib as mpl
mpl.use('Agg')
import pylab as pl
import numpy as np
# test4(S=1, B=82, T=1024, ILP=2, C=0, name="b.png")
# test4(S=1, B=83, T=1024, ILP=2, C=0, name="c.png")
# test4(S=1, B=82*3, T=512, ILP=2, C=0, name="d1.png")
# test4(S=1, B=82*3+1, T=512, ILP=2, C=0, name="d2.png")
# test4(S=1, B=82*6+1, T=512, ILP=2, C=0, name="d3.png")
# test4(S=0, B=82*6+1, T=512, ILP=2, C=0, name="d4.png")
for b in range(70, 83):
test4(S=1, B=b, T=1024, ILP=2, C=0, name=f"b-{b}.png")
# data = []
# for b in range(32, 2000, 8):
# _, bw = test3(S=0, B=b, T=32, ILP=2)
# data.append([b, bw])
# data = np.array(data)
# pl.plot(data[:,0], data[:,1])
# for t in [32, 64, 128, 256, 512, 1024]:
# data = []
# for b in range(32, 2000, 8):
# _, bw = test3(S=1, B=b*(1024//t), T=t, ILP=2)
# data.append([b, bw])
# data = np.array(data)
# pl.plot(data[:,0], data[:,1])
# for t in [1024]:
# for c in [0,1]:
# data = []
# # for b in range(32, 1000, 8):
# for b in range(32, 33, 8):
# _, bw = test3(S=c, B=b*(1024//t), T=t, ILP=2, C=0)
# data.append([b, bw])
# data = np.array(data)
# pl.plot(data[:,0], data[:,1])
# for ilp in [2]:
# for s in [1]:
# for t in [1024,512,256,128]:
# data = []
# for b in range(32, 1100, 8):
# _, bw = test3(S=s, B=b*(1024//t), T=t, ILP=ilp)
# data.append([b, bw])
# data = np.array(data)
# pl.plot(data[:,0], data[:,1])
# pl.savefig("a.png")
# pl.close()
# for b in range(80, 90, 1):
# _, bw = test3(S=1, B=b, T=1024, ILP=2)
# # 82
# for b in range(240, 260, 1):
# _, bw = test3(S=1, B=b, T=512, ILP=2)
# # 82*3 = 246
# for b in range(240, 500, 1):
# _, bw = test3(S=1, B=b, T=256, ILP=2)
# # 492 = 82*6
# for b in range(240, 1000, 1):
# _, bw = test3(S=1, B=b, T=128, ILP=2)
# # 984 = 82*12
# for b in [128,256]:
# test(S=1, B=b, T=1024, ILP=2)
# for b in [128,256]:
# test(S=0, B=b, T=512, ILP=2)
# for b in [128,256]:
# test(S=0, B=b, T=1024, ILP=2)
# for b in [128,256]:
# test(S=1, B=b, T=512, ILP=1)
# for b in [128,256]:
# test(S=1, B=b, T=1024, ILP=1)
# for b in [128,256]:
# test(S=0, B=b, T=512, ILP=1)
# for b in [128,256]:
# test(S=0, B=b, T=1024, ILP=1)
# test(S=1, B=128, T=512, ILP=4)
# test(S=1, B=64, T=512, ILP=2)
# test(S=1, B=80, T=512, ILP=2)
# test(S=1, B=100, T=512, ILP=2)
# test(S=1, B=110, T=512, ILP=2)
# test(S=1, B=115, T=512, ILP=2)
# test(S=1, B=120, T=512, ILP=2)
# test(S=1, B=130, T=512, ILP=2)
# test(S=1, B=140, T=512, ILP=2)
# test2(S=1, B=128, T=512, ILP=2)
# test(S=1, B=128, T=256, ILP=4)
# test(S=1, B=128, T=128, ILP=8)
# test(S=1, B=128, T=64, ILP=16)
@unittest.skipIf(not jt.compiler.has_cuda, "No CUDA found")
class TestBenchmarkCUDA(unittest.TestCase):
def setUp(self):
jt.flags.use_cuda = 1
def tearDown(self):
jt.flags.use_cuda = 0
def test_main(self):
return
get_mem_band()
check_simple_add_band()
if __name__ == "__main__":
unittest.main()

View File

@ -19,12 +19,12 @@ def all_eq(x, y):
y = convert(y)
if str(x.dtype).startswith("float"):
return str(y.dtype).startswith("float") and x.shape == y.shape and (x==y).all()
return x.dtype == y.dtype and x.shape == y.shape and (x==y).all()
return x.dtype == y.dtype and x.shape == y.shape and np.testing.assert_allclose(x, y)
def check(op, *args):
x = eval(f"np.{op}(*args)")
y = eval(f"jt.{op}(*args).data")
assert all_eq(x, y), f"{x}\n{y}"
all_eq(x, y)
class TestBinaryOp(unittest.TestCase):
def test_binary_op(self):
@ -47,6 +47,9 @@ class TestBinaryOp(unittest.TestCase):
def test_i(self):
def check(op, a, b):
if isinstance(a, list):
a = np.array(a)
b = np.array(b)
if jt.flags.use_cuda and op == "@":
return
if op=="@":
@ -65,13 +68,13 @@ class TestBinaryOp(unittest.TestCase):
a = np.float32(a)
ja = np.float32(ja)
assert all_eq(ja, a), (ja,a)
all_eq(ja, a)
check("+", 5, 2)
check("-", 5, 2)
check("*", 5, 2)
check("/", 5, 2)
check("//", 5, 2)
check("@", [[5]], [[2]])
# check("@", [[5]], [[2]])
check("%", 5, 2)
check("**", 5, 2)
check("<<", 5, 2)
@ -80,6 +83,15 @@ class TestBinaryOp(unittest.TestCase):
check("^", 5, 2)
check("|", 5, 2)
check("+", [5.0,6.0], [2.0,3.0])
check("-", [5.0,6.0], [2.0,3.0])
check("*", [5.0,6.0], [2.0,3.0])
check("/", [5.0,6.0], [2.0,3.0])
check("//", [5.0,6.0], [2.0,3.0])
check("@", [[5,6],[7,8]], [[2,3],[4,5]])
check("%", [5.0,6.0], [2.0,3.0])
check("**", [5.0,6.0], [2.0,3.0])
def test_r(self):
def check(op, a, b):
a = np.array(a)
@ -97,7 +109,7 @@ class TestBinaryOp(unittest.TestCase):
a = eval(f"a {op} b")
a = np.array(a)
assert all_eq(jc, a), f"\n{jc}\n{a}"
all_eq(jc, a)
check("+", 5, 2)
check("-", 5, 2)
check("*", 5, 2)
@ -118,6 +130,7 @@ class TestBinaryOp(unittest.TestCase):
a = np.random.rand(10)
b = np.random.rand(10)
c = np.random.rand(10)
tol = 1e-2 if jt.flags.amp_reg & 2 else 1e-4
for op in ops:
func = lambda x: eval(f"((x[0]{op}x[1])*x[2]).sum()")
x, grads = ngrad(func, [a,b,c], 1e-8)
@ -127,7 +140,7 @@ class TestBinaryOp(unittest.TestCase):
jx = eval(f"(ja{op}jb)*jc")
jgrads = jt.grad(jx, [ja,jb,jc])
for jd, nd in zip(jgrads, grads):
assert (np.abs(jd.data-nd)<1e-4).all(), f"\n{jd.data}\n{nd}"
np.testing.assert_allclose(jd.data, nd, atol=tol, rtol=tol)
def test_mod_float(self):
a = jt.random((10,))
@ -137,7 +150,8 @@ class TestBinaryOp(unittest.TestCase):
a = jt.random((10,), 'float64')
b = jt.random((10,), 'float64')
c = a % b
assert np.allclose(c.data, a.data % b.data)
assert np.allclose(c.data, a.data % b.data, a.data, b.data)
if jt.flags.amp_reg & 2: return
a = jt.random((10,)) * 1000
b = (jt.random((10,)) * 10).int() + 1
c = a % b
@ -169,5 +183,19 @@ class TestBinaryOp(unittest.TestCase):
class TestBinaryOpCuda(TestBinaryOp, test_cuda(2)):
pass
class TestBinaryOpCpuFp16(TestBinaryOp):
def setUp(self):
jt.flags.amp_reg = 2 | 4 | 8 | 16
def tearDown(self):
jt.flags.amp_reg = 0
class TestBinaryOpCudaFp16(TestBinaryOp):
def setUp(self):
jt.flags.amp_reg = 2 | 4 | 8 | 16
jt.flags.use_cuda = 1
def tearDown(self):
jt.flags.amp_reg = 0
jt.flags.use_cuda = 0
if __name__ == "__main__":
unittest.main()

View File

@ -106,5 +106,18 @@ class TestCore(unittest.TestCase):
a.y = 2
assert a.y == 2
def test_modules(self):
a = jt.Module()
a.x = jt.Module()
a.y = jt.Module()
a.a = jt.array([1,2,3])
a.b = jt.array([1,2,3])
assert list(a._modules.keys()) == ["x", "y"]
assert a._modules['x'] is a.x
assert a._modules['y'] is a.y
assert list(a._parameters.keys()) == ['a', 'b']
assert a._parameters['a'] is a.a
assert a._parameters['b'] is a.b
if __name__ == "__main__":
unittest.main()

View File

@ -101,6 +101,12 @@ class TestCuda(unittest.TestCase):
assert a.shape == [3,4,5] and a.dtype == 'float'
assert (-na.flatten() == range(3*4*5)).all(), na
def test_cuda_fused_op(self):
a = jt.array([1,2,3])
a.sync()
with jt.flag_scope(use_cuda=1):
((a+a)*2).data
@unittest.skipIf(jt.compiler.has_cuda, "Only test without CUDA")
class TestNoCuda(unittest.TestCase):

View File

@ -123,8 +123,8 @@ class TestCudnnConvOp(unittest.TestCase):
logs = find_log_with_re(raw_log, "(Jit op key (not )?found: cudnn_conv.*)")
assert len(logs)==3 and "oihw" in logs[0][0], logs
assert np.allclose(y.data, cy.data)
np.testing.assert_allclose(dx.data, cdx.data, atol=1e-2)
np.testing.assert_allclose(dw.data, cdw.data, atol=1e-2)
np.testing.assert_allclose(dx.data, cdx.data, atol=1e-2, rtol=1e-3)
np.testing.assert_allclose(dw.data, cdw.data, atol=1e-2, rtol=1e-3)
if os.name == 'nt': return
check([10,3,100,100], [5,3,3,3], stride=2, padding=0, dilation=1)
check([10,4,40,50], [5,4,5,5], stride=1, padding=1, dilation=1)

View File

@ -0,0 +1,347 @@
# ***************************************************************
# Copyright (c) 2021 Jittor. All Rights Reserved.
# Maintainers: Dun Liang <randonlang@gmail.com>.
# This file is subject to the terms and conditions defined in
# file 'LICENSE.txt', which is part of this source code package.
# ***************************************************************
import unittest
import jittor as jt
import numpy as np
import os
def transpose0231(x):
s0, s1, s2, s3 = x.shape
asize = 16
bsize = 16
ILP = 2
return jt.code([s0, s2, s3, s1], x.dtype, [x],
cuda_header="#include <type/fp16_compute.h>\n#include <cassert>",
cuda_src=f"""
__global__ void kernel(in0_type* __restrict__ x, in0_type* __restrict__ y, int s0, int s1, int s2, int s3) {{
__shared__ in0_type t[{asize*ILP}*{bsize*ILP+1}];
int t3 = threadIdx.x % {bsize};
int t1 = threadIdx.x / {bsize};
int b3 = blockIdx.x;
int b2 = blockIdx.y;
int b0 = blockIdx.z;
int x3 = 1;
int x2 = s3;
int x1 = s2*x2;
int x0 = s1*x1;
int y3 = 1;
int y2 = s1;
int y1 = s3*y2;
int y0 = s2*y1;
in0_type tmp[{ILP}];
for (int i=0; i<(s1-1)/{asize*ILP}+1; i++)
{{
int _b3 = b3 * {bsize*ILP} + t3*{ILP};
if (_b3 < s3) {{
#pragma unroll
for (int j=0; j<{ILP}; j++) {{
vload<sizeof(in0_type)*{ILP}>(
tmp,
&x[b0*x0+(t1*{ILP}+j+i*{asize*ILP})*x1+b2*x2+_b3*x3]
);
#pragma unroll
for (int k=0; k<{ILP}; k++)
t[(t1*{ILP}+j)*{bsize*ILP+1}+t3*{ILP}+k] = tmp[k];
}}
}}
__syncthreads();
int t3_ = threadIdx.x % {asize};
int t1_ = threadIdx.x / {asize};
_b3 = b3 * {bsize*ILP} + t1_*{ILP};
if (_b3 < s3) {{
#pragma unroll
for (int j=0; j<{ILP}; j++) {{
#pragma unroll
for (int k=0; k<{ILP}; k++) {{
tmp[k] =
t[(t3*{ILP}+k)*{bsize*ILP+1}+t1_*{ILP}+j];
}}
vload<sizeof(in0_type)*{ILP}>(
&y[b0*y0+b2*y1+(_b3+j)*y2+((t3*{ILP})+i*{asize*ILP})*y3],
tmp
);
}}
}}
__syncthreads();
}}
}}
int s0, s1, s2, s3;
in0->shape.unpack(s0, s1, s2, s3);
kernel<<<{{(s3-1)/{bsize*ILP}+1, s2, s0 }}, {bsize*asize}>>>
(in0_p, out0_p, s0, s1, s2, s3);
""")
def transpose0231_2(x):
s0, s1, s2, s3 = x.shape
asize = 16
bsize = 8
ILP = 2
return jt.code([s0, s2, s3, s1], x.dtype, [x],
cuda_header="#include <type/fp16_compute.h>\n#include <cassert>",
cuda_src=f"""
__global__ __launch_bounds__({asize*bsize}) void kernel(in0_type* __restrict__ x, in0_type* __restrict__ y, int s0, int s1, int s2, int s3) {{
__shared__ in0_type t[{asize*ILP}*{bsize*ILP+1}];
int t3 = threadIdx.x % {bsize};
int t1 = threadIdx.x / {bsize};
int b3 = blockIdx.x;
int b1 = blockIdx.y;
int b2 = 0;
int b0 = blockIdx.z;
int x3 = 1;
int x2 = s3;
int x1 = s2*x2;
int x0 = s1*x1;
int y3 = 1;
int y2 = s1;
int y1 = s3*y2;
int y0 = s2*y1;
in0_type tmp[{ILP}];
{{
int _b3 = b3 * {bsize*ILP} + t3*{ILP};
if (_b3 < s3) {{
#pragma unroll
for (int j=0; j<{ILP}; j++) {{
if (t1*{ILP}+j+b1*{asize*ILP} >= s1)
continue;
vload<sizeof(in0_type)*{ILP}>(
tmp,
&x[b0*x0+(t1*{ILP}+j+b1*{asize*ILP})*x1+b2*x2+_b3*x3]
);
#pragma unroll
for (int k=0; k<{ILP}; k++)
t[(t1*{ILP}+j)*{bsize*ILP+1}+t3*{ILP}+k] = tmp[k];
}}
}}
__syncthreads();
int t3_ = threadIdx.x % {asize};
int t1_ = threadIdx.x / {asize};
_b3 = b3 * {bsize*ILP} + t1_*{ILP};
int yy3 = (t3_*{ILP})+b1*{asize*ILP};
if (_b3 < s3 && yy3 < s1) {{
#pragma unroll
for (int j=0; j<{ILP}; j++) {{
#pragma unroll
for (int k=0; k<{ILP}; k++) {{
tmp[k] =
t[(t3_*{ILP}+k)*{bsize*ILP+1}+t1_*{ILP}+j];
}}
vload<sizeof(in0_type)*{ILP}>(
&y[b0*y0+b2*y1+(_b3+j)*y2+yy3*y3],
tmp
);
// printf("%d %d %d %d %d\\n", b0*y0+b2*y1+(_b3+j)*y2+yy3*y3,
// b0, b2, (_b3+j), yy3);
}}
}}
__syncthreads();
}}
}}
int s0, s1, s2, s3;
in0->shape.unpack(s0, s1, s2, s3);
kernel<<<{{(s3-1)/{bsize*ILP}+1, (s1-1)/{asize*ILP}+1, s0 }}, {bsize*asize}>>>
(in0_p, out0_p, s0, s1, s2, s3);
""")
def check_share():
return
a = jt.rand((30, 32, 4, 2000)).float32()
jt.code(a.shape, a.dtype, [a],
cuda_header="#include <type/fp16_compute.h>\n#include <cassert>",
cuda_src="""
__global__ void kernel(in0_type* __restrict__ a, in0_type* __restrict__ b) {
__shared__ float x[32*33];
for (int i=0; i<3; i++) {
((float2*)&x[i])[0] = ((float2*)&a[i])[0];
((float2*)&b[i])[0] = ((float2*)&x[i+1])[0];
}
}
kernel<<<1024,16*16>>>(in0_p, out0_p);
""").sync()
jt.sync_all(True)
# print(a[0]+1)
print("pass test")
class TestFP16(unittest.TestCase):
def test_array(self):
a = np.array([1,2,3], dtype="float16")
b = jt.array(a)
np.testing.assert_allclose(a, b.data)
def test_add(self):
a = np.array([1,2,3], dtype="float16")
b = jt.array(a)
c = b+b
np.testing.assert_allclose(c.data, a+a)
d = c.sum()
np.testing.assert_allclose(d.data, [12])
c = c+1
print(c)
def test_matmul(self):
a = jt.random((100,100)).float16()
b = jt.random((100,100)).float16()
c = jt.matmul(a, b)
c.sync()
def test_matmul_grad(self):
a = jt.random((100,100)).float16()
b = jt.random((100,100)).float16()
c = jt.matmul(a, b)
c.sync()
da, db = jt.grad(c, [a,b])
jt.sync_all()
assert da.dtype == "float16"
assert db.dtype == "float16"
def test_array_random_auto_cast(self):
a = jt.array([1.0,2.0])
assert a.dtype == "float32"
with jt.flag_scope(amp_reg=2+16):
a = jt.array([1.0,2.0])
assert a.dtype == "float16", a.dtype
a = jt.random([10])
assert a.dtype == "float32"
with jt.flag_scope(amp_reg=2+16):
a = jt.random([10])
assert a.dtype == "float16", a.dtype
def test_conv(self):
a = jt.random((3,4,5,5)).float16()
b = jt.random((4,4,3,3)).float16()
c = jt.nn.conv(a, b)
c.sync()
def test_max(self):
a = jt.random((100,)).float16()
b = jt.random((100,)).float16()
c = a.maximum(b)
c.sync()
def test_reduce_dtype_infer(self):
with jt.flag_scope(amp_reg=1):
a = jt.random((3,4,5,5)).float16()
b = a.sum()
b.sync()
assert b.dtype == "float32"
with jt.flag_scope(amp_reg=2):
a = jt.random((3,4,5,5)).float16()
b = a.sum()
b.sync()
assert b.dtype == "float32"
with jt.flag_scope(amp_reg=0):
a = jt.random((3,4,5,5)).float16()
b = a.sum()
b.sync()
assert b.dtype == "float32"
with jt.flag_scope(amp_reg=2+4):
a = jt.random((3,4,5,5)).float16()
b = a.sum()
b.sync()
assert b.dtype == "float16", b.dtype
def test_white_dtype_infer(self):
with jt.flag_scope(amp_reg=1):
a = jt.random((3,4,5,5)).float16()
b = a**a
b.sync()
assert b.dtype == "float32"
with jt.flag_scope(amp_reg=2):
a = jt.random((3,4,5,5)).float16()
b = a**a
b.sync()
assert b.dtype == "float32"
with jt.flag_scope(amp_reg=0):
a = jt.random((3,4,5,5)).float16()
b = a**a
b.sync()
assert b.dtype == "float32"
with jt.flag_scope(amp_reg=2+8):
a = jt.random((3,4,5,5)).float16()
b = a**a
b.sync()
assert b.dtype == "float16", b.dtype
def test_module_half(self):
a = jt.nn.Linear(10,10)
assert a.weight.dtype == "float32"
a.half()
assert a.weight.dtype == "float16"
@unittest.skipIf(not jt.compiler.has_cuda, "No CUDA found")
class TestFP16CUDA(TestFP16):
def setUp(self):
jt.flags.use_cuda = 1
def tearDown(self):
jt.flags.use_cuda = 0
def test_softmax(self):
a = jt.rand((120, 2000, 2000)).float16()
# a = jt.rand((1, 2000, 2000)).float32()
jt.sync_all()
with jt.profile_scope(10, 100):
a.log_softmax(-1).sync()
def test_transpose(self):
check_share()
# return
a = jt.rand((30, 32, 4, 2000)).float32()
# a = jt.rand((1, 1024, 1, 2000)).float32()
diff = transpose0231(a).data != a.transpose((0,2,3,1)).data
print(np.where(diff))
# return
jt.sync_all()
# with jt.profile_scope(100, 11000):
with jt.profile_scope(100, 11000):
# a.log_softmax(-1).sync()
transpose0231(a).sync()
a.transpose((0,2,3,1)).sync()
# a.transpose((0,2,1,3)).sync()
a.fuse_transpose((0,2,1,3)).sync()
(a+1).sync()
jt.sync_all(True)
diff = transpose0231(a).data != a.transpose((0,2,3,1)).data
print(np.where(diff))
np.testing.assert_allclose(transpose0231(a).data, a.transpose((0,2,3,1)).data)
def test_transpose2(self):
# check_share()
# return
# a = jt.rand((30, 32, 4, 2000)).float32()
# a = jt.rand((1, 10000, 1, 2000)).float32()
a = jt.rand((1, 10000, 1, 2048)).float32()
print("transpose")
transpose0231_2(a).sync()
print("add")
(a+1).sync()
return
# a = jt.arange(32*16).reshape((1, 32, 1, 16))
diff = transpose0231_2(a).data != a.transpose((0,2,3,1)).data
print(np.where(diff))
# return
jt.sync_all()
# with jt.profile_scope(100, 11000):
with jt.profile_scope(100, 1100):
# a.log_softmax(-1).sync()
transpose0231_2(a).sync()
a.transpose((0,2,3,1)).sync()
# a.transpose((0,2,1,3)).sync()
a.fuse_transpose((0,2,1,3)).sync()
(a+1).sync()
jt.sync_all(True)
diff = transpose0231_2(a).data != a.transpose((0,2,3,1)).data
print(np.where(diff))
np.testing.assert_allclose(transpose0231_2(a).data, a.transpose((0,2,3,1)).data)
if __name__ == "__main__":
unittest.main()

View File

@ -10,12 +10,12 @@
import unittest
import os, sys
import jittor as jt
from pathlib import Path
import jittor_utils as jit_utils
class TestLock(unittest.TestCase):
def test(self):
if os.environ.get('lock_full_test', '0') == '1':
cache_path = os.path.join(str(Path.home()), ".cache", "jittor", "lock")
cache_path = os.path.join(jit_utils.home(), ".cache", "jittor", "lock")
assert os.system(f"rm -rf {cache_path}") == 0
cmd = f"cache_name=lock {sys.executable} -m jittor.test.test_example"
else:

View File

@ -75,6 +75,8 @@ class TestPad(unittest.TestCase):
print('pass flip test ...')
def test_cross(self):
def check_equal(a, b, tol):
np.testing.assert_allclose(a.detach().numpy(), b.numpy(), atol=1e-5)
arr1 = np.random.randn(16,3,224,224,3)
arr2 = np.random.randn(16,3,224,224,3)
check_equal(torch.Tensor(arr1).cross(torch.Tensor(arr2), dim=1), jt.array(arr1).cross(jt.array(arr2), dim=1), 1e-1)
@ -257,5 +259,52 @@ class TestOther(unittest.TestCase):
a = jt.arctan2(jt.array([1,1.0,0]), jt.array([1,0.0,-1]))
np.testing.assert_allclose(a.data, [0.7853982,1.5707964,3.1415927])
y = jt.random((100,))
x = jt.random((100,))
z = jt.arctan2(y, x)
z2 = np.arctan2(y.data, x.data)
np.testing.assert_allclose(z.data, z2)
def test_code_softmax(self):
if not jt.has_cuda: return
def softmax(x, dim = None, log=False):
if dim is None:
x = (x - x.max()).exp()
ret = x / x.sum()
else:
x = (x-x.max(dim, keepdims=True)).exp()
ret = x / x.sum(dim, keepdims=True)
if log: return ret.log()
return ret
from jittor.other.code_softmax import softmax_v1
with jt.flag_scope(use_cuda = 1):
shape = (120, 2000, 2000)
shape = (3,3)
for log in [0,1]:
for shape in [(3,3),
(12, 200, 2000),
(12, 200, 2048),
(12, 200, 2049)]:
print(shape)
a = jt.rand(shape)
c = jt.rand(shape)
b = softmax(a, -1, log=log)
bb = softmax_v1(a, log=log)
err = (bb - b).abs().max()
assert err.item() < 1e-5, (err, bb, b)
d1 = jt.grad(b*c, a)
d2 = jt.grad(bb*c, a)
err = (d1 - d2).abs().max()
if log:
assert err.item() < 1e-2, (err.item())
else:
assert err.item() < 1e-5, (err.item())
if __name__ == "__main__":
unittest.main()

View File

@ -8,10 +8,10 @@ import unittest, os
import jittor as jt
from jittor import LOG
import sys
from pathlib import Path
import jittor_utils as jit_utils
dirname = os.path.join(jt.flags.jittor_path, "notebook")
notebook_dir = os.path.join(str(Path.home()), ".cache","jittor","notebook")
notebook_dir = os.path.join(jit_utils.home(), ".cache","jittor","notebook")
tests = []
for mdname in os.listdir(dirname):
if not mdname.endswith(".src.md"): continue

View File

@ -111,17 +111,6 @@ class TestOpCompiler(unittest.TestCase):
check("@{a^b == 7}", "2")
check("@{(a^b) == 7}", "1")
check("@{b<<a == 5*4}", "1")
check('''#include "ops/binary_op_defs.h"
#define OP1(a, b) a+b
OP1
@expand_macro(OP1,1,2)
@expand_macro(maximum, T, 1, 2)
@expand_macro(@OP,T,1,2)''',
''' #define OP1(a, b) a+b
OP1
1+2
std::max(T(1), T(2))
((1)+T(2)*(T(rcount)))''')
expect_error(lambda: jit_precompile(vars, "@{a"))
expect_error(lambda: jit_precompile(vars, "@for(a"))
expect_error(lambda: jit_precompile(vars, "@for(i,l,r)"))

View File

@ -0,0 +1,24 @@
# ***************************************************************
# Copyright (c) 2021 Jittor. All Rights Reserved.
# Maintainers: Dun Liang <randonlang@gmail.com>.
# This file is subject to the terms and conditions defined in
# file 'LICENSE.txt', which is part of this source code package.
# ***************************************************************
import unittest
import jittor as jt
import numpy as np
import os
class TestProfiler(unittest.TestCase):
def test_profiler(self):
a = jt.rand(1000,1000)
b = jt.rand(1000,1000)
jt.sync_all()
with jt.profile_scope(10, 100, profiler_record_peek=1) as rep:
jt.matmul(a, b).sync()
x = float(rep[-1][4])
y = float(rep[-2][4])
assert abs(x-y)/x < 1e-3
if __name__ == "__main__":
unittest.main()

View File

@ -36,19 +36,7 @@ class MnistNet(Module):
return x
@unittest.skipIf(skip_this_test, "skip_this_test")
class TestResnet(unittest.TestCase):
@classmethod
def setUpClass(self):
# hyper-parameters
self.batch_size = int(os.environ.get("TEST_BATCH_SIZE", "100"))
self.weight_decay = 0.0001
self.momentum = 0.9
self.learning_rate = 0.1
# mnist dataset
self.train_loader = MNIST(train=True, transform=trans.Resize(224)) \
.set_attrs(batch_size=self.batch_size, shuffle=True)
self.train_loader.num_workers = 4
class TestResnetFp32(unittest.TestCase):
# setup random seed
def setup_seed(self, seed):
np.random.seed(seed)
@ -59,6 +47,19 @@ class TestResnet(unittest.TestCase):
@jt.flag_scope(use_cuda=1, use_stat_allocator=1)
def test_resnet(self):
self.setup_seed(1)
# hyper-parameters
self.batch_size = int(os.environ.get("TEST_BATCH_SIZE", "100"))
self.weight_decay = 0.0001
self.momentum = 0.9
self.learning_rate = 0.1
if jt.flags.amp_reg:
self.learning_rate = 0.01
# mnist dataset
self.train_loader = MNIST(train=True, transform=trans.Resize(224)) \
.set_attrs(batch_size=self.batch_size, shuffle=True)
self.train_loader.num_workers = 4
loss_list=[]
acc_list=[]
mnist_net = MnistNet()
@ -70,6 +71,7 @@ class TestResnet(unittest.TestCase):
for data, target in self.train_loader:
batch_id = self.train_loader.batch_id
epoch_id = self.train_loader.epoch_id
data = data.float_auto()
# train step
# with jt.log_capture_scope(
@ -120,6 +122,8 @@ class TestResnet(unittest.TestCase):
# Train Epoch: 0 [40/100 (40%)] Loss: 2.286762 Acc: 0.130000
# Train Epoch: 0 [50/100 (50%)] Loss: 2.055014 Acc: 0.290000
if jt.flags.amp_reg:
continue
if jt.in_mpi:
assert jt.core.number_of_lived_vars() < 8100, jt.core.number_of_lived_vars()
else:
@ -131,5 +135,14 @@ class TestResnet(unittest.TestCase):
assert np.mean(loss_list[-50:])<0.5
assert np.mean(acc_list[-50:])>0.8
@unittest.skipIf(skip_this_test, "skip_this_test")
class TestResnetFp16(TestResnetFp32):
def setup(self):
jt.flags.auto_mixed_precision_level = 5
def tearDown(self):
jt.flags.auto_mixed_precision_level = 0
if __name__ == "__main__":
unittest.main()

View File

@ -0,0 +1,121 @@
# ***************************************************************
# Copyright (c) 2021 Jittor. All Rights Reserved.
# Maintainers: Dun Liang <randonlang@gmail.com>.
# This file is subject to the terms and conditions defined in
# file 'LICENSE.txt', which is part of this source code package.
# ***************************************************************
import unittest
import jittor as jt
import numpy as np
import os
from jittor.test.misc import superglue
from jittor.test.misc.superglue import SuperGlue
import time
@jt.flag_scope(use_cuda=1)
def main():
global superglue
superglue.split_size = int(os.environ.get("split_size", "12"))
# superglue.split_size = 1000000
batch = 30
num = 2000
dim = 128
# jt.display_memory_info()
# os.system("nvidia-smi")
# breakpoint()
with jt.no_grad():
config = {
'superglue': {
'sinkhorn_iterations': 25,
'match_threshold': 0.01,
'keypoint_position_dim': 2,
'descriptor_dim': dim,
'use_dual_softmax': True,
'GNN_layers': ['self', 'cross'] * 9,
}
}
superglue = SuperGlue(config.get('superglue', {}))
superglue.eval()
data = {
'keypoints0': jt.rand((batch, num, 2), dtype=jt.float),
'keypoints1': jt.rand((batch, num, 2), dtype=jt.float),
'shape0': jt.rand((batch, 2), dtype=jt.float),
'shape1': jt.rand((batch, 2), dtype=jt.float),
'descriptors0': jt.rand((batch, dim, num), dtype=jt.float),
'descriptors1': jt.rand((batch, dim, num), dtype=jt.float),
'scores0': jt.rand((batch, num), dtype=jt.float),
'scores1': jt.rand((batch, num), dtype=jt.float),
'all_matches': jt.randint(0, num, (batch, num, 2), dtype=jt.int),
'return_match': False,
# 'match_num': match_num
}
use_fp16 = int(os.environ.get("use_fp16", "0"))
if use_fp16:
jt.flags.amp_reg = 2
for k,v in data.items():
if isinstance(v, jt.Var) and v.dtype == "float32":
v.assign(v.float16())
for v in superglue.parameters():
if v.dtype == "float32":
v.assign(v.float16())
jt.sync_all(True)
import pickle
jt.sync_all(True)
for x in range(5):
print(x)
jt.gc()
x = superglue(data)['loss']
x.sync()
jt.display_memory_info()
# os.system("nvidia-smi")
# breakpoint()
# print(data)
# print(x)
# with open("/tmp/record.pkl", "wb") as f:
# pickle.dump([data, x], f, pickle.HIGHEST_PROTOCOL)
# with jt.flag_scope(trace_py_var=3, profile_memory_enable=1):
# x = superglue(data)['loss']
# x.sync()
# jt.get_max_memory_treemap()
# exit(0)
jt.sync_all(True)
time0 = time.time()
jt.flags.profiler_enable = int(os.environ.get("profiler", "0"))
for x in range(20):
print(x)
# jt.display_memory_info()
x = superglue(data)['loss']
x.sync()
# print(x)
jt.sync_all(True)
time1 = time.time()
print("avg time:", (time1 - time0) / 20)
return (time1 - time0) / 20
class TestSuperglue(unittest.TestCase):
def test(self):
if not jt.has_cuda: return
t1 = main()
os.environ["use_fp16"] = "1"
t2 = main()
os.environ["use_fp16"] = "0"
assert t1*0.55 > t2
if __name__ == "__main__":
unittest.main()

View File

@ -17,7 +17,8 @@ def check(op, *args):
x = convert(x)
y = convert(y)
# str match nan and inf
assert x.dtype == y.dtype and x.shape == y.shape
assert x.dtype == y.dtype and x.shape == y.shape, \
(x.dtype, y.dtype, x.shape, y.shape)
for a,b in zip(x.flatten(), y.flatten()):
assert str(a)[:5] == str(b)[:5], (a,b)
@ -32,9 +33,10 @@ class TestUnaryOp(unittest.TestCase):
check("logical_not", a)
check("bitwise_not", a)
b = np.array([1.1, 2.2, 3.3, 4.4, -1, 0])
check("log", a.astype("float32"))
check("exp", a.astype("float32"))
check("sqrt", a.astype("float32"))
type = "float16" if (jt.flags.amp_reg & 2) else "float32"
check("log", a.astype(type))
check("exp", a.astype(type))
check("sqrt", a.astype(type))
def test_grad(self):
ops = ["abs", "negative", "log", "exp", "sqrt",
@ -60,7 +62,8 @@ class TestUnaryOp(unittest.TestCase):
ja = jt.array(b)
jb = eval(f"jt.{op}(ja)")
jda = jt.grad(jb, ja)
assert (np.allclose(jda.data, da)), (jda.data,da,op)
tol = 1e-2 if jt.flags.amp_reg & 2 else 1e-6
assert (np.allclose(jda.data, da, atol=tol, rtol=tol)), (jda.data,da,op)
def test_sigmoid(self):
a = np.arange(-150,150, 10).astype("float32")
@ -92,11 +95,26 @@ class TestUnaryOp(unittest.TestCase):
np.testing.assert_allclose(y.data, y2.data)
d = jt.grad(x2, y2)
_, (dn,) = ngrad(lambda y: special.erfinv(y).sum(), [y], 1e-8)
np.testing.assert_allclose(d.data, dn, atol=1e-6, rtol=1e-6)
tol = 1e-3 if jt.flags.amp_reg & 2 else 1e-6
np.testing.assert_allclose(d.data, dn, atol=tol, rtol=tol)
class TestUnaryOpCuda(TestUnaryOp, test_cuda(2)):
pass
class TestUnaryOpCudaFp16(TestUnaryOp, test_cuda(2)):
def setUp(self):
jt.flags.amp_reg = 2 | 4 | 8 | 16
def tearDown(self):
jt.flags.amp_reg = 0
class TestUnaryOpCudaFp16(TestUnaryOp, test_cuda(2)):
def setUp(self):
jt.flags.amp_reg = 2 | 4 | 8 | 16
jt.flags.use_cuda = 1
def tearDown(self):
jt.flags.amp_reg = 0
jt.flags.use_cuda = 0
if __name__ == "__main__":
unittest.main()

View File

@ -15,8 +15,8 @@ def find_jittor_path():
return path[:-len(suffix)] + ".."
def find_cache_path():
from pathlib import Path
path = str(Path.home())
import jittor_utils as jit_utils
path = jit_utils.home()
dirs = [".cache", "jittor"]
for d in dirs:
path = os.path.join(path, d)

Binary file not shown.

Some files were not shown because too many files have changed in this diff Show More