diff --git a/extern/cuda/cub/ops/cub_arg_reduce_op.cc b/extern/cuda/cub/ops/cub_arg_reduce_op.cc index 2a366c8f..1dd1fe8b 100644 --- a/extern/cuda/cub/ops/cub_arg_reduce_op.cc +++ b/extern/cuda/cub/ops/cub_arg_reduce_op.cc @@ -87,7 +87,7 @@ void CubArgReduceOp::jit_run() { num_segments *= x->shape[i]; } size_t allocation_dout; - cub::KeyValuePair *d_out = (cub::KeyValuePair *)exe.allocator->alloc(sizeof(cub::KeyValuePair) * num_segments, allocation_dout); + cub::KeyValuePair *d_out = (cub::KeyValuePair *)exe.temp_allocator->alloc(sizeof(cub::KeyValuePair) * num_segments, allocation_dout); // Determine temporary device storage requirementse = NULL; void *d_temp_storage = NULL; @@ -96,7 +96,7 @@ void CubArgReduceOp::jit_run() { xp, d_out, num_segments, offsetsp, offsetsp + 1); // Allocate temporary storage size_t allocation; - d_temp_storage = exe.allocator->alloc(temp_storage_bytes, allocation); + d_temp_storage = exe.temp_allocator->alloc(temp_storage_bytes, allocation); // Run sorting operation cub::DeviceSegmentedReduce::@FUNC@@(d_temp_storage, temp_storage_bytes, xp, d_out, num_segments, offsetsp, offsetsp + 1); @@ -105,8 +105,8 @@ void CubArgReduceOp::jit_run() { auto* __restrict__ y_keyp = y_key->ptr(); split<<>>(d_out, y_keyp, yp, num_segments); - exe.allocator->free(d_temp_storage, temp_storage_bytes, allocation); - exe.allocator->free(d_out, sizeof(cub::KeyValuePair) * num_segments, allocation_dout); + exe.temp_allocator->free(d_temp_storage, temp_storage_bytes, allocation); + exe.temp_allocator->free(d_out, sizeof(cub::KeyValuePair) * num_segments, allocation_dout); } #endif // JIT_cuda #endif // JIT diff --git a/extern/cuda/cub/ops/cub_argsort_op.cc b/extern/cuda/cub/ops/cub_argsort_op.cc index 4a28ae65..f1e2a0c4 100644 --- a/extern/cuda/cub/ops/cub_argsort_op.cc +++ b/extern/cuda/cub/ops/cub_argsort_op.cc @@ -85,12 +85,12 @@ void CubArgsortOp::jit_run() { num_items, num_segments, offsetsp, offsetsp + 1); // Allocate temporary storage size_t allocation; - d_temp_storage = exe.allocator->alloc(temp_storage_bytes, allocation); + d_temp_storage = exe.temp_allocator->alloc(temp_storage_bytes, allocation); // Run sorting operation cub::DeviceSegmentedRadixSort::@FUNC@@(d_temp_storage, temp_storage_bytes, xp, y_keyp, indexesp, yp, num_items, num_segments, offsetsp, offsetsp + 1); - exe.allocator->free(d_temp_storage, temp_storage_bytes, allocation); + exe.temp_allocator->free(d_temp_storage, temp_storage_bytes, allocation); } #endif // JIT_cuda #endif // JIT diff --git a/extern/cuda/cub/ops/cub_where_op.cc b/extern/cuda/cub/ops/cub_where_op.cc index 09c659eb..d31144e2 100644 --- a/extern/cuda/cub/ops/cub_where_op.cc +++ b/extern/cuda/cub/ops/cub_where_op.cc @@ -82,7 +82,7 @@ void CubWhereOp::jit_run(){ int N = cond->num; size_t temp_storage_bytes=0; size_t num_nonzeros_allocation; - auto num_nonzeros = exe.allocator->alloc(sizeof(To), num_nonzeros_allocation); + auto num_nonzeros = exe.temp_allocator->alloc(sizeof(To), num_nonzeros_allocation); size_t temp_storage_allocation; void* temp_storage; @@ -93,9 +93,9 @@ void CubWhereOp::jit_run(){ cub::TransformInputIterator, Ti*> itr(cond->ptr(), NonZeroOp()); temp_storage_bytes = 0; checkCudaErrors(cub::DeviceSelect::Flagged(nullptr, temp_storage_bytes, counting_itr, itr, out_temp, (To*)num_nonzeros, N)); - temp_storage = exe.allocator->alloc(temp_storage_bytes, temp_storage_allocation); + temp_storage = exe.temp_allocator->alloc(temp_storage_bytes, temp_storage_allocation); checkCudaErrors(cub::DeviceSelect::Flagged(temp_storage, temp_storage_bytes, counting_itr, itr,out_temp, (To*)num_nonzeros, N)); - exe.allocator->free(temp_storage, temp_storage_bytes, temp_storage_allocation); + exe.temp_allocator->free(temp_storage, temp_storage_bytes, temp_storage_allocation); To num_nonzeros_h; cudaMemcpy(&num_nonzeros_h, num_nonzeros, sizeof(To), cudaMemcpyDeviceToHost); @@ -110,7 +110,7 @@ void CubWhereOp::jit_run(){ @for(i, 0, NDIM, 1, , cond->shape[@i], outs[@i]->ptr()) ); } - exe.allocator->free(num_nonzeros, sizeof(int), num_nonzeros_allocation); + exe.temp_allocator->free(num_nonzeros, sizeof(int), num_nonzeros_allocation); } #endif diff --git a/extern/cuda/cudnn/ops/cudnn_conv_backward_w_op.cc b/extern/cuda/cudnn/ops/cudnn_conv_backward_w_op.cc index ccf6bc47..3e82c9bb 100644 --- a/extern/cuda/cudnn/ops/cudnn_conv_backward_w_op.cc +++ b/extern/cuda/cudnn/ops/cudnn_conv_backward_w_op.cc @@ -203,7 +203,7 @@ void CudnnConvBackwardWOp::jit_run() { if (CUDNN_STATUS_SUCCESS == ret && sz > max_ws_size) max_ws_size = sz; } size_t allocation; - void* ws = exe.allocator->alloc(max_ws_size, allocation); + void* ws = exe.temp_allocator->alloc(max_ws_size, allocation); checkCudaErrors(cudnnFindConvolutionBackwardFilterAlgorithmEx( handle_, cudnnIdesc, x->ptr(), @@ -215,7 +215,7 @@ void CudnnConvBackwardWOp::jit_run() { perf_results, ws, max_ws_size)); - exe.allocator->free(ws, max_ws_size, allocation); + exe.temp_allocator->free(ws, max_ws_size, allocation); } else { checkCudaErrors(cudnnGetConvolutionBackwardFilterAlgorithm_v7( handle_, @@ -250,7 +250,7 @@ void CudnnConvBackwardWOp::jit_run() { cudnnFdesc, algo, &workSpaceSize)); size_t allocation; if (workSpaceSize > 0) { - workSpace = exe.allocator->alloc(workSpaceSize, allocation); + workSpace = exe.temp_allocator->alloc(workSpaceSize, allocation); } float alpha=1, beta=0; checkCudaErrors(cudnnConvolutionBackwardFilter( @@ -265,7 +265,7 @@ void CudnnConvBackwardWOp::jit_run() { cudnnFdesc, w->ptr()) ); if (workSpace) - exe.allocator->free(workSpace, workSpaceSize, allocation); + exe.temp_allocator->free(workSpace, workSpaceSize, allocation); checkCudaErrors(cudnnDestroyTensorDescriptor( cudnnIdesc )); checkCudaErrors(cudnnDestroyFilterDescriptor( cudnnFdesc )); diff --git a/extern/cuda/cudnn/ops/cudnn_conv_backward_x_op.cc b/extern/cuda/cudnn/ops/cudnn_conv_backward_x_op.cc index bb3e5eaa..a604f679 100644 --- a/extern/cuda/cudnn/ops/cudnn_conv_backward_x_op.cc +++ b/extern/cuda/cudnn/ops/cudnn_conv_backward_x_op.cc @@ -204,7 +204,7 @@ void CudnnConvBackwardXOp::jit_run() { if (CUDNN_STATUS_SUCCESS == ret && sz > max_ws_size) max_ws_size = sz; } size_t allocation; - void* ws = exe.allocator->alloc(max_ws_size, allocation); + void* ws = exe.temp_allocator->alloc(max_ws_size, allocation); checkCudaErrors(cudnnFindConvolutionBackwardDataAlgorithmEx( handle_, cudnnFdesc, w->ptr(), @@ -216,7 +216,7 @@ void CudnnConvBackwardXOp::jit_run() { perf_results, ws, max_ws_size)); - exe.allocator->free(ws, max_ws_size, allocation); + exe.temp_allocator->free(ws, max_ws_size, allocation); } else { checkCudaErrors(cudnnGetConvolutionBackwardDataAlgorithm_v7( handle_, @@ -251,7 +251,7 @@ void CudnnConvBackwardXOp::jit_run() { cudnnIdesc, algo, &workSpaceSize)); size_t allocation; if (workSpaceSize > 0) { - workSpace = exe.allocator->alloc(workSpaceSize, allocation); + workSpace = exe.temp_allocator->alloc(workSpaceSize, allocation); } float alpha=1, beta=0; checkCudaErrors(cudnnConvolutionBackwardData( @@ -266,7 +266,7 @@ void CudnnConvBackwardXOp::jit_run() { cudnnIdesc, x->ptr()) ); if (workSpace) - exe.allocator->free(workSpace, workSpaceSize, allocation); + exe.temp_allocator->free(workSpace, workSpaceSize, allocation); checkCudaErrors(cudnnDestroyTensorDescriptor( cudnnIdesc )); checkCudaErrors(cudnnDestroyFilterDescriptor( cudnnFdesc )); diff --git a/extern/cuda/cudnn/ops/cudnn_conv_op.cc b/extern/cuda/cudnn/ops/cudnn_conv_op.cc index e84ee7e6..7f3305ed 100644 --- a/extern/cuda/cudnn/ops/cudnn_conv_op.cc +++ b/extern/cuda/cudnn/ops/cudnn_conv_op.cc @@ -208,7 +208,7 @@ void CudnnConvOp::jit_run() { if (CUDNN_STATUS_SUCCESS == ret && sz > max_ws_size) max_ws_size = sz; } size_t allocation; - void* ws = exe.allocator->alloc(max_ws_size, allocation); + void* ws = exe.temp_allocator->alloc(max_ws_size, allocation); checkCudaErrors(cudnnFindConvolutionForwardAlgorithmEx( handle_, cudnnIdesc, x->ptr(), @@ -220,7 +220,7 @@ void CudnnConvOp::jit_run() { perf_results, ws, max_ws_size)); - exe.allocator->free(ws, max_ws_size, allocation); + exe.temp_allocator->free(ws, max_ws_size, allocation); } else { checkCudaErrors(cudnnGetConvolutionForwardAlgorithm_v7( handle_, @@ -255,7 +255,7 @@ void CudnnConvOp::jit_run() { cudnnOdesc, algo, &workSpaceSize) ); size_t allocation; if (workSpaceSize > 0) { - workSpace = exe.allocator->alloc(workSpaceSize, allocation); + workSpace = exe.temp_allocator->alloc(workSpaceSize, allocation); } float alpha=1, beta=0; checkCudaErrors(cudnnConvolutionForward( @@ -270,7 +270,7 @@ void CudnnConvOp::jit_run() { cudnnOdesc, y->ptr()) ); if (workSpace) - exe.allocator->free(workSpace, workSpaceSize, allocation); + exe.temp_allocator->free(workSpace, workSpaceSize, allocation); checkCudaErrors(cudnnDestroyTensorDescriptor( cudnnIdesc )); checkCudaErrors(cudnnDestroyFilterDescriptor( cudnnFdesc )); diff --git a/python/jittor/dataset/dataset.py b/python/jittor/dataset/dataset.py index e3a54618..e08200d2 100644 --- a/python/jittor/dataset/dataset.py +++ b/python/jittor/dataset/dataset.py @@ -128,7 +128,8 @@ class Dataset(object): if self.stop_grad else jt.array(x) if isinstance(batch, np.ndarray): return to_jt(batch) - assert isinstance(batch, Sequence) + if not isinstance(batch, (list, tuple)): + return batch new_batch = [] for a in batch: if isinstance(a, np.ndarray) or \ diff --git a/python/jittor/lr_scheduler.py b/python/jittor/lr_scheduler.py index e7ee70ef..ab771782 100644 --- a/python/jittor/lr_scheduler.py +++ b/python/jittor/lr_scheduler.py @@ -84,4 +84,58 @@ class ReduceLROnPlateau(object): save = self.threshold + 1.0 return a > b * save else: - return a > b + self.threshold \ No newline at end of file + return a > b + self.threshold + +class CosineAnnealingLR(object): + def __init__(self, optimizer, T_max, eta_min=0, last_epoch=-1): + self.T_max = T_max + self.eta_min = eta_min + self.optimizer = optimizer + self.last_epoch = last_epoch + self.base_lr = optimizer.lr + #TODO set last_epoch is not ready + + def get_lr(self): + if self.last_epoch == 0: + return self.base_lr + now_lr = self.optimizer.lr + if (self.last_epoch - 1 - self.T_max) % (2 * self.T_max) == 0: + return (now_lr + (self.base_lr - self.eta_min) * + (1 - math.cos(math.pi / self.T_max)) / 2) + return ((1 + math.cos(math.pi * self.last_epoch / self.T_max)) / + (1 + math.cos(math.pi * (self.last_epoch - 1) / self.T_max)) * + (now_lr - self.eta_min) + self.eta_min) + + def step(self): + self.last_epoch += 1 + self.update_lr(self.get_lr()) + + def update_lr(self, new_lr): + self.optimizer.lr = new_lr + for i, param_group in enumerate(self.optimizer.param_groups): + if param_group.get("lr")!=None: + param_group["lr"] = new_lr + +class MultiStepLR(object): + def __init__(self, optimizer, milestones=[], gamma=0.1, last_epoch=-1): + self.optimizer = optimizer + self.milestones = milestones + self.gamma = gamma + self.last_epoch = last_epoch + #TODO set last_epoch is not ready + + def get_lr(self): + now_lr = self.optimizer.lr + if (self.last_epoch in self.milestones): + now_lr *= gamma + return now_lr + + def step(self): + self.last_epoch += 1 + self.update_lr(self.get_lr()) + + def update_lr(self, new_lr): + self.optimizer.lr = new_lr + for i, param_group in enumerate(self.optimizer.param_groups): + if param_group.get("lr")!=None: + param_group["lr"] = new_lr \ No newline at end of file diff --git a/python/jittor/misc.py b/python/jittor/misc.py index 2f7f0d5b..2345a864 100644 --- a/python/jittor/misc.py +++ b/python/jittor/misc.py @@ -3,7 +3,7 @@ # Maintainers: # Dun Liang . # Wenyang Zhou <576825820@qq.com> -# +# Guoye Yang <498731903@qq.com> # # This file is subject to the terms and conditions defined in # file 'LICENSE.txt', which is part of this source code package. @@ -77,7 +77,7 @@ def repeat(x, *shape): x = x.broadcast(x_shape) elif len_x_shape > len_shape: rep_shape = (len_x_shape - len_shape) * [1] + shape - + #TODO if input.shape[i]=1, no add [1] reshape_shape = [] broadcast_shape = [] for x_s,r_s in zip(x_shape,rep_shape): @@ -722,6 +722,123 @@ def triu_(x,diagonal=0): jt.Var.triu_ = triu_ +def print_tree(now, max_memory_size, prefix1, prefix2, build_by): + def format_size(s): + if (s < 1024): + s = str(s) + return s + ' B' + + if (s < 1024*1024): + s = format(s/1024, '.2f') + return s + ' KB' + + if (s < 1024*1024*1024): + s = format(s/1024/1024, '.2f') + return s + ' MB' + + s = format(s/1024/1024/1024, '.2f') + return s + ' GB' + + out = '' + tab = ' ' + out += prefix1+now['name']+'('+now['type']+')\n' + out += prefix2+'['+format_size(now['size'])+'; '+format(now['size']/max_memory_size*100, '.2f')+'%]\n' + if (build_by == 0): + for p in now['path']: + out += prefix2+p+'\n' + else: + out += prefix2+now['path'] + '\n' + if (len(now['children']) > 0): + out += prefix2 + tab + '| ' + '\n' + else: + out += prefix2 + '\n' + for i in range(len(now['children'])): + c = now['children'][i] + if i < len(now['children']) - 1: + prefix1_ = prefix2 + tab + '├─' + prefix2_ = prefix2 + tab + '| ' + else: + prefix1_ = prefix2 + tab + '└─' + prefix2_ = prefix2 + tab + ' ' + out += print_tree(c, max_memory_size, prefix1_, prefix2_, build_by) + return out + +def get_max_memory_treemap(build_by=0, do_print=True): + div1 = "[!@#div1!@#]" + div2 = "[!@#div2!@#]" + div3 = "[!@#div3!@#]" + info = jt.get_max_memory_info() + + vars = [] + vars_ = info.split(div1) + max_memory_size = int(vars_[0]) + vars_ = vars_[1:] + for v_ in vars_: + v__ = v_.split(div2) + var = {'size':int(v__[1]), 'stack':[]} + v__ = v__[2:-1] + for s_ in v__: + s__ = s_.split(div3) + s = {'path':s__[0], 'name':s__[1], 'type':s__[2]} + var['stack'].append(s) + vars.append(var) + if (build_by == 0): # build tree by name + tree = {'name':'root', "children":[], 'size':0, 'path':[], 'type':''} + + def find_child(now, key): + for c in now['children']: + if (c['name'] == key): + return c + return None + for v in vars: + now = tree + now['size'] += v['size'] + for s in v['stack']: + ch = find_child(now, s['name']) + if (ch is not None): + if (not s['path'] in ch['path']): + ch['path'].append(s['path']) + assert(ch['type']==s['type']) + now = ch + now['size'] += v['size'] + else: + now_ = {'name':s['name'], "children":[], 'size':v['size'], 'path':[s['path']], 'type':s['type']} + now['children'].append(now_) + now = now_ + elif (build_by == 1): # build tree by path + tree = {'name':'root', "children":[], 'size':0, 'path':'_root_', 'type':''} + + def find_child(now, key): + for c in now['children']: + if (c['path'] == key): + return c + return None + for v in vars: + now = tree + now['size'] += v['size'] + for s in v['stack']: + ch = find_child(now, s['path']) + if (ch is not None): + now = ch + now['size'] += v['size'] + else: + now_ = {'name':s['name'], "children":[], 'size':v['size'], 'path':s['path'], 'type':s['type']} + now['children'].append(now_) + now = now_ + else: + assert(False) + + def sort_tree(now): + def takeSize(elem): + return elem['size'] + now['children'].sort(key=takeSize, reverse=True) + for c in now['children']: + sort_tree(c) + sort_tree(tree) + out = print_tree(tree, max_memory_size, '', '', build_by) + if (do_print): + print(out) + return tree, out def python_pass_warper(mod_func, args, kw): import importlib mod, func = mod_func.rsplit(".", 1) diff --git a/python/jittor/models/resnet.py b/python/jittor/models/resnet.py index bb758526..66da5232 100644 --- a/python/jittor/models/resnet.py +++ b/python/jittor/models/resnet.py @@ -12,8 +12,8 @@ import jittor as jt from jittor import nn -__all__ = ['ResNet', 'Resnet18', 'Resnet34', 'Resnet50', 'Resnet101', 'Resnet152', 'Resnext50_32x4d', 'Resnext101_32x8d', 'Wide_resnet50_2', 'Wide_resnet101_2', - 'resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152', 'resnext50_32x4d', 'resnext101_32x8d', 'wide_resnet50_2', 'wide_resnet101_2'] +__all__ = ['ResNet', 'Resnet18', 'Resnet34', 'Resnet26', 'Resnet38', 'Resnet50', 'Resnet101', 'Resnet152', 'Resnext50_32x4d', 'Resnext101_32x8d', 'Wide_resnet50_2', 'Wide_resnet101_2', + 'resnet18', 'resnet34', 'resnet26', 'resnet38', 'resnet50', 'resnet101', 'resnet152', 'resnext50_32x4d', 'resnext101_32x8d', 'wide_resnet50_2', 'wide_resnet101_2'] def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1): conv=nn.Conv(in_planes, out_planes, kernel_size=3, stride=stride, padding=dilation, groups=groups, bias=False, dilation=dilation) @@ -174,6 +174,14 @@ def Resnet50(pretrained=False, **kwargs): resnet50 = Resnet50 +def Resnet38(**kwargs): + return _resnet(Bottleneck, [2, 3, 5, 2], **kwargs) +resnet38 = Resnet38 + +def Resnet26(**kwargs): + return _resnet(Bottleneck, [1, 2, 4, 1], **kwargs) +resnet26 = Resnet26 + def Resnet101(pretrained=False, **kwargs): """ ResNet-101 model architecture. diff --git a/python/jittor/test/test_memory_profiler.py b/python/jittor/test/test_memory_profiler.py new file mode 100644 index 00000000..3ad49127 --- /dev/null +++ b/python/jittor/test/test_memory_profiler.py @@ -0,0 +1,97 @@ +# *************************************************************** +# Copyright (c) 2021 Jittor. All Rights Reserved. +# Maintainers: +# Guoye Yang <498731903@qq.com> +# Dun Liang . +# +# This file is subject to the terms and conditions defined in +# file 'LICENSE.txt', which is part of this source code package. +# *************************************************************** +import jittor as jt +from jittor import nn, Module +from jittor.models import resnet +import numpy as np +import sys, os +import random +import math +import unittest +from jittor.test.test_reorder_tuner import simple_parser +from jittor.test.test_log import find_log_with_re +from jittor.dataset.mnist import MNIST +import jittor.transform as trans +import time + +skip_this_test = False + +class MnistNet(Module): + def __init__(self): + self.model = resnet.Resnet18() + self.layer = nn.Linear(1000,10) + def execute(self, x): + x = self.model(x) + x = self.layer(x) + return x + +@unittest.skipIf(skip_this_test, "skip_this_test") +class TestMemoryProfiler(unittest.TestCase): + @classmethod + def setUpClass(self): + # hyper-parameters + self.batch_size = 100 + self.weight_decay = 0.0001 + self.momentum = 0.9 + self.learning_rate = 0.1 + # mnist dataset + self.train_loader = MNIST(train=True, transform=trans.Resize(224)) \ + .set_attrs(batch_size=self.batch_size, shuffle=True) + self.train_loader.num_workers = 4 + + # setup random seed + def setup_seed(self, seed): + np.random.seed(seed) + random.seed(seed) + jt.seed(seed) + + @unittest.skipIf(not jt.has_cuda, "Cuda not found") + @jt.flag_scope(use_cuda=1, use_stat_allocator=1, trace_py_var=3, profile_memory_enable=1) + def test_resnet(self): + self.setup_seed(1) + loss_list=[] + acc_list=[] + mnist_net = MnistNet() + global prev + prev = time.time() + SGD = nn.SGD(mnist_net.parameters(), self.learning_rate, self.momentum, self.weight_decay) + + iters = 10 + for batch_idx, (data, target) in enumerate(self.train_loader): + if (batch_idx > iters): + break + jt.display_memory_info() + output = mnist_net(data) + loss = nn.cross_entropy_loss(output, target) + SGD.step(loss) + def callback(batch_idx, loss, output, target): + global prev + pred = np.argmax(output, axis=1) + acc = np.mean(target==pred) + loss_list.append(loss[0]) + acc_list.append(acc) + print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\tAcc: {:.6f} \tTime:{:.3f}' + .format(0, batch_idx, iters,1. * batch_idx / 6.0, loss[0], acc, time.time()-prev)) + jt.fetch(batch_idx, loss, output, target, callback) + jt.sync_all(True) + jt.display_max_memory_info() + _, out = jt.get_max_memory_treemap() + out_ = out.split('\n') + assert(out_[0] == 'root()') + assert(out_[3].endswith('(_run_module_as_main)')) + assert(out_[7].endswith('(_run_code)')) + _, out = jt.get_max_memory_treemap(build_by=1) + out_ = out.split('\n') + assert(out_[0] == 'root()') + assert(out_[4].endswith('(_run_module_as_main)')) + assert(out_[8].endswith('(_run_code)')) + +if __name__ == "__main__": + unittest.main() diff --git a/src/executor.cc b/src/executor.cc index cec8e898..3f1b87f4 100644 --- a/src/executor.cc +++ b/src/executor.cc @@ -1,6 +1,9 @@ // *************************************************************** // Copyright (c) 2021 Jittor. All Rights Reserved. -// Maintainers: Dun Liang . +// Maintainers: +// Dun Liang . +// Guoye Yang <498731903@qq.com> +// // This file is subject to the terms and conditions defined in // file 'LICENSE.txt', which is part of this source code package. // *************************************************************** @@ -22,11 +25,14 @@ #include "fuser.h" #include "profiler/profiler_guard.h" #include "parallel_compiler.h" +#include "memory_profiler.h" #include "misc/nan_checker.h" namespace jittor { Executor exe; +extern MemoryProfiler memory_profiler; +DECLARE_FLAG(int, profile_memory_enable); // from fetch_op.cc extern list fetcher_to_free; @@ -90,7 +96,9 @@ void load_fused_op(FusedOp& fused_op, vector& fuse_ops, vector& ops, i void Executor::run_sync(vector vars, bool device_sync) { auto allocator = get_allocator(); + auto temp_allocator = get_allocator(true); this->allocator = allocator; + this->temp_allocator = temp_allocator; // bfs find all ops need to run int op_num = 0; vector bfs_q; @@ -420,6 +428,8 @@ void Executor::run_sync(vector vars, bool device_sync) { for (auto* var : op->outputs()) { var->alloc(allocator); } + if (PREDICT_BRANCH_NOT_TAKEN(profile_memory_enable)) + memory_profiler.check(); LOGvvv << "Run" << op << "inputs:" << op->inputs() << "outputs:" << op->outputs(); op->do_prepare(jkl); bool is_cuda = op->flags.get(NodeFlags::_cuda); diff --git a/src/executor.h b/src/executor.h index 0876f47d..5c924e98 100644 --- a/src/executor.h +++ b/src/executor.h @@ -1,6 +1,9 @@ // *************************************************************** -// Copyright (c) 2021 Jittor. All Rights Reserved. -// Maintainers: Dun Liang . +// Copyright (c) 2021 Jittor. All Rights Reserved. +// Maintainers: +// Dun Liang . +// Guoye Yang <498731903@qq.com> +// // This file is subject to the terms and conditions defined in // file 'LICENSE.txt', which is part of this source code package. // *************************************************************** @@ -16,6 +19,7 @@ namespace jittor { struct Executor { Allocator* allocator; + Allocator* temp_allocator; bool last_is_cuda = false; void run_sync(vector vars, bool device_sync); }; diff --git a/src/mem/allocator.cc b/src/mem/allocator.cc index 8a9d09d8..00293da2 100644 --- a/src/mem/allocator.cc +++ b/src/mem/allocator.cc @@ -15,6 +15,7 @@ #include "mem/allocator/stat_allocator.h" #include "mem/allocator/sfrl_allocator.h" #include "mem/allocator/nfef_allocator.h" +#include "mem/allocator/temp_allocator.h" namespace jittor { @@ -46,7 +47,7 @@ Allocator* setup_allocator(Allocator* underlying) { Allocator* cpu_allocator = setup_allocator(&aligned_allocator); -Allocator* get_allocator() { +Allocator* get_allocator(bool temp_allocator) { Allocator* allocator = nullptr; #ifdef HAS_CUDA if (use_cuda && !allocator) { @@ -72,7 +73,10 @@ Allocator* get_allocator() { allocator = setup_allocator(allocator); return allocator; } - if (use_sfrl_allocator) { + if (temp_allocator && use_temp_allocator) { + LOGvv << "Using temp_allocator"; + allocator = setup_allocator(allocator); + } else if (use_sfrl_allocator) { LOGvv << "Using sfrl_allocator"; allocator = setup_allocator(allocator); } diff --git a/src/mem/allocator.h b/src/mem/allocator.h index 797b7970..bb96f3b9 100644 --- a/src/mem/allocator.h +++ b/src/mem/allocator.h @@ -49,7 +49,7 @@ struct Allocation { }; extern Allocator* cpu_allocator; -Allocator* get_allocator(); +Allocator* get_allocator(bool temp_allocator=false); // @pyjt(gc) void gc_all(); diff --git a/src/mem/allocator/temp_allocator.cc b/src/mem/allocator/temp_allocator.cc new file mode 100644 index 00000000..b6b2c5a9 --- /dev/null +++ b/src/mem/allocator/temp_allocator.cc @@ -0,0 +1,116 @@ +// *************************************************************** +// Copyright (c) 2021 Jittor. All Rights Reserved. +// Maintainers: +// Guoye Yang <498731903@qq.com> +// Dun Liang . +// +// This file is subject to the terms and conditions defined in +// file 'LICENSE.txt', which is part of this source code package. +// *************************************************************** + +#include "mem/allocator/temp_allocator.h" + +namespace jittor { + +DEFINE_FLAG(int, use_temp_allocator, 1, "Enable temp allocator"); + +TempAllocator::~TempAllocator() { + while (!cached_blocks.empty()) { + auto it = cached_blocks.begin(); + TempCachingBlock* block = it->second; + cached_blocks.erase(it); + delete block; + } +} + +const char* TempAllocator::name() const {return "temp";} + +void TempAllocator::setup(Allocator* underlying) { + this->underlying = underlying; +} + +size_t TempAllocator::align_size(size_t size) { + return (size + ALIGN_SIZE - 1) / ALIGN_SIZE * ALIGN_SIZE; +} + +unsigned long long TempAllocator::get_key(TempCachingBlock* block) { + return ((unsigned long long)block->size) * ID_LIMIT + block->id; +} + +void* TempAllocator::alloc(size_t size, size_t& allocation) { + size = align_size(size); + + auto temp = TempCachingBlock(size); + auto it = cached_blocks.lower_bound(get_key(&temp)); + TempCachingBlock* block = nullptr; + if (it != cached_blocks.end()) { + block = it->second; + cached_blocks.erase(it); + unused_memory -= block->size; + } else { + void* ptr = underlying->alloc(size, allocation); + block = new TempCachingBlock(size, ptr); + size_t id; + if (!block_ids.empty()) { + id = block_ids.back(); + block_ids.pop_back(); + } else { + ASSERT(tot_block_id < ID_LIMIT - 1) << "block id limit extended."; + id = ++tot_block_id; + } + block->id = id; + } + + used_memory += block->size; + occupied_id_mapper[block->id] = block; + allocation = block->id; + return block->memory_ptr; +} + +void TempAllocator::free(void* mem_ptr, size_t size, const size_t& allocation) { + size = align_size(size); + ASSERT(occupied_id_mapper[allocation] != nullptr) << "allocation not found"; + TempCachingBlock* block = occupied_id_mapper[allocation]; + occupied_id_mapper[allocation] = nullptr; + used_memory -= block->size; + unused_memory += block->size; + bool can_add = true; + if (cached_blocks.size() > cache_blocks_limit-1) { + ASSERT(cached_blocks.size() == cache_blocks_limit); + auto it = cached_blocks.lower_bound(get_key(block)); + if (it == cached_blocks.begin()) { + can_add = false; + } else { + --it; + TempCachingBlock* block = it->second; + underlying->free((void*)block->memory_ptr, block->size, 0); + unused_memory -= block->size; + block_ids.push_back(block->id); + cached_blocks.erase(it); + delete block; + } + } + if (can_add) { + cached_blocks[get_key(block)] = block; + } +} + +void TempAllocator::gc() { + while (!cached_blocks.empty()) { + auto it = cached_blocks.begin(); + TempCachingBlock* block = it->second; + underlying->free((void*)block->memory_ptr, block->size, 0); + unused_memory -= block->size; + block_ids.push_back(block->id); + cached_blocks.erase(it); + delete block; + } +} + +bool TempAllocator::share_with(size_t size, size_t allocation) { + ASSERT(false); + return true; +} + +} // jittor + diff --git a/src/mem/allocator/temp_allocator.h b/src/mem/allocator/temp_allocator.h new file mode 100644 index 00000000..50a0969b --- /dev/null +++ b/src/mem/allocator/temp_allocator.h @@ -0,0 +1,57 @@ +// *************************************************************** +// Copyright (c) 2021 Jittor. All Rights Reserved. +// Maintainers: +// Guoye Yang <498731903@qq.com> +// Dun Liang . +// +// This file is subject to the terms and conditions defined in +// file 'LICENSE.txt', which is part of this source code package. +// *************************************************************** +#pragma once +#include "mem/allocator.h" + +namespace jittor { + +struct TempCachingBlock { + size_t size; + size_t id; + void* memory_ptr; + + TempCachingBlock(size_t size):size(size),id(0) {} + TempCachingBlock(size_t size, void* memory_ptr):size(size),id(0), memory_ptr(memory_ptr) {} +}; + +struct TempAllocator : Allocator { + static const size_t ALIGN_SIZE = 512; + static const size_t ID_LIMIT = 1 << 18; + Allocator* underlying; + size_t cache_blocks_limit, used_memory, unused_memory; + std::map cached_blocks; + std::vector block_ids; + size_t tot_block_id; + std::unique_ptr occupied_id_mapper; + + + inline TempAllocator(size_t cache_blocks_limit=2) : cache_blocks_limit(cache_blocks_limit), used_memory(0), unused_memory(0), tot_block_id(0), occupied_id_mapper(new TempCachingBlock*[ID_LIMIT]) { + } + inline TempAllocator(Allocator* underlying, size_t cache_blocks_limit=2) : TempAllocator(cache_blocks_limit) { + setup(underlying); + } + ~TempAllocator(); + + size_t align_size(size_t size); + unsigned long long get_key(TempCachingBlock* block); + // free all unused memory of all sfrl allocators. + void setup(Allocator* underlying); + uint64 flags() const override { return underlying->flags(); } + const char* name() const override; + void* alloc(size_t size, size_t& allocation) override; + void free(void* mem_ptr, size_t size, const size_t& allocation) override; + void gc() override; + virtual bool share_with(size_t size, size_t allocation) override; +}; + +DECLARE_FLAG(int, use_temp_allocator); + +}//jittor + diff --git a/src/mem/mem_info.cc b/src/mem/mem_info.cc index ced2ea4f..de37b1a1 100644 --- a/src/mem/mem_info.cc +++ b/src/mem/mem_info.cc @@ -15,8 +15,10 @@ #include "misc/cuda_flags.h" #include "mem/allocator/sfrl_allocator.h" #include "mem/allocator/stat_allocator.h" +#include "mem/allocator/temp_allocator.h" #include "mem/mem_info.h" #include "update_queue.h" +#include "executor.h" namespace jittor { @@ -101,7 +103,13 @@ void display_memory_info(const char* fileline, bool dump_var, bool red_color) { log << "cpu&gpu:" << FloatOutput{(double)all_total, " KMG", 1024, "B"} << "gpu:" << FloatOutput{(double)gpu_total, " KMG", 1024, "B"} << "cpu:" << FloatOutput{(double)cpu_total, " KMG", 1024, "B"} >> '\n'; - + if (use_temp_allocator) { + TempAllocator* temp_allocator = (TempAllocator*)exe.temp_allocator; + log << "\nname:" << temp_allocator->name() << "\n"; + log << "used_memory:" << FloatOutput{(double)temp_allocator->used_memory, " KMG", 1024, "B"} << "\n"; + log << "unused_memory:" << FloatOutput{(double)temp_allocator->unused_memory, " KMG", 1024, "B"} << "\n"; + + } if (dump_var) { vector queue; unordered_set visited; diff --git a/src/memory_profiler.cc b/src/memory_profiler.cc new file mode 100644 index 00000000..572ff213 --- /dev/null +++ b/src/memory_profiler.cc @@ -0,0 +1,163 @@ +// *************************************************************** +// Copyright (c) 2021 Jittor. All Rights Reserved. +// Maintainers: +// Guoye Yang <498731903@qq.com> +// Dun Liang . +// +// This file is subject to the terms and conditions defined in +// file 'LICENSE.txt', which is part of this source code package. +// *************************************************************** +#include "memory_profiler.h" +#include "graph.h" +#include "var_holder.h" +#include "var.h" +#include "mem/allocator/sfrl_allocator.h" +#include +#include +#include +#include +#include "pybind/py_var_tracer.h" + +namespace jittor { + +//TODO reuse from mem_info.cc +struct FloatOutput_ { + double value; + string scale; + int base; + string suffix; + int p=4; +}; +inline std::ostream& operator<<(std::ostream& os, const FloatOutput_& o) { + int w = 8; + os << std::setw(w-2-o.suffix.size()); + os << std::setprecision(o.p); + uint i=0; + double k = o.value; + for (; i+1 MemoryProfiler::get_memory_info() { + ASSERT(profile_memory_enable == 1); + size_t used = 0; + size_t unused = 0; + //TODO add mssfrl allocator + for (auto& a : SFRLAllocator::sfrl_allocators) { + used += a->used_memory; + unused += a->unused_memory; + } + return std::make_pair(used, unused); +} + +void MemoryProfiler::check() { + ASSERT(profile_memory_enable == 1); + std::pair mem_info = get_memory_info(); + if (mem_info.first > max_used_memory_size) { + max_used_memory_size = mem_info.first; + + allocations.clear(); + size_t memory_size = 0; + std::vector>, size_t>> live_vars; + vector queue; + + auto t = ++Node::tflag_count; + for (auto& vh : VarHolder::hold_vars) + if (vh->var->tflag != t) { + vh->var->tflag = t; + queue.push_back(vh->var); + } + bfs_both(queue, [](Node*){return true;}); + for (Node* node : queue) { + if (node->is_var()) { + Var* var = (Var*)node; + if (var->mem_ptr != nullptr) { + vector stacks = get_node_trace(var); + std::stringstream stream; + stream << var; + live_vars.push_back(std::make_pair(std::make_pair(stream.str(), stacks), var->size)); + if (!allocations.count(var->mem_ptr)) { + allocations[var->mem_ptr] = 1; + memory_size += var->size; + } + } + } + } + max_live_vars = live_vars; + max_memory_size = memory_size; + } +} + +bool MemoryProfiler::cmp(const std::pair>, size_t>& a, const std::pair>, size_t>& b) { + return a.second > b.second; +} + +void MemoryProfiler::display_max_memory_info() { + ASSERT(profile_memory_enable == 1); + Log log("", 'i', 0); + std::sort(max_live_vars.begin(), max_live_vars.end(), cmp); + log << "\n=====display_max_memory_info=====\n"; + log << "max used memory" << FloatOutput_{(double)max_used_memory_size, " KMG", 1024, "B"} << "\n"; + log << "max var memory" << FloatOutput_{(double)max_memory_size, " KMG", 1024, "B"} << "\n\n"; + log << "[Size]" << "[Percent]" << "[Var Info]" << "\n"; + for (int i = 0; i < max_live_vars.size(); ++i) { + log << FloatOutput_{(double)max_live_vars[i].second, " KMG", 1024, "B"} + << double(max_live_vars[i].second) / max_memory_size * 100 << "%" + << max_live_vars[i].first.first + << max_live_vars[i].first.second[0].file_path + ":" + std::to_string(max_live_vars[i].first.second[0].lineno) + << "\n\n"; + } + log << "=========================\n"; + log.end(); +} + +void display_max_memory_info() { + ASSERT(profile_memory_enable == 1); + memory_profiler.display_max_memory_info(); +} + +string MemoryProfiler::get_max_memory_info() { + ASSERT(profile_memory_enable == 1); + std::stringstream out; + string div1 = "[!@#div1!@#]"; + string div2 = "[!@#div2!@#]"; + string div3 = "[!@#div3!@#]"; + + std::sort(max_live_vars.begin(), max_live_vars.end(), cmp); + out << max_memory_size; + for (int i = 0; i < max_live_vars.size(); ++i) { + out << div1; + out << max_live_vars[i].first.first << div2; + out << max_live_vars[i].second << div2; + for (int j = 0; j < max_live_vars[i].first.second.size(); ++j) { + out << max_live_vars[i].first.second[j].file_path + ":" + std::to_string(max_live_vars[i].first.second[j].lineno) << div3 + << max_live_vars[i].first.second[j].module_name << div3 + << max_live_vars[i].first.second[j].module_type << div2; + } + } + return out.str(); +} + +string get_max_memory_info() { + ASSERT(profile_memory_enable == 1); + return memory_profiler.get_max_memory_info(); +} + +} // jittor \ No newline at end of file diff --git a/src/memory_profiler.h b/src/memory_profiler.h new file mode 100644 index 00000000..be1daac2 --- /dev/null +++ b/src/memory_profiler.h @@ -0,0 +1,46 @@ +// *************************************************************** +// Copyright (c) 2021 Jittor. All Rights Reserved. +// Maintainers: +// Guoye Yang <498731903@qq.com> +// Dun Liang . +// +// This file is subject to the terms and conditions defined in +// file 'LICENSE.txt', which is part of this source code package. +// *************************************************************** +#pragma once +#include "common.h" +#include "mem/allocator.h" +#include +#include +#include +#include "var.h" +#include "pybind/py_var_tracer.h" +namespace jittor { + +// @pyjt(display_max_memory_info) +void display_max_memory_info(); +// @pyjt(get_max_memory_info) +string get_max_memory_info(); + +struct MemoryProfiler { + std::map allocations; + // Max Infos + vector>, size_t>> max_live_vars; + size_t max_used_memory_size; + size_t max_memory_size; + + + MemoryProfiler(); + static bool cmp(const std::pair>, size_t>& a, const std::pair>, size_t>& b); + void clear(); + void check(); + std::pair get_memory_info(); + void display_max_memory_info(); + string get_max_memory_info(); +}; + +extern MemoryProfiler memory_profiler; + +DECLARE_FLAG(int, profile_memory_enable); + +} // jittor \ No newline at end of file diff --git a/src/ops/candidate_op.cc b/src/ops/candidate_op.cc index 0dd3ff4f..5a056722 100644 --- a/src/ops/candidate_op.cc +++ b/src/ops/candidate_op.cc @@ -76,9 +76,9 @@ void CandidateOp::jit_run() { // define ys auto* __restrict__ yp = y->ptr(); size_t n_allocation; - int* np = (int*)exe.allocator->alloc(4, n_allocation); + int* np = (int*)exe.temp_allocator->alloc(4, n_allocation); size_t mask_allocation; - bool* maskp = (bool*)exe.allocator->alloc(xshape0, mask_allocation); + bool* maskp = (bool*)exe.temp_allocator->alloc(xshape0, mask_allocation); checkCudaErrors(cudaMemsetAsync(maskp, 1, xshape0)); candidate_kernel<<<1, std::max(1, std::min(1024, xshape0)) >>>( @@ -93,8 +93,8 @@ void CandidateOp::jit_run() { // checkCudaErrors(cudaDeviceSynchronize()); checkCudaErrors(cudaMemcpy(&n, np, 4, cudaMemcpyDefault)); y->set_shape({n}); - exe.allocator->free(np, 4, n_allocation); - exe.allocator->free(maskp, xshape0, mask_allocation); + exe.temp_allocator->free(np, 4, n_allocation); + exe.temp_allocator->free(maskp, xshape0, mask_allocation); } #else void CandidateOp::jit_run() { diff --git a/src/ops/where_op.cc b/src/ops/where_op.cc index acb7a868..8e15dd81 100644 --- a/src/ops/where_op.cc +++ b/src/ops/where_op.cc @@ -196,7 +196,7 @@ void WhereOp::jit_run() { @for(i, 0, NDIM, auto* __restrict__ outs@i@@p = outs[@i]->ptr();) size_t n_allocation; - int* np = (int*)exe.allocator->alloc(4, n_allocation); + int* np = (int*)exe.temp_allocator->alloc(4, n_allocation); // one block kernel, result maybe unstable // int tnum = condshape@{NDIM-1}; @@ -232,7 +232,7 @@ void WhereOp::jit_run() { // checkCudaErrors(cudaDeviceSynchronize()); checkCudaErrors(cudaMemcpy(&n, np, 4, cudaMemcpyDefault)); @for(i, 0, NDIM, outs[@i]->set_shape({n});) - exe.allocator->free(np, 4, n_allocation); + exe.temp_allocator->free(np, 4, n_allocation); } #else diff --git a/src/opt/gopt/setitem_gopt.cc b/src/opt/gopt/setitem_gopt.cc index 4cef6694..c777d2a0 100644 --- a/src/opt/gopt/setitem_gopt.cc +++ b/src/opt/gopt/setitem_gopt.cc @@ -48,7 +48,6 @@ static void setitem_inplace(SetitemOp* op) { } auto output = op->outputs().front(); output->share_with(input); - // return; auto data = op->input(1); // if setitem requires type conversion, don't inplace diff --git a/src/pybind/py_var_tracer.cc b/src/pybind/py_var_tracer.cc index 769a3f96..e46b2afe 100644 --- a/src/pybind/py_var_tracer.cc +++ b/src/pybind/py_var_tracer.cc @@ -1,6 +1,9 @@ // *************************************************************** -// Copyright (c) 2021 Jittor. All Rights Reserved. -// Maintainers: Dun Liang . +// Copyright (c) 2021 Jittor. All Rights Reserved. +// Maintainers: +// Dun Liang . +// Guoye Yang <498731903@qq.com> +// // This file is subject to the terms and conditions defined in // file 'LICENSE.txt', which is part of this source code package. // *************************************************************** @@ -97,7 +100,8 @@ static vector get_stack_info() { PyObject* prev_obj = nullptr; if (trace_py_var >= 3) { // trace raw stack - auto start = std::max(0, n-5); + // auto start = std::max(0, n-5); + auto start = 0; for (int i=start; if_code->co_filename); @@ -363,4 +367,16 @@ void print_node_trace(const Node* node, std::ostream& os) { os << _get_stack_info((Node*)node); } +vector get_node_trace(Node* node) { + auto iter = trace_data.id_map.find(node); + if (iter == trace_data.id_map.end()) + return vector(); + auto node_id = iter->second; + auto iter2 = trace_data.node_data.find(node_id); + if (iter2 == trace_data.node_data.end()) + return vector(); + return iter2->second.stacks; +} + + } // jittor diff --git a/src/pybind/py_var_tracer.h b/src/pybind/py_var_tracer.h index d797b847..669d570f 100644 --- a/src/pybind/py_var_tracer.h +++ b/src/pybind/py_var_tracer.h @@ -67,5 +67,5 @@ struct TraceData { extern TraceData trace_data; void print_node_trace(const Node* node, std::ostream& os); - +vector get_node_trace(Node* node); } // jittor