mirror of https://github.com/Jittor/Jittor
temp_allocator
This commit is contained in:
parent
a3a09a4837
commit
a96ccab4bb
|
@ -87,7 +87,7 @@ void CubArgReduceOp::jit_run() {
|
||||||
num_segments *= x->shape[i];
|
num_segments *= x->shape[i];
|
||||||
}
|
}
|
||||||
size_t allocation_dout;
|
size_t allocation_dout;
|
||||||
cub::KeyValuePair<int, Tx> *d_out = (cub::KeyValuePair<int, Tx> *)exe.allocator->alloc(sizeof(cub::KeyValuePair<int, Tx>) * num_segments, allocation_dout);
|
cub::KeyValuePair<int, Tx> *d_out = (cub::KeyValuePair<int, Tx> *)exe.temp_allocator->alloc(sizeof(cub::KeyValuePair<int, Tx>) * num_segments, allocation_dout);
|
||||||
|
|
||||||
// Determine temporary device storage requirementse = NULL;
|
// Determine temporary device storage requirementse = NULL;
|
||||||
void *d_temp_storage = NULL;
|
void *d_temp_storage = NULL;
|
||||||
|
@ -96,7 +96,7 @@ void CubArgReduceOp::jit_run() {
|
||||||
xp, d_out, num_segments, offsetsp, offsetsp + 1);
|
xp, d_out, num_segments, offsetsp, offsetsp + 1);
|
||||||
// Allocate temporary storage
|
// Allocate temporary storage
|
||||||
size_t allocation;
|
size_t allocation;
|
||||||
d_temp_storage = exe.allocator->alloc(temp_storage_bytes, allocation);
|
d_temp_storage = exe.temp_allocator->alloc(temp_storage_bytes, allocation);
|
||||||
// Run sorting operation
|
// Run sorting operation
|
||||||
cub::DeviceSegmentedReduce::@FUNC@@(d_temp_storage, temp_storage_bytes,
|
cub::DeviceSegmentedReduce::@FUNC@@(d_temp_storage, temp_storage_bytes,
|
||||||
xp, d_out, num_segments, offsetsp, offsetsp + 1);
|
xp, d_out, num_segments, offsetsp, offsetsp + 1);
|
||||||
|
@ -105,8 +105,8 @@ void CubArgReduceOp::jit_run() {
|
||||||
auto* __restrict__ y_keyp = y_key->ptr<Tx>();
|
auto* __restrict__ y_keyp = y_key->ptr<Tx>();
|
||||||
split<<<max(1,num_segments/1024),1024>>>(d_out, y_keyp, yp, num_segments);
|
split<<<max(1,num_segments/1024),1024>>>(d_out, y_keyp, yp, num_segments);
|
||||||
|
|
||||||
exe.allocator->free(d_temp_storage, temp_storage_bytes, allocation);
|
exe.temp_allocator->free(d_temp_storage, temp_storage_bytes, allocation);
|
||||||
exe.allocator->free(d_out, sizeof(cub::KeyValuePair<int, Tx>) * num_segments, allocation_dout);
|
exe.temp_allocator->free(d_out, sizeof(cub::KeyValuePair<int, Tx>) * num_segments, allocation_dout);
|
||||||
}
|
}
|
||||||
#endif // JIT_cuda
|
#endif // JIT_cuda
|
||||||
#endif // JIT
|
#endif // JIT
|
||||||
|
|
|
@ -85,12 +85,12 @@ void CubArgsortOp::jit_run() {
|
||||||
num_items, num_segments, offsetsp, offsetsp + 1);
|
num_items, num_segments, offsetsp, offsetsp + 1);
|
||||||
// Allocate temporary storage
|
// Allocate temporary storage
|
||||||
size_t allocation;
|
size_t allocation;
|
||||||
d_temp_storage = exe.allocator->alloc(temp_storage_bytes, allocation);
|
d_temp_storage = exe.temp_allocator->alloc(temp_storage_bytes, allocation);
|
||||||
// Run sorting operation
|
// Run sorting operation
|
||||||
cub::DeviceSegmentedRadixSort::@FUNC@@(d_temp_storage, temp_storage_bytes,
|
cub::DeviceSegmentedRadixSort::@FUNC@@(d_temp_storage, temp_storage_bytes,
|
||||||
xp, y_keyp, indexesp, yp,
|
xp, y_keyp, indexesp, yp,
|
||||||
num_items, num_segments, offsetsp, offsetsp + 1);
|
num_items, num_segments, offsetsp, offsetsp + 1);
|
||||||
exe.allocator->free(d_temp_storage, temp_storage_bytes, allocation);
|
exe.temp_allocator->free(d_temp_storage, temp_storage_bytes, allocation);
|
||||||
}
|
}
|
||||||
#endif // JIT_cuda
|
#endif // JIT_cuda
|
||||||
#endif // JIT
|
#endif // JIT
|
||||||
|
|
|
@ -82,7 +82,7 @@ void CubWhereOp::jit_run(){
|
||||||
int N = cond->num;
|
int N = cond->num;
|
||||||
size_t temp_storage_bytes=0;
|
size_t temp_storage_bytes=0;
|
||||||
size_t num_nonzeros_allocation;
|
size_t num_nonzeros_allocation;
|
||||||
auto num_nonzeros = exe.allocator->alloc(sizeof(To), num_nonzeros_allocation);
|
auto num_nonzeros = exe.temp_allocator->alloc(sizeof(To), num_nonzeros_allocation);
|
||||||
|
|
||||||
size_t temp_storage_allocation;
|
size_t temp_storage_allocation;
|
||||||
void* temp_storage;
|
void* temp_storage;
|
||||||
|
@ -93,9 +93,9 @@ void CubWhereOp::jit_run(){
|
||||||
cub::TransformInputIterator<bool, NonZeroOp<Ti>, Ti*> itr(cond->ptr<Ti>(), NonZeroOp<Ti>());
|
cub::TransformInputIterator<bool, NonZeroOp<Ti>, Ti*> itr(cond->ptr<Ti>(), NonZeroOp<Ti>());
|
||||||
temp_storage_bytes = 0;
|
temp_storage_bytes = 0;
|
||||||
checkCudaErrors(cub::DeviceSelect::Flagged(nullptr, temp_storage_bytes, counting_itr, itr, out_temp, (To*)num_nonzeros, N));
|
checkCudaErrors(cub::DeviceSelect::Flagged(nullptr, temp_storage_bytes, counting_itr, itr, out_temp, (To*)num_nonzeros, N));
|
||||||
temp_storage = exe.allocator->alloc(temp_storage_bytes, temp_storage_allocation);
|
temp_storage = exe.temp_allocator->alloc(temp_storage_bytes, temp_storage_allocation);
|
||||||
checkCudaErrors(cub::DeviceSelect::Flagged(temp_storage, temp_storage_bytes, counting_itr, itr,out_temp, (To*)num_nonzeros, N));
|
checkCudaErrors(cub::DeviceSelect::Flagged(temp_storage, temp_storage_bytes, counting_itr, itr,out_temp, (To*)num_nonzeros, N));
|
||||||
exe.allocator->free(temp_storage, temp_storage_bytes, temp_storage_allocation);
|
exe.temp_allocator->free(temp_storage, temp_storage_bytes, temp_storage_allocation);
|
||||||
|
|
||||||
To num_nonzeros_h;
|
To num_nonzeros_h;
|
||||||
cudaMemcpy(&num_nonzeros_h, num_nonzeros, sizeof(To), cudaMemcpyDeviceToHost);
|
cudaMemcpy(&num_nonzeros_h, num_nonzeros, sizeof(To), cudaMemcpyDeviceToHost);
|
||||||
|
@ -110,7 +110,7 @@ void CubWhereOp::jit_run(){
|
||||||
@for(i, 0, NDIM, 1, , cond->shape[@i], outs[@i]->ptr<To>())
|
@for(i, 0, NDIM, 1, , cond->shape[@i], outs[@i]->ptr<To>())
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
exe.allocator->free(num_nonzeros, sizeof(int), num_nonzeros_allocation);
|
exe.temp_allocator->free(num_nonzeros, sizeof(int), num_nonzeros_allocation);
|
||||||
|
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -203,7 +203,7 @@ void CudnnConvBackwardWOp::jit_run() {
|
||||||
if (CUDNN_STATUS_SUCCESS == ret && sz > max_ws_size) max_ws_size = sz;
|
if (CUDNN_STATUS_SUCCESS == ret && sz > max_ws_size) max_ws_size = sz;
|
||||||
}
|
}
|
||||||
size_t allocation;
|
size_t allocation;
|
||||||
void* ws = exe.allocator->alloc(max_ws_size, allocation);
|
void* ws = exe.temp_allocator->alloc(max_ws_size, allocation);
|
||||||
checkCudaErrors(cudnnFindConvolutionBackwardFilterAlgorithmEx(
|
checkCudaErrors(cudnnFindConvolutionBackwardFilterAlgorithmEx(
|
||||||
handle_,
|
handle_,
|
||||||
cudnnIdesc, x->ptr<Tx>(),
|
cudnnIdesc, x->ptr<Tx>(),
|
||||||
|
@ -215,7 +215,7 @@ void CudnnConvBackwardWOp::jit_run() {
|
||||||
perf_results,
|
perf_results,
|
||||||
ws,
|
ws,
|
||||||
max_ws_size));
|
max_ws_size));
|
||||||
exe.allocator->free(ws, max_ws_size, allocation);
|
exe.temp_allocator->free(ws, max_ws_size, allocation);
|
||||||
} else {
|
} else {
|
||||||
checkCudaErrors(cudnnGetConvolutionBackwardFilterAlgorithm_v7(
|
checkCudaErrors(cudnnGetConvolutionBackwardFilterAlgorithm_v7(
|
||||||
handle_,
|
handle_,
|
||||||
|
@ -250,7 +250,7 @@ void CudnnConvBackwardWOp::jit_run() {
|
||||||
cudnnFdesc, algo, &workSpaceSize));
|
cudnnFdesc, algo, &workSpaceSize));
|
||||||
size_t allocation;
|
size_t allocation;
|
||||||
if (workSpaceSize > 0) {
|
if (workSpaceSize > 0) {
|
||||||
workSpace = exe.allocator->alloc(workSpaceSize, allocation);
|
workSpace = exe.temp_allocator->alloc(workSpaceSize, allocation);
|
||||||
}
|
}
|
||||||
float alpha=1, beta=0;
|
float alpha=1, beta=0;
|
||||||
checkCudaErrors(cudnnConvolutionBackwardFilter(
|
checkCudaErrors(cudnnConvolutionBackwardFilter(
|
||||||
|
@ -265,7 +265,7 @@ void CudnnConvBackwardWOp::jit_run() {
|
||||||
cudnnFdesc, w->ptr<Tw>())
|
cudnnFdesc, w->ptr<Tw>())
|
||||||
);
|
);
|
||||||
if (workSpace)
|
if (workSpace)
|
||||||
exe.allocator->free(workSpace, workSpaceSize, allocation);
|
exe.temp_allocator->free(workSpace, workSpaceSize, allocation);
|
||||||
|
|
||||||
checkCudaErrors(cudnnDestroyTensorDescriptor( cudnnIdesc ));
|
checkCudaErrors(cudnnDestroyTensorDescriptor( cudnnIdesc ));
|
||||||
checkCudaErrors(cudnnDestroyFilterDescriptor( cudnnFdesc ));
|
checkCudaErrors(cudnnDestroyFilterDescriptor( cudnnFdesc ));
|
||||||
|
|
|
@ -204,7 +204,7 @@ void CudnnConvBackwardXOp::jit_run() {
|
||||||
if (CUDNN_STATUS_SUCCESS == ret && sz > max_ws_size) max_ws_size = sz;
|
if (CUDNN_STATUS_SUCCESS == ret && sz > max_ws_size) max_ws_size = sz;
|
||||||
}
|
}
|
||||||
size_t allocation;
|
size_t allocation;
|
||||||
void* ws = exe.allocator->alloc(max_ws_size, allocation);
|
void* ws = exe.temp_allocator->alloc(max_ws_size, allocation);
|
||||||
checkCudaErrors(cudnnFindConvolutionBackwardDataAlgorithmEx(
|
checkCudaErrors(cudnnFindConvolutionBackwardDataAlgorithmEx(
|
||||||
handle_,
|
handle_,
|
||||||
cudnnFdesc, w->ptr<Tw>(),
|
cudnnFdesc, w->ptr<Tw>(),
|
||||||
|
@ -216,7 +216,7 @@ void CudnnConvBackwardXOp::jit_run() {
|
||||||
perf_results,
|
perf_results,
|
||||||
ws,
|
ws,
|
||||||
max_ws_size));
|
max_ws_size));
|
||||||
exe.allocator->free(ws, max_ws_size, allocation);
|
exe.temp_allocator->free(ws, max_ws_size, allocation);
|
||||||
} else {
|
} else {
|
||||||
checkCudaErrors(cudnnGetConvolutionBackwardDataAlgorithm_v7(
|
checkCudaErrors(cudnnGetConvolutionBackwardDataAlgorithm_v7(
|
||||||
handle_,
|
handle_,
|
||||||
|
@ -251,7 +251,7 @@ void CudnnConvBackwardXOp::jit_run() {
|
||||||
cudnnIdesc, algo, &workSpaceSize));
|
cudnnIdesc, algo, &workSpaceSize));
|
||||||
size_t allocation;
|
size_t allocation;
|
||||||
if (workSpaceSize > 0) {
|
if (workSpaceSize > 0) {
|
||||||
workSpace = exe.allocator->alloc(workSpaceSize, allocation);
|
workSpace = exe.temp_allocator->alloc(workSpaceSize, allocation);
|
||||||
}
|
}
|
||||||
float alpha=1, beta=0;
|
float alpha=1, beta=0;
|
||||||
checkCudaErrors(cudnnConvolutionBackwardData(
|
checkCudaErrors(cudnnConvolutionBackwardData(
|
||||||
|
@ -266,7 +266,7 @@ void CudnnConvBackwardXOp::jit_run() {
|
||||||
cudnnIdesc, x->ptr<Tx>())
|
cudnnIdesc, x->ptr<Tx>())
|
||||||
);
|
);
|
||||||
if (workSpace)
|
if (workSpace)
|
||||||
exe.allocator->free(workSpace, workSpaceSize, allocation);
|
exe.temp_allocator->free(workSpace, workSpaceSize, allocation);
|
||||||
|
|
||||||
checkCudaErrors(cudnnDestroyTensorDescriptor( cudnnIdesc ));
|
checkCudaErrors(cudnnDestroyTensorDescriptor( cudnnIdesc ));
|
||||||
checkCudaErrors(cudnnDestroyFilterDescriptor( cudnnFdesc ));
|
checkCudaErrors(cudnnDestroyFilterDescriptor( cudnnFdesc ));
|
||||||
|
|
|
@ -208,7 +208,7 @@ void CudnnConvOp::jit_run() {
|
||||||
if (CUDNN_STATUS_SUCCESS == ret && sz > max_ws_size) max_ws_size = sz;
|
if (CUDNN_STATUS_SUCCESS == ret && sz > max_ws_size) max_ws_size = sz;
|
||||||
}
|
}
|
||||||
size_t allocation;
|
size_t allocation;
|
||||||
void* ws = exe.allocator->alloc(max_ws_size, allocation);
|
void* ws = exe.temp_allocator->alloc(max_ws_size, allocation);
|
||||||
checkCudaErrors(cudnnFindConvolutionForwardAlgorithmEx(
|
checkCudaErrors(cudnnFindConvolutionForwardAlgorithmEx(
|
||||||
handle_,
|
handle_,
|
||||||
cudnnIdesc, x->ptr<Tx>(),
|
cudnnIdesc, x->ptr<Tx>(),
|
||||||
|
@ -220,7 +220,7 @@ void CudnnConvOp::jit_run() {
|
||||||
perf_results,
|
perf_results,
|
||||||
ws,
|
ws,
|
||||||
max_ws_size));
|
max_ws_size));
|
||||||
exe.allocator->free(ws, max_ws_size, allocation);
|
exe.temp_allocator->free(ws, max_ws_size, allocation);
|
||||||
} else {
|
} else {
|
||||||
checkCudaErrors(cudnnGetConvolutionForwardAlgorithm_v7(
|
checkCudaErrors(cudnnGetConvolutionForwardAlgorithm_v7(
|
||||||
handle_,
|
handle_,
|
||||||
|
@ -255,7 +255,7 @@ void CudnnConvOp::jit_run() {
|
||||||
cudnnOdesc, algo, &workSpaceSize) );
|
cudnnOdesc, algo, &workSpaceSize) );
|
||||||
size_t allocation;
|
size_t allocation;
|
||||||
if (workSpaceSize > 0) {
|
if (workSpaceSize > 0) {
|
||||||
workSpace = exe.allocator->alloc(workSpaceSize, allocation);
|
workSpace = exe.temp_allocator->alloc(workSpaceSize, allocation);
|
||||||
}
|
}
|
||||||
float alpha=1, beta=0;
|
float alpha=1, beta=0;
|
||||||
checkCudaErrors(cudnnConvolutionForward(
|
checkCudaErrors(cudnnConvolutionForward(
|
||||||
|
@ -270,7 +270,7 @@ void CudnnConvOp::jit_run() {
|
||||||
cudnnOdesc, y->ptr<Ty>())
|
cudnnOdesc, y->ptr<Ty>())
|
||||||
);
|
);
|
||||||
if (workSpace)
|
if (workSpace)
|
||||||
exe.allocator->free(workSpace, workSpaceSize, allocation);
|
exe.temp_allocator->free(workSpace, workSpaceSize, allocation);
|
||||||
|
|
||||||
checkCudaErrors(cudnnDestroyTensorDescriptor( cudnnIdesc ));
|
checkCudaErrors(cudnnDestroyTensorDescriptor( cudnnIdesc ));
|
||||||
checkCudaErrors(cudnnDestroyFilterDescriptor( cudnnFdesc ));
|
checkCudaErrors(cudnnDestroyFilterDescriptor( cudnnFdesc ));
|
||||||
|
|
|
@ -92,7 +92,9 @@ void load_fused_op(FusedOp& fused_op, vector<int>& fuse_ops, vector<Op*>& ops, i
|
||||||
|
|
||||||
void Executor::run_sync(vector<Var*> vars, bool device_sync) {
|
void Executor::run_sync(vector<Var*> vars, bool device_sync) {
|
||||||
auto allocator = get_allocator();
|
auto allocator = get_allocator();
|
||||||
|
auto temp_allocator = get_allocator(true);
|
||||||
this->allocator = allocator;
|
this->allocator = allocator;
|
||||||
|
this->temp_allocator = temp_allocator;
|
||||||
// bfs find all ops need to run
|
// bfs find all ops need to run
|
||||||
int op_num = 0;
|
int op_num = 0;
|
||||||
vector<Node*> bfs_q;
|
vector<Node*> bfs_q;
|
||||||
|
|
|
@ -16,6 +16,7 @@ namespace jittor {
|
||||||
|
|
||||||
struct Executor {
|
struct Executor {
|
||||||
Allocator* allocator;
|
Allocator* allocator;
|
||||||
|
Allocator* temp_allocator;
|
||||||
bool last_is_cuda = false;
|
bool last_is_cuda = false;
|
||||||
void run_sync(vector<Var*> vars, bool device_sync);
|
void run_sync(vector<Var*> vars, bool device_sync);
|
||||||
};
|
};
|
||||||
|
|
|
@ -15,6 +15,7 @@
|
||||||
#include "mem/allocator/stat_allocator.h"
|
#include "mem/allocator/stat_allocator.h"
|
||||||
#include "mem/allocator/sfrl_allocator.h"
|
#include "mem/allocator/sfrl_allocator.h"
|
||||||
#include "mem/allocator/nfef_allocator.h"
|
#include "mem/allocator/nfef_allocator.h"
|
||||||
|
#include "mem/allocator/temp_allocator.h"
|
||||||
|
|
||||||
namespace jittor {
|
namespace jittor {
|
||||||
|
|
||||||
|
@ -46,7 +47,7 @@ Allocator* setup_allocator(Allocator* underlying) {
|
||||||
|
|
||||||
Allocator* cpu_allocator = setup_allocator<SFRLAllocator>(&aligned_allocator);
|
Allocator* cpu_allocator = setup_allocator<SFRLAllocator>(&aligned_allocator);
|
||||||
|
|
||||||
Allocator* get_allocator() {
|
Allocator* get_allocator(bool temp_allocator) {
|
||||||
Allocator* allocator = nullptr;
|
Allocator* allocator = nullptr;
|
||||||
#ifdef HAS_CUDA
|
#ifdef HAS_CUDA
|
||||||
if (use_cuda && !allocator) {
|
if (use_cuda && !allocator) {
|
||||||
|
@ -72,7 +73,10 @@ Allocator* get_allocator() {
|
||||||
allocator = setup_allocator<NFEFAllocator>(allocator);
|
allocator = setup_allocator<NFEFAllocator>(allocator);
|
||||||
return allocator;
|
return allocator;
|
||||||
}
|
}
|
||||||
if (use_sfrl_allocator) {
|
if (temp_allocator && use_temp_allocator) {
|
||||||
|
LOGvv << "Using temp_allocator";
|
||||||
|
allocator = setup_allocator<TempAllocator>(allocator);
|
||||||
|
} else if (use_sfrl_allocator) {
|
||||||
LOGvv << "Using sfrl_allocator";
|
LOGvv << "Using sfrl_allocator";
|
||||||
allocator = setup_allocator<SFRLAllocator>(allocator);
|
allocator = setup_allocator<SFRLAllocator>(allocator);
|
||||||
}
|
}
|
||||||
|
|
|
@ -49,7 +49,7 @@ struct Allocation {
|
||||||
};
|
};
|
||||||
|
|
||||||
extern Allocator* cpu_allocator;
|
extern Allocator* cpu_allocator;
|
||||||
Allocator* get_allocator();
|
Allocator* get_allocator(bool temp_allocator=false);
|
||||||
// @pyjt(gc)
|
// @pyjt(gc)
|
||||||
void gc_all();
|
void gc_all();
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,116 @@
|
||||||
|
// ***************************************************************
|
||||||
|
// Copyright (c) 2020 Jittor. All Rights Reserved.
|
||||||
|
// Maintainers:
|
||||||
|
// Guoye Yang <498731903@qq.com>
|
||||||
|
// Dun Liang <randonlang@gmail.com>.
|
||||||
|
//
|
||||||
|
// This file is subject to the terms and conditions defined in
|
||||||
|
// file 'LICENSE.txt', which is part of this source code package.
|
||||||
|
// ***************************************************************
|
||||||
|
|
||||||
|
#include "mem/allocator/temp_allocator.h"
|
||||||
|
|
||||||
|
namespace jittor {
|
||||||
|
|
||||||
|
DEFINE_FLAG(int, use_temp_allocator, 1, "Enable temp allocator");
|
||||||
|
|
||||||
|
TempAllocator::~TempAllocator() {
|
||||||
|
while (!cached_blocks.empty()) {
|
||||||
|
auto it = cached_blocks.begin();
|
||||||
|
TempCachingBlock* block = it->second;
|
||||||
|
cached_blocks.erase(it);
|
||||||
|
delete block;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const char* TempAllocator::name() const {return "temp";}
|
||||||
|
|
||||||
|
void TempAllocator::setup(Allocator* underlying) {
|
||||||
|
this->underlying = underlying;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t TempAllocator::align_size(size_t size) {
|
||||||
|
return (size + ALIGN_SIZE - 1) / ALIGN_SIZE * ALIGN_SIZE;
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned long long TempAllocator::get_key(TempCachingBlock* block) {
|
||||||
|
return ((unsigned long long)block->size) * ID_LIMIT + block->id;
|
||||||
|
}
|
||||||
|
|
||||||
|
void* TempAllocator::alloc(size_t size, size_t& allocation) {
|
||||||
|
size = align_size(size);
|
||||||
|
|
||||||
|
auto temp = TempCachingBlock(size);
|
||||||
|
auto it = cached_blocks.lower_bound(get_key(&temp));
|
||||||
|
TempCachingBlock* block = nullptr;
|
||||||
|
if (it != cached_blocks.end()) {
|
||||||
|
block = it->second;
|
||||||
|
cached_blocks.erase(it);
|
||||||
|
unused_memory -= block->size;
|
||||||
|
} else {
|
||||||
|
void* ptr = underlying->alloc(size, allocation);
|
||||||
|
block = new TempCachingBlock(size, ptr);
|
||||||
|
size_t id;
|
||||||
|
if (!block_ids.empty()) {
|
||||||
|
id = block_ids.back();
|
||||||
|
block_ids.pop_back();
|
||||||
|
} else {
|
||||||
|
ASSERT(tot_block_id < ID_LIMIT - 1) << "block id limit extended.";
|
||||||
|
id = ++tot_block_id;
|
||||||
|
}
|
||||||
|
block->id = id;
|
||||||
|
}
|
||||||
|
|
||||||
|
used_memory += block->size;
|
||||||
|
occupied_id_mapper[block->id] = block;
|
||||||
|
allocation = block->id;
|
||||||
|
return block->memory_ptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
void TempAllocator::free(void* mem_ptr, size_t size, const size_t& allocation) {
|
||||||
|
size = align_size(size);
|
||||||
|
ASSERT(occupied_id_mapper[allocation] != nullptr) << "allocation not found";
|
||||||
|
TempCachingBlock* block = occupied_id_mapper[allocation];
|
||||||
|
occupied_id_mapper[allocation] = nullptr;
|
||||||
|
used_memory -= block->size;
|
||||||
|
unused_memory += block->size;
|
||||||
|
bool can_add = true;
|
||||||
|
if (cached_blocks.size() > cache_blocks_limit-1) {
|
||||||
|
ASSERT(cached_blocks.size() == cache_blocks_limit);
|
||||||
|
auto it = cached_blocks.lower_bound(get_key(block));
|
||||||
|
if (it == cached_blocks.begin()) {
|
||||||
|
can_add = false;
|
||||||
|
} else {
|
||||||
|
--it;
|
||||||
|
TempCachingBlock* block = it->second;
|
||||||
|
underlying->free((void*)block->memory_ptr, block->size, 0);
|
||||||
|
unused_memory -= block->size;
|
||||||
|
block_ids.push_back(block->id);
|
||||||
|
cached_blocks.erase(it);
|
||||||
|
delete block;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (can_add) {
|
||||||
|
cached_blocks[get_key(block)] = block;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void TempAllocator::gc() {
|
||||||
|
while (!cached_blocks.empty()) {
|
||||||
|
auto it = cached_blocks.begin();
|
||||||
|
TempCachingBlock* block = it->second;
|
||||||
|
underlying->free((void*)block->memory_ptr, block->size, 0);
|
||||||
|
unused_memory -= block->size;
|
||||||
|
block_ids.push_back(block->id);
|
||||||
|
cached_blocks.erase(it);
|
||||||
|
delete block;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
bool TempAllocator::share_with(size_t size, size_t allocation) {
|
||||||
|
ASSERT(false);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // jittor
|
||||||
|
|
|
@ -0,0 +1,57 @@
|
||||||
|
// ***************************************************************
|
||||||
|
// Copyright (c) 2020 Jittor. All Rights Reserved.
|
||||||
|
// Maintainers:
|
||||||
|
// Guoye Yang <498731903@qq.com>
|
||||||
|
// Dun Liang <randonlang@gmail.com>.
|
||||||
|
//
|
||||||
|
// This file is subject to the terms and conditions defined in
|
||||||
|
// file 'LICENSE.txt', which is part of this source code package.
|
||||||
|
// ***************************************************************
|
||||||
|
#pragma once
|
||||||
|
#include "mem/allocator.h"
|
||||||
|
|
||||||
|
namespace jittor {
|
||||||
|
|
||||||
|
struct TempCachingBlock {
|
||||||
|
size_t size;
|
||||||
|
size_t id;
|
||||||
|
void* memory_ptr;
|
||||||
|
|
||||||
|
TempCachingBlock(size_t size):size(size),id(0) {}
|
||||||
|
TempCachingBlock(size_t size, void* memory_ptr):size(size),id(0), memory_ptr(memory_ptr) {}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct TempAllocator : Allocator {
|
||||||
|
static const size_t ALIGN_SIZE = 512;
|
||||||
|
static const size_t ID_LIMIT = 1 << 18;
|
||||||
|
Allocator* underlying;
|
||||||
|
size_t cache_blocks_limit, used_memory, unused_memory;
|
||||||
|
std::map<unsigned long long, TempCachingBlock*> cached_blocks;
|
||||||
|
std::vector<size_t> block_ids;
|
||||||
|
size_t tot_block_id;
|
||||||
|
std::unique_ptr<TempCachingBlock*[]> occupied_id_mapper;
|
||||||
|
|
||||||
|
|
||||||
|
inline TempAllocator(size_t cache_blocks_limit=2) : cache_blocks_limit(cache_blocks_limit), used_memory(0), unused_memory(0), tot_block_id(0), occupied_id_mapper(new TempCachingBlock*[ID_LIMIT]) {
|
||||||
|
}
|
||||||
|
inline TempAllocator(Allocator* underlying, size_t cache_blocks_limit=2) : TempAllocator(cache_blocks_limit) {
|
||||||
|
setup(underlying);
|
||||||
|
}
|
||||||
|
~TempAllocator();
|
||||||
|
|
||||||
|
size_t align_size(size_t size);
|
||||||
|
unsigned long long get_key(TempCachingBlock* block);
|
||||||
|
// free all unused memory of all sfrl allocators.
|
||||||
|
void setup(Allocator* underlying);
|
||||||
|
uint64 flags() const override { return underlying->flags(); }
|
||||||
|
const char* name() const override;
|
||||||
|
void* alloc(size_t size, size_t& allocation) override;
|
||||||
|
void free(void* mem_ptr, size_t size, const size_t& allocation) override;
|
||||||
|
void gc() override;
|
||||||
|
virtual bool share_with(size_t size, size_t allocation) override;
|
||||||
|
};
|
||||||
|
|
||||||
|
DECLARE_FLAG(int, use_temp_allocator);
|
||||||
|
|
||||||
|
}//jittor
|
||||||
|
|
|
@ -15,8 +15,10 @@
|
||||||
#include "misc/cuda_flags.h"
|
#include "misc/cuda_flags.h"
|
||||||
#include "mem/allocator/sfrl_allocator.h"
|
#include "mem/allocator/sfrl_allocator.h"
|
||||||
#include "mem/allocator/stat_allocator.h"
|
#include "mem/allocator/stat_allocator.h"
|
||||||
|
#include "mem/allocator/temp_allocator.h"
|
||||||
#include "mem/mem_info.h"
|
#include "mem/mem_info.h"
|
||||||
#include "update_queue.h"
|
#include "update_queue.h"
|
||||||
|
#include "executor.h"
|
||||||
|
|
||||||
namespace jittor {
|
namespace jittor {
|
||||||
|
|
||||||
|
@ -101,7 +103,13 @@ void display_memory_info(const char* fileline, bool dump_var, bool red_color) {
|
||||||
log << "cpu&gpu:" << FloatOutput{(double)all_total, " KMG", 1024, "B"}
|
log << "cpu&gpu:" << FloatOutput{(double)all_total, " KMG", 1024, "B"}
|
||||||
<< "gpu:" << FloatOutput{(double)gpu_total, " KMG", 1024, "B"}
|
<< "gpu:" << FloatOutput{(double)gpu_total, " KMG", 1024, "B"}
|
||||||
<< "cpu:" << FloatOutput{(double)cpu_total, " KMG", 1024, "B"} >> '\n';
|
<< "cpu:" << FloatOutput{(double)cpu_total, " KMG", 1024, "B"} >> '\n';
|
||||||
|
if (use_temp_allocator) {
|
||||||
|
TempAllocator* temp_allocator = (TempAllocator*)exe.temp_allocator;
|
||||||
|
log << "\nname:" << temp_allocator->name() << "\n";
|
||||||
|
log << "used_memory:" << FloatOutput{(double)temp_allocator->used_memory, " KMG", 1024, "B"} << "\n";
|
||||||
|
log << "unused_memory:" << FloatOutput{(double)temp_allocator->unused_memory, " KMG", 1024, "B"} << "\n";
|
||||||
|
|
||||||
|
}
|
||||||
if (dump_var) {
|
if (dump_var) {
|
||||||
vector<Node*> queue;
|
vector<Node*> queue;
|
||||||
unordered_set<Node*> visited;
|
unordered_set<Node*> visited;
|
||||||
|
|
|
@ -76,9 +76,9 @@ void CandidateOp::jit_run() {
|
||||||
// define ys
|
// define ys
|
||||||
auto* __restrict__ yp = y->ptr<Ty>();
|
auto* __restrict__ yp = y->ptr<Ty>();
|
||||||
size_t n_allocation;
|
size_t n_allocation;
|
||||||
int* np = (int*)exe.allocator->alloc(4, n_allocation);
|
int* np = (int*)exe.temp_allocator->alloc(4, n_allocation);
|
||||||
size_t mask_allocation;
|
size_t mask_allocation;
|
||||||
bool* maskp = (bool*)exe.allocator->alloc(xshape0, mask_allocation);
|
bool* maskp = (bool*)exe.temp_allocator->alloc(xshape0, mask_allocation);
|
||||||
checkCudaErrors(cudaMemsetAsync(maskp, 1, xshape0));
|
checkCudaErrors(cudaMemsetAsync(maskp, 1, xshape0));
|
||||||
|
|
||||||
candidate_kernel<<<1, std::max(1, std::min(1024, xshape0)) >>>(
|
candidate_kernel<<<1, std::max(1, std::min(1024, xshape0)) >>>(
|
||||||
|
@ -93,8 +93,8 @@ void CandidateOp::jit_run() {
|
||||||
// checkCudaErrors(cudaDeviceSynchronize());
|
// checkCudaErrors(cudaDeviceSynchronize());
|
||||||
checkCudaErrors(cudaMemcpy(&n, np, 4, cudaMemcpyDefault));
|
checkCudaErrors(cudaMemcpy(&n, np, 4, cudaMemcpyDefault));
|
||||||
y->set_shape({n});
|
y->set_shape({n});
|
||||||
exe.allocator->free(np, 4, n_allocation);
|
exe.temp_allocator->free(np, 4, n_allocation);
|
||||||
exe.allocator->free(maskp, xshape0, mask_allocation);
|
exe.temp_allocator->free(maskp, xshape0, mask_allocation);
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
void CandidateOp::jit_run() {
|
void CandidateOp::jit_run() {
|
||||||
|
|
|
@ -196,7 +196,7 @@ void WhereOp::jit_run() {
|
||||||
@for(i, 0, NDIM, auto* __restrict__ outs@i@@p = outs[@i]->ptr<To>();)
|
@for(i, 0, NDIM, auto* __restrict__ outs@i@@p = outs[@i]->ptr<To>();)
|
||||||
|
|
||||||
size_t n_allocation;
|
size_t n_allocation;
|
||||||
int* np = (int*)exe.allocator->alloc(4, n_allocation);
|
int* np = (int*)exe.temp_allocator->alloc(4, n_allocation);
|
||||||
|
|
||||||
// one block kernel, result maybe unstable
|
// one block kernel, result maybe unstable
|
||||||
// int tnum = condshape@{NDIM-1};
|
// int tnum = condshape@{NDIM-1};
|
||||||
|
@ -232,7 +232,7 @@ void WhereOp::jit_run() {
|
||||||
// checkCudaErrors(cudaDeviceSynchronize());
|
// checkCudaErrors(cudaDeviceSynchronize());
|
||||||
checkCudaErrors(cudaMemcpy(&n, np, 4, cudaMemcpyDefault));
|
checkCudaErrors(cudaMemcpy(&n, np, 4, cudaMemcpyDefault));
|
||||||
@for(i, 0, NDIM, outs[@i]->set_shape({n});)
|
@for(i, 0, NDIM, outs[@i]->set_shape({n});)
|
||||||
exe.allocator->free(np, 4, n_allocation);
|
exe.temp_allocator->free(np, 4, n_allocation);
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
|
|
||||||
|
|
|
@ -48,7 +48,7 @@ static void setitem_inplace(SetitemOp* op) {
|
||||||
}
|
}
|
||||||
auto output = op->outputs().front();
|
auto output = op->outputs().front();
|
||||||
output->share_with(input);
|
output->share_with(input);
|
||||||
// return;
|
return;
|
||||||
|
|
||||||
// LOGir << "pass setitem optim one";
|
// LOGir << "pass setitem optim one";
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue