support disable cuda managed memory

This commit is contained in:
Dun Liang 2021-02-22 15:39:23 +08:00
parent aefe719770
commit bebd218bdd
6 changed files with 76 additions and 11 deletions

View File

@ -8,7 +8,7 @@
# This file is subject to the terms and conditions defined in
# file 'LICENSE.txt', which is part of this source code package.
# ***************************************************************
__version__ = '1.2.2.32'
__version__ = '1.2.2.33'
from . import lock
with lock.lock_scope():
ori_int = int

View File

@ -23,5 +23,16 @@ class TestAllocator(unittest.TestCase):
assert jt.flags.stat_allocator_total_free_call == 2
assert jt.flags.stat_allocator_total_free_byte == 800
@unittest.skipIf(not jt.has_cuda, "Cuda not found")
@jt.flag_scope(use_cuda=1, use_cuda_managed_allocator=0)
def test_device_allocator(self):
a = jt.array([1,2,3,4,5])
b = a + 1
c = jt.code(a.shape, a.dtype, [b], cpu_src="""
for (int i=0; i<in0_shape0; i++)
@out(i) = @in0(i)*@in0(i)*2;
""")
assert (c.data == [8,18,32,50,72]).all()
if __name__ == "__main__":
unittest.main()

View File

@ -37,6 +37,10 @@ DECLARE_FLAG(int, profile_memory_enable);
// from fetch_op.cc
extern list<VarPtr> fetcher_to_free;
// from cuda_managed_allocator
#ifdef HAS_CUDA
DECLARE_FLAG(int, use_cuda_managed_allocator);
#endif
void load_fused_op(FusedOp& fused_op, vector<int>& fuse_ops, vector<Op*>& ops, int ll, int rr, int64 tt) {
fused_op.ops.clear();
@ -445,6 +449,14 @@ void Executor::run_sync(vector<Var*> vars, bool device_sync) {
for (Var* v : op->inputs()) {
migrate_to_cpu(v, allocator);
}
if (!use_cuda_managed_allocator) {
for (auto* var : op->outputs()) {
var->allocator->free(var->mem_ptr, var->size, var->allocation);
var->mem_ptr = var->allocator = nullptr;
var->allocation = 0;
var->alloc(cpu_allocator);
}
}
}
#endif
#ifdef NODE_MEMCHECK
@ -459,6 +471,14 @@ void Executor::run_sync(vector<Var*> vars, bool device_sync) {
#endif
last_is_cuda = is_cuda;
op->do_run_after_prepare(jkl);
#ifdef HAS_CUDA
// migrate to gpu
if (PREDICT_BRANCH_NOT_TAKEN((!is_cuda && use_cuda && !use_cuda_managed_allocator))) {
for (Var* v : op->outputs()) {
migrate_to_gpu(v, allocator);
}
}
#endif
// record trace data
if (PREDICT_BRANCH_NOT_TAKEN(trace_py_var>=2)) {
trace_data.record_execution(op, is_fused_op, jkl);

View File

@ -11,6 +11,7 @@
#ifdef HAS_CUDA
#include "mem/allocator/cuda_managed_allocator.h"
#include "mem/allocator/cuda_device_allocator.h"
#include "mem/allocator/cuda_dual_allocator.h"
#endif
#include "mem/allocator/stat_allocator.h"
#include "mem/allocator/sfrl_allocator.h"
@ -91,4 +92,42 @@ void gc_all() {
for (auto& kv : allocators) kv.second->gc();
}
#ifdef HAS_CUDA
void migrate_to_cpu(Var* var, Allocator* allocator) {
if (!use_cuda_managed_allocator)
allocator = cpu_allocator;
if (var->allocator == &delay_free) {
var->allocator = allocator;
delay_free.migrate_to_cpu(
var->mem_ptr, var->allocation, var->size, var->allocator
);
} else
if (!use_cuda_managed_allocator) {
// must be a device allocator
Allocation a(allocator, var->size);
checkCudaErrors(cudaMemcpy(a.ptr, var->mem_ptr, var->size, cudaMemcpyDeviceToHost));
var->allocator->free(var->mem_ptr, var->size, var->allocation);
var->mem_ptr = a.ptr;
var->allocation = a.allocation;
var->allocator = a.allocator;
a.ptr = nullptr;
}
}
void migrate_to_gpu(Var* var, Allocator* allocator) {
// only happend when not using use_cuda_managed_allocator
Allocation a(allocator, var->size);
checkCudaErrors(cudaMemcpy(a.ptr, var->mem_ptr, var->size, cudaMemcpyHostToDevice));
var->allocator->free(var->mem_ptr, var->size, var->allocation);
var->mem_ptr = a.ptr;
var->allocation = a.allocation;
var->allocator = a.allocator;
a.ptr = nullptr;
}
#endif
} // jittor

View File

@ -53,4 +53,9 @@ Allocator* get_allocator(bool temp_allocator=false);
// @pyjt(gc)
void gc_all();
#ifdef HAS_CUDA
void migrate_to_cpu(Var* var, Allocator* allocator);
void migrate_to_gpu(Var* var, Allocator* allocator);
#endif
} // jittor

View File

@ -115,16 +115,6 @@ struct DelayFree final : Allocator {
extern DelayFree delay_free;
inline void migrate_to_cpu(Var* var, Allocator* allocator) {
if (var->allocator == &delay_free) {
var->allocator = allocator;
delay_free.migrate_to_cpu(
var->mem_ptr, var->allocation, var->size, var->allocator
);
}
}
}
#endif