add __launch_bounds__ for cuda kernel

This commit is contained in:
Dun Liang 2020-07-23 13:28:01 +08:00
parent 8bb698c225
commit fb873bc50e
2 changed files with 2 additions and 2 deletions

View File

@ -7,7 +7,7 @@
# This file is subject to the terms and conditions defined in
# file 'LICENSE.txt', which is part of this source code package.
# ***************************************************************
__version__ = '1.1.6.4'
__version__ = '1.1.6.5'
from . import lock
with lock.lock_scope():
from . import compiler

View File

@ -292,7 +292,7 @@ void ParallelPass::run() {
&new_func_call->before
);
} else {
new_func_def->get_attr("dtype") = "__global__ void";
new_func_def->get_attr("dtype") = "__launch_bounds__("+S(cuda_thread_num)+") __global__ void";
new_tid_def.push_front("int thread_id = blockIdx.x * blockDim.x + threadIdx.x;");
// cuda kernel launch
auto& code = func_call_code;