From ec2eef1fd9d4f400dae6288652f8dffe106f3fce Mon Sep 17 00:00:00 2001 From: Dun Liang Date: Wed, 16 Nov 2022 16:05:27 +0800 Subject: [PATCH] add profiler mark for easy range profiling --- python/jittor/__init__.py | 48 +++++++++++++++++- python/jittor/src/fused_op.cc | 6 ++- python/jittor/src/profiler/profiler.cc | 68 ++++++++++++++++++++++++-- python/jittor/src/profiler/profiler.h | 5 ++ python/jittor/test/test_profiler.py | 18 +++++++ 5 files changed, 138 insertions(+), 7 deletions(-) diff --git a/python/jittor/__init__.py b/python/jittor/__init__.py index a8772cdc..27bb1e50 100644 --- a/python/jittor/__init__.py +++ b/python/jittor/__init__.py @@ -9,7 +9,7 @@ # file 'LICENSE.txt', which is part of this source code package. # *************************************************************** -__version__ = '1.3.5.33' +__version__ = '1.3.5.34' from jittor_utils import lock with lock.lock_scope(): ori_int = int @@ -234,6 +234,52 @@ class profile_scope(_call_no_record_scope): self.report.extend(profiler.report()) self.fs.__exit__(*exc) + +class profile_mark(_call_no_record_scope): + def __init__(self, mark_name: str): + ''' profiler mark is used for profiling part of code, + + Example:: + + a = jt.rand(1000,1000) + b = jt.rand(1000,1000) + jt.sync_all() + results = [] + with jt.profile_scope() as rep: + results.append(jt.matmul(a, b)) + with jt.profile_mark("mark1"): + results.append(jt.matmul(a, b)) + with jt.profile_mark("mark2"): + results.append(jt.matmul(a, b)) + with jt.profile_mark("mark3"): + results.append(jt.matmul(a, b)) + results.append(jt.matmul(a, b)) + + Output:: + + Total time: 46.8ms + Total Memory Access: 57.2MB + [Mark mark3] time: 9ms + [Mark mark2] time: 8.28ms + [Mark mark1] time: 17.7ms + + ''' + self.mark_name = mark_name + def __enter__(self): + self.options = flags.compile_options + new_options = flags.compile_options + prev_marks = "_marks:" + for x in self.options: + if x.startswith(prev_marks): + prev_marks = x + del new_options[x] + new_marks = prev_marks + self.mark_name + ',' + new_options[new_marks] = 1 + flags.compile_options = new_options + + def __exit__(self, *exc): + flags.compile_options = self.options + class __single_process_scope: def __init__(self, rank=0): self.rank = rank diff --git a/python/jittor/src/fused_op.cc b/python/jittor/src/fused_op.cc index 1ab49167..49e52a8c 100644 --- a/python/jittor/src/fused_op.cc +++ b/python/jittor/src/fused_op.cc @@ -202,8 +202,10 @@ void FusedOp::do_jit_prepare(JK& jk) { } } jk << "«choices:"; - for (auto& kv : *loop_options) - jk << kv.first << ':' << kv.second << ','; + for (auto& kv : *loop_options) { + if (kv.first.size() && kv.first[0] != '_') + jk << kv.first << ':' << kv.second << ','; + } } jk.finilize(); } diff --git a/python/jittor/src/profiler/profiler.cc b/python/jittor/src/profiler/profiler.cc index fbf879cd..2181edc8 100644 --- a/python/jittor/src/profiler/profiler.cc +++ b/python/jittor/src/profiler/profiler.cc @@ -24,6 +24,7 @@ #include "profiler/memory_checker.h" #include "misc/deleter.h" #include "executor.h" +#include "utils/str_utils.h" namespace jittor { @@ -55,6 +56,7 @@ void Profiler::start(int64 warmup, int64 rerun) { if (rerun==0) rerun = profiler_rerun; profiler_enable = 1; profiler.records.clear(); + profiler.marks.clear(); profiler.warmup = warmup; profiler.rerun = rerun; profiler.relay_extra_cost = 0; @@ -197,6 +199,34 @@ struct RecordExtraCost { } }; +static string get_marks(Op* op, bool is_fused) { + loop_options_t* options = nullptr; + if (is_fused) { + auto* fop = (FusedOp*)op; + options = fop->loop_options; + } else { + if (op->outputs().size()) { + if (op->outputs().front()->loop_options) + options = &op->outputs().front()->loop_options.data(); + } + } + if (!options) return string(); + for (auto& kv : *options) { + if (startswith(kv.first, "_marks:")) { + return kv.first.substr(7); + } + } + return string(); +} + +static string origin_key(const string& s) { + if (s.size() && s[0]=='[') { + auto vs = split(s, "]"); + return vs[vs.size()-1]; + } + return s; +} + void Profiler::record_and_run( jit_op_entry_t jit_entry, Op* op, @@ -208,6 +238,18 @@ void Profiler::record_and_run( auto ikey=jit_key_mapper.find(jit_key); const char* key = ikey==jit_key_mapper.end() ? jit_key : ikey->second.c_str(); + bool is_fused = op->name() == string("fused"); + string marks = get_marks(op, is_fused); + string new_key; + if (marks.size()) { + // add marks into key, for better report + new_key = string("[marks:"); + new_key += marks; + new_key += "]"; + new_key += key; + key = new_key.c_str(); + } + auto iter = profiler.records.find(key); uint64_t in, out, compute; if (profiler.relay_fop) @@ -224,7 +266,6 @@ void Profiler::record_and_run( iter->second.stack_info = get_stack_info(op); } } - bool is_fused = op->name() == string("fused"); uint64* shape_time = nullptr; if (trace_py_var || profiler_record_shape) { @@ -306,6 +347,20 @@ void Profiler::record_and_run( total_ns = std::max((int64_t)1, total_ns-24); iter->second.update(rerun, total_ns, in, out, compute); if (shape_time) shape_time[0] += total_ns; + + // add markers record + if ((profiler.relay_fop == op || profiler.relay_fop == nullptr) + && marks.size()) { + // only record not relay op + auto vs = split(marks, ","); + for (auto& mark : vs) { + if (mark.size()) { + auto& mark_info = profiler.marks[mark]; + mark_info.count += 1; + mark_info.time_total += total_ns; + } + } + } RecordExtraCost rec(profiler.relay_fop && profiler.relay_fop != op); if (profiler_record_peek) @@ -313,7 +368,7 @@ void Profiler::record_and_run( LOGvvvv << "Duration" << total_ns >> "ns running" << op; if (is_fused && ((FusedOp*)op)->get_loop_option("check_cache")) { - auto fname = Op::get_filename_from_jit_key(key, ".so"); + auto fname = Op::get_filename_from_jit_key(origin_key(key), ".so"); unique_ptr* mc = load_memory_checker(fname); iter->second.cache_info.reset(new CacheInfo(mc)); } @@ -337,7 +392,7 @@ vector> Profiler::report(const string& sort_key) { for (auto& kv : profiler.records) { auto& kinfo = kv.second; names.push_back(kv.first); - fnames.push_back(Op::get_filename_from_jit_key(kv.first, ".cc")); + fnames.push_back(Op::get_filename_from_jit_key(origin_key(kv.first), ".cc")); if (kv.second.stack_info.size()) { fnames.back() += '\n'; fnames.back() += kv.second.stack_info.c_str(); @@ -420,6 +475,11 @@ vector> Profiler::report(const string& sort_key) { ss << "Total Memory Access:"; output_float(" KMG", 1024, "B", total_mem_access); ss << '\n'; + for (auto& mark : profiler.marks) { + ss << "[Mark " << mark.first << "] time:"; + output_float("num ", 1000, "s", mark.second.time_total); + ss << '\n'; + } double cum_time = 0; for (auto i : order) { auto& name = names[i]; @@ -491,7 +551,7 @@ vector> Profiler::report_cache(const string& sort_key) { if (!kv.second.cache_info) continue; names.push_back(kv.first); - fnames.push_back(Op::get_filename_from_jit_key(kv.first, ".cc")); + fnames.push_back(Op::get_filename_from_jit_key(origin_key(kv.first), ".cc")); CacheInfo& kinfo = *kv.second.cache_info; order.push_back(order.size()); vector one_info = {(double)kinfo.check_times, ((double)kinfo.tlb_miss_times) / kinfo.check_times}; diff --git a/python/jittor/src/profiler/profiler.h b/python/jittor/src/profiler/profiler.h index 99fc5fb2..a03a5308 100644 --- a/python/jittor/src/profiler/profiler.h +++ b/python/jittor/src/profiler/profiler.h @@ -61,6 +61,11 @@ struct Profiler { int64 relay_extra_cost; FusedOp* relay_fop; + struct MarkInfo { + uint64_t count, time_total; + }; + unordered_map marks; + ~Profiler(); }; diff --git a/python/jittor/test/test_profiler.py b/python/jittor/test/test_profiler.py index 6a09e97a..192c8d2b 100644 --- a/python/jittor/test/test_profiler.py +++ b/python/jittor/test/test_profiler.py @@ -20,5 +20,23 @@ class TestProfiler(unittest.TestCase): y = float(rep[-2][4]) assert abs(x-y)/x < 1e-3 + def test_marks(self): + a = jt.rand(1000,1000) + b = jt.rand(1000,1000) + jt.sync_all() + results = [] + with jt.profile_scope() as rep: + results.append(jt.matmul(a, b)) + with jt.profile_mark("mark1"): + results.append(jt.matmul(a, b)) + with jt.profile_mark("mark2"): + results.append(jt.matmul(a, b)) + with jt.profile_mark("mark3"): + results.append(jt.matmul(a, b)) + results.append(jt.matmul(a, b)) + jt.sync_all() + assert len(rep) == 6 + + if __name__ == "__main__": unittest.main() \ No newline at end of file