profiler polish

This commit is contained in:
Dun Liang 2022-02-26 16:14:52 +08:00
parent 7c7b856dcf
commit 7cf6165a10
8 changed files with 184 additions and 34 deletions

View File

@ -231,19 +231,19 @@ class profile_scope(_call_no_record_scope):
def __enter__(self):
assert not flags.profiler_enable
profiler.start(self.warmup, self.rerun)
self.report = []
try:
self.fs.__enter__()
profiler.start(self.warmup, self.rerun)
return self.report
except:
profiler.stop()
raise
def __exit__(self, *exc):
self.fs.__exit__(*exc)
profiler.stop()
self.report.extend(profiler.report())
self.fs.__exit__(*exc)
class __single_process_scope:
def __init__(self, rank=0):

View File

@ -22,6 +22,10 @@ struct Executor {
Allocator* temp_allocator;
bool last_is_cuda = false;
void run_sync(vector<Var*> vars, bool device_sync);
inline Allocation alloc_temp(size_t size) {
return Allocation(temp_allocator, size);
}
};
EXTERN_LIB Executor exe;

View File

@ -1064,13 +1064,22 @@ jit_op_entry_t OpCompiler::compile(const string& jit_key, const string& src) {
// add extra flags for custom ops
bool is_cuda = _op->flags.get(NodeFlags::_cuda);
auto op_info = get_op_info(_op->name());
return jit_compiler::compile(jit_key, src, is_cuda, op_info.extra_flags);
string extra_flags = op_info.extra_flags;
for (auto v : _op->outputs())
if (v->loop_options)
for (auto& kv : v->loop_options.data()) {
if (kv.second && startswith(kv.first, "FLAGS:"))
extra_flags += " " + kv.first.substr(6) + " ";
}
return jit_compiler::compile(jit_key, src, is_cuda, extra_flags);
}
jit_op_entry_t OpCompiler::do_compile(Op* op) {
jittor::lock_guard lg;
OpCompiler oc(op);
string* src = &oc.src;
for (auto op_type : op_types)
op_type->post_pass(&oc);
string src_after_passes;
// if is fused op
if (oc.op) {

View File

@ -23,6 +23,7 @@
#include "fused_op.h"
#include "profiler/memory_checker.h"
#include "misc/deleter.h"
#include "executor.h"
namespace jittor {
@ -30,6 +31,8 @@ Profiler profiler;
DEFINE_FLAG(int, profiler_warmup, 0, "Profiler warmup.");
DEFINE_FLAG(int, profiler_rerun, 0, "Profiler rerun.");
DEFINE_FLAG(int, profiler_record_peek, 0, "Profiler record peek mem bandwidth.");
DEFINE_FLAG(int, profiler_record_shape, 0, "Profiler record shape for op.");
DEFINE_FLAG(int, profiler_hide_relay, 0, "Profiler hide relayed op.");
DEFINE_FLAG_WITH_SETTER(int, profiler_enable, 0, "Enable profiler.");
@ -54,6 +57,8 @@ void Profiler::start(int64 warmup, int64 rerun) {
profiler.records.clear();
profiler.warmup = warmup;
profiler.rerun = rerun;
profiler.relay_extra_cost = 0;
profiler.relay_fop = 0;
}
void Profiler::stop() {
@ -138,6 +143,60 @@ static string get_stack_info(Op* op) {
}
}
static void stat_peek_bandwidth(uint64 in, uint64 out, uint64 loop, uint64& peek_time_total) {
auto size = (in+out) / 2;
// memcpy in some not aligned case will drop performance
size &= ~((1 << 12)-1);
// size = 7680000*4;
auto temp1 = exe.alloc_temp(size);
auto temp2 = exe.alloc_temp(size);
loop = 1 << loop;
int warmup = std::max(loop/8, (uint64)1);
for (int i=0; i<warmup; i++)
#ifdef HAS_CUDA
if (use_cuda)
cudaMemcpyAsync(temp1.ptr, temp2.ptr, size, cudaMemcpyDefault, 0);
else
#endif
std::memcpy(temp1.ptr, temp2.ptr, size);
#ifdef HAS_CUDA
if (use_cuda)
checkCudaErrors(cudaDeviceSynchronize());
#endif
auto start = std::chrono::high_resolution_clock::now();
for (int i=0; i<loop; i++)
#ifdef HAS_CUDA
if (use_cuda)
cudaMemcpyAsync(temp1.ptr, temp2.ptr, size, cudaMemcpyDefault, 0);
else
#endif
std::memcpy(temp1.ptr, temp2.ptr, size);
#ifdef HAS_CUDA
if (use_cuda)
checkCudaErrors(cudaDeviceSynchronize());
#endif
auto finish = std::chrono::high_resolution_clock::now();
auto total_ns = (int64_t)std::chrono::duration_cast<std::chrono::nanoseconds>(finish-start).count();
peek_time_total += total_ns;
}
struct RecordExtraCost {
int ck;
std::chrono::high_resolution_clock::time_point start;
RecordExtraCost(int ck) : ck(ck) {
if (!ck) return;
start = std::chrono::high_resolution_clock::now();
}
~RecordExtraCost() {
if (!ck) return;
auto finish = std::chrono::high_resolution_clock::now();
auto total_ns = (int64_t)std::chrono::duration_cast<std::chrono::nanoseconds>(finish-start).count();
profiler.relay_extra_cost += total_ns;
}
};
void Profiler::record_and_run(
jit_op_entry_t jit_entry,
Op* op,
@ -151,7 +210,10 @@ void Profiler::record_and_run(
jit_key : ikey->second.c_str();
auto iter = profiler.records.find(key);
uint64_t in, out, compute;
op->statistics(in, out, compute);
if (profiler.relay_fop)
profiler.relay_fop->statistics(in, out, compute);
else
op->statistics(in, out, compute);
if (iter == profiler.records.end()) {
profiler.records[key] = Info{
0, 0, -1ull, 0,
@ -165,7 +227,7 @@ void Profiler::record_and_run(
bool is_fused = op->name() == string("fused");
uint64* shape_time = nullptr;
if (trace_py_var) {
if (trace_py_var || profiler_record_shape) {
// record shape
NanoVector shape;
int64 num = 0;
@ -193,41 +255,62 @@ void Profiler::record_and_run(
iter->second.shapes[shape].second += 1;
shape_time = &iter->second.shapes[shape].first;
}
int loop = (is_fused &&
((FusedOp*)op)->get_loop_option("insert_profile_loop")) ? 10 : 0;
int64_t warmup = profiler.warmup ? std::max(profiler.warmup>>loop, (int64_t)1) : 0;
int64_t rerun = std::max((profiler.rerun+1)>>loop, (int64_t)1);
// prevent relayed op being rerun
auto warmup_bk = profiler.warmup;
auto rerun_bk = profiler.rerun;
profiler.warmup = profiler.rerun = 0;
Deleter del([&]() {
profiler.warmup = warmup_bk;
profiler.rerun = rerun_bk;
});
for (int64_t i=0; i<warmup; i++) {
jit_entry(op);
int64_t warmup = profiler.warmup;
int64_t rerun = profiler.rerun + 1;
rerun = std::max(NanoVector::get_nbits(rerun) - 2, 0);
int loop = 0;
Deleter _d;
if (is_fused) {
auto fop = ((FusedOp*)op);
if (fop->context && fop->context->entry) {
// relay op
loop = rerun;
profiler.relay_extra_cost = 0;
profiler.relay_fop = fop;
_d.del = [&]() {
profiler.relay_extra_cost = 0;
profiler.relay_fop = 0;
};
} else
loop = fop->get_loop_option("insert_profile_loop") ? 10 : 0;
}
int64 num = 1<<(rerun - loop);
{
profiler_enable = 0;
Deleter del([&]() { profiler_enable = 1;});
RecordExtraCost rec(profiler.relay_fop && profiler.relay_fop != op);
for (int64_t i=0; i<warmup; i++) {
jit_entry(op);
}
#ifdef HAS_CUDA
if (use_cuda)
checkCudaErrors(cudaDeviceSynchronize());
#endif
}
for (int64_t i=0; i<rerun; i++) {
auto start = std::chrono::high_resolution_clock::now();
auto start = std::chrono::high_resolution_clock::now();
for (int64_t i=0; i<num; i++) {
jit_entry(op);
#ifdef HAS_CUDA
if (use_cuda)
checkCudaErrors(cudaDeviceSynchronize());
#endif
auto finish = std::chrono::high_resolution_clock::now();
auto total_ns = (int64_t)std::chrono::duration_cast<std::chrono::nanoseconds>(finish-start).count();
// 24ns function call overhead
total_ns = std::max((int64_t)1, total_ns-24);
iter->second.update(loop, total_ns, in, out, compute);
if (shape_time) shape_time[0] += total_ns;
LOGvvvv << "Duration" << total_ns >> "ns running" << op;
}
#ifdef HAS_CUDA
if (use_cuda)
checkCudaErrors(cudaDeviceSynchronize());
#endif
auto finish = std::chrono::high_resolution_clock::now();
auto total_ns = (int64_t)std::chrono::duration_cast<std::chrono::nanoseconds>(finish-start).count();
if (profiler.relay_fop == op) {
total_ns -= profiler.relay_extra_cost;
}
// 24ns function call overhead
total_ns = std::max((int64_t)1, total_ns-24);
iter->second.update(rerun, total_ns, in, out, compute);
if (shape_time) shape_time[0] += total_ns;
RecordExtraCost rec(profiler.relay_fop && profiler.relay_fop != op);
if (profiler_record_peek)
stat_peek_bandwidth(in, out, rerun, iter->second.peek_time_total);
LOGvvvv << "Duration" << total_ns >> "ns running" << op;
if (is_fused &&
((FusedOp*)op)->get_loop_option("check_cache")) {
auto fname = Op::get_filename_from_jit_key(key, ".so");
@ -239,6 +322,8 @@ void Profiler::record_and_run(
vector<vector<string>> Profiler::report(const string& sort_key) {
vector<vector<string>> rep = {{"Name", "FileName", "Count", "TotalTime", "AvgTime", "MinTime", "MaxTime", "Input", "Output", "InOut", "Compute"}};
if (profiler_record_peek)
rep[0].push_back("Peek");
vector<string> names, fnames;
vector<vector<double>> info;
vector<int> order;
@ -295,6 +380,10 @@ vector<vector<string>> Profiler::report(const string& sort_key) {
(double)(kinfo.in_total+kinfo.out_total)*1e9 / kinfo.time_total, // InOut
(double)kinfo.compute_total*1e9 / kinfo.time_total, // Compute
});
if (profiler_record_peek)
info.back().push_back(
(double)(kinfo.in_total+kinfo.out_total)*1e9 / kinfo.peek_time_total // Peek
);
}
if (sort_key_id>=2)
std::sort(order.begin(), order.end(), [&](int i, int j) {
@ -363,7 +452,7 @@ vector<vector<string>> Profiler::report(const string& sort_key) {
<< std::setw(3)
<< std::setprecision(p) << cum_time / total_time * 100 << "%)";
}
} else if (j<=7) {
} else if (j<=7 || j==9) {
// output thoughtput
output_float(" KMG", 1024, "B/s", k);
} else {

View File

@ -24,6 +24,8 @@ struct Profiler {
uint64_t in_total, out_total;
// compute thoughtput in ops
uint64_t compute_total;
// peek time use memcopy
uint64_t peek_time_total;
// cache test info
unique_ptr<CacheInfo> cache_info;
cstr stack_info;
@ -56,6 +58,9 @@ struct Profiler {
int64_t warmup=0, rerun=0;
unordered_map<string, Info> records;
int64 relay_extra_cost;
FusedOp* relay_fop;
~Profiler();
};

View File

@ -61,4 +61,19 @@ string format(const string& s, const vector<string>& v) {
return ss;
}
string join(const vector<string>& vs, const string& x) {
string s;
for (int i=0; i<vs.size(); i++) {
s += vs[i];
if (i!=vs.size()-1)
s += x;
}
return s;
}
string replace(const string& a, const string& b, const string& c) {
auto vs = split(a, b);
return join(vs, c);
}
} // jittor

View File

@ -29,4 +29,8 @@ string strip(const string& s);
string format(const string& s, const vector<string>& v);
string replace(const string& a, const string& b, const string& c);
string join(const vector<string>& vs, const string& x);
} // jittor

View File

@ -0,0 +1,24 @@
# ***************************************************************
# Copyright (c) 2021 Jittor. All Rights Reserved.
# Maintainers: Dun Liang <randonlang@gmail.com>.
# This file is subject to the terms and conditions defined in
# file 'LICENSE.txt', which is part of this source code package.
# ***************************************************************
import unittest
import jittor as jt
import numpy as np
import os
class TestProfiler(unittest.TestCase):
def test_profiler(self):
a = jt.rand(1000,1000)
b = jt.rand(1000,1000)
jt.sync_all()
with jt.profile_scope(10, 100, profiler_record_peek=1) as rep:
jt.matmul(a, b).sync()
x = float(rep[-1][4])
y = float(rep[-2][4])
assert abs(x-y)/x < 1e-3
if __name__ == "__main__":
unittest.main()