JittorMirror/python/jittor/src/profiler/profiler.cc

554 lines
19 KiB
C++

// ***************************************************************
// Copyright (c) 2021 Jittor. All Rights Reserved.
// Maintainers: Dun Liang <randonlang@gmail.com>.
// This file is subject to the terms and conditions defined in
// file 'LICENSE.txt', which is part of this source code package.
// ***************************************************************
#include <chrono>
#include <algorithm>
#include <sstream>
#include <iomanip>
#ifdef _WIN32
#include <windows.h>
#else
#include <dlfcn.h>
#endif
#ifdef HAS_CUDA
#include <cuda_runtime.h>
#include "helper_cuda.h"
#endif
#include "misc/cuda_flags.h"
#include "profiler/profiler.h"
#include "op.h"
#include "fused_op.h"
#include "profiler/memory_checker.h"
#include "misc/deleter.h"
#include "executor.h"
namespace jittor {
Profiler profiler;
DEFINE_FLAG(int, profiler_warmup, 0, "Profiler warmup.");
DEFINE_FLAG(int, profiler_rerun, 0, "Profiler rerun.");
DEFINE_FLAG(int, profiler_record_peek, 0, "Profiler record peek mem bandwidth.");
DEFINE_FLAG(int, profiler_record_shape, 0, "Profiler record shape for op.");
DEFINE_FLAG(int, profiler_hide_relay, 0, "Profiler hide relayed op.");
DEFINE_FLAG_WITH_SETTER(int, profiler_enable, 0, "Enable profiler.");
void setter_profiler_enable(int value) {
if (value)
Profiler::start();
else
Profiler::stop();
}
Profiler::~Profiler() {
if (profiler_enable) {
Profiler::stop();
Profiler::report();
}
}
void Profiler::start(int64 warmup, int64 rerun) {
if (warmup==0) warmup = profiler_warmup;
if (rerun==0) rerun = profiler_rerun;
profiler_enable = 1;
profiler.records.clear();
profiler.warmup = warmup;
profiler.rerun = rerun;
profiler.relay_extra_cost = 0;
profiler.relay_fop = 0;
}
void Profiler::stop() {
profiler_enable = 0;
}
unique_ptr<MemoryChecker>* load_memory_checker(string name) {
const char* msg = "";
LOGvv << "Opening jit lib:" << name;
#ifdef _WIN32
void* handle = (void*)LoadLibrary(name.c_str());
#elif defined(__linux__)
void* handle = dlopen(name.c_str(), RTLD_LAZY | RTLD_DEEPBIND | RTLD_LOCAL);
msg = dlerror();
#else
void* handle = dlopen(name.c_str(), RTLD_LAZY | RTLD_LOCAL);
msg = dlerror();
#endif
CHECK(handle) << "Cannot open library" << name << ":" << msg;
#ifdef _WIN32
auto mm = (unique_ptr<MemoryChecker>*)GetProcAddress((HINSTANCE)handle, "memory_checker");
#else
//dlerror();
auto mm = (unique_ptr<MemoryChecker>*)dlsym(handle, "memory_checker");
msg = dlerror();
#endif
CHECK(!msg) << "Loading symbol memory_checker from" << name << "failed:" << msg;
return mm;
}
EXTERN_LIB string _get_stack_info(Node* node);
static string get_stack_info(Op* op) {
string stack_info = "stack info:\n";
if (string("fused") == op->name()) {
auto fop = (FusedOp*)op;
map<string, int> stacks;
for (Op* op : fop->ops) {
stacks[_get_stack_info(op)] = 1;
}
for (auto& kv : stacks) {
stack_info += kv.first;
stack_info += '\n';
}
if (trace_py_var >= 3) {
std::stringstream ss;
ss << "input from:\n";
for (auto& vi : fop->vars) {
if (vi.type == 0) {
auto v = vi.var;
ss << v->shape << ',' << v->dtype() << ',' << v->name << ',';
if (v->input())
ss << v->input()->name_ex() << ',' << _get_stack_info(v->input());
else
ss << _get_stack_info(v);
ss << '\n';
}
}
stack_info += ss.str();
}
return stack_info;
} else {
stack_info += _get_stack_info(op);
stack_info += '\n';
if (trace_py_var >= 3) {
std::stringstream ss;
ss << "input from:\n";
for (auto v : op->inputs()) {
ss << v->shape << ',' << v->dtype() << ',' << v->name << ',';
if (v->input())
ss << v->input()->name_ex() << ',' << _get_stack_info(v->input());
else
ss << _get_stack_info(v);
ss << '\n';
}
stack_info += ss.str();
}
return stack_info;
}
}
static void stat_peek_bandwidth(uint64 in, uint64 out, uint64 loop, uint64& peek_time_total) {
auto size = (in+out) / 2;
// memcpy in some not aligned case will drop performance
size &= ~((1 << 12)-1);
// size = 7680000*4;
auto temp1 = exe.alloc_temp(size);
auto temp2 = exe.alloc_temp(size);
loop = 1 << loop;
int warmup = std::max(loop/8, (uint64)1);
for (int i=0; i<warmup; i++)
#ifdef HAS_CUDA
if (use_cuda)
cudaMemcpyAsync(temp1.ptr, temp2.ptr, size, cudaMemcpyDefault, 0);
else
#endif
std::memcpy(temp1.ptr, temp2.ptr, size);
#ifdef HAS_CUDA
if (use_cuda)
checkCudaErrors(cudaDeviceSynchronize());
#endif
auto start = std::chrono::high_resolution_clock::now();
for (int i=0; i<loop; i++)
#ifdef HAS_CUDA
if (use_cuda)
cudaMemcpyAsync(temp1.ptr, temp2.ptr, size, cudaMemcpyDefault, 0);
else
#endif
std::memcpy(temp1.ptr, temp2.ptr, size);
#ifdef HAS_CUDA
if (use_cuda)
checkCudaErrors(cudaDeviceSynchronize());
#endif
auto finish = std::chrono::high_resolution_clock::now();
auto total_ns = (int64_t)std::chrono::duration_cast<std::chrono::nanoseconds>(finish-start).count();
peek_time_total += total_ns;
}
struct RecordExtraCost {
int ck;
std::chrono::high_resolution_clock::time_point start;
RecordExtraCost(int ck) : ck(ck) {
if (!ck) return;
start = std::chrono::high_resolution_clock::now();
}
~RecordExtraCost() {
if (!ck) return;
auto finish = std::chrono::high_resolution_clock::now();
auto total_ns = (int64_t)std::chrono::duration_cast<std::chrono::nanoseconds>(finish-start).count();
profiler.relay_extra_cost += total_ns;
}
};
void Profiler::record_and_run(
jit_op_entry_t jit_entry,
Op* op,
const char* jit_key
) {
if (!profiler_enable)
jit_entry(op);
else {
auto ikey=jit_key_mapper.find(jit_key);
const char* key = ikey==jit_key_mapper.end() ?
jit_key : ikey->second.c_str();
auto iter = profiler.records.find(key);
uint64_t in, out, compute;
if (profiler.relay_fop)
profiler.relay_fop->statistics(in, out, compute);
else
op->statistics(in, out, compute);
if (iter == profiler.records.end()) {
profiler.records[key] = Info{
0, 0, -1ull, 0,
0, 0, 0
};
iter = profiler.records.find(key);
if (trace_py_var) {
iter->second.stack_info = get_stack_info(op);
}
}
bool is_fused = op->name() == string("fused");
uint64* shape_time = nullptr;
if (trace_py_var || profiler_record_shape) {
// record shape
NanoVector shape;
int64 num = 0;
Op** ops = &op;
int op_num = 1;
if (is_fused) {
ops = &(((FusedOp*)op)->ops[0]);
op_num = ((FusedOp*)op)->ops.size();
}
for (int i=0; i<op_num; i++) {
auto o = ops[i];
for (auto v : o->inputs()) {
if (v->num > num) {
num = v->num;
shape = v->shape;
}
}
for (auto v : o->outputs()) {
if (v->num > num) {
num = v->num;
shape = v->shape;
}
}
}
iter->second.shapes[shape].second += 1;
shape_time = &iter->second.shapes[shape].first;
}
int64_t warmup = profiler.warmup;
int64_t rerun = profiler.rerun + 1;
rerun = std::max(NanoVector::get_nbits(rerun) - 2, 0);
int loop = 0;
Deleter _d;
if (is_fused) {
auto fop = ((FusedOp*)op);
if (fop->context && fop->context->vrm.relay_groups.size()) {
// relay op
loop = rerun;
profiler.relay_extra_cost = 0;
profiler.relay_fop = fop;
_d.del = [&]() {
profiler.relay_extra_cost = 0;
profiler.relay_fop = 0;
};
} else
loop = fop->get_loop_option("insert_profile_loop") ? 10 : 0;
}
int64 num = 1<<(rerun - loop);
{
profiler_enable = 0;
Deleter del([&]() { profiler_enable = 1;});
RecordExtraCost rec(profiler.relay_fop && profiler.relay_fop != op);
for (int64_t i=0; i<warmup; i++) {
jit_entry(op);
}
#ifdef HAS_CUDA
if (use_cuda)
checkCudaErrors(cudaDeviceSynchronize());
#endif
}
auto start = std::chrono::high_resolution_clock::now();
for (int64_t i=0; i<num; i++) {
jit_entry(op);
}
#ifdef HAS_CUDA
if (use_cuda)
checkCudaErrors(cudaDeviceSynchronize());
#endif
auto finish = std::chrono::high_resolution_clock::now();
auto total_ns = (int64_t)std::chrono::duration_cast<std::chrono::nanoseconds>(finish-start).count();
if (profiler.relay_fop == op) {
total_ns -= profiler.relay_extra_cost;
}
// 24ns function call overhead
total_ns = std::max((int64_t)1, total_ns-24);
iter->second.update(rerun, total_ns, in, out, compute);
if (shape_time) shape_time[0] += total_ns;
RecordExtraCost rec(profiler.relay_fop && profiler.relay_fop != op);
if (profiler_record_peek)
stat_peek_bandwidth(in, out, rerun, iter->second.peek_time_total);
LOGvvvv << "Duration" << total_ns >> "ns running" << op;
if (is_fused &&
((FusedOp*)op)->get_loop_option("check_cache")) {
auto fname = Op::get_filename_from_jit_key(key, ".so");
unique_ptr<MemoryChecker>* mc = load_memory_checker(fname);
iter->second.cache_info.reset(new CacheInfo(mc));
}
}
}
vector<vector<string>> Profiler::report(const string& sort_key) {
vector<vector<string>> rep = {{"Name", "FileName", "Count", "TotalTime", "AvgTime", "MinTime", "MaxTime", "Input", "Output", "InOut", "Compute"}};
if (profiler_record_peek)
rep[0].push_back("Peek");
vector<string> names, fnames;
vector<vector<double>> info;
vector<int> order;
int sort_key_id = 0;
for (; sort_key_id<(int)rep[0].size(); sort_key_id++)
if (rep[0][sort_key_id] == sort_key)
break;
ASSERT(sort_key_id<(int)rep[0].size()) << "Key not supported:" << sort_key;
double total_time = 0;
double total_mem_access = 0;
for (auto& kv : profiler.records) {
auto& kinfo = kv.second;
names.push_back(kv.first);
fnames.push_back(Op::get_filename_from_jit_key(kv.first, ".cc"));
if (kv.second.stack_info.size()) {
fnames.back() += '\n';
fnames.back() += kv.second.stack_info.c_str();
}
if (kv.second.shapes.size()) {
// show shapes
vector<pair<pair<uint64,uint64>,NanoVector>> shapes;
shapes.reserve(kv.second.shapes.size());
for (auto& kv2 : kv.second.shapes) {
shapes.push_back(std::make_pair(kv2.second, kv2.first));
}
std::sort(shapes.begin(), shapes.end());
std::stringstream ss;
ss << "shapes:\n";
for (int i=0; i<10; i++) {
if (i>=shapes.size()) break;
auto& sp = shapes[shapes.size() - i - 1];
auto rate = sp.first.first * 100.0 / kinfo.time_total;
ss << sp.second << ':' << sp.first.second <<
"("<< std::setprecision(3) << rate << "%), ";
}
if (shapes.size()>10)
ss << "... total " << shapes.size() << '\n';
fnames.back() += ss.str();
}
order.push_back(order.size());
// do not count relay op time
if (kv.first.find("relay") == string::npos) {
total_time += kinfo.time_total;
total_mem_access += kinfo.in_total + kinfo.out_total;
}
info.push_back({
(double)kinfo.count, // Count
(double)kinfo.time_total, // TotalTime
(double)kinfo.time_total*1.0 / kinfo.count, // AvgTime
(double)kinfo.time_min, // MinTime
(double)kinfo.time_max, // MaxTime
(double)kinfo.in_total*1e9 / kinfo.time_total, // Input
(double)kinfo.out_total*1e9 / kinfo.time_total, // Output
(double)(kinfo.in_total+kinfo.out_total)*1e9 / kinfo.time_total, // InOut
(double)kinfo.compute_total*1e9 / kinfo.time_total, // Compute
});
if (profiler_record_peek)
info.back().push_back(
(double)(kinfo.in_total+kinfo.out_total)*1e9 / kinfo.peek_time_total // Peek
);
}
if (sort_key_id>=2)
std::sort(order.begin(), order.end(), [&](int i, int j) {
return info[i][sort_key_id-2] > info[j][sort_key_id-2];
});
else
std::sort(order.begin(), order.end(), [&](int i, int j) {
return names[i] > names[j];
});
std::stringstream ss;
ss << "Profile result, sorted by " << sort_key << "\n"
<< "('it/s' represent number of iterations per sec)\n";
uint w = 10, p=3;
for (auto& s : rep[0]) {
ss << std::setw(w) << s;
if (s == "TotalTime")
ss << std::setw(w) << "%,cum%";
}
ss << '\n';
auto output_float = [&](const string& scale, int base, const string& suffix, double k) {
ss << ' ' << std::setw(w-2-suffix.size());
ss << std::setprecision(p);
uint i=0;
for (; i+1<scale.size(); i++) {
if (k<base) break;
k /= base;
}
ss << k << scale[i];
ss << suffix;
};
ss << "Total time:";
output_float("num ", 1000, "s", total_time);
ss << '\n';
ss << "Total Memory Access:";
output_float(" KMG", 1024, "B", total_mem_access);
ss << '\n';
double cum_time = 0;
for (auto i : order) {
auto& name = names[i];
auto is_relay = name.find("relay") != string::npos;
if (is_relay && profiler_hide_relay)
continue;
auto& fname = fnames[i];
rep.push_back({name, fname});
ss << std::setw(w) << name;
if (name.size() >= w-1)
ss << "\n" << std::setw(w) << " ";
ss << std::setw(w) << fname;
if (fname.size() >= w-1)
ss << "\n" << std::setw(w*2) << " ";
for (uint j=0; j<info[i].size(); j++) {
auto k = info[i][j];
if (j==0)
ss << ' ' << std::setw(w-1) << k;
else if (j<=4) {
// output time
output_float("num ", 1000, "s", k);
// output total ratio
if (j == 1) {
// do not count relay op time
if (is_relay)
k = 0;
cum_time += k;
ss << '(' << std::setw(3)
<< std::setprecision(p) << k / total_time * 100 << "%,"
<< std::setw(3)
<< std::setprecision(p) << cum_time / total_time * 100 << "%)";
}
} else if (j<=7 || j==9) {
// output thoughtput
output_float(" KMG", 1024, "B/s", k);
} else {
// computation thoughtput
output_float(" KMG", 1000, "it/s", k);
}
std::stringstream s2;
s2 << k;
rep.back().push_back(s2.str());
}
ss << '\n';
}
LOGi >> '\n' >> ss.str() >> '\n';
//cache rep
// TODO: report_cache sort_key
vector<vector<string>> rep_cache = report_cache("CheckTimes");
if (rep_cache.size() > 1)
rep.insert(rep.end(), rep_cache.begin(), rep_cache.end());
return rep;
}
vector<vector<string>> Profiler::report_cache(const string& sort_key) {
vector<vector<string>> rep = {{"Name", "FileName", "CheckTimes", "TLBMissRate"}};
vector<string> names, fnames;
vector<vector<double>> info;
vector<vector<int>> int_info;
vector<int> order;
int sort_key_id = 0;
for (; sort_key_id<(int)rep[0].size(); sort_key_id++)
if (rep[0][sort_key_id] == sort_key)
break;
ASSERT(sort_key_id<(int)rep[0].size()) << "Key not supported:" << sort_key;
sort_key_id--;
for (auto& kv : profiler.records) {
if (!kv.second.cache_info)
continue;
names.push_back(kv.first);
fnames.push_back(Op::get_filename_from_jit_key(kv.first, ".cc"));
CacheInfo& kinfo = *kv.second.cache_info;
order.push_back(order.size());
vector<double> one_info = {(double)kinfo.check_times, ((double)kinfo.tlb_miss_times) / kinfo.check_times};
vector<int> one_int_info = {(int)kinfo.check_times, (int)kinfo.tlb_miss_times};
for (int i = 0; i < (int)kinfo.cache_miss_times.size(); ++i) {
if ((int)rep[0].size() < 4 + i + 1) {
std::stringstream ss;
ss << "L" << i + 1 << "MissRate";
rep[0].push_back(ss.str());
}
one_info.push_back(((double)kinfo.cache_miss_times[i]) / kinfo.check_times);
one_int_info.push_back((int)kinfo.cache_miss_times[i]);
}
info.push_back(one_info);
int_info.push_back(one_int_info);
}
if (sort_key_id>0)
std::sort(order.begin(), order.end(), [&](int i, int j) {
return info[i][sort_key_id-1] > info[j][sort_key_id-1];
});
else
std::sort(order.begin(), order.end(), [&](int i, int j) {
return names[i] > names[j];
});
std::stringstream ss;
ss << "Memory profile result, sorted by " << sort_key << "\n";
uint w = 15;
for (auto& s : rep[0])
ss << std::setw(w) << s;
ss << '\n';
for (auto i : order) {
auto& name = names[i];
auto& fname = fnames[i];
rep.push_back({name, fname});
ss << std::setw(w) << name;
if (name.size() >= w-1)
ss << "\n" << std::setw(w) << " ";
ss << std::setw(w) << fname;
if (fname.size() >= w-1)
ss << "\n" << std::setw(w*2) << " ";
for (uint j=0; j<info[i].size(); j++) {
auto k = info[i][j];
auto int_k = int_info[i][j];
if (j == 0)
ss << ' ' << std::setw(w-1) << (int)k;
else
ss << ' ' << std::setw(w-2) << k * 100 << "%";
std::stringstream s2;
s2 << int_k;
rep.back().push_back(s2.str());
}
ss << '\n';
}
LOGi >> '\n' >> ss.str() >> '\n';
return rep;
}
} // jittor