Add visualization of multi-threaded waiting time with verilator_gantt (#5929)

This commit is contained in:
Bartłomiej Chmiel 2025-04-10 16:03:58 +02:00 committed by GitHub
parent fc8e1b5a2e
commit 84f26ed3d7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
12 changed files with 156 additions and 69 deletions

View File

@ -26,7 +26,9 @@ Global = {
}
ElapsedTime = None # total elapsed time
ExecGraphTime = 0 # total elapsed time executing an exec graph
WaitingTime = 0 # total elapsed time waiting for mtasks
ExecGraphIntervals = [] # list of (start, end) pairs
ThreadScheduleWaitIntervals = [] # list of (start, tick, ecpu) pairs
######################################################################
@ -37,6 +39,7 @@ def read_data(filename):
re_record = re.compile(r'^VLPROFEXEC (\S+) (\d+)(.*)$')
re_payload_mtaskBegin = re.compile(r'id (\d+) predictStart (\d+) cpu (\d+)')
re_payload_mtaskEnd = re.compile(r'id (\d+) predictCost (\d+)')
re_payload_wait = re.compile(r'cpu (\d+)')
re_arg1 = re.compile(r'VLPROF arg\s+(\S+)\+([0-9.]*)\s*')
re_arg2 = re.compile(r'VLPROF arg\s+(\S+)\s+([0-9.]*)\s*$')
@ -49,9 +52,11 @@ def read_data(filename):
global LongestVcdStrValueLength
global ExecGraphTime
global WaitingTime
ExecGraphStack = []
SectionStack = []
ThreadScheduleWait = collections.defaultdict(list)
mTaskThread = {}
for line in fh:
@ -96,6 +101,14 @@ def read_data(filename):
Mtasks[mtask]['elapsed'] += tick - begin
Mtasks[mtask]['predict_cost'] = predict_cost
Mtasks[mtask]['end'] = max(Mtasks[mtask]['end'], tick)
elif kind == "THREAD_SCHEDULE_WAIT_BEGIN":
ecpu = int(re_payload_wait.match(payload).groups()[0])
ThreadScheduleWait[ecpu].append(tick)
elif kind == "THREAD_SCHEDULE_WAIT_END":
ecpu = int(re_payload_wait.match(payload).groups()[0])
start = ThreadScheduleWait[ecpu].pop()
WaitingTime += tick - start
ThreadScheduleWaitIntervals.append((start, tick, ecpu))
elif kind == "EXEC_GRAPH_BEGIN":
ExecGraphStack.append(tick)
elif kind == "EXEC_GRAPH_END":
@ -169,6 +182,7 @@ def report():
print("\nSummary:")
print(" Total elapsed time = {} rdtsc ticks".format(ElapsedTime))
print(" Parallelized code = {:.2%} of elapsed time".format(ExecGraphTime / ElapsedTime))
print(" Waiting time = {:.2%} of elapsed time".format(WaitingTime / ElapsedTime))
print(" Total threads = %d" % nthreads)
print(" Total CPUs used = %d" % ncpus)
print(" Total mtasks = %d" % len(Mtasks))
@ -422,6 +436,9 @@ def write_vcd(filename):
val = buf.ljust(LongestVcdStrValueLength * 8, "0")
values[time][code] = val
def addXValue(code, time):
values[time][code] = 'x'
# Measured graph
for thread in sorted(Threads):
mcode = getCode(32, 'measured', 't%d_mtask' % thread)
@ -446,6 +463,12 @@ def write_vcd(filename):
tStart = sorted(_['start'] for records in Threads.values() for _ in records)
tEnd = sorted(_['end'] for records in Threads.values() for _ in records)
# Measured waiting time
for (start, end, cpu) in ThreadScheduleWaitIntervals:
mcode = getCode(32, 'measured', 'cpu%d_waiting' % cpu)
addXValue(mcode, start)
addValue(mcode, end, None)
# Predicted graph
for start, end in ExecGraphIntervals:
# Find the earliest MTask start after the start point, and the
@ -532,6 +555,8 @@ def write_vcd(filename):
value = values[time][code]
if value is None:
fh.write("bz v%x\n" % code)
elif value == 'x':
fh.write("bx v%x\n" % code)
elif isinstance(value, str):
fh.write("b%s v%x\n" % (value, code))
else:

View File

@ -51,6 +51,10 @@ predicted_parallelism
cpu#_thread
For the given CPU number, the thread number measured to be executing.
cpu#_waiting
For the given CPU number, aggregated waiting time for mtask dependencies.
Visualized as `X` values.
mtask#_cpu
For the given mtask id, the CPU it was measured to execute on.

View File

@ -199,6 +199,12 @@ void VlExecutionProfiler::dump(const char* filenamep, uint64_t tickEnd)
fprintf(fp, " id %u predictCost %u\n", payload.m_id, payload.m_predictCost);
break;
}
case VlExecutionRecord::Type::THREAD_SCHEDULE_WAIT_BEGIN:
case VlExecutionRecord::Type::THREAD_SCHEDULE_WAIT_END: {
const auto& payload = er.m_payload.threadScheduleWait;
fprintf(fp, " cpu %u\n", payload.m_cpu);
break;
}
case VlExecutionRecord::Type::SECTION_PUSH: {
const auto& payload = er.m_payload.sectionPush;
fprintf(fp, " %s\n", payload.m_name);

View File

@ -62,6 +62,8 @@ VL_ATTR_ALWINLINE QData VL_CPU_TICK() {
_VL_FOREACH_APPLY(macro, SECTION_POP) \
_VL_FOREACH_APPLY(macro, MTASK_BEGIN) \
_VL_FOREACH_APPLY(macro, MTASK_END) \
_VL_FOREACH_APPLY(macro, THREAD_SCHEDULE_WAIT_BEGIN) \
_VL_FOREACH_APPLY(macro, THREAD_SCHEDULE_WAIT_END) \
_VL_FOREACH_APPLY(macro, EXEC_GRAPH_BEGIN) \
_VL_FOREACH_APPLY(macro, EXEC_GRAPH_END)
// clang-format on
@ -95,6 +97,9 @@ class VlExecutionRecord final {
uint32_t m_id; // MTask id
uint32_t m_predictCost; // How long scheduler predicted would take
} mtaskEnd;
struct {
uint32_t m_cpu; // Executing CPU id
} threadScheduleWait;
};
// STATE
@ -126,6 +131,14 @@ public:
m_payload.mtaskEnd.m_predictCost = predictCost;
m_type = Type::MTASK_END;
}
void threadScheduleWaitBegin() {
m_payload.threadScheduleWait.m_cpu = VlOs::getcpu();
m_type = Type::THREAD_SCHEDULE_WAIT_BEGIN;
}
void threadScheduleWaitEnd() {
m_payload.threadScheduleWait.m_cpu = VlOs::getcpu();
m_type = Type::THREAD_SCHEDULE_WAIT_END;
}
void execGraphBegin() { m_type = Type::EXEC_GRAPH_BEGIN; }
void execGraphEnd() { m_type = Type::EXEC_GRAPH_END; }
};

View File

@ -726,7 +726,13 @@ void addMTaskToFunction(const ThreadSchedule& schedule, const uint32_t threadId,
varp->protect(false); // Do not protect as we still have references in AstText
modp->addStmtsp(varp);
// For now, reference is still via text bashing
if (v3Global.opt.profExec()) {
addStrStmt("VL_EXEC_TRACE_ADD_RECORD(vlSymsp).threadScheduleWaitBegin();\n");
}
addStrStmt("vlSelf->" + name + +".waitUntilUpstreamDone(even_cycle);\n");
if (v3Global.opt.profExec()) {
addStrStmt("VL_EXEC_TRACE_ADD_RECORD(vlSymsp).threadScheduleWaitEnd();\n");
}
}
if (v3Global.opt.profPgo()) {
@ -878,8 +884,14 @@ void addThreadStartToExecGraph(AstExecGraph* const execGraphp,
}
V3Stats::addStatSum("Optimizations, Thread schedule total tasks", i);
if (v3Global.opt.profExec()) {
addStrStmt("VL_EXEC_TRACE_ADD_RECORD(vlSymsp).threadScheduleWaitBegin();\n");
}
addStrStmt("vlSelf->__Vm_mtaskstate_final__" + std::to_string(scheduleId) + tag
+ ".waitUntilUpstreamDone(vlSymsp->__Vm_even_cycle__" + tag + ");\n");
if (v3Global.opt.profExec()) {
addStrStmt("VL_EXEC_TRACE_ADD_RECORD(vlSymsp).threadScheduleWaitEnd();\n");
}
// Free all assigned worker indices in this section
if (!v3Global.opt.hierBlocks().empty() && last > 0) {
addStrStmt("vlSymsp->__Vm_threadPoolp->freeWorkerIndexes(indexes);\n");

View File

@ -910,6 +910,8 @@ VLPROFEXEC EXEC_GRAPH_END 12180
VLPROFEXEC EXEC_GRAPH_BEGIN 14000
VLPROFEXEC MTASK_BEGIN 15610 id 6 predictStart 0 cpu 19
VLPROFEXEC MTASK_END 15820 id 6 predictCost 30
VLPROFEXEC THREAD_SCHEDULE_WAIT_BEGIN 20000 cpu 19
VLPROFEXEC THREAD_SCHEDULE_WAIT_END 21000 cpu 19
VLPROFEXEC MTASK_BEGIN 21700 id 10 predictStart 196 cpu 19
VLPROFEXEC MTASK_END 21875 id 10 predictCost 30
VLPROFEXEC EXEC_GRAPH_END 22085
@ -934,4 +936,6 @@ VLPROFEXEC MTASK_BEGIN 20650 id 9 predictStart 167 cpu 10
VLPROFEXEC MTASK_END 20720 id 9 predictCost 30
VLPROFEXEC MTASK_BEGIN 21140 id 11 predictStart 197 cpu 10
VLPROFEXEC MTASK_END 21245 id 11 predictCost 30
VLPROFEXEC THREAD_SCHEDULE_WAIT_BEGIN 22000 cpu 10
VLPROFEXEC THREAD_SCHEDULE_WAIT_END 23000 cpu 10
VLPROF stat ticks 23415

View File

@ -7,6 +7,7 @@ Argument settings:
Summary:
Total elapsed time = 23415 rdtsc ticks
Parallelized code = 82.51% of elapsed time
Waiting time = 8.54% of elapsed time
Total threads = 2
Total CPUs used = 2
Total mtasks = 7

View File

@ -5,18 +5,20 @@ $timescale 1ns $end
$scope module measured $end
$var wire 32 v5 cpu10_mtask [31:0] $end
$var wire 32 v4 cpu10_thread [31:0] $end
$var wire 32 v7 cpu10_waiting [31:0] $end
$var wire 32 v2 cpu19_mtask [31:0] $end
$var wire 32 v1 cpu19_thread [31:0] $end
$var wire 32 v6 cpu19_waiting [31:0] $end
$var wire 32 v0 t0_mtask [31:0] $end
$var wire 32 v3 t1_mtask [31:0] $end
$upscope $end
$scope module predicted $end
$var wire 32 v6 t0_mtask [31:0] $end
$var wire 32 v7 t1_mtask [31:0] $end
$var wire 32 v8 t0_mtask [31:0] $end
$var wire 32 v9 t1_mtask [31:0] $end
$upscope $end
$scope module stats $end
$var wire 32 v8 measured_parallelism [31:0] $end
$var wire 32 v9 predicted_parallelism [31:0] $end
$var wire 32 va measured_parallelism [31:0] $end
$var wire 32 vb predicted_parallelism [31:0] $end
$upscope $end
$upscope $end
$enddefinitions $end
@ -30,193 +32,203 @@ bz v4
bz v5
bz v6
bz v7
b0 v8
b0 v9
bz v8
bz v9
b0 va
b0 vb
#2695
b110 v0
b0 v1
b110 v2
b110 v6
b101 v7
b1 v8
b10 v9
b110 v8
b101 v9
b1 va
b10 vb
#2905
bz v0
bz v1
bz v2
b0 v8
b0 va
#3800
bz v6
b111 v7
b1 v9
bz v8
b111 v9
b1 vb
#4906
b1000 v7
b1 v9
b1000 v9
b1 vb
#5495
b101 v3
b1 v4
b101 v5
b1 v8
b1 va
#6090
bz v3
bz v4
bz v5
b0 v8
b0 va
#6300
b111 v3
b1 v4
b111 v5
b1 v8
b1 va
#6895
bz v3
bz v4
bz v5
b0 v8
b0 va
#7490
b1000 v3
b1 v4
b1000 v5
b1 v8
b1 va
#8540
bz v3
bz v4
bz v5
b0 v8
b0 va
#8848
b1001 v7
b1 v9
b1001 v9
b1 vb
#9135
b1001 v3
b1 v4
b1001 v5
b1 v8
b1 va
#9695
b1010 v0
b0 v1
b1010 v2
b10 v8
b10 va
#9730
bz v3
bz v4
bz v5
b1 v8
b1 va
#9870
bz v0
bz v1
bz v2
b0 v8
b0 va
#9917
b1010 v6
b10 v9
b1010 v8
b10 vb
#9954
b1011 v7
b10 v9
b1011 v9
b10 vb
#10255
b1011 v3
b1 v4
b1011 v5
b1 v8
b1 va
#11023
bz v6
b1 v9
bz v8
b1 vb
#11060
bz v3
bz v4
bz v5
bz v7
b0 v8
b0 v9
bz v9
b0 va
b0 vb
#15610
b110 v0
b0 v1
b110 v2
b110 v6
b101 v7
b1 v8
b10 v9
b110 v8
b101 v9
b1 va
b10 vb
#15820
bz v0
bz v1
bz v2
b0 v8
b0 va
#16437
bz v6
b111 v7
b1 v9
bz v8
b111 v9
b1 vb
#17265
b1000 v7
b1 v9
b1000 v9
b1 vb
#18375
b101 v3
b1 v4
b101 v5
b1 v8
b1 va
#18970
bz v3
bz v4
bz v5
b0 v8
b0 va
#19145
b111 v3
b1 v4
b111 v5
b1 v8
b1 va
#19320
bz v3
bz v4
bz v5
b0 v8
b0 va
#19670
b1000 v3
b1 v4
b1000 v5
b1 v8
b1 va
#19810
bz v3
bz v4
bz v5
b0 v8
b0 va
#20000
bx v6
#20219
b1001 v7
b1 v9
b1001 v9
b1 vb
#20650
b1001 v3
b1 v4
b1001 v5
b1 v8
b1 va
#20720
bz v3
bz v4
bz v5
b0 v8
b0 va
#21000
bz v6
#21019
b1010 v6
b10 v9
b1010 v8
b10 vb
#21047
b1011 v7
b10 v9
b1011 v9
b10 vb
#21140
b1011 v3
b1 v4
b1011 v5
b1 v8
b1 va
#21245
bz v3
bz v4
bz v5
b0 v8
b0 va
#21700
b1010 v0
b0 v1
b1010 v2
b1 v8
b1 va
#21847
bz v6
b1 v9
bz v8
b1 vb
#21875
bz v0
bz v1
bz v2
bz v9
b0 va
b0 vb
#22000
bx v7
#23000
bz v7
b0 v8
b0 v9

View File

@ -50,6 +50,8 @@ VLPROFEXEC MTASK_BEGIN 90465 id 85 predictStart 14315 cpu 2
VLPROFEXEC MTASK_END 155034 id 85 predictCost 30533
VLPROFEXEC MTASK_BEGIN 156555 id 79 predictStart 44848 cpu 2
VLPROFEXEC MTASK_END 294309 id 79 predictCost 48001
VLPROFEXEC THREAD_SCHEDULE_WAIT_BEGIN 295000 cpu 2
VLPROFEXEC THREAD_SCHEDULE_WAIT_END 296000 cpu 2
VLPROFEXEC EXEC_GRAPH_END 300000
VLPROFTHREAD 1
VLPROFEXEC MTASK_BEGIN 77352 id 90 predictStart 14315 cpu 3
@ -58,4 +60,6 @@ VLPROFEXEC MTASK_BEGIN 79799 id 81 predictStart 35907 cpu 3
VLPROFEXEC MTASK_END 80667 id 81 predictCost 29215
VLPROFEXEC MTASK_BEGIN 81746 id 87 predictStart 65147 cpu 3
VLPROFEXEC MTASK_END 82633 id 87 predictCost 33809
VLPROFEXEC THREAD_SCHEDULE_WAIT_BEGIN 83000 cpu 3
VLPROFEXEC THREAD_SCHEDULE_WAIT_END 84000 cpu 3
VLPROF stat ticks 300000

View File

@ -7,6 +7,7 @@ Argument settings:
Summary:
Total elapsed time = 300000 rdtsc ticks
Parallelized code = 80.49% of elapsed time
Waiting time = 0.67% of elapsed time
Total threads = 2
Total CPUs used = 2
Total mtasks = 5

View File

@ -13,6 +13,8 @@ VLPROFEXEC EXEC_GRAPH_END 12180
VLPROFEXEC EXEC_GRAPH_BEGIN 14000
VLPROFEXEC MTASK_BEGIN 15610 id 6 predictStart 0 cpu 19
VLPROFEXEC MTASK_END 15820 id 6 predictCost 30
VLPROFEXEC THREAD_SCHEDULE_WAIT_BEGIN 16000 cpu 19
VLPROFEXEC THREAD_SCHEDULE_WAIT_END 17000 cpu 19
VLPROFEXEC MTASK_BEGIN 21700 id 10 predictStart 196 cpu 19
VLPROFEXEC MTASK_END 21875 id 10 predictCost 30
VLPROFEXEC EXEC_GRAPH_END 22085
@ -27,6 +29,8 @@ VLPROFEXEC MTASK_BEGIN 9135 id 9 predictStart 167 cpu 10
VLPROFEXEC MTASK_END 9730 id 9 predictCost 30
VLPROFEXEC MTASK_BEGIN 10255 id 11 predictStart 197 cpu 10
VLPROFEXEC MTASK_END 11060 id 11 predictCost 30
VLPROFEXEC THREAD_SCHEDULE_WAIT_BEGIN 17000 cpu 10
VLPROFEXEC THREAD_SCHEDULE_WAIT_END 18000 cpu 10
VLPROFEXEC MTASK_BEGIN 18375 id 5 predictStart 0 cpu 10
VLPROFEXEC MTASK_END 18970 id 5 predictCost 30
VLPROFEXEC MTASK_BEGIN 19145 id 7 predictStart 30 cpu 10

View File

@ -7,6 +7,7 @@ Argument settings:
Summary:
Total elapsed time = 23415 rdtsc ticks
Parallelized code = 82.51% of elapsed time
Waiting time = 8.54% of elapsed time
Total threads = 2
Total CPUs used = 2
Total mtasks = 7