Add numactl-like automatic assignment of processor affinity (#5911)
This commit is contained in:
parent
cd5997a2e6
commit
6d1e82b908
1
Changes
1
Changes
|
@ -27,6 +27,7 @@ Verilator 5.035 devel
|
|||
* Add `--make json` to enable integration with non-make/cmake build systems (#5799). [Andrew Voznytsa]
|
||||
* Add empty veriuser.h for legacy compatibility.
|
||||
* Add DEPRECATED warning on `--xml-only` and `--xml-output`.
|
||||
* Add numactl-like automatic assignment of processor affinity.
|
||||
* Remove unused gtkwave/wavealloca.h. [Geza Lore]
|
||||
* Optimize automatic splitting of some packed variables (#5843). [Geza Lore]
|
||||
* Optimize trigger vector in whole words (#5857). [Geza Lore]
|
||||
|
|
|
@ -16,7 +16,14 @@ LongestVcdStrValueLength = 0
|
|||
Threads = collections.defaultdict(lambda: []) # List of records per thread id
|
||||
Mtasks = collections.defaultdict(lambda: {'elapsed': 0, 'end': 0})
|
||||
Cpus = collections.defaultdict(lambda: {'mtask_time': 0})
|
||||
Global = {'args': {}, 'cpuinfo': collections.defaultdict(lambda: {}), 'stats': {}}
|
||||
Global = {
|
||||
'args': {},
|
||||
'cpuinfo': collections.defaultdict(lambda: {}),
|
||||
'info': {
|
||||
'numa': 'no data'
|
||||
},
|
||||
'stats': {}
|
||||
}
|
||||
ElapsedTime = None # total elapsed time
|
||||
ExecGraphTime = 0 # total elapsed time executing an exec graph
|
||||
ExecGraphIntervals = [] # list of (start, end) pairs
|
||||
|
@ -33,7 +40,8 @@ def read_data(filename):
|
|||
|
||||
re_arg1 = re.compile(r'VLPROF arg\s+(\S+)\+([0-9.]*)\s*')
|
||||
re_arg2 = re.compile(r'VLPROF arg\s+(\S+)\s+([0-9.]*)\s*$')
|
||||
re_stat = re.compile(r'VLPROF stat\s+(\S+)\s+([0-9.]+)')
|
||||
re_info = re.compile(r'VLPROF info\s+(\S+)\s+(.*)$')
|
||||
re_stat = re.compile(r'VLPROF stat\s+(\S+)\s+(\S+)')
|
||||
re_proc_cpu = re.compile(r'VLPROFPROC processor\s*:\s*(\d+)\s*$')
|
||||
re_proc_dat = re.compile(r'VLPROFPROC ([a-z_ ]+)\s*:\s*(.*)$')
|
||||
cpu = None
|
||||
|
@ -108,6 +116,9 @@ def read_data(filename):
|
|||
elif re_arg2.match(line):
|
||||
match = re_arg2.match(line)
|
||||
Global['args'][match.group(1)] = match.group(2)
|
||||
elif re_info.match(line):
|
||||
match = re_info.match(line)
|
||||
Global['info'][match.group(1)] = match.group(2)
|
||||
elif re_stat.match(line):
|
||||
match = re_stat.match(line)
|
||||
Global['stats'][match.group(1)] = match.group(2)
|
||||
|
@ -163,6 +174,7 @@ def report():
|
|||
print(" Total mtasks = %d" % len(Mtasks))
|
||||
print(" Total yields = %d" % int(Global['stats'].get('yields', 0)))
|
||||
|
||||
report_numa()
|
||||
report_mtasks()
|
||||
report_cpus()
|
||||
report_sections()
|
||||
|
@ -183,6 +195,11 @@ def report():
|
|||
print()
|
||||
|
||||
|
||||
def report_numa():
|
||||
print("\nNUMA assignment:")
|
||||
print(" NUMA status = %s" % Global['info']['numa'])
|
||||
|
||||
|
||||
def report_mtasks():
|
||||
if not Mtasks:
|
||||
return
|
||||
|
|
|
@ -83,9 +83,10 @@ option will require a longer time to run Verilator, and
|
|||
may increase the risk of reset bugs in trade for performance; see the above
|
||||
documentation for these options.
|
||||
|
||||
If using Verilated multithreaded, use ``numactl`` to ensure you use
|
||||
non-conflicting hardware resources. See :ref:`Multithreading`. Also,
|
||||
consider using profile-guided optimization; see :ref:`Thread PGO`.
|
||||
If using Verilated multithreaded, consider overriding Verilator's default
|
||||
thread-to-processor assignment by using ``numactl``; see
|
||||
:ref:`Multithreading`. Also, consider using profile-guided optimization;
|
||||
see :ref:`Thread PGO`.
|
||||
|
||||
Minor Verilog code changes can also give big wins. You should not have any
|
||||
:option:`UNOPTFLAT` warnings from Verilator. Fixing these warnings can
|
||||
|
|
|
@ -243,11 +243,14 @@ trace. FST tracing can utilize up to 2 offload threads, so there is no use
|
|||
of setting :vlopt:`--trace-threads` higher than 2 at the moment.
|
||||
|
||||
When running a multithreaded model, the default Linux task scheduler often
|
||||
works against the model by assuming short-lived threads and thus
|
||||
it often schedules threads using multiple hyperthreads within the same
|
||||
physical core. For best performance, use the :command:`numactl` program to
|
||||
(when the threading count fits) select unique physical cores on the same
|
||||
socket. The same applies for :vlopt:`--trace-threads` as well.
|
||||
works against the model by assuming short-lived threads and thus it often
|
||||
schedules threads using multiple hyperthreads within the same physical
|
||||
core. If there is no affinity already set, on Linux only, Verilator
|
||||
attempts to set thread-to-processor affinity in a reasonable way.
|
||||
|
||||
For best performance, use the :command:`numactl` program to (when the
|
||||
threading count fits) select unique physical cores on the same socket. The
|
||||
same applies for :vlopt:`--trace-threads` as well.
|
||||
|
||||
As an example, if a model was Verilated with
|
||||
:vlopt:`--threads 4 <--threads>`, we consult:
|
||||
|
|
|
@ -34,28 +34,6 @@ thread_local VlExecutionProfiler::ExecutionTrace VlExecutionProfiler::t_trace;
|
|||
|
||||
constexpr const char* const VlExecutionRecord::s_ascii[];
|
||||
|
||||
//=============================================================================
|
||||
// VlPgoProfiler implementation
|
||||
|
||||
uint16_t VlExecutionRecord::getcpu() {
|
||||
#if defined(__linux)
|
||||
return sched_getcpu(); // TODO: this is a system call. Not exactly cheap.
|
||||
#elif defined(__APPLE__) && !defined(__arm64__)
|
||||
uint32_t info[4];
|
||||
__cpuid_count(1, 0, info[0], info[1], info[2], info[3]);
|
||||
// info[1] is EBX, bits 24-31 are APIC ID
|
||||
if ((info[3] & (1 << 9)) == 0) {
|
||||
return -1; // no APIC on chip
|
||||
} else {
|
||||
return (unsigned)info[1] >> 24;
|
||||
}
|
||||
#elif defined(_WIN32)
|
||||
return GetCurrentProcessorNumber();
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
//=============================================================================
|
||||
// VlExecutionProfiler implementation
|
||||
|
||||
|
@ -161,11 +139,17 @@ void VlExecutionProfiler::dump(const char* filenamep, uint64_t tickEnd)
|
|||
|
||||
// TODO Perhaps merge with verilated_coverage output format, so can
|
||||
// have a common merging and reporting tool, etc.
|
||||
fprintf(fp, "VLPROFVERSION 2.1 # Verilator execution profile version 2.1\n");
|
||||
fprintf(fp, "VLPROFVERSION 2.2 # Verilator execution profile version 2.2\n");
|
||||
fprintf(fp, "VLPROF arg +verilator+prof+exec+start+%" PRIu64 "\n",
|
||||
Verilated::threadContextp()->profExecStart());
|
||||
fprintf(fp, "VLPROF arg +verilator+prof+exec+window+%u\n",
|
||||
Verilated::threadContextp()->profExecWindow());
|
||||
std::string numa = "no threads";
|
||||
if (VlThreadPool* const threadPoolp
|
||||
= static_cast<VlThreadPool*>(Verilated::threadContextp()->threadPoolp())) {
|
||||
numa = threadPoolp->numaStatus();
|
||||
}
|
||||
fprintf(fp, "VLPROF info numa %s\n", numa.c_str());
|
||||
// Note that VerilatedContext will by default create as many threads as there are hardware
|
||||
// processors, but not all of them might be utilized. Report the actual number that has trace
|
||||
// entries to avoid over-counting.
|
||||
|
|
|
@ -105,8 +105,6 @@ class VlExecutionRecord final {
|
|||
static_assert(alignof(uint64_t) >= alignof(Payload), "Padding not allowed");
|
||||
static_assert(alignof(Payload) >= alignof(Type), "Padding not allowed");
|
||||
|
||||
static uint16_t getcpu(); // Return currently executing CPU id
|
||||
|
||||
public:
|
||||
// CONSTRUCTOR
|
||||
VlExecutionRecord() = default;
|
||||
|
@ -120,7 +118,7 @@ public:
|
|||
void mtaskBegin(uint32_t id, uint32_t predictStart) {
|
||||
m_payload.mtaskBegin.m_id = id;
|
||||
m_payload.mtaskBegin.m_predictStart = predictStart;
|
||||
m_payload.mtaskBegin.m_cpu = getcpu();
|
||||
m_payload.mtaskBegin.m_cpu = VlOs::getcpu();
|
||||
m_type = Type::MTASK_BEGIN;
|
||||
}
|
||||
void mtaskEnd(uint32_t id, uint32_t predictCost) {
|
||||
|
|
|
@ -26,6 +26,8 @@
|
|||
#include "verilated_threads.h"
|
||||
|
||||
#include <cstdio>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
|
@ -104,9 +106,149 @@ VlThreadPool::VlThreadPool(VerilatedContext* contextp, unsigned nThreads) {
|
|||
m_workers.push_back(new VlWorkerThread{contextp});
|
||||
m_unassignedWorkers.push(i);
|
||||
}
|
||||
m_numaStatus = numaAssign();
|
||||
}
|
||||
|
||||
VlThreadPool::~VlThreadPool() {
|
||||
// Each ~WorkerThread will wait for its thread to exit.
|
||||
for (auto& i : m_workers) delete i;
|
||||
}
|
||||
|
||||
bool VlThreadPool::isNumactlRunning() {
|
||||
// We assume if current thread is CPU-masked, then under numactl, otherwise not.
|
||||
// This shows that numactl is visible through the affinity mask
|
||||
#if defined(__linux) || defined(CPU_ZERO) // Linux-like; assume we have pthreads etc
|
||||
const unsigned num_cpus = std::thread::hardware_concurrency();
|
||||
cpu_set_t cpuset;
|
||||
CPU_ZERO(&cpuset);
|
||||
const int rc = pthread_getaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
|
||||
if (rc != 0) return true; // Error; assuming returning true is the least-damage option
|
||||
for (unsigned c = 0; c < std::min(num_cpus, static_cast<unsigned>(CPU_SETSIZE)); ++c) {
|
||||
if (!CPU_ISSET(c, &cpuset)) return true;
|
||||
}
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
|
||||
std::string VlThreadPool::numaAssign() {
|
||||
#if defined(__linux) || defined(CPU_ZERO) // Linux-like; assume we have pthreads etc
|
||||
// If not under numactl, make a reasonable processor affinity selection
|
||||
if (isNumactlRunning()) return "running under numactl"; // User presumably set affinity
|
||||
const int num_threads = static_cast<int>(m_workers.size());
|
||||
const int num_proc = static_cast<int>(std::thread::hardware_concurrency());
|
||||
if (num_threads < 2) return "too few threads";
|
||||
if (num_threads > num_proc) return "too many threads";
|
||||
|
||||
// Read CPU info.
|
||||
// Uncertain if any modern system has gaps in the processor id (Solaris
|
||||
// did), but just in case use vectors instead of processor number math.
|
||||
//
|
||||
// Currently ignoring socket number "physical id".
|
||||
// If processor numbers are sequential on sockets, algorithm works out ok.
|
||||
// If processor numbers are strided on sockets, algorithm also works out ok.
|
||||
std::ifstream is{"/proc/cpuinfo"};
|
||||
if (VL_UNLIKELY(!is)) return "%Warning: no /proc/cpuinfo";
|
||||
|
||||
std::vector<int> unassigned_processors; // Processors to assign in sorted order
|
||||
std::map<int, int> processor_core;
|
||||
std::multimap<int, int> core_processors;
|
||||
std::set<int> cores;
|
||||
int processor = -1;
|
||||
int core = -1;
|
||||
while (!is.eof()) {
|
||||
std::string line;
|
||||
std::getline(is, line);
|
||||
static std::string::size_type pos = line.find(":");
|
||||
int number = -1;
|
||||
if (pos != std::string::npos) number = atoi(line.c_str() + pos + 1);
|
||||
if (line.compare(0, std::strlen("processor"), "processor") == 0) {
|
||||
processor = number;
|
||||
core = -1;
|
||||
} else if (line.compare(0, std::strlen("core id"), "core id") == 0) {
|
||||
core = number;
|
||||
// std::cout << "p" << processor << " socket " << socket << " c" << core << std::endl;
|
||||
cores.emplace(core);
|
||||
processor_core[processor] = core;
|
||||
core_processors.emplace(core, processor);
|
||||
unassigned_processors.push_back(processor);
|
||||
}
|
||||
}
|
||||
|
||||
// Start scheduling on the current CPU + 1.
|
||||
// This will help to land on the same socket as current CPU, and also
|
||||
// help make sure that different processes have different masks (when
|
||||
// num_threads is not a common-factor of the processor count).
|
||||
std::sort(unassigned_processors.begin(), unassigned_processors.end());
|
||||
{
|
||||
const int on_cpu = sched_getcpu(); // TODO: this is a system call. Not exactly cheap.
|
||||
bool hit = false;
|
||||
std::vector<int> new_front;
|
||||
std::vector<int> new_back;
|
||||
for (const int processor : unassigned_processors) {
|
||||
if (hit) {
|
||||
new_front.push_back(processor);
|
||||
} else {
|
||||
new_back.push_back(processor);
|
||||
}
|
||||
if (processor == on_cpu) hit = true;
|
||||
}
|
||||
unassigned_processors = new_front;
|
||||
unassigned_processors.insert(unassigned_processors.end(), new_back.begin(),
|
||||
new_back.end());
|
||||
}
|
||||
|
||||
// If less threads than cores, we can schedule per-core
|
||||
const bool core_per_thread = num_threads <= cores.size();
|
||||
|
||||
// Compute core mapping
|
||||
std::multimap<int, int> thread_processors;
|
||||
{
|
||||
std::set<int> assigned_processors;
|
||||
int thread = 0;
|
||||
for (const int processor : unassigned_processors) {
|
||||
// Find free processor, the current thread can use that
|
||||
if (assigned_processors.find(processor) != assigned_processors.end()) continue;
|
||||
assigned_processors.emplace(processor);
|
||||
thread_processors.emplace(thread, processor);
|
||||
if (core_per_thread) {
|
||||
// Also include all other processors same core,
|
||||
// so that another thread doesn't land on different processor in same core
|
||||
const int core = processor_core[processor];
|
||||
const auto bounds = core_processors.equal_range(core);
|
||||
for (auto it{bounds.first}; it != bounds.second; ++it) {
|
||||
if (assigned_processors.find(it->second) != assigned_processors.end())
|
||||
continue;
|
||||
if (it->second == processor) continue;
|
||||
thread_processors.emplace(thread, it->second);
|
||||
assigned_processors.emplace(it->second);
|
||||
}
|
||||
}
|
||||
// Prepare for next loop
|
||||
thread = (thread + 1) % num_threads;
|
||||
}
|
||||
}
|
||||
|
||||
// Set affinity
|
||||
std::string status = "assigned ";
|
||||
for (int thread = 0; thread < num_threads; ++thread) {
|
||||
cpu_set_t cpuset;
|
||||
CPU_ZERO(&cpuset);
|
||||
|
||||
const auto bounds = thread_processors.equal_range(thread);
|
||||
for (auto it{bounds.first}; it != bounds.second; ++it) {
|
||||
if (it != bounds.first) status += ',';
|
||||
status += std::to_string(it->second);
|
||||
CPU_SET(it->second, &cpuset);
|
||||
}
|
||||
status += ";";
|
||||
|
||||
const int rc = pthread_setaffinity_np(m_workers[thread]->m_cthread.native_handle(),
|
||||
sizeof(cpu_set_t), &cpuset);
|
||||
if (rc != 0) return "%Warning: pthread_setaffinity_np failed";
|
||||
}
|
||||
// std::cout << "Status: " << status << std::endl;
|
||||
return status;
|
||||
#else
|
||||
return "non-supported host OS";
|
||||
#endif
|
||||
}
|
||||
|
|
|
@ -34,15 +34,6 @@
|
|||
#include <thread>
|
||||
#include <vector>
|
||||
|
||||
// clang-format off
|
||||
#if defined(__linux)
|
||||
# include <sched.h> // For sched_getcpu()
|
||||
#endif
|
||||
#if defined(__APPLE__) && !defined(__arm64__)
|
||||
# include <cpuid.h> // For __cpuid_count()
|
||||
#endif
|
||||
// clang-format on
|
||||
|
||||
class VlExecutionProfiler;
|
||||
class VlThreadPool;
|
||||
|
||||
|
@ -156,6 +147,10 @@ private:
|
|||
|
||||
VL_UNCOPYABLE(VlWorkerThread);
|
||||
|
||||
protected:
|
||||
friend class VlThreadPool;
|
||||
const std::thread& cthread() const { return m_cthread; }
|
||||
|
||||
public:
|
||||
// CONSTRUCTORS
|
||||
explicit VlWorkerThread(VerilatedContext* contextp);
|
||||
|
@ -206,12 +201,12 @@ class VlThreadPool final : public VerilatedVirtualBase {
|
|||
// MEMBERS
|
||||
std::vector<VlWorkerThread*> m_workers; // our workers
|
||||
|
||||
// Guards indexes of unassigned workers
|
||||
mutable VerilatedMutex m_mutex;
|
||||
mutable VerilatedMutex m_mutex; // Guards indexes of unassigned workers
|
||||
// Indexes of unassigned workers
|
||||
std::stack<size_t> m_unassignedWorkers VL_GUARDED_BY(m_mutex);
|
||||
// Used for sequentially generating task IDs to avoid shadowing
|
||||
// For sequentially generating task IDs to avoid shadowing
|
||||
std::atomic<unsigned> m_assignedTasks{0};
|
||||
std::string m_numaStatus; // Status of NUMA assignment
|
||||
|
||||
public:
|
||||
// CONSTRUCTORS
|
||||
|
@ -236,6 +231,7 @@ public:
|
|||
}
|
||||
unsigned assignTaskIndex() { return m_assignedTasks++; }
|
||||
int numThreads() const { return static_cast<int>(m_workers.size()); }
|
||||
std::string numaStatus() const { return m_numaStatus; }
|
||||
VlWorkerThread* workerp(int index) {
|
||||
assert(index >= 0);
|
||||
assert(index < static_cast<int>(m_workers.size()));
|
||||
|
@ -244,6 +240,9 @@ public:
|
|||
|
||||
private:
|
||||
VL_UNCOPYABLE(VlThreadPool);
|
||||
|
||||
static bool isNumactlRunning();
|
||||
std::string numaAssign();
|
||||
};
|
||||
|
||||
#endif
|
||||
|
|
|
@ -632,7 +632,12 @@ namespace VlOs {
|
|||
/// Get environment variable
|
||||
extern std::string getenvStr(const std::string& envvar,
|
||||
const std::string& defaultValue) VL_MT_SAFE;
|
||||
extern uint64_t memUsageBytes() VL_MT_SAFE; ///< Return memory usage in bytes, or 0 if unknown
|
||||
|
||||
/// Return currently executing processor number; may do an OS call underneath so slow
|
||||
extern uint16_t getcpu() VL_MT_SAFE;
|
||||
|
||||
/// Return memory usage in bytes, or 0 if unknown
|
||||
extern uint64_t memUsageBytes() VL_MT_SAFE;
|
||||
|
||||
// Internal: Record CPU time, starting point on construction, and current delta from that
|
||||
class DeltaCpuTime final {
|
||||
|
|
|
@ -31,6 +31,13 @@
|
|||
# include <processthreadsapi.h> // GetProcessTimes
|
||||
# include <psapi.h> // GetProcessMemoryInfo
|
||||
#endif
|
||||
|
||||
#if defined(__linux)
|
||||
# include <sched.h> // For sched_getcpu()
|
||||
#endif
|
||||
#if defined(__APPLE__) && !defined(__arm64__)
|
||||
# include <cpuid.h> // For __cpuid_count()
|
||||
#endif
|
||||
// clang-format on
|
||||
|
||||
namespace VlOs {
|
||||
|
@ -72,6 +79,28 @@ double DeltaWallTime::gettime() VL_MT_SAFE {
|
|||
#endif
|
||||
}
|
||||
|
||||
//=============================================================================
|
||||
// Vlos::getcpu implementation
|
||||
|
||||
uint16_t getcpu() VL_MT_SAFE {
|
||||
#if defined(__linux)
|
||||
return sched_getcpu(); // TODO: this is a system call. Not exactly cheap.
|
||||
#elif defined(__APPLE__) && !defined(__arm64__)
|
||||
uint32_t info[4];
|
||||
__cpuid_count(1, 0, info[0], info[1], info[2], info[3]);
|
||||
// info[1] is EBX, bits 24-31 are APIC ID
|
||||
if ((info[3] & (1 << 9)) == 0) {
|
||||
return 0; // no APIC on chip
|
||||
} else {
|
||||
return (unsigned)info[1] >> 24;
|
||||
}
|
||||
#elif defined(_WIN32)
|
||||
return GetCurrentProcessorNumber();
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
//=========================================================================
|
||||
// VlOs::memUsageBytes implementation
|
||||
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
VLPROFVERSION 2.0
|
||||
VLPROF arg +verilator+prof+exec+start+2
|
||||
VLPROF arg +verilator+prof+exec+window+2
|
||||
VLPROF info numa 0,1,4,5;2,3,6,7
|
||||
VLPROF stat yields 0
|
||||
VLPROF stat threads 2
|
||||
VLPROFPROC processor : 0
|
||||
|
|
|
@ -12,6 +12,9 @@ Summary:
|
|||
Total mtasks = 7
|
||||
Total yields = 0
|
||||
|
||||
NUMA assignment:
|
||||
NUMA status = 0,1,4,5;2,3,6,7
|
||||
|
||||
Parallelized code, measured:
|
||||
Thread utilization = 14.22%
|
||||
Speedup = 0.284x
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
VLPROFVERSION 2.0
|
||||
VLPROF arg +verilator+prof+exec+start+1
|
||||
VLPROF arg +verilator+prof+exec+window+2
|
||||
VLPROF info numa 0,2;1,3
|
||||
VLPROF stat threads 2
|
||||
VLPROF stat yields 51
|
||||
VLPROFPROC processor : 0
|
||||
|
|
|
@ -12,6 +12,9 @@ Summary:
|
|||
Total mtasks = 5
|
||||
Total yields = 51
|
||||
|
||||
NUMA assignment:
|
||||
NUMA status = 0,2;1,3
|
||||
|
||||
Parallelized code, measured:
|
||||
Thread utilization = 42.50%
|
||||
Speedup = 0.85x
|
||||
|
|
|
@ -12,6 +12,9 @@ Summary:
|
|||
Total mtasks = 7
|
||||
Total yields = 0
|
||||
|
||||
NUMA assignment:
|
||||
NUMA status = no data
|
||||
|
||||
Parallelized code, measured:
|
||||
Thread utilization = 14.22%
|
||||
Speedup = 0.284x
|
||||
|
|
|
@ -0,0 +1,44 @@
|
|||
#!/usr/bin/env python3
|
||||
# DESCRIPTION: Verilator: Verilog Test driver/expect definition
|
||||
#
|
||||
# Copyright 2024 by Wilson Snyder. This program is free software; you
|
||||
# can redistribute it and/or modify it under the terms of either the GNU
|
||||
# Lesser General Public License Version 3 or the Perl Artistic License
|
||||
# Version 2.0.
|
||||
# SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
|
||||
|
||||
# Test for bin/verilator_gantt,
|
||||
|
||||
import vltest_bootstrap
|
||||
|
||||
test.scenarios('vltmt')
|
||||
test.top_filename = "t/t_gen_alw.v" # Any, as long as runs a few cycles
|
||||
|
||||
test.compile(
|
||||
v_flags2=["--prof-exec"],
|
||||
# Checks below care about thread count
|
||||
threads=4)
|
||||
|
||||
# We need several experiments to make sure that the algorithm is working
|
||||
trials = 4
|
||||
for trial in range(0, trials):
|
||||
print("--------- Trial %d" % trial)
|
||||
|
||||
test.execute( # Test fail: run_env='numactl -m 0 -C 0,0,0,0',
|
||||
all_run_flags=[
|
||||
"+verilator+prof+exec+start+2", " +verilator+prof+exec+window+2",
|
||||
" +verilator+prof+exec+file+" + test.obj_dir + "/profile_exec.dat"
|
||||
])
|
||||
|
||||
gantt_log = test.obj_dir + "/gantt.log"
|
||||
|
||||
test.run(cmd=[
|
||||
os.environ["VERILATOR_ROOT"] + "/bin/verilator_gantt", "--no-vcd", test.obj_dir +
|
||||
"/profile_exec.dat", "| tee " + gantt_log
|
||||
])
|
||||
|
||||
test.file_grep(gantt_log, r'CPU info:')
|
||||
test.file_grep(gantt_log, r'NUMA status += assigned')
|
||||
test.file_grep_not(gantt_log, r'%Warning:') # e.g. There were fewer CPUs (1) than threads (3).
|
||||
|
||||
test.passes()
|
Loading…
Reference in New Issue