Add numactl-like automatic assignment of processor affinity (#5911)

2025-04-02 08:27:23 -04:00 · 2025-04-02 08:27:23 -04:00 · 6d1e82b908
parent cd5997a2e6
commit 6d1e82b908
16 changed files with 283 additions and 49 deletions
--- a/1
+++ b/1
@ -27,6 +27,7 @@ Verilator 5.035 devel
 * Add `--make json` to enable integration with non-make/cmake build systems (#5799). [Andrew Voznytsa]
 * Add empty veriuser.h for legacy compatibility.
 * Add DEPRECATED warning on `--xml-only` and `--xml-output`.
+* Add numactl-like automatic assignment of processor affinity.
 * Remove unused gtkwave/wavealloca.h. [Geza Lore]
 * Optimize automatic splitting of some packed variables (#5843). [Geza Lore]
 * Optimize trigger vector in whole words (#5857). [Geza Lore]
--- a/bin/verilator_gantt
+++ b/bin/verilator_gantt
@ -16,7 +16,14 @@ LongestVcdStrValueLength = 0
 Threads = collections.defaultdict(lambda: [])  # List of records per thread id
 Mtasks = collections.defaultdict(lambda: {'elapsed': 0, 'end': 0})
 Cpus = collections.defaultdict(lambda: {'mtask_time': 0})
-Global = {'args': {}, 'cpuinfo': collections.defaultdict(lambda: {}), 'stats': {}}
+Global = {
+    'args': {},
+    'cpuinfo': collections.defaultdict(lambda: {}),
+    'info': {
+        'numa': 'no data'
+    },
+    'stats': {}
+}
 ElapsedTime = None  # total elapsed time
 ExecGraphTime = 0  # total elapsed time executing an exec graph
 ExecGraphIntervals = []  # list of (start, end) pairs
@ -33,7 +40,8 @@ def read_data(filename):

        re_arg1 = re.compile(r'VLPROF arg\s+(\S+)\+([0-9.]*)\s*')
        re_arg2 = re.compile(r'VLPROF arg\s+(\S+)\s+([0-9.]*)\s*$')
-        re_stat = re.compile(r'VLPROF stat\s+(\S+)\s+([0-9.]+)')
+        re_info = re.compile(r'VLPROF info\s+(\S+)\s+(.*)$')
+        re_stat = re.compile(r'VLPROF stat\s+(\S+)\s+(\S+)')
        re_proc_cpu = re.compile(r'VLPROFPROC processor\s*:\s*(\d+)\s*$')
        re_proc_dat = re.compile(r'VLPROFPROC ([a-z_ ]+)\s*:\s*(.*)$')
        cpu = None
@ -108,6 +116,9 @@ def read_data(filename):
            elif re_arg2.match(line):
                match = re_arg2.match(line)
                Global['args'][match.group(1)] = match.group(2)
+            elif re_info.match(line):
+                match = re_info.match(line)
+                Global['info'][match.group(1)] = match.group(2)
            elif re_stat.match(line):
                match = re_stat.match(line)
                Global['stats'][match.group(1)] = match.group(2)
@ -163,6 +174,7 @@ def report():
    print("  Total mtasks       = %d" % len(Mtasks))
    print("  Total yields       = %d" % int(Global['stats'].get('yields', 0)))

+    report_numa()
    report_mtasks()
    report_cpus()
    report_sections()
@ -183,6 +195,11 @@ def report():
    print()


+def report_numa():
+    print("\nNUMA assignment:")
+    print("  NUMA status        = %s" % Global['info']['numa'])
+
+
 def report_mtasks():
    if not Mtasks:
        return
--- a/docs/guide/simulating.rst
+++ b/docs/guide/simulating.rst
@ -83,9 +83,10 @@ option will require a longer time to run Verilator, and
 may increase the risk of reset bugs in trade for performance; see the above
 documentation for these options.

-If using Verilated multithreaded, use ``numactl`` to ensure you use
-non-conflicting hardware resources. See :ref:`Multithreading`. Also,
-consider using profile-guided optimization; see :ref:`Thread PGO`.
+If using Verilated multithreaded, consider overriding Verilator's default
+thread-to-processor assignment by using ``numactl``; see
+:ref:`Multithreading`. Also, consider using profile-guided optimization;
+see :ref:`Thread PGO`.

 Minor Verilog code changes can also give big wins.  You should not have any
 :option:`UNOPTFLAT` warnings from Verilator.  Fixing these warnings can
--- a/docs/guide/verilating.rst
+++ b/docs/guide/verilating.rst
@ -243,11 +243,14 @@ trace. FST tracing can utilize up to 2 offload threads, so there is no use
 of setting :vlopt:`--trace-threads` higher than 2 at the moment.

 When running a multithreaded model, the default Linux task scheduler often
-works against the model by assuming short-lived threads and thus
-it often schedules threads using multiple hyperthreads within the same
-physical core. For best performance, use the :command:`numactl` program to
-(when the threading count fits) select unique physical cores on the same
-socket. The same applies for :vlopt:`--trace-threads` as well.
+works against the model by assuming short-lived threads and thus it often
+schedules threads using multiple hyperthreads within the same physical
+core. If there is no affinity already set, on Linux only, Verilator
+attempts to set thread-to-processor affinity in a reasonable way.
+
+For best performance, use the :command:`numactl` program to (when the
+threading count fits) select unique physical cores on the same socket. The
+same applies for :vlopt:`--trace-threads` as well.

 As an example, if a model was Verilated with
 :vlopt:`--threads 4 <--threads>`, we consult:
--- a/include/verilated_profiler.cpp
+++ b/include/verilated_profiler.cpp
@ -34,28 +34,6 @@ thread_local VlExecutionProfiler::ExecutionTrace VlExecutionProfiler::t_trace;

 constexpr const char* const VlExecutionRecord::s_ascii[];

-//=============================================================================
-// VlPgoProfiler implementation
-
-uint16_t VlExecutionRecord::getcpu() {
-#if defined(__linux)
-    return sched_getcpu();  // TODO: this is a system call. Not exactly cheap.
-#elif defined(__APPLE__) && !defined(__arm64__)
-    uint32_t info[4];
-    __cpuid_count(1, 0, info[0], info[1], info[2], info[3]);
-    // info[1] is EBX, bits 24-31 are APIC ID
-    if ((info[3] & (1 << 9)) == 0) {
-        return -1;  // no APIC on chip
-    } else {
-        return (unsigned)info[1] >> 24;
-    }
-#elif defined(_WIN32)
-    return GetCurrentProcessorNumber();
-#else
-    return 0;
-#endif
-}
-
 //=============================================================================
 // VlExecutionProfiler implementation

@ -161,11 +139,17 @@ void VlExecutionProfiler::dump(const char* filenamep, uint64_t tickEnd)

    // TODO Perhaps merge with verilated_coverage output format, so can
    // have a common merging and reporting tool, etc.
-    fprintf(fp, "VLPROFVERSION 2.1 # Verilator execution profile version 2.1\n");
+    fprintf(fp, "VLPROFVERSION 2.2 # Verilator execution profile version 2.2\n");
    fprintf(fp, "VLPROF arg +verilator+prof+exec+start+%" PRIu64 "\n",
            Verilated::threadContextp()->profExecStart());
    fprintf(fp, "VLPROF arg +verilator+prof+exec+window+%u\n",
            Verilated::threadContextp()->profExecWindow());
+    std::string numa = "no threads";
+    if (VlThreadPool* const threadPoolp
+        = static_cast<VlThreadPool*>(Verilated::threadContextp()->threadPoolp())) {
+        numa = threadPoolp->numaStatus();
+    }
+    fprintf(fp, "VLPROF info numa %s\n", numa.c_str());
    // Note that VerilatedContext will by default create as many threads as there are hardware
    // processors, but not all of them might be utilized. Report the actual number that has trace
    // entries to avoid over-counting.
--- a/include/verilated_profiler.h
+++ b/include/verilated_profiler.h
@ -105,8 +105,6 @@ class VlExecutionRecord final {
    static_assert(alignof(uint64_t) >= alignof(Payload), "Padding not allowed");
    static_assert(alignof(Payload) >= alignof(Type), "Padding not allowed");

-    static uint16_t getcpu();  // Return currently executing CPU id
-
 public:
    // CONSTRUCTOR
    VlExecutionRecord() = default;
@ -120,7 +118,7 @@ public:
    void mtaskBegin(uint32_t id, uint32_t predictStart) {
        m_payload.mtaskBegin.m_id = id;
        m_payload.mtaskBegin.m_predictStart = predictStart;
-        m_payload.mtaskBegin.m_cpu = getcpu();
+        m_payload.mtaskBegin.m_cpu = VlOs::getcpu();
        m_type = Type::MTASK_BEGIN;
    }
    void mtaskEnd(uint32_t id, uint32_t predictCost) {
--- a/include/verilated_threads.cpp
+++ b/include/verilated_threads.cpp
@ -26,6 +26,8 @@
 #include "verilated_threads.h"

 #include <cstdio>
+#include <fstream>
+#include <iostream>
 #include <memory>
 #include <string>

@ -104,9 +106,149 @@ VlThreadPool::VlThreadPool(VerilatedContext* contextp, unsigned nThreads) {
        m_workers.push_back(new VlWorkerThread{contextp});
        m_unassignedWorkers.push(i);
    }
+    m_numaStatus = numaAssign();
 }

 VlThreadPool::~VlThreadPool() {
    // Each ~WorkerThread will wait for its thread to exit.
    for (auto& i : m_workers) delete i;
 }
+
+bool VlThreadPool::isNumactlRunning() {
+    // We assume if current thread is CPU-masked, then under numactl, otherwise not.
+    // This shows that numactl is visible through the affinity mask
+#if defined(__linux) || defined(CPU_ZERO)  // Linux-like; assume we have pthreads etc
+    const unsigned num_cpus = std::thread::hardware_concurrency();
+    cpu_set_t cpuset;
+    CPU_ZERO(&cpuset);
+    const int rc = pthread_getaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
+    if (rc != 0) return true;  // Error; assuming returning true is the least-damage option
+    for (unsigned c = 0; c < std::min(num_cpus, static_cast<unsigned>(CPU_SETSIZE)); ++c) {
+        if (!CPU_ISSET(c, &cpuset)) return true;
+    }
+#endif
+    return false;
+}
+
+std::string VlThreadPool::numaAssign() {
+#if defined(__linux) || defined(CPU_ZERO)  // Linux-like; assume we have pthreads etc
+    // If not under numactl, make a reasonable processor affinity selection
+    if (isNumactlRunning()) return "running under numactl";  // User presumably set affinity
+    const int num_threads = static_cast<int>(m_workers.size());
+    const int num_proc = static_cast<int>(std::thread::hardware_concurrency());
+    if (num_threads < 2) return "too few threads";
+    if (num_threads > num_proc) return "too many threads";
+
+    // Read CPU info.
+    // Uncertain if any modern system has gaps in the processor id (Solaris
+    // did), but just in case use vectors instead of processor number math.
+    //
+    // Currently ignoring socket number "physical id".
+    // If processor numbers are sequential on sockets, algorithm works out ok.
+    // If processor numbers are strided on sockets, algorithm also works out ok.
+    std::ifstream is{"/proc/cpuinfo"};
+    if (VL_UNLIKELY(!is)) return "%Warning: no /proc/cpuinfo";
+
+    std::vector<int> unassigned_processors;  // Processors to assign in sorted order
+    std::map<int, int> processor_core;
+    std::multimap<int, int> core_processors;
+    std::set<int> cores;
+    int processor = -1;
+    int core = -1;
+    while (!is.eof()) {
+        std::string line;
+        std::getline(is, line);
+        static std::string::size_type pos = line.find(":");
+        int number = -1;
+        if (pos != std::string::npos) number = atoi(line.c_str() + pos + 1);
+        if (line.compare(0, std::strlen("processor"), "processor") == 0) {
+            processor = number;
+            core = -1;
+        } else if (line.compare(0, std::strlen("core id"), "core id") == 0) {
+            core = number;
+            // std::cout << "p" << processor << " socket " << socket << " c" << core << std::endl;
+            cores.emplace(core);
+            processor_core[processor] = core;
+            core_processors.emplace(core, processor);
+            unassigned_processors.push_back(processor);
+        }
+    }
+
+    // Start scheduling on the current CPU + 1.
+    // This will help to land on the same socket as current CPU, and also
+    // help make sure that different processes have different masks (when
+    // num_threads is not a common-factor of the processor count).
+    std::sort(unassigned_processors.begin(), unassigned_processors.end());
+    {
+        const int on_cpu = sched_getcpu();  // TODO: this is a system call. Not exactly cheap.
+        bool hit = false;
+        std::vector<int> new_front;
+        std::vector<int> new_back;
+        for (const int processor : unassigned_processors) {
+            if (hit) {
+                new_front.push_back(processor);
+            } else {
+                new_back.push_back(processor);
+            }
+            if (processor == on_cpu) hit = true;
+        }
+        unassigned_processors = new_front;
+        unassigned_processors.insert(unassigned_processors.end(), new_back.begin(),
+                                     new_back.end());
+    }
+
+    // If less threads than cores, we can schedule per-core
+    const bool core_per_thread = num_threads <= cores.size();
+
+    // Compute core mapping
+    std::multimap<int, int> thread_processors;
+    {
+        std::set<int> assigned_processors;
+        int thread = 0;
+        for (const int processor : unassigned_processors) {
+            // Find free processor, the current thread can use that
+            if (assigned_processors.find(processor) != assigned_processors.end()) continue;
+            assigned_processors.emplace(processor);
+            thread_processors.emplace(thread, processor);
+            if (core_per_thread) {
+                // Also include all other processors same core,
+                // so that another thread doesn't land on different processor in same core
+                const int core = processor_core[processor];
+                const auto bounds = core_processors.equal_range(core);
+                for (auto it{bounds.first}; it != bounds.second; ++it) {
+                    if (assigned_processors.find(it->second) != assigned_processors.end())
+                        continue;
+                    if (it->second == processor) continue;
+                    thread_processors.emplace(thread, it->second);
+                    assigned_processors.emplace(it->second);
+                }
+            }
+            // Prepare for next loop
+            thread = (thread + 1) % num_threads;
+        }
+    }
+
+    // Set affinity
+    std::string status = "assigned ";
+    for (int thread = 0; thread < num_threads; ++thread) {
+        cpu_set_t cpuset;
+        CPU_ZERO(&cpuset);
+
+        const auto bounds = thread_processors.equal_range(thread);
+        for (auto it{bounds.first}; it != bounds.second; ++it) {
+            if (it != bounds.first) status += ',';
+            status += std::to_string(it->second);
+            CPU_SET(it->second, &cpuset);
+        }
+        status += ";";
+
+        const int rc = pthread_setaffinity_np(m_workers[thread]->m_cthread.native_handle(),
+                                              sizeof(cpu_set_t), &cpuset);
+        if (rc != 0) return "%Warning: pthread_setaffinity_np failed";
+    }
+    // std::cout << "Status: " << status << std::endl;
+    return status;
+#else
+    return "non-supported host OS";
+#endif
+}
--- a/include/verilated_threads.h
+++ b/include/verilated_threads.h
@ -34,15 +34,6 @@
 #include <thread>
 #include <vector>

-// clang-format off
-#if defined(__linux)
-# include <sched.h>  // For sched_getcpu()
-#endif
-#if defined(__APPLE__) && !defined(__arm64__)
-# include <cpuid.h>  // For __cpuid_count()
-#endif
-// clang-format on
-
 class VlExecutionProfiler;
 class VlThreadPool;

@ -156,6 +147,10 @@ private:

    VL_UNCOPYABLE(VlWorkerThread);

+protected:
+    friend class VlThreadPool;
+    const std::thread& cthread() const { return m_cthread; }
+
 public:
    // CONSTRUCTORS
    explicit VlWorkerThread(VerilatedContext* contextp);
@ -206,12 +201,12 @@ class VlThreadPool final : public VerilatedVirtualBase {
    // MEMBERS
    std::vector<VlWorkerThread*> m_workers;  // our workers

-    // Guards indexes of unassigned workers
-    mutable VerilatedMutex m_mutex;
+    mutable VerilatedMutex m_mutex;  // Guards indexes of unassigned workers
    // Indexes of unassigned workers
    std::stack<size_t> m_unassignedWorkers VL_GUARDED_BY(m_mutex);
-    // Used for sequentially generating task IDs to avoid shadowing
+    // For sequentially generating task IDs to avoid shadowing
    std::atomic<unsigned> m_assignedTasks{0};
+    std::string m_numaStatus;  // Status of NUMA assignment

 public:
    // CONSTRUCTORS
@ -236,6 +231,7 @@ public:
    }
    unsigned assignTaskIndex() { return m_assignedTasks++; }
    int numThreads() const { return static_cast<int>(m_workers.size()); }
+    std::string numaStatus() const { return m_numaStatus; }
    VlWorkerThread* workerp(int index) {
        assert(index >= 0);
        assert(index < static_cast<int>(m_workers.size()));
@ -244,6 +240,9 @@ public:

 private:
    VL_UNCOPYABLE(VlThreadPool);
+
+    static bool isNumactlRunning();
+    std::string numaAssign();
 };

 #endif
--- a/include/verilatedos.h
+++ b/include/verilatedos.h
@ -632,7 +632,12 @@ namespace VlOs {
 /// Get environment variable
 extern std::string getenvStr(const std::string& envvar,
                             const std::string& defaultValue) VL_MT_SAFE;
-extern uint64_t memUsageBytes() VL_MT_SAFE;  ///< Return memory usage in bytes, or 0 if unknown
+
+/// Return currently executing processor number; may do an OS call underneath so slow
+extern uint16_t getcpu() VL_MT_SAFE;
+
+/// Return memory usage in bytes, or 0 if unknown
+extern uint64_t memUsageBytes() VL_MT_SAFE;

 // Internal: Record CPU time, starting point on construction, and current delta from that
 class DeltaCpuTime final {
--- a/include/verilatedos_c.h
+++ b/include/verilatedos_c.h
@ -31,6 +31,13 @@
 # include <processthreadsapi.h>  // GetProcessTimes
 # include <psapi.h>   // GetProcessMemoryInfo
 #endif
+
+#if defined(__linux)
+# include <sched.h>  // For sched_getcpu()
+#endif
+#if defined(__APPLE__) && !defined(__arm64__)
+# include <cpuid.h>  // For __cpuid_count()
+#endif
 // clang-format on

 namespace VlOs {
@ -72,6 +79,28 @@ double DeltaWallTime::gettime() VL_MT_SAFE {
 #endif
 }

+//=============================================================================
+// Vlos::getcpu implementation
+
+uint16_t getcpu() VL_MT_SAFE {
+#if defined(__linux)
+    return sched_getcpu();  // TODO: this is a system call. Not exactly cheap.
+#elif defined(__APPLE__) && !defined(__arm64__)
+    uint32_t info[4];
+    __cpuid_count(1, 0, info[0], info[1], info[2], info[3]);
+    // info[1] is EBX, bits 24-31 are APIC ID
+    if ((info[3] & (1 << 9)) == 0) {
+        return 0;  // no APIC on chip
+    } else {
+        return (unsigned)info[1] >> 24;
+    }
+#elif defined(_WIN32)
+    return GetCurrentProcessorNumber();
+#else
+    return 0;
+#endif
+}
+
 //=========================================================================
 // VlOs::memUsageBytes implementation

--- a/test_regress/t/t_gantt_io.dat
+++ b/test_regress/t/t_gantt_io.dat
@ -1,6 +1,7 @@
 VLPROFVERSION 2.0
 VLPROF arg +verilator+prof+exec+start+2
 VLPROF arg +verilator+prof+exec+window+2
+VLPROF info numa 0,1,4,5;2,3,6,7
 VLPROF stat yields 0
 VLPROF stat threads 2
 VLPROFPROC processor    : 0
--- a/test_regress/t/t_gantt_io.out
+++ b/test_regress/t/t_gantt_io.out
@ -12,6 +12,9 @@ Summary:
  Total mtasks       = 7
  Total yields       = 0

+NUMA assignment:
+  NUMA status        = 0,1,4,5;2,3,6,7
+
 Parallelized code, measured:
  Thread utilization =  14.22%
  Speedup            =  0.284x
--- a/test_regress/t/t_gantt_io_arm.dat
+++ b/test_regress/t/t_gantt_io_arm.dat
@ -1,6 +1,7 @@
 VLPROFVERSION 2.0
 VLPROF arg +verilator+prof+exec+start+1
 VLPROF arg +verilator+prof+exec+window+2
+VLPROF info numa 0,2;1,3
 VLPROF stat threads 2
 VLPROF stat yields 51
 VLPROFPROC processor    : 0
--- a/test_regress/t/t_gantt_io_arm.out
+++ b/test_regress/t/t_gantt_io_arm.out
@ -12,6 +12,9 @@ Summary:
  Total mtasks       = 5
  Total yields       = 51

+NUMA assignment:
+  NUMA status        = 0,2;1,3
+
 Parallelized code, measured:
  Thread utilization =  42.50%
  Speedup            =   0.85x
--- a/test_regress/t/t_gantt_io_noproc.out
+++ b/test_regress/t/t_gantt_io_noproc.out
@ -12,6 +12,9 @@ Summary:
  Total mtasks       = 7
  Total yields       = 0

+NUMA assignment:
+  NUMA status        = no data
+
 Parallelized code, measured:
  Thread utilization =  14.22%
  Speedup            =  0.284x
--- a/test_regress/t/t_gantt_numa.py
+++ b/test_regress/t/t_gantt_numa.py
@ -0,0 +1,44 @@
+#!/usr/bin/env python3
+# DESCRIPTION: Verilator: Verilog Test driver/expect definition
+#
+# Copyright 2024 by Wilson Snyder. This program is free software; you
+# can redistribute it and/or modify it under the terms of either the GNU
+# Lesser General Public License Version 3 or the Perl Artistic License
+# Version 2.0.
+# SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
+
+# Test for bin/verilator_gantt,
+
+import vltest_bootstrap
+
+test.scenarios('vltmt')
+test.top_filename = "t/t_gen_alw.v"  # Any, as long as runs a few cycles
+
+test.compile(
+    v_flags2=["--prof-exec"],
+    # Checks below care about thread count
+    threads=4)
+
+# We need several experiments to make sure that the algorithm is working
+trials = 4
+for trial in range(0, trials):
+    print("--------- Trial %d" % trial)
+
+    test.execute(  # Test fail: run_env='numactl -m 0 -C 0,0,0,0',
+        all_run_flags=[
+            "+verilator+prof+exec+start+2", " +verilator+prof+exec+window+2",
+            " +verilator+prof+exec+file+" + test.obj_dir + "/profile_exec.dat"
+        ])
+
+    gantt_log = test.obj_dir + "/gantt.log"
+
+    test.run(cmd=[
+        os.environ["VERILATOR_ROOT"] + "/bin/verilator_gantt", "--no-vcd", test.obj_dir +
+        "/profile_exec.dat", "| tee " + gantt_log
+    ])
+
+    test.file_grep(gantt_log, r'CPU info:')
+    test.file_grep(gantt_log, r'NUMA status += assigned')
+    test.file_grep_not(gantt_log, r'%Warning:')  # e.g. There were fewer CPUs (1) than threads (3).
+
+test.passes()