Fix V3OrderParallel scoring contraction hang (#6052)

2025-05-29 16:31:57 +02:00 · 2025-05-29 16:31:57 +02:00 · 7dbe4f1807
parent 5fcd0e52e7
commit 7dbe4f1807
5 changed files with 170 additions and 69 deletions
--- a/src/V3OrderParallel.cpp
+++ b/src/V3OrderParallel.cpp
@ -127,7 +127,7 @@ constexpr unsigned PART_SIBLING_EDGE_LIMIT = 26;
 // and we probably don't want a huge number of mTaskGraphp in practice anyway
 // (50 to 100 is typical.)
 //
-// If the user doesn't give one with '--threads-max-mTaskGraphp', we'll set the
+// If the user doesn't give one with '--threads-max-mtasks', we'll set the
 // maximum # of MTasks to
 //  (# of threads * PART_DEFAULT_MAX_MTASKS_PER_THREAD)
 constexpr unsigned PART_DEFAULT_MAX_MTASKS_PER_THREAD = 50;
@ -137,7 +137,7 @@ constexpr unsigned PART_DEFAULT_MAX_MTASKS_PER_THREAD = 50;
 //######################################################################
 // Misc graph and assertion utilities

-static void partCheckCachedScoreVsActual(uint32_t cached, uint32_t actual) {
+static void partCheckCachedScoreVsActual(uint64_t cached, uint64_t actual) {
 #if PART_STEPPED_COST
    // Cached CP might be a little bigger than actual, due to stepped CPs.
    // Example:
@ -160,8 +160,8 @@ static void partCheckCachedScoreVsActual(uint32_t cached, uint32_t actual) {
 struct EdgeKey final {
    // Node: Structure layout chosen to minimize padding in PairingHeap<*>::Node
    uint64_t m_id;  // Unique ID part of edge score
-    uint32_t m_score;  // Score part of ID
-    void increase(uint32_t score) {
+    uint64_t m_score;  // Score part of ID
+    void increase(uint64_t score) {
        UDEBUGONLY(UASSERT(score >= m_score, "Must increase"););
        m_score = score;
    }
@ -179,7 +179,7 @@ using EdgeHeap = PairingHeap<EdgeKey>;
 struct MergeCandidateKey final {
    // Note: Structure layout chosen to minimize padding in PairingHeap<*>::Node
    uint64_t m_id;  // Unique ID part of edge score
-    uint32_t m_score;  // Score part of ID
+    uint64_t m_score;  // Score part of ID
    bool operator<(const MergeCandidateKey& other) const {
        // First by Score then by ID, but notice that we want minimums using a max-heap, so reverse
        return m_score > other.m_score || (m_score == other.m_score && m_id > other.m_id);
@ -222,7 +222,7 @@ public:
    bool mergeWouldCreateCycle() const;  // Instead of virtual method

    inline void rescore();
-    uint32_t score() const { return m_key.m_score; }
+    uint64_t score() const { return m_key.m_score; }

    static MergeCandidate* heapNodeToElem(MergeCandidateScoreboard::Node* nodep) {
        return static_cast<MergeCandidate*>(nodep);
@ -290,7 +290,7 @@ public:
    // with updated critical path.
    void resetCriticalPaths();

-    uint32_t cachedCp(GraphWay way) const { return m_edgeHeapNode[way].key().m_score; }
+    uint64_t cachedCp(GraphWay way) const { return m_edgeHeapNode[way].key().m_score; }

    // Convert from the address of the m_edgeHeapNode[way] in an MTaskEdge back to the MTaskEdge
    static const MTaskEdge* toMTaskEdge(GraphWay way, const EdgeHeap::Node* nodep) {
@ -327,12 +327,12 @@ private:

    // Cost estimate for this LogicMTask, derived from V3InstrCount.
    // In abstract time units.
-    uint32_t m_cost = 0;
+    uint64_t m_cost = 0;

    // Cost of critical paths going FORWARD from graph-start to the start
    // of this vertex, and also going REVERSE from the end of the graph to
    // the end of the vertex. Same units as m_cost.
-    std::array<uint32_t, GraphWay::NUM_WAYS> m_critPathCost;
+    std::array<uint64_t, GraphWay::NUM_WAYS> m_critPathCost;

    const uint32_t m_id;  // Unique LogicMTask ID number
    static uint32_t s_nextId;  // Next ID number to use
@ -361,7 +361,7 @@ public:
        : V3GraphVertex{graphp}
        , m_id{s_nextId++} {
        UASSERT(s_nextId < 0xFFFFFFFFUL, "Too many mTaskGraphp");
-        for (uint32_t& item : m_critPathCost) item = 0;
+        for (uint64_t& item : m_critPathCost) item = 0;
        if (mVtxp) {
            m_mVertices.linkBack(mVtxp);
            if (const OrderLogicVertex* const olvp = mVtxp->logicp()) {
@ -392,10 +392,10 @@ public:
    // the final C++ output.
    uint32_t id() const { return m_id; }
    // Abstract cost of every logic mtask
-    uint32_t cost() const VL_MT_SAFE { return m_cost; }
-    void setCost(uint32_t cost) { m_cost = cost; }  // For tests only
-    uint32_t stepCost() const { return stepCost(m_cost); }
-    static uint32_t stepCost(uint32_t cost) {
+    uint64_t cost() const VL_MT_SAFE { return m_cost; }
+    void setCost(uint64_t cost) { m_cost = cost; }  // For tests only
+    uint64_t stepCost() const { return stepCost(m_cost); }
+    static uint64_t stepCost(uint64_t cost) {
 #if PART_STEPPED_COST
        // Round cost up to the nearest 5%. Use this when computing all
        // critical paths. The idea is that critical path changes don't
@ -410,7 +410,7 @@ public:
        logcost = ceil(logcost);
        logcost = logcost / 20.0;

-        const uint32_t stepCost = static_cast<uint32_t>(exp(logcost));
+        const uint64_t stepCost = static_cast<uint64_t>(exp(logcost));
        UDEBUGONLY(UASSERT_STATIC(stepCost >= cost, "stepped cost error exceeded"););
        UDEBUGONLY(UASSERT_STATIC(stepCost <= ((cost * 11 / 10)), "stepped cost error exceeded"););
        return stepCost;
@ -426,7 +426,7 @@ public:
        // Add to the edge heap
        LogicMTask* const relativep = edgep->furtherMTaskp<N_Way>();
        // Value is !way cp to this edge
-        const uint32_t cp = relativep->stepCost() + relativep->critPathCost(inv);
+        const uint64_t cp = relativep->stepCost() + relativep->critPathCost(inv);
        //
        m_edgeHeap[way].insert(&edgep->m_edgeHeapNode[way], {relativep->id(), cp});
    }
@ -462,8 +462,8 @@ public:
        for (const V3GraphEdge& edge : edges<N_Way>()) {
            const LogicMTask* const relativep
                = static_cast<const LogicMTask*>(edge.furtherp<N_Way>());
-            const uint32_t cachedCp = static_cast<const MTaskEdge&>(edge).cachedCp(way);
-            const uint32_t cp = relativep->critPathCost(way.invert()) + relativep->stepCost();
+            const uint64_t cachedCp = static_cast<const MTaskEdge&>(edge).cachedCp(way);
+            const uint64_t cp = relativep->critPathCost(way.invert()) + relativep->stepCost();
            partCheckCachedScoreVsActual(cachedCp, cp);
        }
    }
@ -477,10 +477,10 @@ public:
        return out.str();
    }

-    void setCritPathCost(GraphWay way, uint32_t cost) { m_critPathCost[way] = cost; }
-    uint32_t critPathCost(GraphWay way) const { return m_critPathCost[way]; }
+    void setCritPathCost(GraphWay way, uint64_t cost) { m_critPathCost[way] = cost; }
+    uint64_t critPathCost(GraphWay way) const { return m_critPathCost[way]; }
    template <GraphWay::en N_Way>
-    uint32_t critPathCostWithout(const V3GraphEdge* withoutp) const {
+    uint64_t critPathCostWithout(const V3GraphEdge* withoutp) const {
        const GraphWay way{N_Way};
        const GraphWay inv = way.invert();
        // Compute the critical path cost wayward to this node, without considering edge
@ -574,7 +574,7 @@ public:

        // Follow the entire critical path
        std::vector<const LogicMTask*> path;
-        uint32_t totalCost = 0;
+        uint64_t totalCost = 0;
        for (const LogicMTask* nextp = startp; nextp;) {
            path.push_back(nextp);
            totalCost += nextp->cost();
@ -624,25 +624,25 @@ bool MergeCandidate::mergeWouldCreateCycle() const {
                         : static_cast<const MTaskEdge*>(this)->mergeWouldCreateCycle();
 }

-static uint32_t siblingScore(const SiblingMC* sibsp) {
+static uint64_t siblingScore(const SiblingMC* sibsp) {
    const LogicMTask* const ap = sibsp->ap();
    const LogicMTask* const bp = sibsp->bp();
-    const uint32_t mergedCpCostFwd
+    const uint64_t mergedCpCostFwd
        = std::max(ap->critPathCost(GraphWay::FORWARD), bp->critPathCost(GraphWay::FORWARD));
-    const uint32_t mergedCpCostRev
+    const uint64_t mergedCpCostRev
        = std::max(ap->critPathCost(GraphWay::REVERSE), bp->critPathCost(GraphWay::REVERSE));
    return mergedCpCostRev + mergedCpCostFwd + LogicMTask::stepCost(ap->cost() + bp->cost());
 }

-static uint32_t edgeScore(const MTaskEdge* edgep) {
+static uint64_t edgeScore(const MTaskEdge* edgep) {
    // Score this edge. Lower is better. The score is the new local CP
    // length if we merge these mTaskGraphp.  ("Local" means the longest
    // critical path running through the merged node.)
    const LogicMTask* const top = edgep->toMTaskp();
    const LogicMTask* const fromp = edgep->fromMTaskp();
-    const uint32_t mergedCpCostFwd = std::max(fromp->critPathCost(GraphWay::FORWARD),
+    const uint64_t mergedCpCostFwd = std::max(fromp->critPathCost(GraphWay::FORWARD),
                                              top->critPathCostWithout<GraphWay::FORWARD>(edgep));
-    const uint32_t mergedCpCostRev = std::max(fromp->critPathCostWithout<GraphWay::REVERSE>(edgep),
+    const uint64_t mergedCpCostRev = std::max(fromp->critPathCostWithout<GraphWay::REVERSE>(edgep),
                                              top->critPathCost(GraphWay::REVERSE));
    return mergedCpCostRev + mergedCpCostFwd + LogicMTask::stepCost(fromp->cost() + top->cost());
 }
@ -724,7 +724,7 @@ static void partInitHalfCriticalPaths(V3Graph& mTaskGraph, bool checkOnly) {
    for (const V3GraphVertex* vertexp; (vertexp = order.nextp());) {
        const LogicMTask* const mtaskcp = static_cast<const LogicMTask*>(vertexp);
        LogicMTask* const mtaskp = const_cast<LogicMTask*>(mtaskcp);
-        uint32_t cpCost = 0;
+        uint64_t cpCost = 0;
 #if VL_DEBUG
        std::unordered_set<V3GraphVertex*> relatives;
 #endif
@ -739,7 +739,7 @@ static void partInitHalfCriticalPaths(V3Graph& mTaskGraph, bool checkOnly) {
 #endif
            const LogicMTask* const relativep = static_cast<LogicMTask*>(edge.furtherp<rev>());
            cpCost = std::max(cpCost, (relativep->critPathCost(way)
-                                       + static_cast<uint32_t>(relativep->stepCost())));
+                                       + static_cast<uint64_t>(relativep->stepCost())));
        }
        if (checkOnly) {
            partCheckCachedScoreVsActual(mtaskp->critPathCost(way), cpCost);
@ -798,8 +798,8 @@ class PropagateCp final {
    // We keep pending vertices in a heap during critical path propagation
    struct PendingKey final {
        LogicMTask* m_mtaskp;  // The vertex in the heap
-        uint32_t m_score;  // The score of this entry
-        void increase(uint32_t score) {
+        uint64_t m_score;  // The score of this entry
+        void increase(uint64_t score) {
            UDEBUGONLY(UASSERT(score >= m_score, "Must increase"););
            m_score = score;
        }
@ -861,7 +861,7 @@ private:
    }

 public:
-    void cpHasIncreased(V3GraphVertex* vxp, uint32_t newInclusiveCp) {
+    void cpHasIncreased(V3GraphVertex* vxp, uint64_t newInclusiveCp) {
        constexpr GraphWay way{N_Way};
        constexpr GraphWay inv{way.invert()};

@ -877,13 +877,13 @@ public:
                relativep->m_edgeHeap[inv].increaseKey(&edgeHeapNode, newInclusiveCp);
            }

-            const uint32_t critPathCost = relativep->critPathCost(way);
+            const uint64_t critPathCost = relativep->critPathCost(way);

            if (critPathCost >= newInclusiveCp) continue;

            // relativep's critPathCost() is out of step with its longest !wayward edge.
            // Schedule that to be resolved.
-            const uint32_t newVal = newInclusiveCp - critPathCost;
+            const uint64_t newVal = newInclusiveCp - critPathCost;

            if (PendingHeapNode* const nodep = static_cast<PendingHeapNode*>(relativep->userp())) {
                // Already in heap. Increase score if needed.
@ -924,16 +924,16 @@ public:
            m_pendingHeap.remove(maxp);
            // Pick up values
            LogicMTask* const mtaskp = maxp->key().m_mtaskp;
-            const uint32_t cpGrowBy = maxp->key().m_score;
+            const uint64_t cpGrowBy = maxp->key().m_score;
            // Free the heap node, we are done with it
            freeNode(maxp);
            mtaskp->userp(nullptr);
            // Update the critPathCost of mtaskp, that was out-of-date with respect to its edges
-            const uint32_t startCp = mtaskp->critPathCost(way);
-            const uint32_t newCp = startCp + cpGrowBy;
+            const uint64_t startCp = mtaskp->critPathCost(way);
+            const uint64_t newCp = startCp + cpGrowBy;
            if (VL_UNLIKELY(m_slowAsserts)) {
                // Check that CP matches that of the longest edge wayward of vxp.
-                const uint32_t edgeCp = mtaskp->m_edgeHeap[inv].max()->key().m_score;
+                const uint64_t edgeCp = mtaskp->m_edgeHeap[inv].max()->key().m_score;
                UASSERT_OBJ(edgeCp == newCp, mtaskp, "CP doesn't match longest wayward edge");
                // Confirm that we only set each node's CP once.  That's an
                // important property of PropagateCp which allows it to be far
@ -1114,15 +1114,17 @@ class Contraction final {
    // TYPES
    // New CP information for mtaskp reflecting an upcoming merge
    struct NewCp final {
-        uint32_t cp;
-        uint32_t propagateCp;
+        uint64_t cp;
+        uint64_t propagateCp;
        bool propagate;
    };

    // MEMBERS
    V3Graph& m_mTaskGraph;  // The Mtask graph
-    uint32_t m_scoreLimit;  // Sloppy score allowed when picking merges
-    uint32_t m_scoreLimitBeforeRescore = 0xffffffff;  // Next score rescore at
+    uint64_t m_scoreLimit;  // Sloppy score allowed when picking merges
+    uint64_t m_scoreLimitBeforeRescore
+        = std::numeric_limits<decltype(m_scoreLimitBeforeRescore)>::max();  // Next score rescore
+                                                                            // at
    unsigned m_mergesSinceRescore = 0;  // Merges since last rescore
    const bool m_slowAsserts;  // Take extra time to validate algorithm
    MergeCandidateScoreboard m_sb;  // Scoreboard
@ -1135,7 +1137,7 @@ class Contraction final {

 public:
    // CONSTRUCTORS
-    Contraction(V3Graph& mTaskGraph, uint32_t scoreLimit, LogicMTask* entryMTaskp,
+    Contraction(V3Graph& mTaskGraph, uint64_t scoreLimit, LogicMTask* entryMTaskp,
                LogicMTask* exitMTaskp, bool slowAsserts)
        : m_mTaskGraph{mTaskGraph}
        , m_scoreLimit{scoreLimit}
@ -1199,9 +1201,9 @@ public:
                UASSERT(!m_sb.needsRescore(mergeCanp),
                        "Need-rescore items should not be returned by bestp");
            }
-            const uint32_t cachedScore = mergeCanp->score();
+            const uint64_t cachedScore = mergeCanp->score();
            mergeCanp->rescore();
-            const uint32_t actualScore = mergeCanp->score();
+            const uint64_t actualScore = mergeCanp->score();

            if (actualScore > cachedScore) {
                // Cached score is out-of-date.
@ -1226,12 +1228,16 @@ public:
                    // limit and keep going...
                    const unsigned mtaskCount = m_mTaskGraph.vertices().size();
                    if (mtaskCount > maxMTasks) {
-                        const uint32_t oldLimit = m_scoreLimit;
+                        const uint64_t oldLimit = m_scoreLimit;
                        m_scoreLimit = (m_scoreLimit * 120) / 100;
-                        v3Global.rootp()->fileline()->v3warn(
-                            UNOPTTHREADS, "Thread scheduler is unable to provide requested "
-                                          "parallelism; suggest asking for fewer threads.");
-                        UINFO(1,
+                        FileLine* const flp = v3Global.rootp()->fileline();
+                        if (!flp->warnIsOff(V3ErrorCode::UNOPTTHREADS)) {
+                            flp->v3warn(UNOPTTHREADS,
+                                        "Thread scheduler is unable to provide requested "
+                                        "parallelism; suggest asking for fewer threads.");
+                            flp->modifyWarnOff(V3ErrorCode::UNOPTTHREADS, true);
+                        }
+                        UINFO(6,
                              "Critical path limit was=" << oldLimit << " now=" << m_scoreLimit);
                        continue;
                    }
@ -1322,7 +1328,7 @@ private:
        // Return new wayward-CP for mtaskp reflecting its upcoming merge
        // with otherp. Set 'result.propagate' if mtaskp's wayward
        // relatives will see a new wayward CP from this merge.
-        uint32_t newCp;
+        uint64_t newCp;
        if (mergeEdgep) {
            if (mtaskp == mergeEdgep->furtherp<way>()) {
                newCp = std::max(otherp->critPathCost(way),
@ -1335,8 +1341,8 @@ private:
            newCp = std::max(otherp->critPathCost(way), mtaskp->critPathCost(way));
        }

-        const uint32_t origRelativesCp = mtaskp->critPathCost(way) + mtaskp->stepCost();
-        const uint32_t newRelativesCp
+        const uint64_t origRelativesCp = mtaskp->critPathCost(way) + mtaskp->stepCost();
+        const uint64_t newRelativesCp
            = newCp + LogicMTask::stepCost(mtaskp->cost() + otherp->cost());

        NewCp result;
@ -1506,7 +1512,8 @@ private:
        UINFO(6, "Did rescore. Merges since previous = " << m_mergesSinceRescore);

        m_mergesSinceRescore = 0;
-        m_scoreLimitBeforeRescore = 0xffffffff;
+        m_scoreLimitBeforeRescore
+            = std::numeric_limits<decltype(m_scoreLimitBeforeRescore)>::max();
    }

    void makeSiblingMC(LogicMTask* ap, LogicMTask* bp) {
@ -1545,8 +1552,8 @@ private:
        // functions are efficient enough and using more optimized methods (e.g.: sorting networks)
        // has no measurable benefit.
        struct alignas(16) SortingRecord final {
-            uint64_t m_id;
-            uint32_t m_cp;
+            uint64_t m_cp;
+            uint32_t m_id;
            uint8_t m_idx;
            static_assert(PART_SIBLING_EDGE_LIMIT <= std::numeric_limits<uint8_t>::max(),
                          "m_idx must fit all indices into 'neighbors'");
@ -1689,7 +1696,7 @@ public:
        selfTestChain();
    }

-    static void apply(V3Graph& mTaskGraph, uint32_t scoreLimit, LogicMTask* entryMTaskp,
+    static void apply(V3Graph& mTaskGraph, uint64_t scoreLimit, LogicMTask* entryMTaskp,
                      LogicMTask* exitMTaskp, bool slowAsserts) {
        Contraction{mTaskGraph, scoreLimit, entryMTaskp, exitMTaskp, slowAsserts};
    }
@ -2056,18 +2063,18 @@ static void debugMTaskGraphStats(V3Graph& graph, const string& stage) {

    UINFO(4, "\n");
    UINFO(4, " Stats for " << stage);
-    uint32_t mtaskCount = 0;
-    uint32_t totalCost = 0;
-    std::array<uint32_t, 32> mtaskCostHist;
-    mtaskCostHist.fill(0);
+    uint64_t mtaskCount = 0;
+    uint64_t totalCost = 0;
+    constexpr int scoreBits = std::numeric_limits<uint64_t>::digits;
+    std::array<uint64_t, scoreBits> mtaskCostHist{};
    for (const V3GraphVertex& mtask : graph.vertices()) {
        ++mtaskCount;
-        uint32_t mtaskCost = mtask.as<const LogicMTask>()->cost();
+        uint64_t mtaskCost = mtask.as<const LogicMTask>()->cost();
        totalCost += mtaskCost;

        unsigned log2Cost = 0;
        while (mtaskCost >>= 1) ++log2Cost;
-        UASSERT(log2Cost < 32, "log2Cost overflow in debugMTaskGraphStats");
+        UASSERT(log2Cost < scoreBits, "log2Cost overflow in debugMTaskGraphStats");
        ++mtaskCostHist[log2Cost];
    }
    UINFO(4, "  Total mtask cost = " << totalCost);
@ -2075,7 +2082,7 @@ static void debugMTaskGraphStats(V3Graph& graph, const string& stage) {
    UINFO(4, "  Avg cost / mtask = " << ((mtaskCount > 0) ? cvtToStr(totalCost / mtaskCount)
                                                          : "INF!"));
    UINFO(4, "  Histogram of mtask costs:");
-    for (unsigned i = 0; i < 32; ++i) {
+    for (unsigned i = 0; i < scoreBits; ++i) {
        if (mtaskCostHist[i]) {
            UINFO(4, "    2^" << i << ": " << mtaskCostHist[i]);
            V3Stats::addStat("MTask graph, " + stage + ", mtask cost 2^" + (i < 10 ? " " : "")
@ -2185,8 +2192,8 @@ class Partitioner final {
        return fanIn + fanOut == 4;
    }

-    uint32_t setupMTaskDeps() VL_MT_DISABLED {
-        uint32_t totalGraphCost = 0;
+    uint64_t setupMTaskDeps() VL_MT_DISABLED {
+        uint64_t totalGraphCost = 0;

        // Artificial single entry point vertex in the MTask graph to allow sibling merges.
        // This is required as otherwise disjoint sub-graphs could not be merged, but the
@ -2280,7 +2287,7 @@ class Partitioner final {
        // OrderMoveVertex. Over time, we'll merge MTasks together and
        // eventually each MTask will wrap a large number of MTaskMoveVertices
        // (and the logic nodes therein.)
-        const uint32_t totalGraphCost = setupMTaskDeps();
+        const uint64_t totalGraphCost = setupMTaskDeps();

        debugMTaskGraphStats(*m_mTaskGraphp, "initial");

@ -2328,7 +2335,7 @@ class Partitioner final {
            // when scheduling them.
            const unsigned fudgeNumerator = 3;
            const unsigned fudgeDenominator = 5;
-            const uint32_t cpLimit
+            const uint64_t cpLimit
                = ((totalGraphCost * fudgeNumerator) / (targetParFactor * fudgeDenominator));
            UINFO(4, "Partitioner set cpLimit = " << cpLimit);

--- a/test_regress/t/t_instr_count_dpi_large.cpp
+++ b/test_regress/t/t_instr_count_dpi_large.cpp
@ -0,0 +1,12 @@
+// -*- mode: C++; c-file-style: "cc-mode" -*-
+//*************************************************************************
+//
+// Copyright 2025 by Antmicro. This program is free software; you can
+// redistribute it and/or modify it under the terms of either the GNU
+// Lesser General Public License Version 3 or the Perl Artistic License
+// Version 2.0.
+// SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
+//
+//*************************************************************************
+
+extern "C" void dpii_call(void) {}
--- a/test_regress/t/t_instr_count_dpi_large.py
+++ b/test_regress/t/t_instr_count_dpi_large.py
@ -0,0 +1,27 @@
+#!/usr/bin/env python3
+# DESCRIPTION: Verilator: Verilog Test driver/expect definition
+#
+# Copyright 2025 by Wilson Snyder. This program is free software; you
+# can redistribute it and/or modify it under the terms of either the GNU
+# Lesser General Public License Version 3 or the Perl Artistic License
+# Version 2.0.
+# SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
+
+import vltest_bootstrap
+
+test.scenarios('vltmt')
+test.clean_objs()
+
+test.compile(
+    v_flags2=["t/t_instr_count_dpi_large.cpp"],
+    verilator_flags2=[
+        "--instr-count-dpi 999999999",
+        # Force UNOPTTHREADS error to cause Contraction limit increase beyond UINT32
+        "--threads-max-mtasks 1",
+        "-Wno-UNOPTTHREADS"
+    ],
+    threads=2)
+
+test.execute()
+
+test.passes()
--- a/test_regress/t/t_instr_count_dpi_large.v
+++ b/test_regress/t/t_instr_count_dpi_large.v
@ -0,0 +1,26 @@
+// DESCRIPTION: Verilator: Verilog Test module
+//
+// This file ONLY is placed under the Creative Commons Public Domain, for
+// any use, without warranty, 2025 by Wilson Snyder.
+// SPDX-License-Identifier: CC0-1.0
+
+
+module t(clk);
+   input clk;
+   sub_0 sub_0(clk);
+   sub_1 sub_1(clk);
+   initial begin
+      $write("*-* All Finished *-*\n");
+      $finish;
+   end
+endmodule
+
+import "DPI-C" context function void dpii_call();
+
+module sub_0(input clk); /*verilator hier_block*/
+   always @(posedge clk) dpii_call();
+endmodule
+
+module sub_1(input clk); /*verilator hier_block*/
+   always @(posedge clk) dpii_call();
+endmodule
--- a/test_regress/t/t_instr_count_dpi_large_hier.py
+++ b/test_regress/t/t_instr_count_dpi_large_hier.py
@ -0,0 +1,29 @@
+#!/usr/bin/env python3
+# DESCRIPTION: Verilator: Verilog Test driver/expect definition
+#
+# Copyright 2025 by Wilson Snyder. This program is free software; you
+# can redistribute it and/or modify it under the terms of either the GNU
+# Lesser General Public License Version 3 or the Perl Artistic License
+# Version 2.0.
+# SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
+
+import vltest_bootstrap
+
+test.scenarios('vltmt')
+test.clean_objs()
+test.top_filename = "t/t_instr_count_dpi_large.v"
+
+test.compile(
+    v_flags2=["t/t_instr_count_dpi_large.cpp"],
+    verilator_flags2=[
+        "--hierarchical",
+        "--instr-count-dpi 999999999",
+        # Force UNOPTTHREADS error to cause Contraction limit increase beyond UINT32
+        "--threads-max-mtasks 1",
+        "-Wno-UNOPTTHREADS"
+    ],
+    threads=2)
+
+test.execute()
+
+test.passes()