Merge branch 'master' into develop-v5

2022-06-04 11:58:13 -04:00 · 2022-06-04 11:58:13 -04:00 · 0f324c8309
parent 3af5e7e8da 67f7432dd7
commit 0f324c8309
120 changed files with 1997 additions and 1372 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -29,7 +29,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        os: [ubuntu-20.04, ubuntu-18.04]
+        os: [ubuntu-22.04, ubuntu-20.04, ubuntu-18.04]
        compiler:
          - { cc: clang, cxx: clang++ }
          - { cc: gcc,   cxx: g++     }
@ -37,9 +37,11 @@ jobs:
        exclude:
          # Build pull requests only with ubuntu-20.04 and without m32
          - os:  ${{ github.event_name == 'pull_request' && 'ubuntu-18.04' || 'do-not-exclude' }}
+          - os:  ${{ github.event_name == 'pull_request' && 'ubuntu-22.04' || 'do-not-exclude' }}
          - m32: ${{ github.event_name == 'pull_request' && 1              || 'do-not-exclude' }}
          # Build -m32 only on ubuntu-20.04
          - {os: ubuntu-18.04, m32: 1}
+          - {os: ubuntu-22.04, m32: 1}
        include:
          # Build GCC 10 on ubuntu-20.04
          - os: ubuntu-20.04
@ -95,7 +97,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        os: [ubuntu-20.04, ubuntu-18.04]
+        os: [ubuntu-22.04, ubuntu-20.04, ubuntu-18.04]
        compiler:
          - { cc: clang, cxx: clang++ }
          - { cc: gcc,   cxx: g++     }
@ -104,9 +106,11 @@ jobs:
        exclude:
          # Build pull requests only with ubuntu-20.04 and without m32
          - os:  ${{ github.event_name == 'pull_request' && 'ubuntu-18.04' || 'do-not-exclude' }}
+          - os:  ${{ github.event_name == 'pull_request' && 'ubuntu-22.04' || 'do-not-exclude' }}
          - m32: ${{ github.event_name == 'pull_request' && 1              || 'do-not-exclude' }}
          # Build -m32 only on ubuntu-20.04
          - {os: ubuntu-18.04, m32: 1}
+          - {os: ubuntu-22.04, m32: 1}
        include:
          # Test with GCC 10 on ubuntu-20.04 without m32
          - {os: ubuntu-20.04, compiler: { cc: gcc-10, cxx: g++-10 }, m32: 0, suite: dist-vlt-0}
@ -122,7 +126,7 @@ jobs:
      CI_M32: ${{ matrix.m32 }}
      CC: ${{ matrix.compiler.cc }}
      CXX: ${{ matrix.compiler.cxx }}
-      CACHE_BASE_KEY: test-${{ matrix.os }}-${{ matrix.compiler.cc }}-m32=${{ matrix.m32 }}-${ matrix.suite }}
+      CACHE_BASE_KEY: test-${{ matrix.os }}-${{ matrix.compiler.cc }}-m32=${{ matrix.m32 }}-${{ matrix.suite }}
      CCACHE_MAXSIZE: 64M # Per build matrix entry (2160M in total)
      VERILATOR_ARCHIVE: verilator-${{ github.sha }}-${{ matrix.os }}-${{ matrix.compiler.cc }}${{ matrix.m32 && '-m32' || '' }}.tar.gz
    steps:
--- a/12
+++ b/12
@ -22,12 +22,20 @@ Verilator 5.001 devel
 Verilator 4.223 devel
 ==========================

+**Major:**
+
+* VCD tracing is now parallelized with --threads (#3449). [Geza Lore, Shunyao CAD]
+
 **Minor:**

+* Add -f<optimization> options to replace -O<letter> options (#3436).
+* Changed --no-merge-const-pool to -fno-merge-const-pool (#3436).
 * Support compile time trace signal selection with tracing_on/off (#3323). [Shunyao CAD]
-* Add assert when VerilatedContext is mis-deleted (#3121). [Rupert Swarbrick]
-* Define VM_TRACE_VCD when tracing in VCD format. [Geza Lore, Shunyao CAD]
 * Support non-ANSI interface port declarations (#3439). [Geza Lore, Shunyao CAD]
+* Support concat assignment to packed array (#3446).
+* Improve conditional merging optimization (#3125). [Geza Lore, Shunyao CAD]
+* Define VM_TRACE_VCD when tracing in VCD format. [Geza Lore, Shunyao CAD]
+* Add assert when VerilatedContext is mis-deleted (#3121). [Rupert Swarbrick]
 * Fix hang with large case statement optimization (#3405). [Mike Urbach]
 * Fix 'with' operator with type casting (#3387). [xiak95]
 * Fix incorrect conditional merging (#3409). [Raynard Qiao]
--- a/bin/verilator
+++ b/bin/verilator
@ -319,6 +319,7 @@ detailed descriptions of these arguments.
     -f <file>                  Parse arguments from a file
     -FI <file>                 Force include of a file
    --flatten                   Force inlining of all modules, tasks and functions
+     -fno-<optimization>        Disable internal optimization stage
     -G<name>=<value>           Overwrite top-level parameter
    --gdb                       Run Verilator under GDB interactively
    --gdbbt                     Run Verilator under GDB for backtrace
@ -344,7 +345,6 @@ detailed descriptions of these arguments.
    --MMD                       Create .d dependency files
    --MP                        Create phony dependency targets
    --Mdir <directory>          Name of output object directory
-    --no-merge-const-pool       Disable merging of different types in const pool
    --mod-prefix <topname>      Name to prepend to lower classes
    --no-clk <signal-name>      Prevent marking specified signal as clock
    --no-decoration             Disable comments and symbol decorations
@ -404,7 +404,7 @@ detailed descriptions of these arguments.
    --trace-max-width <width>   Maximum array depth for tracing
    --trace-params              Enable tracing of parameters
    --trace-structs             Enable tracing structure names
-    --trace-threads <threads>   Enable waveform creation on separate threads
+    --trace-threads <threads>   Enable FST waveform creation on separate threads
    --trace-underscore          Enable tracing of _signals
     -U<var>                    Undefine preprocessor define
    --unroll-count <loops>      Tune maximum loop iterations
--- a/ci/ci-install.bash
+++ b/ci/ci-install.bash
@ -54,8 +54,12 @@ if [ "$CI_BUILD_STAGE_NAME" = "build" ]; then

  if [ "$CI_OS_NAME" = "linux" ]; then
    sudo apt-get update
-    sudo apt-get install libfl-dev libgoogle-perftools-dev ccache
-    if [ "$CI_RUNS_ON" = "ubuntu-20.04" ]; then
+    sudo apt-get install libfl-dev ccache
+    if [ "$CI_RUNS_ON" != "ubuntu-22.04" ]; then
+      # Some conflict of libunwind verison on 22.04, can live without it for now
+      sudo apt-get install libgoogle-perftools-dev
+    fi
+    if [ "$CI_RUNS_ON" = "ubuntu-20.04" ] || [ "$CI_RUNS_ON" = "ubuntu-22.04" ]; then
      sudo apt-get install libsystemc libsystemc-dev
    fi
    if [ "$COVERAGE" = 1 ]; then
@ -85,7 +89,7 @@ elif [ "$CI_BUILD_STAGE_NAME" = "test" ]; then
    sudo apt-get update
    # libfl-dev needed for internal coverage's test runs
    sudo apt-get install gdb gtkwave lcov libfl-dev ccache
-    if [ "$CI_RUNS_ON" = "ubuntu-20.04" ]; then
+    if [ "$CI_RUNS_ON" = "ubuntu-20.04" ] || [ "$CI_RUNS_ON" = "ubuntu-22.04" ]; then
      sudo apt-get install libsystemc-dev
    fi
    if [ "$CI_M32" = 1 ]; then
--- a/configure.ac
+++ b/configure.ac
@ -348,14 +348,18 @@ AC_SUBST(CFG_CXXFLAGS_PROFILE)

 # Flag to select newest language standard supported
 # Macros work such that first option that passes is the one we take
-# Currently enabled c++14 due to packaged SystemC dependency
-# c++14 is the newest that Verilator is regressed to support
+# Currently enable c++17/c++14 due to packaged SystemC dependency
+# c++17 is the newest that Verilator is regularly tested to support
 # c++11 is the oldest that Verilator supports
 # gnu is requried for Cygwin to compile verilated.h successfully
 #_MY_CXX_CHECK_SET(CFG_CXXFLAGS_STD_NEWEST,-std=gnu++20)
 #_MY_CXX_CHECK_SET(CFG_CXXFLAGS_STD_NEWEST,-std=c++20)
-#_MY_CXX_CHECK_SET(CFG_CXXFLAGS_STD_NEWEST,-std=gnu++17)
-#_MY_CXX_CHECK_SET(CFG_CXXFLAGS_STD_NEWEST,-std=c++17)
+case "$(which lsb_release 2>&1 > /dev/null && lsb_release -d)" in
+*Ubuntu*22.04*)
+_MY_CXX_CHECK_SET(CFG_CXXFLAGS_STD_NEWEST,-std=gnu++17)
+_MY_CXX_CHECK_SET(CFG_CXXFLAGS_STD_NEWEST,-std=c++17)
+;;
+esac
 _MY_CXX_CHECK_SET(CFG_CXXFLAGS_STD_NEWEST,-std=gnu++14)
 _MY_CXX_CHECK_SET(CFG_CXXFLAGS_STD_NEWEST,-std=c++14)
 _MY_CXX_CHECK_SET(CFG_CXXFLAGS_STD_NEWEST,-std=gnu++11)
--- a/docs/CONTRIBUTORS
+++ b/docs/CONTRIBUTORS
@ -35,6 +35,7 @@ Guokai Chen
 Harald Heckmann
 Howard Su
 Huang Rui
+Huanghuang Zhou
 HungMingWu
 HyungKi Jeong
 Iru Cai
--- a/docs/guide/deprecations.rst
+++ b/docs/guide/deprecations.rst
@ -20,6 +20,11 @@ Option `--cdc`
  The experimental `--cdc` option is believed to be generally unused and is
  planned for removal no sooner than January 2023.

+Option `-O<letter>`
+  The debug `-O<letter>` options have been replaced with
+  `-fno-<optimization>` debug options to match GCC. The old options are
+  planned for removal no sooner than June 2023.
+
 Option `--prof-threads`
  The `--prof-threads` option has been superseded by the `--prof-exec` and
  `--prof-pgo` options and is planned for removal no sooner than April 2023.
--- a/docs/guide/exe_verilator.rst
+++ b/docs/guide/exe_verilator.rst
@ -428,6 +428,52 @@ Summary:
   flattening large designs may require significant CPU time, memory and
   storage.

+.. option:: -fno-acyc-simp
+
+.. option:: -fno-assemble
+
+.. option:: -fno-case
+
+.. option:: -fno-combine
+
+.. option:: -fno-const
+
+.. option:: -fno-const-bit-op-tree
+
+.. option:: -fno-dedup
+
+.. option:: -fno-expand
+
+.. option:: -fno-gate
+
+.. option:: -fno-inline
+
+.. option:: -fno-life
+
+.. option:: -fno-life-post
+
+.. option:: -fno-localize
+
+.. option:: -fno-merge-cond
+
+.. option:: -fno-merge-const-pool
+
+.. option:: -fno-reloop
+
+.. option:: -fno-reorder
+
+.. option:: -fno-split
+
+.. option:: -fno-subst
+
+.. option:: -fno-subst-const
+
+.. option:: -fno-table
+
+   Rarely needed. Disables one of the internal optimization steps. These
+   are typically used only when recommended by a maintainer to help debug
+   or work around an issue.
+
 .. option:: -G<name>=<value>

   Overwrites the given parameter of the toplevel module. The value is
@ -645,13 +691,6 @@ Summary:
   The directory is created if it does not exist and the parent directories
   exist; otherwise manually create the Mdir before calling Verilator.

-.. option:: --no-merge-const-pool
-
-   Rarely needed.  In order to minimize cache footprint, values of different
-   data type, that are yet emitted identically in C++ are merged in the
-   constant pool.  This option disables this and causes every constant pool
-   entry with a distinct data type to be emitted separately.
-
 .. option:: --mod-prefix <topname>

   Specifies the name to prepend to all lower level classes.  Defaults to
@ -700,9 +739,9 @@ Summary:

   Rarely needed.  Enables or disables a specific optimizations, with the
   optimization selected based on the letter passed.  A lowercase letter
-   disables an optimization, an upper case letter enables it.  This is
-   intended for debugging use only; see the source code for
-   version-dependent mappings of optimizations to -O letters.
+   disables an optimization, an upper case letter enables it.  This option
+   is deprecated and the various `-f<optimization>` arguments should be
+   used instead.

 .. option:: -o <executable>

@ -1042,7 +1081,8 @@ Summary:
   is not thread safe. With "--threads 1", the generated model is single
   threaded but may run in a multithreaded environment. With "--threads N",
   where N >= 2, the model is generated to run multithreaded on up to N
-   threads. See :ref:`Multithreading`.
+   threads. See :ref:`Multithreading`. This option also applies to
+   :vlopt:`--trace` (but not :vlopt:`--trace-fst`).

 .. option:: --threads-dpi all

@ -1120,7 +1160,8 @@ Summary:
   Having tracing compiled in may result in some small performance losses,
   even when tracing is not turned on during model execution.

-   See also :vlopt:`--trace-threads` option.
+   When using :vlopt:`--threads`, VCD tracing is parallelized, using the
+   same number of threads as passed to :vlopt:`--threads`.

 .. option:: --trace-coverage

@ -1174,12 +1215,12 @@ Summary:
 .. option:: --trace-threads *threads*

   Enable waveform tracing using separate threads. This is typically faster
-   in simulation runtime but uses more total compute. This option is
-   independent of, and works with, both :vlopt:`--trace` and
-   :vlopt:`--trace-fst`.  Different trace formats can take advantage of
-   more trace threads to varying degrees. Currently VCD tracing can utilize
-   at most "--trace-threads 1", and FST tracing can utilize at most
-   "--trace-threads 2". This overrides :vlopt:`--no-threads` .
+   in simulation runtime but uses more total compute. This option only
+   applies to :vlopt:`--trace-fst`. FST tracing can utilize at most
+   "--trace-threads 2". This overrides :vlopt:`--no-threads`.
+
+   This option is accepted, but has absolutely no effect with
+   :vlopt:`--trace`, which respects :vlopt:`--threads` instead.

 .. option:: --trace-underscore

--- a/docs/guide/faq.rst
+++ b/docs/guide/faq.rst
@ -72,23 +72,38 @@ a good thing for getting working silicon.
 Will Verilator output remain under my own license/copyright?
 """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""

-Yes, it's just like using GCC on your programs; this is why Verilator uses
-the "GNU **Lesser** Public License Version 3" instead of the more typical
-"GNU Public License".  See the licenses for details, but in brief, if you
-change Verilator itself or the header files Verilator includes, you must
-make the source code available under the GNU Lesser Public License.
-However, Verilator output (the Verilated code) only "include"s the licensed
-files, and so you are **not** required to open-source release any output
-from Verilator.
+Your SystemVerilog, VPI/DPI, or main() C++ code remains under your own license.
+
+It's just like how using GCC on your programs does not change the copyright
+of your program; this is why Verilator uses the "GNU **Lesser** Public
+License Version 3" instead of the more typical "GNU Public License".  See
+the licenses for details.
+
+Some examples:
+
+* Any SystemVerilog or other input fed into Verilator remain your own.
+
+* Any of your VPI/DPI C++ routines that Verilator calls remain your own.
+
+* Any of your main() C++ code that calls into Verilator remain your own.
+
+* If you change Verilator itself, for example changing or adding a file
+  under the src/ directory in the repository, you must make the source code
+  available under the GNU Lesser Public License.
+
+* If you change a header Verilator provides, for example under include/ in
+  the repository, you must make the source code available under the GNU
+  Lesser Public License.

 You also have the option of using the Perl Artistic License, which again
-does not require you to release your Verilog or generated code, and also
-allows you to modify Verilator for internal use without distributing the
-modified version.  But please contribute back to the community!
+does not require you to release your Verilog, C++, or generated code. This
+license also allows you to modify Verilator for internal use without
+distributing the modified version.  But please contribute back to the
+community!

-One limit is that you cannot under either license release a closed-source
-Verilog simulation product incorporating Verilator. That is you can have a
-commercial product, but must make the source code available.
+Under both license you can offer a commercial product that is based on
+Verilator either directly or embedded within.  However under both licenses,
+any changes you make to Verilator for such a product must be open sourced.

 As is standard with Open Source, contributions back to Verilator will be
 placed under the Verilator copyright and LGPL/Artistic license.  Small test
--- a/docs/guide/verilating.rst
+++ b/docs/guide/verilating.rst
@ -221,9 +221,13 @@ model, it may be beneficial to performance to adjust the
 influences the partitioning of the model by adjusting the assumed execution
 time of DPI imports.

-The :vlopt:`--trace-threads` options can be used to produce trace dumps
-using multiple threads. If :vlopt:`--trace-threads` is set without
-:vlopt:`--threads`, then :vlopt:`--trace-threads` will imply
+When using :vlopt:`--trace` to perform VCD tracing, the VCD trace
+construction is parallelized using the same number of threads as specified
+with :vlopt:`--threads`, and is executed on the same thread pool as the model.
+
+The :vlopt:`--trace-threads` options can be used with :vlopt:`--trace-fst`
+to offload FST tracing using multiple threads. If :vlopt:`--trace-threads` is
+given without :vlopt:`--threads`, then :vlopt:`--trace-threads` will imply
 :vlopt:`--threads 1 <--threads>`, i.e.: the support libraries will be
 thread safe.

@ -231,12 +235,12 @@ With :vlopt:`--trace-threads 0 <--trace-threads>`, trace dumps are produced
 on the main thread. This again gives the highest single thread performance.

 With :vlopt:`--trace-threads {N} <--trace-threads>`, where N is at least 1,
-N additional threads will be created and managed by the trace files (e.g.:
-VerilatedVcdC or VerilatedFstC), to generate the trace dump. The main
-thread will be released to proceed with execution as soon as possible,
-though some blocking of the main thread is still necessary while capturing
-the trace. Different trace formats can utilize a various number of
-threads. See the :vlopt:`--trace-threads` option.
+up to N additional threads will be created and managed by the trace files
+(e.g.: VerilatedFstC), to offload construction of the trace dump. The main
+thread will be released to proceed with execution as soon as possible, though
+some blocking of the main thread is still necessary while capturing the
+trace. FST tracing can utilize up to 2 offload threads, so there is no use
+of setting :vlopt:`--trace-threads` higher than 2 at the moment.

 When running a multithreaded model, the default Linux task scheduler often
 works against the model, by assuming threads are short lived, and thus
@ -441,7 +445,7 @@ SystemC include directories and link to the SystemC libraries.

 .. describe:: TRACE_THREADS

-   Optional. Generated multi-threaded trace dumping, same as
+   Optional. Generated multi-threaded FST trace dumping, same as
   "--trace-threads".

 .. describe:: TOP_MODULE
--- a/docs/internals.rst
+++ b/docs/internals.rst
@ -595,7 +595,7 @@ path through the graph is the sum of macro-task execution costs. Sarkar
 does almost the same thing, except that he has nonzero estimates for
 synchronization costs.

-Verilator's cost estimates are assigned by ``InstrCountCostVisitor``.  This
+Verilator's cost estimates are assigned by ``InstrCountVisitor``.  This
 class is perhaps the most fragile piece of the multithread
 implementation. It's easy to have a bug where you count something cheap
 (eg. accessing one element of a huge array) as if it were expensive (eg.
--- a/docs/spelling.txt
+++ b/docs/spelling.txt
@ -683,6 +683,7 @@ onehot
 ooo
 oprofile
 oversubscription
+parallelized
 param
 parameterized
 params
@ -771,6 +772,7 @@ specparam
 splitme
 spp
 sqrt
+src
 srcdir
 srcfile
 sscanf
@ -889,6 +891,7 @@ writeme
 writemem
 writememb
 writememh
+xiak
 xin
 xml
 xnor
--- a/examples/cmake_tracing_c/CMakeLists.txt
+++ b/examples/cmake_tracing_c/CMakeLists.txt
@ -33,5 +33,5 @@ add_executable(example ../make_tracing_c/sim_main.cpp)
 # Add the Verilated circuit to the target
 verilate(example COVERAGE TRACE
  INCLUDE_DIRS "../make_tracing_c"
-  VERILATOR_ARGS -f ../make_tracing_c/input.vc -Os -x-assign 0
+  VERILATOR_ARGS -f ../make_tracing_c/input.vc -x-assign fast
  SOURCES ../make_tracing_c/top.v)
--- a/examples/cmake_tracing_sc/CMakeLists.txt
+++ b/examples/cmake_tracing_sc/CMakeLists.txt
@ -45,7 +45,7 @@ set_property(
 # Add the Verilated circuit to the target
 verilate(example SYSTEMC COVERAGE TRACE
  INCLUDE_DIRS "../make_tracing_sc"
-  VERILATOR_ARGS -f ../make_tracing_sc/input.vc -Os -x-assign 0
+  VERILATOR_ARGS -f ../make_tracing_sc/input.vc -x-assign fast
  SOURCES ../make_tracing_sc/top.v)

 verilator_link_systemc(example)
--- a/examples/make_protect_lib/Makefile
+++ b/examples/make_protect_lib/Makefile
@ -33,7 +33,7 @@ VERILATOR_FLAGS =
 # Generate C++
 VERILATOR_FLAGS += -cc
 # Optimize
-VERILATOR_FLAGS += -Os -x-assign 0
+VERILATOR_FLAGS += -x-assign fast
 # Warn abount lint issues; may not want this on less solid designs
 VERILATOR_FLAGS += -Wall
 # This example does not use vl_time_stamp but rather
--- a/examples/make_tracing_c/Makefile
+++ b/examples/make_tracing_c/Makefile
@ -36,7 +36,7 @@ VERILATOR_FLAGS += -cc --exe
 # Generate makefile dependencies (not shown as complicates the Makefile)
 #VERILATOR_FLAGS += -MMD
 # Optimize
-VERILATOR_FLAGS += -Os -x-assign 0
+VERILATOR_FLAGS += -x-assign fast
 # Warn abount lint issues; may not want this on less solid designs
 VERILATOR_FLAGS += -Wall
 # Make waveforms
--- a/examples/make_tracing_sc/Makefile
+++ b/examples/make_tracing_sc/Makefile
@ -37,7 +37,7 @@ VERILATOR_FLAGS += -sc --exe
 # Generate makefile dependencies (not shown as complicates the Makefile)
 #VERILATOR_FLAGS += -MMD
 # Optimize
-VERILATOR_FLAGS += -Os -x-assign 0
+VERILATOR_FLAGS += -x-assign fast
 # Warn abount lint issues; may not want this on less solid designs
 VERILATOR_FLAGS += -Wall
 # Make waveforms
--- a/include/verilated.h
+++ b/include/verilated.h
@ -147,7 +147,7 @@ extern uint32_t VL_THREAD_ID() VL_MT_SAFE;

 #if VL_THREADED

-#define VL_LOCK_SPINS 50000  /// Number of times to spin for a mutex before relaxing
+#define VL_LOCK_SPINS 50000  /// Number of times to spin for a mutex before yielding

 /// Mutex, wrapped to allow -fthread_safety checks
 class VL_CAPABILITY("mutex") VerilatedMutex final {
--- a/include/verilated_fst_c.cpp
+++ b/include/verilated_fst_c.cpp
@ -83,9 +83,11 @@ static_assert(static_cast<int>(FST_ST_VCD_PROGRAM) == static_cast<int>(VLT_TRACE
 //=============================================================================
 // Specialization of the generics for this trace format

-#define VL_DERIVED_T VerilatedFst
-#include "verilated_trace_imp.cpp"
-#undef VL_DERIVED_T
+#define VL_SUB_T VerilatedFst
+#define VL_BUF_T VerilatedFstBuffer
+#include "verilated_trace_imp.h"
+#undef VL_SUB_T
+#undef VL_BUF_T

 //=============================================================================
 // VerilatedFst
@ -111,7 +113,7 @@ void VerilatedFst::open(const char* filename) VL_MT_SAFE_EXCLUDES(m_mutex) {

    m_curScope.clear();

-    VerilatedTrace<VerilatedFst>::traceInit();
+    Super::traceInit();

    // Clear the scope stack
    auto it = m_curScope.begin();
@ -133,14 +135,14 @@ void VerilatedFst::open(const char* filename) VL_MT_SAFE_EXCLUDES(m_mutex) {

 void VerilatedFst::close() VL_MT_SAFE_EXCLUDES(m_mutex) {
    const VerilatedLockGuard lock{m_mutex};
-    VerilatedTrace<VerilatedFst>::closeBase();
+    Super::closeBase();
    fstWriterClose(m_fst);
    m_fst = nullptr;
 }

 void VerilatedFst::flush() VL_MT_SAFE_EXCLUDES(m_mutex) {
    const VerilatedLockGuard lock{m_mutex};
-    VerilatedTrace<VerilatedFst>::flushBase();
+    Super::flushBase();
    fstWriterFlushContext(m_fst);
 }

@ -162,7 +164,7 @@ void VerilatedFst::declare(uint32_t code, const char* name, int dtypenum, fstVar
                           int lsb) {
    const int bits = ((msb > lsb) ? (msb - lsb) : (lsb - msb)) + 1;

-    const bool enabled = VerilatedTrace<VerilatedFst>::declCode(code, name, bits, false);
+    const bool enabled = Super::declCode(code, name, bits, false);
    if (!enabled) return;

    std::string nameasstr = namePrefix() + name;
@ -245,18 +247,42 @@ void VerilatedFst::declDouble(uint32_t code, const char* name, int dtypenum, fst
    declare(code, name, dtypenum, vardir, vartype, array, arraynum, false, 63, 0);
 }

+//=============================================================================
+// Get/commit trace buffer
+
+VerilatedFstBuffer* VerilatedFst::getTraceBuffer() { return new VerilatedFstBuffer{*this}; }
+
+void VerilatedFst::commitTraceBuffer(VerilatedFstBuffer* bufp) {
+#ifdef VL_TRACE_OFFLOAD
+    if (bufp->m_offloadBufferWritep) {
+        m_offloadBufferWritep = bufp->m_offloadBufferWritep;
+        return;  // Buffer will be deleted by the offload thread
+    }
+#endif
+    delete bufp;
+}
+
+//=============================================================================
+// VerilatedFstBuffer implementation
+
+VerilatedFstBuffer::VerilatedFstBuffer(VerilatedFst& owner)
+    : VerilatedTraceBuffer<VerilatedFst, VerilatedFstBuffer>{owner} {}
+
+//=============================================================================
+// Trace rendering primitives
+
 // Note: emit* are only ever called from one place (full* in
-// verilated_trace_imp.cpp, which is included in this file at the top),
+// verilated_trace_imp.h, which is included in this file at the top),
 // so always inline them.

 VL_ATTR_ALWINLINE
-void VerilatedFst::emitBit(uint32_t code, CData newval) {
+void VerilatedFstBuffer::emitBit(uint32_t code, CData newval) {
    VL_DEBUG_IFDEF(assert(m_symbolp[code]););
    fstWriterEmitValueChange(m_fst, m_symbolp[code], newval ? "1" : "0");
 }

 VL_ATTR_ALWINLINE
-void VerilatedFst::emitCData(uint32_t code, CData newval, int bits) {
+void VerilatedFstBuffer::emitCData(uint32_t code, CData newval, int bits) {
    char buf[VL_BYTESIZE];
    VL_DEBUG_IFDEF(assert(m_symbolp[code]););
    cvtCDataToStr(buf, newval << (VL_BYTESIZE - bits));
@ -264,7 +290,7 @@ void VerilatedFst::emitCData(uint32_t code, CData newval, int bits) {
 }

 VL_ATTR_ALWINLINE
-void VerilatedFst::emitSData(uint32_t code, SData newval, int bits) {
+void VerilatedFstBuffer::emitSData(uint32_t code, SData newval, int bits) {
    char buf[VL_SHORTSIZE];
    VL_DEBUG_IFDEF(assert(m_symbolp[code]););
    cvtSDataToStr(buf, newval << (VL_SHORTSIZE - bits));
@ -272,7 +298,7 @@ void VerilatedFst::emitSData(uint32_t code, SData newval, int bits) {
 }

 VL_ATTR_ALWINLINE
-void VerilatedFst::emitIData(uint32_t code, IData newval, int bits) {
+void VerilatedFstBuffer::emitIData(uint32_t code, IData newval, int bits) {
    char buf[VL_IDATASIZE];
    VL_DEBUG_IFDEF(assert(m_symbolp[code]););
    cvtIDataToStr(buf, newval << (VL_IDATASIZE - bits));
@ -280,7 +306,7 @@ void VerilatedFst::emitIData(uint32_t code, IData newval, int bits) {
 }

 VL_ATTR_ALWINLINE
-void VerilatedFst::emitQData(uint32_t code, QData newval, int bits) {
+void VerilatedFstBuffer::emitQData(uint32_t code, QData newval, int bits) {
    char buf[VL_QUADSIZE];
    VL_DEBUG_IFDEF(assert(m_symbolp[code]););
    cvtQDataToStr(buf, newval << (VL_QUADSIZE - bits));
@ -288,7 +314,7 @@ void VerilatedFst::emitQData(uint32_t code, QData newval, int bits) {
 }

 VL_ATTR_ALWINLINE
-void VerilatedFst::emitWData(uint32_t code, const WData* newvalp, int bits) {
+void VerilatedFstBuffer::emitWData(uint32_t code, const WData* newvalp, int bits) {
    int words = VL_WORDS_I(bits);
    char* wp = m_strbuf;
    // Convert the most significant word
@ -304,6 +330,6 @@ void VerilatedFst::emitWData(uint32_t code, const WData* newvalp, int bits) {
 }

 VL_ATTR_ALWINLINE
-void VerilatedFst::emitDouble(uint32_t code, double newval) {
+void VerilatedFstBuffer::emitDouble(uint32_t code, double newval) {
    fstWriterEmitValueChange(m_fst, m_symbolp[code], &newval);
 }
--- a/include/verilated_fst_c.h
+++ b/include/verilated_fst_c.h
@ -31,15 +31,19 @@
 #include <string>
 #include <vector>

+class VerilatedFstBuffer;
+
 //=============================================================================
 // VerilatedFst
 // Base class to create a Verilator FST dump
 // This is an internally used class - see VerilatedFstC for what to call from applications

-class VerilatedFst final : public VerilatedTrace<VerilatedFst> {
+class VerilatedFst final : public VerilatedTrace<VerilatedFst, VerilatedFstBuffer> {
+public:
+    using Super = VerilatedTrace<VerilatedFst, VerilatedFstBuffer>;
+
 private:
-    // Give the superclass access to private bits (to avoid virtual functions)
-    friend class VerilatedTrace<VerilatedFst>;
+    friend Buffer;  // Give the buffer access to the private bits

    //=========================================================================
    // FST specific internals
@ -60,31 +64,26 @@ protected:
    //=========================================================================
    // Implementation of VerilatedTrace interface

-    // Implementations of protected virtual methods for VerilatedTrace
+    // Called when the trace moves forward to a new time point
    virtual void emitTimeChange(uint64_t timeui) override;

    // Hooks called from VerilatedTrace
    virtual bool preFullDump() override { return isOpen(); }
    virtual bool preChangeDump() override { return isOpen(); }

-    // Implementations of duck-typed methods for VerilatedTrace. These are
-    // called from only one place (namely full*) so always inline them.
-    inline void emitBit(uint32_t code, CData newval);
-    inline void emitCData(uint32_t code, CData newval, int bits);
-    inline void emitSData(uint32_t code, SData newval, int bits);
-    inline void emitIData(uint32_t code, IData newval, int bits);
-    inline void emitQData(uint32_t code, QData newval, int bits);
-    inline void emitWData(uint32_t code, const WData* newvalp, int bits);
-    inline void emitDouble(uint32_t code, double newval);
+    // Trace buffer management
+    virtual VerilatedFstBuffer* getTraceBuffer() override;
+    virtual void commitTraceBuffer(VerilatedFstBuffer*) override;

 public:
    //=========================================================================
    // External interface to client code
-    // (All must be threadsafe)

+    // CONSTRUCTOR
    explicit VerilatedFst(void* fst = nullptr);
    ~VerilatedFst();

+    // METHODS - All must be thread safe
    // Open the file; call isOpen() to see if errors
    void open(const char* filename) VL_MT_SAFE_EXCLUDES(m_mutex);
    // Close the file
@ -97,11 +96,6 @@ public:
    //=========================================================================
    // Internal interface to Verilator generated code

-    // Inside dumping routines, declare a data type
-    void declDTypeEnum(int dtypenum, const char* name, uint32_t elements, unsigned int minValbits,
-                       const char** itemNamesp, const char** itemValuesp);
-
-    // Inside dumping routines, declare a signal
    void declBit(uint32_t code, const char* name, int dtypenum, fstVarDir vardir,
                 fstVarType vartype, bool array, int arraynum);
    void declBus(uint32_t code, const char* name, int dtypenum, fstVarDir vardir,
@ -112,18 +106,55 @@ public:
                   fstVarType vartype, bool array, int arraynum, int msb, int lsb);
    void declDouble(uint32_t code, const char* name, int dtypenum, fstVarDir vardir,
                    fstVarType vartype, bool array, int arraynum);
+
+    void declDTypeEnum(int dtypenum, const char* name, uint32_t elements, unsigned int minValbits,
+                       const char** itemNamesp, const char** itemValuesp);
 };

 #ifndef DOXYGEN
 // Declare specialization here as it's used in VerilatedFstC just below
-template <> void VerilatedTrace<VerilatedFst>::dump(uint64_t timeui);
-template <> void VerilatedTrace<VerilatedFst>::set_time_unit(const char* unitp);
-template <> void VerilatedTrace<VerilatedFst>::set_time_unit(const std::string& unit);
-template <> void VerilatedTrace<VerilatedFst>::set_time_resolution(const char* unitp);
-template <> void VerilatedTrace<VerilatedFst>::set_time_resolution(const std::string& unit);
-template <> void VerilatedTrace<VerilatedFst>::dumpvars(int level, const std::string& hier);
+template <> void VerilatedFst::Super::dump(uint64_t time);
+template <> void VerilatedFst::Super::set_time_unit(const char* unitp);
+template <> void VerilatedFst::Super::set_time_unit(const std::string& unit);
+template <> void VerilatedFst::Super::set_time_resolution(const char* unitp);
+template <> void VerilatedFst::Super::set_time_resolution(const std::string& unit);
+template <> void VerilatedFst::Super::dumpvars(int level, const std::string& hier);
 #endif

+//=============================================================================
+// VerilatedFstBuffer
+
+class VerilatedFstBuffer final : public VerilatedTraceBuffer<VerilatedFst, VerilatedFstBuffer> {
+    // Give the trace file access to the private bits
+    friend VerilatedFst;
+    friend VerilatedFst::Super;
+
+    // The FST file handle
+    void* const m_fst = m_owner.m_fst;
+    // code to fstHande map, as an array
+    const fstHandle* const m_symbolp = m_owner.m_symbolp;
+    // String buffer long enough to hold maxBits() chars
+    char* const m_strbuf = m_owner.m_strbuf;
+
+public:
+    // CONSTRUCTOR
+    explicit VerilatedFstBuffer(VerilatedFst& owner);
+    ~VerilatedFstBuffer() = default;
+
+    //=========================================================================
+    // Implementation of VerilatedTraceBuffer interface
+
+    // Implementations of duck-typed methods for VerilatedTraceBuffer. These are
+    // called from only one place (the full* methods), so always inline them.
+    VL_ATTR_ALWINLINE inline void emitBit(uint32_t code, CData newval);
+    VL_ATTR_ALWINLINE inline void emitCData(uint32_t code, CData newval, int bits);
+    VL_ATTR_ALWINLINE inline void emitSData(uint32_t code, SData newval, int bits);
+    VL_ATTR_ALWINLINE inline void emitIData(uint32_t code, IData newval, int bits);
+    VL_ATTR_ALWINLINE inline void emitQData(uint32_t code, QData newval, int bits);
+    VL_ATTR_ALWINLINE inline void emitWData(uint32_t code, const WData* newvalp, int bits);
+    VL_ATTR_ALWINLINE inline void emitDouble(uint32_t code, double newval);
+};
+
 //=============================================================================
 // VerilatedFstC
 /// Create a FST dump file in C standalone (no SystemC) simulations.
--- a/include/verilated_profiler.cpp
+++ b/include/verilated_profiler.cpp
@ -60,7 +60,7 @@ uint16_t VlExecutionRecord::getcpu() {
 //=============================================================================
 // VlExecutionProfiler implementation

-template <size_t N> size_t roundUptoMultipleOf(size_t value) {
+template <size_t N> static size_t roundUptoMultipleOf(size_t value) {
    static_assert((N & (N - 1)) == 0, "'N' must be a power of 2");
    size_t mask = N - 1;
    return (value + mask) & ~mask;
--- a/include/verilated_trace.h
+++ b/include/verilated_trace.h
@ -22,28 +22,43 @@
 #ifndef VERILATOR_VERILATED_TRACE_H_
 #define VERILATOR_VERILATED_TRACE_H_

-#ifdef VL_TRACE_THREADED
-#define VL_TRACE_OFFLOAD
+// clang-format off
+
+// In FST mode, VL_TRACE_THREADED enables offloading, but only if we also have
+// the FST writer thread. This means with --trace-threads 1, we get the FST
+// writer thread only, and with --trace-threads 2 we get offloading as well
+#if defined(VL_TRACE_FST_WRITER_THREAD) && defined(VL_TRACE_THREADED)
+# define VL_TRACE_OFFLOAD
+#endif
+// VCD tracing can happen fully in parallel
+#if defined(VM_TRACE_VCD) && VM_TRACE_VCD && defined(VL_TRACE_THREADED)
+# define VL_TRACE_PARALLEL
 #endif

-// clang-format off
+#if defined(VL_TRACE_PARALLEL) && defined(VL_TRACE_OFFLOAD)
+# error "Cannot have VL_TRACE_PARALLEL and VL_TRACE_OFFLOAD together"
+#endif

 #include "verilated.h"
 #include "verilated_trace_defs.h"

 #include <bitset>
+#include <condition_variable>
 #include <memory>
 #include <string>
+#include <unordered_map>
 #include <vector>

 #ifdef VL_TRACE_OFFLOAD
-# include <condition_variable>
 # include <deque>
 # include <thread>
 #endif

 // clang-format on

+class VlThreadPool;
+template <class T_Trace, class T_Buffer> class VerilatedTraceBuffer;
+
 #ifdef VL_TRACE_OFFLOAD
 //=============================================================================
 // Offloaded tracing
@ -106,7 +121,8 @@ public:
        CHG_WDATA = 0x6,
        CHG_DOUBLE = 0x8,
        // TODO: full..
-        TIME_CHANGE = 0xd,
+        TIME_CHANGE = 0xc,
+        TRACE_BUFFER = 0xd,
        END = 0xe,  // End of buffer
        SHUTDOWN = 0xf  // Shutdown worker thread, also marks end of buffer
    };
@ -116,16 +132,22 @@ public:
 //=============================================================================
 // VerilatedTrace

-// VerilatedTrace uses F-bounded polymorphism to access duck-typed
-// implementations in the format specific derived class, which must be passed
-// as the type parameter T_Derived
-template <class T_Derived> class VerilatedTrace VL_NOT_FINAL {
+// T_Trace is the format specific subclass of VerilatedTrace.
+// T_Buffer is the format specific subclass of VerilatedTraceBuffer.
+template <class T_Trace, class T_Buffer> class VerilatedTrace VL_NOT_FINAL {
+    // Give the buffer (both base and derived) access to the private bits
+    friend VerilatedTraceBuffer<T_Trace, T_Buffer>;
+    friend T_Buffer;
+
 public:
+    using Buffer = T_Buffer;
+
    //=========================================================================
    // Generic tracing internals

-    using initCb_t = void (*)(void*, T_Derived*, uint32_t);  // Type of init callbacks
-    using dumpCb_t = void (*)(void*, T_Derived*);  // Type of all but init callbacks
+    using initCb_t = void (*)(void*, T_Trace*, uint32_t);  // Type of init callbacks
+    using dumpCb_t = void (*)(void*, Buffer*);  // Type of dump callbacks
+    using cleanupCb_t = void (*)(void*, T_Trace*);  // Type of cleanup callbacks

 private:
    struct CallbackRecord {
@ -133,9 +155,10 @@ private:
        // (the one in Ubuntu 14.04 with GCC 4.8.4 in particular) use the
        // assignment operator on inserting into collections, so they don't work
        // with const fields...
-        union {
-            initCb_t m_initCb;  // The callback function
-            dumpCb_t m_dumpCb;  // The callback function
+        union {  // The callback
+            initCb_t m_initCb;
+            dumpCb_t m_dumpCb;
+            cleanupCb_t m_cleanupCb;
        };
        void* m_userp;  // The user pointer to pass to the callback (the symbol table)
        CallbackRecord(initCb_t cb, void* userp)
@ -144,32 +167,66 @@ private:
        CallbackRecord(dumpCb_t cb, void* userp)
            : m_dumpCb{cb}
            , m_userp{userp} {}
+        CallbackRecord(cleanupCb_t cb, void* userp)
+            : m_cleanupCb{cb}
+            , m_userp{userp} {}
    };

-    uint32_t* m_sigs_oldvalp;  // Old value store
-    EData* m_sigs_enabledp;  // Bit vector of enabled codes (nullptr = all on)
-    uint64_t m_timeLastDump;  // Last time we did a dump
+#ifdef VL_TRACE_PARALLEL
+    struct ParallelWorkerData {
+        const dumpCb_t m_cb;  // The callback
+        void* const m_userp;  // The use pointer to pass to the callback
+        Buffer* const m_bufp;  // The buffer pointer to pass to the callback
+        std::atomic<bool> m_ready{false};  // The ready flag
+        mutable VerilatedMutex m_mutex;  // Mutex for suspension until ready
+        std::condition_variable_any m_cv;  // Condition variable for suspension
+        bool m_waiting VL_GUARDED_BY(m_mutex) = false;  // Whether a thread is suspended in wait()
+
+        void wait();
+
+        ParallelWorkerData(dumpCb_t cb, void* userp, Buffer* bufp)
+            : m_cb{cb}
+            , m_userp{userp}
+            , m_bufp{bufp} {}
+    };
+
+    // Passed a ParallelWorkerData*, second argument is ignored
+    static void parallelWorkerTask(void*, bool);
+#endif
+
+    using ParallelCallbackMap = std::unordered_map<VlThreadPool*, std::vector<CallbackRecord>>;
+
+protected:
+    uint32_t* m_sigs_oldvalp = nullptr;  // Previous value store
+    EData* m_sigs_enabledp = nullptr;  // Bit vector of enabled codes (nullptr = all on)
+private:
+    uint64_t m_timeLastDump = 0;  // Last time we did a dump
    std::vector<bool> m_sigs_enabledVec;  // Staging for m_sigs_enabledp
-    std::vector<CallbackRecord> m_initCbs;  // Routines to initialize traciong
-    std::vector<CallbackRecord> m_fullCbs;  // Routines to perform full dump
-    std::vector<CallbackRecord> m_chgCbs;  // Routines to perform incremental dump
+    std::vector<CallbackRecord> m_initCbs;  // Routines to initialize tracing
+    ParallelCallbackMap m_fullCbs;  // Routines to perform full dump
+    ParallelCallbackMap m_chgCbs;  // Routines to perform incremental dump
    std::vector<CallbackRecord> m_cleanupCbs;  // Routines to call at the end of dump
-    bool m_fullDump;  // Whether a full dump is required on the next call to 'dump'
-    uint32_t m_nextCode;  // Next code number to assign
-    uint32_t m_numSignals;  // Number of distinct signals
-    uint32_t m_maxBits;  // Number of bits in the widest signal
+    std::vector<VlThreadPool*> m_threadPoolps;  // All thread pools, in insertion order
+    bool m_fullDump = true;  // Whether a full dump is required on the next call to 'dump'
+    uint32_t m_nextCode = 0;  // Next code number to assign
+    uint32_t m_numSignals = 0;  // Number of distinct signals
+    uint32_t m_maxBits = 0;  // Number of bits in the widest signal
    std::vector<std::string> m_namePrefixStack{""};  // Path prefixes to add to signal names
    std::vector<std::pair<int, std::string>> m_dumpvars;  // dumpvar() entries
-    char m_scopeEscape;
-    double m_timeRes;  // Time resolution (ns/ms etc)
-    double m_timeUnit;  // Time units (ns/ms etc)
+    char m_scopeEscape = '.';
+    double m_timeRes = 1e-9;  // Time resolution (ns/ms etc)
+    double m_timeUnit = 1e-0;  // Time units (ns/ms etc)
+
+    void addThreadPool(VlThreadPool* threadPoolp) VL_MT_SAFE_EXCLUDES(m_mutex);

    void addCallbackRecord(std::vector<CallbackRecord>& cbVec, CallbackRecord& cbRec)
        VL_MT_SAFE_EXCLUDES(m_mutex);

-    // Equivalent to 'this' but is of the sub-type 'T_Derived*'. Use 'self()->'
+    // Equivalent to 'this' but is of the sub-type 'T_Trace*'. Use 'self()->'
    // to access duck-typed functions to avoid a virtual function call.
-    T_Derived* self() { return static_cast<T_Derived*>(this); }
+    T_Trace* self() { return static_cast<T_Trace*>(this); }
+
+    void runParallelCallbacks(const ParallelCallbackMap& cbMap);

    // Flush any remaining data for this file
    static void onFlush(void* selfp) VL_MT_UNSAFE_ONE;
@ -178,17 +235,21 @@ private:

 #ifdef VL_TRACE_OFFLOAD
    // Number of total offload buffers that have been allocated
-    uint32_t m_numOffloadBuffers;
+    uint32_t m_numOffloadBuffers = 0;
    // Size of offload buffers
-    size_t m_offloadBufferSize;
+    size_t m_offloadBufferSize = 0;
    // Buffers handed to worker for processing
    VerilatedThreadQueue<uint32_t*> m_offloadBuffersToWorker;
    // Buffers returned from worker after processing
    VerilatedThreadQueue<uint32_t*> m_offloadBuffersFromWorker;
+
+protected:
    // Write pointer into current buffer
-    uint32_t* m_offloadBufferWritep;
+    uint32_t* m_offloadBufferWritep = nullptr;
    // End of offload buffer
-    uint32_t* m_offloadBufferEndp;
+    uint32_t* m_offloadBufferEndp = nullptr;
+
+private:
    // The offload worker thread itself
    std::unique_ptr<std::thread> m_workerThread;

@ -250,6 +311,10 @@ protected:
    virtual bool preFullDump() = 0;
    virtual bool preChangeDump() = 0;

+    // Trace buffer management
+    virtual Buffer* getTraceBuffer() = 0;
+    virtual void commitTraceBuffer(Buffer*) = 0;
+
 public:
    //=========================================================================
    // External interface to client code
@ -270,19 +335,55 @@ public:
    // Call
    void dump(uint64_t timeui) VL_MT_SAFE_EXCLUDES(m_mutex);

+    //=========================================================================
+    // Internal interface to Verilator generated code
+
    //=========================================================================
    // Non-hot path internal interface to Verilator generated code

    void addInitCb(initCb_t cb, void* userp) VL_MT_SAFE;
-    void addFullCb(dumpCb_t cb, void* userp) VL_MT_SAFE;
-    void addChgCb(dumpCb_t cb, void* userp) VL_MT_SAFE;
-    void addCleanupCb(dumpCb_t cb, void* userp) VL_MT_SAFE;
+    void addFullCb(dumpCb_t cb, void* userp, VlThreadPool* = nullptr) VL_MT_SAFE;
+    void addChgCb(dumpCb_t cb, void* userp, VlThreadPool* = nullptr) VL_MT_SAFE;
+    void addCleanupCb(cleanupCb_t cb, void* userp) VL_MT_SAFE;

    void scopeEscape(char flag) { m_scopeEscape = flag; }

    void pushNamePrefix(const std::string&);
    void popNamePrefix(unsigned count = 1);
+};

+//=============================================================================
+// VerilatedTraceBuffer
+
+// T_Trace is the format specific subclass of VerilatedTrace.
+// T_Buffer is the format specific subclass of VerilatedTraceBuffer.
+// The format-specific hot-path methods use duck-typing via T_Buffer for performance.
+template <class T_Trace, class T_Buffer> class VerilatedTraceBuffer VL_NOT_FINAL {
+    friend T_Trace;  // Give the trace file access to the private bits
+
+protected:
+    T_Trace& m_owner;  // The VerilatedTrace subclass that owns this buffer
+
+    // Previous value store
+    uint32_t* const m_sigs_oldvalp = m_owner.m_sigs_oldvalp;
+    // Bit vector of enabled codes (nullptr = all on)
+    EData* const m_sigs_enabledp = m_owner.m_sigs_enabledp;
+
+#ifdef VL_TRACE_OFFLOAD
+    // Write pointer into current buffer
+    uint32_t* m_offloadBufferWritep = m_owner.m_offloadBufferWritep;
+    // End of offload buffer
+    uint32_t* const m_offloadBufferEndp = m_owner.m_offloadBufferEndp;
+#endif
+
+    // Equivalent to 'this' but is of the sub-type 'T_Derived*'. Use 'self()->'
+    // to access duck-typed functions to avoid a virtual function call.
+    inline T_Buffer* self() { return static_cast<T_Buffer*>(this); }
+
+    explicit VerilatedTraceBuffer(T_Trace& owner);
+    virtual ~VerilatedTraceBuffer() = default;
+
+public:
    //=========================================================================
    // Hot path internal interface to Verilator generated code

@ -300,7 +401,7 @@ public:
    // duck-typed void emitWData(uint32_t code, const WData* newvalp, int bits) = 0;
    // duck-typed void emitDouble(uint32_t code, double newval) = 0;

-    uint32_t* oldp(uint32_t code) { return m_sigs_oldvalp + code; }
+    VL_ATTR_ALWINLINE inline uint32_t* oldp(uint32_t code) { return m_sigs_oldvalp + code; }

    // Write to previous value buffer value and emit trace entry.
    void fullBit(uint32_t* oldp, CData newval);
@ -363,9 +464,13 @@ public:
        VL_DEBUG_IF(assert(m_offloadBufferWritep <= m_offloadBufferEndp););
    }

-#define CHG(name) chg##name##Impl
-#else
-#define CHG(name) chg##name
+#define chgBit chgBitImpl
+#define chgCData chgCDataImpl
+#define chgSData chgSDataImpl
+#define chgIData chgIDataImpl
+#define chgQData chgQDataImpl
+#define chgWData chgWDataImpl
+#define chgDouble chgDoubleImpl
 #endif

    // In non-offload mode, these are called directly by the trace callbacks,
@ -373,27 +478,27 @@ public:
    // thread and are called chg*Impl

    // Check previous dumped value of signal. If changed, then emit trace entry
-    inline void CHG(Bit)(uint32_t* oldp, CData newval) {
+    VL_ATTR_ALWINLINE inline void chgBit(uint32_t* oldp, CData newval) {
        const uint32_t diff = *oldp ^ newval;
        if (VL_UNLIKELY(diff)) fullBit(oldp, newval);
    }
-    inline void CHG(CData)(uint32_t* oldp, CData newval, int bits) {
+    VL_ATTR_ALWINLINE inline void chgCData(uint32_t* oldp, CData newval, int bits) {
        const uint32_t diff = *oldp ^ newval;
        if (VL_UNLIKELY(diff)) fullCData(oldp, newval, bits);
    }
-    inline void CHG(SData)(uint32_t* oldp, SData newval, int bits) {
+    VL_ATTR_ALWINLINE inline void chgSData(uint32_t* oldp, SData newval, int bits) {
        const uint32_t diff = *oldp ^ newval;
        if (VL_UNLIKELY(diff)) fullSData(oldp, newval, bits);
    }
-    inline void CHG(IData)(uint32_t* oldp, IData newval, int bits) {
+    VL_ATTR_ALWINLINE inline void chgIData(uint32_t* oldp, IData newval, int bits) {
        const uint32_t diff = *oldp ^ newval;
        if (VL_UNLIKELY(diff)) fullIData(oldp, newval, bits);
    }
-    inline void CHG(QData)(uint32_t* oldp, QData newval, int bits) {
+    VL_ATTR_ALWINLINE inline void chgQData(uint32_t* oldp, QData newval, int bits) {
        const uint64_t diff = *reinterpret_cast<QData*>(oldp) ^ newval;
        if (VL_UNLIKELY(diff)) fullQData(oldp, newval, bits);
    }
-    inline void CHG(WData)(uint32_t* oldp, const WData* newvalp, int bits) {
+    VL_ATTR_ALWINLINE inline void chgWData(uint32_t* oldp, const WData* newvalp, int bits) {
        for (int i = 0; i < (bits + 31) / 32; ++i) {
            if (VL_UNLIKELY(oldp[i] ^ newvalp[i])) {
                fullWData(oldp, newvalp, bits);
@ -401,11 +506,20 @@ public:
            }
        }
    }
-    inline void CHG(Double)(uint32_t* oldp, double newval) {
+    VL_ATTR_ALWINLINE inline void chgDouble(uint32_t* oldp, double newval) {
        // cppcheck-suppress invalidPointerCast
        if (VL_UNLIKELY(*reinterpret_cast<double*>(oldp) != newval)) fullDouble(oldp, newval);
    }

-#undef CHG
+#ifdef VL_TRACE_OFFLOAD
+#undef chgBit
+#undef chgCData
+#undef chgSData
+#undef chgIData
+#undef chgQData
+#undef chgWData
+#undef chgDouble
+#endif
 };
+
 #endif  // guard
--- a/include/verilated_trace_imp.cpp
+++ b/include/verilated_trace_imp.cpp
@ -10,26 +10,26 @@
 // SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
 //
 //=============================================================================
-///
-/// \file
-/// \brief Verilated common-format tracing implementation code
-///
-/// This file must be compiled and linked against all Verilated objects
-/// that use --trace.
-///
-/// Use "verilator --trace" to add this to the Makefile for the linker.
-///
+//
+// Verilated tracing implementation code template common to all formats.
+// This file is included by the format specific implementations and
+// should not be used otherwise.
+//
 //=============================================================================

 // clang-format off

 #ifndef VL_CPPCHECK
-#ifndef VL_DERIVED_T
+#if !defined(VL_SUB_T) || !defined(VL_BUF_T)
 # error "This file should be included in trace format implementations"
 #endif

 #include "verilated_intrinsics.h"
 #include "verilated_trace.h"
+#ifdef VL_TRACE_PARALLEL
+# include "verilated_threads.h"
+# include <list>
+#endif

 #if 0
 # include <iostream>
@ -82,7 +82,7 @@ static std::string doubleToTimescale(double value) {
 //=========================================================================
 // Buffer management

-template <> uint32_t* VerilatedTrace<VL_DERIVED_T>::getOffloadBuffer() {
+template <> uint32_t* VerilatedTrace<VL_SUB_T, VL_BUF_T>::getOffloadBuffer() {
    uint32_t* bufferp;
    // Some jitter is expected, so some number of alternative offlaod buffers are
    // required, but don't allocate more than 8 buffers.
@ -101,7 +101,7 @@ template <> uint32_t* VerilatedTrace<VL_DERIVED_T>::getOffloadBuffer() {
    return bufferp;
 }

-template <> void VerilatedTrace<VL_DERIVED_T>::waitForOffloadBuffer(const uint32_t* buffp) {
+template <> void VerilatedTrace<VL_SUB_T, VL_BUF_T>::waitForOffloadBuffer(const uint32_t* buffp) {
    // Slow path code only called on flush/shutdown, so use a simple algorithm.
    // Collect buffers from worker and stash them until we get the one we want.
    std::deque<uint32_t*> stash;
@ -116,7 +116,7 @@ template <> void VerilatedTrace<VL_DERIVED_T>::waitForOffloadBuffer(const uint32
 //=========================================================================
 // Worker thread

-template <> void VerilatedTrace<VL_DERIVED_T>::offloadWorkerThreadMain() {
+template <> void VerilatedTrace<VL_SUB_T, VL_BUF_T>::offloadWorkerThreadMain() {
    bool shutdown = false;

    do {
@ -127,6 +127,8 @@ template <> void VerilatedTrace<VL_DERIVED_T>::offloadWorkerThreadMain() {

        const uint32_t* readp = bufferp;

+        std::unique_ptr<VL_BUF_T> traceBufp;  // We own the passed tracebuffer
+
        while (true) {
            const uint32_t cmd = readp[0];
            const uint32_t top = cmd >> 4;
@ -141,44 +143,44 @@ template <> void VerilatedTrace<VL_DERIVED_T>::offloadWorkerThreadMain() {
                // CHG_* commands
            case VerilatedTraceOffloadCommand::CHG_BIT_0:
                VL_TRACE_OFFLOAD_DEBUG("Command CHG_BIT_0 " << top);
-                chgBitImpl(oldp, 0);
+                traceBufp->chgBitImpl(oldp, 0);
                continue;
            case VerilatedTraceOffloadCommand::CHG_BIT_1:
                VL_TRACE_OFFLOAD_DEBUG("Command CHG_BIT_1 " << top);
-                chgBitImpl(oldp, 1);
+                traceBufp->chgBitImpl(oldp, 1);
                continue;
            case VerilatedTraceOffloadCommand::CHG_CDATA:
                VL_TRACE_OFFLOAD_DEBUG("Command CHG_CDATA " << top);
                // Bits stored in bottom byte of command
-                chgCDataImpl(oldp, *readp, top);
+                traceBufp->chgCDataImpl(oldp, *readp, top);
                readp += 1;
                continue;
            case VerilatedTraceOffloadCommand::CHG_SDATA:
                VL_TRACE_OFFLOAD_DEBUG("Command CHG_SDATA " << top);
                // Bits stored in bottom byte of command
-                chgSDataImpl(oldp, *readp, top);
+                traceBufp->chgSDataImpl(oldp, *readp, top);
                readp += 1;
                continue;
            case VerilatedTraceOffloadCommand::CHG_IDATA:
                VL_TRACE_OFFLOAD_DEBUG("Command CHG_IDATA " << top);
                // Bits stored in bottom byte of command
-                chgIDataImpl(oldp, *readp, top);
+                traceBufp->chgIDataImpl(oldp, *readp, top);
                readp += 1;
                continue;
            case VerilatedTraceOffloadCommand::CHG_QDATA:
                VL_TRACE_OFFLOAD_DEBUG("Command CHG_QDATA " << top);
                // Bits stored in bottom byte of command
-                chgQDataImpl(oldp, *reinterpret_cast<const QData*>(readp), top);
+                traceBufp->chgQDataImpl(oldp, *reinterpret_cast<const QData*>(readp), top);
                readp += 2;
                continue;
            case VerilatedTraceOffloadCommand::CHG_WDATA:
                VL_TRACE_OFFLOAD_DEBUG("Command CHG_WDATA " << top);
-                chgWDataImpl(oldp, readp, top);
+                traceBufp->chgWDataImpl(oldp, readp, top);
                readp += VL_WORDS_I(top);
                continue;
            case VerilatedTraceOffloadCommand::CHG_DOUBLE:
                VL_TRACE_OFFLOAD_DEBUG("Command CHG_DOUBLE " << top);
-                chgDoubleImpl(oldp, *reinterpret_cast<const double*>(readp));
+                traceBufp->chgDoubleImpl(oldp, *reinterpret_cast<const double*>(readp));
                readp += 2;
                continue;

@ -191,9 +193,18 @@ template <> void VerilatedTrace<VL_DERIVED_T>::offloadWorkerThreadMain() {
                readp += 2;
                continue;

+            case VerilatedTraceOffloadCommand::TRACE_BUFFER:
+                VL_TRACE_OFFLOAD_DEBUG("Command TRACE_BUFFER " << top);
+                readp -= 1;  // No code in this command, undo increment
+                traceBufp.reset(*reinterpret_cast<VL_BUF_T* const*>(readp));
+                readp += 2;
+                continue;
+
                //===
                // Commands ending this buffer
-            case VerilatedTraceOffloadCommand::END: VL_TRACE_OFFLOAD_DEBUG("Command END"); break;
+            case VerilatedTraceOffloadCommand::END:  //
+                VL_TRACE_OFFLOAD_DEBUG("Command END");
+                break;
            case VerilatedTraceOffloadCommand::SHUTDOWN:
                VL_TRACE_OFFLOAD_DEBUG("Command SHUTDOWN");
                shutdown = true;
@ -202,8 +213,7 @@ template <> void VerilatedTrace<VL_DERIVED_T>::offloadWorkerThreadMain() {
            //===
            // Unknown command
            default: {  // LCOV_EXCL_START
-                VL_TRACE_OFFLOAD_DEBUG("Command UNKNOWN");
-                VL_PRINTF_MT("Trace command: 0x%08x\n", cmd);
+                VL_TRACE_OFFLOAD_DEBUG("Command UNKNOWN " << cmd);
                VL_FATAL_MT(__FILE__, __LINE__, "", "Unknown trace command");
                break;
            }  // LCOV_EXCL_STOP
@ -221,7 +231,7 @@ template <> void VerilatedTrace<VL_DERIVED_T>::offloadWorkerThreadMain() {
    } while (VL_LIKELY(!shutdown));
 }

-template <> void VerilatedTrace<VL_DERIVED_T>::shutdownOffloadWorker() {
+template <> void VerilatedTrace<VL_SUB_T, VL_BUF_T>::shutdownOffloadWorker() {
    // If the worker thread is not running, done..
    if (!m_workerThread) return;

@ -241,7 +251,7 @@ template <> void VerilatedTrace<VL_DERIVED_T>::shutdownOffloadWorker() {
 //=============================================================================
 // Life cycle

-template <> void VerilatedTrace<VL_DERIVED_T>::closeBase() {
+template <> void VerilatedTrace<VL_SUB_T, VL_BUF_T>::closeBase() {
 #ifdef VL_TRACE_OFFLOAD
    shutdownOffloadWorker();
    while (m_numOffloadBuffers) {
@ -251,7 +261,7 @@ template <> void VerilatedTrace<VL_DERIVED_T>::closeBase() {
 #endif
 }

-template <> void VerilatedTrace<VL_DERIVED_T>::flushBase() {
+template <> void VerilatedTrace<VL_SUB_T, VL_BUF_T>::flushBase() {
 #ifdef VL_TRACE_OFFLOAD
    // Hand an empty buffer to the worker thread
    uint32_t* const bufferp = getOffloadBuffer();
@ -266,46 +276,29 @@ template <> void VerilatedTrace<VL_DERIVED_T>::flushBase() {
 //=============================================================================
 // Callbacks to run on global events

-template <> void VerilatedTrace<VL_DERIVED_T>::onFlush(void* selfp) {
+template <> void VerilatedTrace<VL_SUB_T, VL_BUF_T>::onFlush(void* selfp) {
    // This calls 'flush' on the derived class (which must then get any mutex)
-    reinterpret_cast<VL_DERIVED_T*>(selfp)->flush();
+    reinterpret_cast<VL_SUB_T*>(selfp)->flush();
 }

-template <> void VerilatedTrace<VL_DERIVED_T>::onExit(void* selfp) {
+template <> void VerilatedTrace<VL_SUB_T, VL_BUF_T>::onExit(void* selfp) {
    // This calls 'close' on the derived class (which must then get any mutex)
-    reinterpret_cast<VL_DERIVED_T*>(selfp)->close();
+    reinterpret_cast<VL_SUB_T*>(selfp)->close();
 }

 //=============================================================================
 // VerilatedTrace

-template <>
-VerilatedTrace<VL_DERIVED_T>::VerilatedTrace()
-    : m_sigs_oldvalp{nullptr}
-    , m_sigs_enabledp{nullptr}
-    , m_timeLastDump{0}
-    , m_fullDump{true}
-    , m_nextCode{0}
-    , m_numSignals{0}
-    , m_maxBits{0}
-    , m_scopeEscape{'.'}
-    , m_timeRes{1e-9}
-    , m_timeUnit {
-    1e-9
-}
-#ifdef VL_TRACE_OFFLOAD
-, m_numOffloadBuffers { 0 }
-#endif
-{
+template <> VerilatedTrace<VL_SUB_T, VL_BUF_T>::VerilatedTrace() {
    set_time_unit(Verilated::threadContextp()->timeunitString());
    set_time_resolution(Verilated::threadContextp()->timeprecisionString());
 }

-template <> VerilatedTrace<VL_DERIVED_T>::~VerilatedTrace() {
+template <> VerilatedTrace<VL_SUB_T, VL_BUF_T>::~VerilatedTrace() {
    if (m_sigs_oldvalp) VL_DO_CLEAR(delete[] m_sigs_oldvalp, m_sigs_oldvalp = nullptr);
    if (m_sigs_enabledp) VL_DO_CLEAR(delete[] m_sigs_enabledp, m_sigs_enabledp = nullptr);
-    Verilated::removeFlushCb(VerilatedTrace<VL_DERIVED_T>::onFlush, this);
-    Verilated::removeExitCb(VerilatedTrace<VL_DERIVED_T>::onExit, this);
+    Verilated::removeFlushCb(VerilatedTrace<VL_SUB_T, VL_BUF_T>::onFlush, this);
+    Verilated::removeExitCb(VerilatedTrace<VL_SUB_T, VL_BUF_T>::onExit, this);
 #ifdef VL_TRACE_OFFLOAD
    closeBase();
 #endif
@ -314,7 +307,7 @@ template <> VerilatedTrace<VL_DERIVED_T>::~VerilatedTrace() {
 //=========================================================================
 // Internals available to format specific implementations

-template <> void VerilatedTrace<VL_DERIVED_T>::traceInit() VL_MT_UNSAFE {
+template <> void VerilatedTrace<VL_SUB_T, VL_BUF_T>::traceInit() VL_MT_UNSAFE {
    // Note: It is possible to re-open a trace file (VCD in particular),
    // so we must reset the next code here, but it must have the same number
    // of codes on re-open
@ -359,8 +352,8 @@ template <> void VerilatedTrace<VL_DERIVED_T>::traceInit() VL_MT_UNSAFE {
    }

    // Set callback so flush/abort will flush this file
-    Verilated::addFlushCb(VerilatedTrace<VL_DERIVED_T>::onFlush, this);
-    Verilated::addExitCb(VerilatedTrace<VL_DERIVED_T>::onExit, this);
+    Verilated::addFlushCb(VerilatedTrace<VL_SUB_T, VL_BUF_T>::onFlush, this);
+    Verilated::addExitCb(VerilatedTrace<VL_SUB_T, VL_BUF_T>::onExit, this);

 #ifdef VL_TRACE_OFFLOAD
    // Compute offload buffer size. we need to be able to store a new value for
@ -372,13 +365,13 @@ template <> void VerilatedTrace<VL_DERIVED_T>::traceInit() VL_MT_UNSAFE {

    // Start the worker thread
    m_workerThread.reset(
-        new std::thread{&VerilatedTrace<VL_DERIVED_T>::offloadWorkerThreadMain, this});
+        new std::thread{&VerilatedTrace<VL_SUB_T, VL_BUF_T>::offloadWorkerThreadMain, this});
 #endif
 }

 template <>
-bool VerilatedTrace<VL_DERIVED_T>::declCode(uint32_t code, const char* namep, uint32_t bits,
-                                            bool tri) {
+bool VerilatedTrace<VL_SUB_T, VL_BUF_T>::declCode(uint32_t code, const char* namep, uint32_t bits,
+                                                  bool tri) {
    if (VL_UNCOVERABLE(!code)) {
        VL_FATAL_MT(__FILE__, __LINE__, "", "Internal: internal trace problem, code 0 is illegal");
    }
@ -422,28 +415,30 @@ bool VerilatedTrace<VL_DERIVED_T>::declCode(uint32_t code, const char* namep, ui
 //=========================================================================
 // Internals available to format specific implementations

-template <> std::string VerilatedTrace<VL_DERIVED_T>::timeResStr() const {
+template <> std::string VerilatedTrace<VL_SUB_T, VL_BUF_T>::timeResStr() const {
    return doubleToTimescale(m_timeRes);
 }

 //=========================================================================
 // External interface to client code

-template <> void VerilatedTrace<VL_DERIVED_T>::set_time_unit(const char* unitp) VL_MT_SAFE {
+template <> void VerilatedTrace<VL_SUB_T, VL_BUF_T>::set_time_unit(const char* unitp) VL_MT_SAFE {
    m_timeUnit = timescaleToDouble(unitp);
 }
-template <> void VerilatedTrace<VL_DERIVED_T>::set_time_unit(const std::string& unit) VL_MT_SAFE {
+template <>
+void VerilatedTrace<VL_SUB_T, VL_BUF_T>::set_time_unit(const std::string& unit) VL_MT_SAFE {
    set_time_unit(unit.c_str());
 }
-template <> void VerilatedTrace<VL_DERIVED_T>::set_time_resolution(const char* unitp) VL_MT_SAFE {
+template <>
+void VerilatedTrace<VL_SUB_T, VL_BUF_T>::set_time_resolution(const char* unitp) VL_MT_SAFE {
    m_timeRes = timescaleToDouble(unitp);
 }
 template <>
-void VerilatedTrace<VL_DERIVED_T>::set_time_resolution(const std::string& unit) VL_MT_SAFE {
+void VerilatedTrace<VL_SUB_T, VL_BUF_T>::set_time_resolution(const std::string& unit) VL_MT_SAFE {
    set_time_resolution(unit.c_str());
 }
 template <>
-void VerilatedTrace<VL_DERIVED_T>::dumpvars(int level, const std::string& hier) VL_MT_SAFE {
+void VerilatedTrace<VL_SUB_T, VL_BUF_T>::dumpvars(int level, const std::string& hier) VL_MT_SAFE {
    if (level == 0) {
        m_dumpvars.clear();  // empty = everything on
    } else {
@ -456,7 +451,87 @@ void VerilatedTrace<VL_DERIVED_T>::dumpvars(int level, const std::string& hier)
    }
 }

-template <> void VerilatedTrace<VL_DERIVED_T>::dump(uint64_t timeui) VL_MT_SAFE_EXCLUDES(m_mutex) {
+#ifdef VL_TRACE_PARALLEL
+template <>  //
+void VerilatedTrace<VL_SUB_T, VL_BUF_T>::parallelWorkerTask(void* datap, bool) {
+    ParallelWorkerData* const wdp = reinterpret_cast<ParallelWorkerData*>(datap);
+    // Run the task
+    wdp->m_cb(wdp->m_userp, wdp->m_bufp);
+    // Mark buffer as ready
+    const VerilatedLockGuard lock{wdp->m_mutex};
+    wdp->m_ready.store(true);
+    if (wdp->m_waiting) wdp->m_cv.notify_one();
+}
+
+template <> VL_ATTR_NOINLINE void VerilatedTrace<VL_SUB_T, VL_BUF_T>::ParallelWorkerData::wait() {
+    // Spin for a while, waiting for the buffer to become ready
+    for (int i = 0; i < VL_LOCK_SPINS; ++i) {
+        if (VL_LIKELY(m_ready.load(std::memory_order_relaxed))) return;
+        VL_CPU_RELAX();
+    }
+    // We have been spinning for a while, so yield the thread
+    VerilatedLockGuard lock{m_mutex};
+    m_waiting = true;
+    m_cv.wait(lock, [this] { return m_ready.load(std::memory_order_relaxed); });
+    m_waiting = false;
+}
+#endif
+
+template <>
+void VerilatedTrace<VL_SUB_T, VL_BUF_T>::runParallelCallbacks(const ParallelCallbackMap& cbMap) {
+    for (VlThreadPool* threadPoolp : m_threadPoolps) {
+#ifdef VL_TRACE_PARALLEL
+        // If tracing in parallel, dispatch to the thread pool (if exists)
+        if (threadPoolp && threadPoolp->numThreads()) {
+            // List of work items for thread (std::list, as ParallelWorkerData is not movable)
+            std::list<ParallelWorkerData> workerData;
+            // We use the whole pool + the main thread
+            const unsigned threads = threadPoolp->numThreads() + 1;
+            // Main thread executes all jobs with index % threads == 0
+            std::vector<ParallelWorkerData*> mainThreadWorkerData;
+            // The tracing callbacks to execute on this thread-pool
+            const auto& cbVec = cbMap.at(threadPoolp);
+            // Enuque all the jobs
+            for (unsigned i = 0; i < cbVec.size(); ++i) {
+                const CallbackRecord& cbr = cbVec[i];
+                // Always get the trace buffer on the main thread
+                Buffer* const bufp = getTraceBuffer();
+                // Create new work item
+                workerData.emplace_back(cbr.m_dumpCb, cbr.m_userp, bufp);
+                // Grab the new work item
+                ParallelWorkerData* const itemp = &workerData.back();
+                // Enqueue task to thread pool, or main thread
+                if (unsigned rem = i % threads) {
+                    threadPoolp->workerp(rem - 1)->addTask(parallelWorkerTask, itemp, false);
+                } else {
+                    mainThreadWorkerData.push_back(itemp);
+                }
+            }
+            // Execute main thead jobs
+            for (ParallelWorkerData* const itemp : mainThreadWorkerData) {
+                parallelWorkerTask(itemp, false);
+            }
+            // Commit all trace buffers in order
+            for (ParallelWorkerData& item : workerData) {
+                // Wait until ready
+                item.wait();
+                // Commit the buffer
+                commitTraceBuffer(item.m_bufp);
+            }
+            continue;
+        }
+#endif
+        // Fall back on sequential execution
+        for (const CallbackRecord& cbr : cbMap.at(threadPoolp)) {
+            Buffer* const traceBufferp = getTraceBuffer();
+            cbr.m_dumpCb(cbr.m_userp, traceBufferp);
+            commitTraceBuffer(traceBufferp);
+        }
+    }
+}
+
+template <>
+void VerilatedTrace<VL_SUB_T, VL_BUF_T>::dump(uint64_t timeui) VL_MT_SAFE_EXCLUDES(m_mutex) {
    // Not really VL_MT_SAFE but more VL_MT_UNSAFE_ONE.
    // This does get the mutex, but if multiple threads are trying to dump
    // chances are the data being dumped will have other problems
@ -504,20 +579,14 @@ template <> void VerilatedTrace<VL_DERIVED_T>::dump(uint64_t timeui) VL_MT_SAFE_
    // Run the callbacks
    if (VL_UNLIKELY(m_fullDump)) {
        m_fullDump = false;  // No more need for next dump to be full
-        for (uint32_t i = 0; i < m_fullCbs.size(); ++i) {
-            const CallbackRecord& cbr = m_fullCbs[i];
-            cbr.m_dumpCb(cbr.m_userp, self());
-        }
+        runParallelCallbacks(m_fullCbs);
    } else {
-        for (uint32_t i = 0; i < m_chgCbs.size(); ++i) {
-            const CallbackRecord& cbr = m_chgCbs[i];
-            cbr.m_dumpCb(cbr.m_userp, self());
-        }
+        runParallelCallbacks(m_chgCbs);
    }

    for (uint32_t i = 0; i < m_cleanupCbs.size(); ++i) {
        const CallbackRecord& cbr = m_cleanupCbs[i];
-        cbr.m_dumpCb(cbr.m_userp, self());
+        cbr.m_cleanupCb(cbr.m_userp, self());
    }

 #ifdef VL_TRACE_OFFLOAD
@ -538,8 +607,18 @@ template <> void VerilatedTrace<VL_DERIVED_T>::dump(uint64_t timeui) VL_MT_SAFE_
 // Non-hot path internal interface to Verilator generated code

 template <>
-void VerilatedTrace<VL_DERIVED_T>::addCallbackRecord(std::vector<CallbackRecord>& cbVec,
-                                                     CallbackRecord& cbRec)
+void VerilatedTrace<VL_SUB_T, VL_BUF_T>::addThreadPool(VlThreadPool* threadPoolp)
+    VL_MT_SAFE_EXCLUDES(m_mutex) {
+    const VerilatedLockGuard lock{m_mutex};
+    for (VlThreadPool* const poolp : m_threadPoolps) {
+        if (poolp == threadPoolp) return;
+    }
+    m_threadPoolps.push_back(threadPoolp);
+}
+
+template <>
+void VerilatedTrace<VL_SUB_T, VL_BUF_T>::addCallbackRecord(std::vector<CallbackRecord>& cbVec,
+                                                           CallbackRecord& cbRec)
    VL_MT_SAFE_EXCLUDES(m_mutex) {
    const VerilatedLockGuard lock{m_mutex};
    if (VL_UNCOVERABLE(timeLastDump() != 0)) {  // LCOV_EXCL_START
@ -550,91 +629,40 @@ void VerilatedTrace<VL_DERIVED_T>::addCallbackRecord(std::vector<CallbackRecord>
    cbVec.push_back(cbRec);
 }

-template <> void VerilatedTrace<VL_DERIVED_T>::addInitCb(initCb_t cb, void* userp) VL_MT_SAFE {
+template <>
+void VerilatedTrace<VL_SUB_T, VL_BUF_T>::addInitCb(initCb_t cb, void* userp) VL_MT_SAFE {
    CallbackRecord cbr{cb, userp};
    addCallbackRecord(m_initCbs, cbr);
 }
-template <> void VerilatedTrace<VL_DERIVED_T>::addFullCb(dumpCb_t cb, void* userp) VL_MT_SAFE {
+template <>
+void VerilatedTrace<VL_SUB_T, VL_BUF_T>::addFullCb(dumpCb_t cb, void* userp,
+                                                   VlThreadPool* threadPoolp) VL_MT_SAFE {
    CallbackRecord cbr{cb, userp};
-    addCallbackRecord(m_fullCbs, cbr);
+    addThreadPool(threadPoolp);
+    addCallbackRecord(m_fullCbs[threadPoolp], cbr);
 }
-template <> void VerilatedTrace<VL_DERIVED_T>::addChgCb(dumpCb_t cb, void* userp) VL_MT_SAFE {
+template <>
+void VerilatedTrace<VL_SUB_T, VL_BUF_T>::addChgCb(dumpCb_t cb, void* userp,
+                                                  VlThreadPool* threadPoolp) VL_MT_SAFE {
    CallbackRecord cbr{cb, userp};
-    addCallbackRecord(m_chgCbs, cbr);
+    addThreadPool(threadPoolp);
+    addCallbackRecord(m_chgCbs[threadPoolp], cbr);
 }
-template <> void VerilatedTrace<VL_DERIVED_T>::addCleanupCb(dumpCb_t cb, void* userp) VL_MT_SAFE {
+template <>
+void VerilatedTrace<VL_SUB_T, VL_BUF_T>::addCleanupCb(cleanupCb_t cb, void* userp) VL_MT_SAFE {
    CallbackRecord cbr{cb, userp};
    addCallbackRecord(m_cleanupCbs, cbr);
 }

-template <> void VerilatedTrace<VL_DERIVED_T>::pushNamePrefix(const std::string& prefix) {
+template <> void VerilatedTrace<VL_SUB_T, VL_BUF_T>::pushNamePrefix(const std::string& prefix) {
    m_namePrefixStack.push_back(m_namePrefixStack.back() + prefix);
 }

-template <> void VerilatedTrace<VL_DERIVED_T>::popNamePrefix(unsigned count) {
+template <> void VerilatedTrace<VL_SUB_T, VL_BUF_T>::popNamePrefix(unsigned count) {
    while (count--) m_namePrefixStack.pop_back();
    assert(!m_namePrefixStack.empty());
 }

-//=========================================================================
-// Hot path internal interface to Verilator generated code
-
-// These functions must write the new value back into the old value store,
-// and subsequently call the format specific emit* implementations. Note
-// that this file must be included in the format specific implementation, so
-// the emit* functions can be inlined for performance.
-
-template <> void VerilatedTrace<VL_DERIVED_T>::fullBit(uint32_t* oldp, CData newval) {
-    const uint32_t code = oldp - m_sigs_oldvalp;
-    *oldp = newval;  // Still copy even if not tracing so chg doesn't call full
-    if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return;
-    self()->emitBit(code, newval);
-}
-
-template <> void VerilatedTrace<VL_DERIVED_T>::fullCData(uint32_t* oldp, CData newval, int bits) {
-    const uint32_t code = oldp - m_sigs_oldvalp;
-    *oldp = newval;  // Still copy even if not tracing so chg doesn't call full
-    if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return;
-    self()->emitCData(code, newval, bits);
-}
-
-template <> void VerilatedTrace<VL_DERIVED_T>::fullSData(uint32_t* oldp, SData newval, int bits) {
-    const uint32_t code = oldp - m_sigs_oldvalp;
-    *oldp = newval;  // Still copy even if not tracing so chg doesn't call full
-    if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return;
-    self()->emitSData(code, newval, bits);
-}
-
-template <> void VerilatedTrace<VL_DERIVED_T>::fullIData(uint32_t* oldp, IData newval, int bits) {
-    const uint32_t code = oldp - m_sigs_oldvalp;
-    *oldp = newval;  // Still copy even if not tracing so chg doesn't call full
-    if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return;
-    self()->emitIData(code, newval, bits);
-}
-
-template <> void VerilatedTrace<VL_DERIVED_T>::fullQData(uint32_t* oldp, QData newval, int bits) {
-    const uint32_t code = oldp - m_sigs_oldvalp;
-    *reinterpret_cast<QData*>(oldp) = newval;
-    if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return;
-    self()->emitQData(code, newval, bits);
-}
-
-template <>
-void VerilatedTrace<VL_DERIVED_T>::fullWData(uint32_t* oldp, const WData* newvalp, int bits) {
-    const uint32_t code = oldp - m_sigs_oldvalp;
-    for (int i = 0; i < VL_WORDS_I(bits); ++i) oldp[i] = newvalp[i];
-    if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return;
-    self()->emitWData(code, newvalp, bits);
-}
-
-template <> void VerilatedTrace<VL_DERIVED_T>::fullDouble(uint32_t* oldp, double newval) {
-    const uint32_t code = oldp - m_sigs_oldvalp;
-    *reinterpret_cast<double*>(oldp) = newval;
-    if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return;
-    // cppcheck-suppress invalidPointerCast
-    self()->emitDouble(code, newval);
-}
-
 //=========================================================================
 // Primitives converting binary values to strings...

@ -725,41 +753,86 @@ static inline void cvtQDataToStr(char* dstp, QData value) {

 #define cvtEDataToStr cvtIDataToStr

-//=============================================================================
+//=========================================================================
+// VerilatedTraceBuffer

-#ifdef VERILATED_VCD_TEST
-
-void verilated_trace_imp_selftest() {
-#define SELF_CHECK(got, exp) \
-    do { \
-        if ((got) != (exp)) VL_FATAL_MT(__FILE__, __LINE__, "", "%Error: selftest"); \
-    } while (0)
-
-#define SELF_CHECK_TS(scale) \
-    SELF_CHECK(doubleToTimescale(timescaleToDouble(scale)), std::string{scale});
-    SELF_CHECK_TS("100s");
-    SELF_CHECK_TS("10s");
-    SELF_CHECK_TS("1s");
-    SELF_CHECK_TS("100ms");
-    SELF_CHECK_TS("10ms");
-    SELF_CHECK_TS("1ms");
-    SELF_CHECK_TS("100us");
-    SELF_CHECK_TS("10us");
-    SELF_CHECK_TS("1us");
-    SELF_CHECK_TS("100ns");
-    SELF_CHECK_TS("10ns");
-    SELF_CHECK_TS("1ns");
-    SELF_CHECK_TS("100ps");
-    SELF_CHECK_TS("10ps");
-    SELF_CHECK_TS("1ps");
-    SELF_CHECK_TS("100fs");
-    SELF_CHECK_TS("10fs");
-    SELF_CHECK_TS("1fs");
-    SELF_CHECK_TS("100as");
-    SELF_CHECK_TS("10as");
-    SELF_CHECK_TS("1as");
+template <>  //
+VerilatedTraceBuffer<VL_SUB_T, VL_BUF_T>::VerilatedTraceBuffer(VL_SUB_T& owner)
+    : m_owner{owner} {
+#ifdef VL_TRACE_OFFLOAD
+    if (m_offloadBufferWritep) {
+        using This = VerilatedTraceBuffer<VL_SUB_T, VL_BUF_T>*;
+        // Tack on the buffer address
+        static_assert(2 * sizeof(uint32_t) >= sizeof(This),
+                      "This should be enough on all plafrorms");
+        *m_offloadBufferWritep++ = VerilatedTraceOffloadCommand::TRACE_BUFFER;
+        *reinterpret_cast<This*>(m_offloadBufferWritep) = this;
+        m_offloadBufferWritep += 2;
+    }
+#endif
 }

-#endif
+// These functions must write the new value back into the old value store,
+// and subsequently call the format specific emit* implementations. Note
+// that this file must be included in the format specific implementation, so
+// the emit* functions can be inlined for performance.
+
+template <>  //
+void VerilatedTraceBuffer<VL_SUB_T, VL_BUF_T>::fullBit(uint32_t* oldp, CData newval) {
+    const uint32_t code = oldp - m_sigs_oldvalp;
+    *oldp = newval;  // Still copy even if not tracing so chg doesn't call full
+    if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return;
+    self()->emitBit(code, newval);
+}
+
+template <>
+void VerilatedTraceBuffer<VL_SUB_T, VL_BUF_T>::fullCData(uint32_t* oldp, CData newval, int bits) {
+    const uint32_t code = oldp - m_sigs_oldvalp;
+    *oldp = newval;  // Still copy even if not tracing so chg doesn't call full
+    if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return;
+    self()->emitCData(code, newval, bits);
+}
+
+template <>
+void VerilatedTraceBuffer<VL_SUB_T, VL_BUF_T>::fullSData(uint32_t* oldp, SData newval, int bits) {
+    const uint32_t code = oldp - m_sigs_oldvalp;
+    *oldp = newval;  // Still copy even if not tracing so chg doesn't call full
+    if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return;
+    self()->emitSData(code, newval, bits);
+}
+
+template <>
+void VerilatedTraceBuffer<VL_SUB_T, VL_BUF_T>::fullIData(uint32_t* oldp, IData newval, int bits) {
+    const uint32_t code = oldp - m_sigs_oldvalp;
+    *oldp = newval;  // Still copy even if not tracing so chg doesn't call full
+    if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return;
+    self()->emitIData(code, newval, bits);
+}
+
+template <>
+void VerilatedTraceBuffer<VL_SUB_T, VL_BUF_T>::fullQData(uint32_t* oldp, QData newval, int bits) {
+    const uint32_t code = oldp - m_sigs_oldvalp;
+    *reinterpret_cast<QData*>(oldp) = newval;
+    if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return;
+    self()->emitQData(code, newval, bits);
+}
+
+template <>
+void VerilatedTraceBuffer<VL_SUB_T, VL_BUF_T>::fullWData(uint32_t* oldp, const WData* newvalp,
+                                                         int bits) {
+    const uint32_t code = oldp - m_sigs_oldvalp;
+    for (int i = 0; i < VL_WORDS_I(bits); ++i) oldp[i] = newvalp[i];
+    if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return;
+    self()->emitWData(code, newvalp, bits);
+}
+
+template <>
+void VerilatedTraceBuffer<VL_SUB_T, VL_BUF_T>::fullDouble(uint32_t* oldp, double newval) {
+    const uint32_t code = oldp - m_sigs_oldvalp;
+    *reinterpret_cast<double*>(oldp) = newval;
+    if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return;
+    // cppcheck-suppress invalidPointerCast
+    self()->emitDouble(code, newval);
+}

 #endif  // VL_CPPCHECK
--- a/include/verilated_vcd_c.cpp
+++ b/include/verilated_vcd_c.cpp
@ -62,12 +62,23 @@ constexpr unsigned VL_TRACE_MAX_VCD_CODE_SIZE = 5;  // Maximum length of a VCD s
 // cache-lines.
 constexpr unsigned VL_TRACE_SUFFIX_ENTRY_SIZE = 8;  // Size of a suffix entry

+//=============================================================================
+// Utility functions: TODO: put these in a common place and share them.
+
+template <size_t N> static size_t roundUpToMultipleOf(size_t value) {
+    static_assert((N & (N - 1)) == 0, "'N' must be a power of 2");
+    size_t mask = N - 1;
+    return (value + mask) & ~mask;
+}
+
 //=============================================================================
 // Specialization of the generics for this trace format

-#define VL_DERIVED_T VerilatedVcd
-#include "verilated_trace_imp.cpp"
-#undef VL_DERIVED_T
+#define VL_SUB_T VerilatedVcd
+#define VL_BUF_T VerilatedVcdBuffer
+#include "verilated_trace_imp.h"
+#undef VL_SUB_T
+#undef VL_BUF_T

 //=============================================================================
 //=============================================================================
@ -183,7 +194,7 @@ void VerilatedVcd::makeNameMap() {
    deleteNameMap();
    m_namemapp = new NameMap;

-    VerilatedTrace<VerilatedVcd>::traceInit();
+    Super::traceInit();

    // Though not speced, it's illegal to generate a vcd with signals
    // not under any module - it crashes at least two viewers.
@ -218,13 +229,17 @@ VerilatedVcd::~VerilatedVcd() {
    if (m_wrBufp) VL_DO_CLEAR(delete[] m_wrBufp, m_wrBufp = nullptr);
    deleteNameMap();
    if (m_filep && m_fileNewed) VL_DO_CLEAR(delete m_filep, m_filep = nullptr);
+#ifdef VL_TRACE_PARALLEL
+    assert(m_numBuffers == m_freeBuffers.size());
+    for (auto& pair : m_freeBuffers) VL_DO_CLEAR(delete[] pair.first, pair.first = nullptr);
+#endif
 }

 void VerilatedVcd::closePrev() {
    // This function is on the flush() call path
    if (!isOpen()) return;

-    VerilatedTrace<VerilatedVcd>::flushBase();
+    Super::flushBase();
    bufferFlush();
    m_isOpen = false;
    m_filep->close();
@ -251,14 +266,14 @@ void VerilatedVcd::close() VL_MT_SAFE_EXCLUDES(m_mutex) {
        printStr(" $end\n");
    }
    closePrev();
-    // closePrev() called VerilatedTrace<VerilatedVcd>::flush(), so we just
+    // closePrev() called Super::flush(), so we just
    // need to shut down the tracing thread here.
-    VerilatedTrace<VerilatedVcd>::closeBase();
+    Super::closeBase();
 }

 void VerilatedVcd::flush() VL_MT_SAFE_EXCLUDES(m_mutex) {
    const VerilatedLockGuard lock{m_mutex};
-    VerilatedTrace<VerilatedVcd>::flushBase();
+    Super::flushBase();
    bufferFlush();
 }

@ -277,12 +292,12 @@ void VerilatedVcd::printQuad(uint64_t n) {
    printStr(buf);
 }

-void VerilatedVcd::bufferResize(uint64_t minsize) {
+void VerilatedVcd::bufferResize(size_t minsize) {
    // minsize is size of largest write.  We buffer at least 8 times as much data,
    // writing when we are 3/4 full (with thus 2*minsize remaining free)
    if (VL_UNLIKELY(minsize > m_wrChunkSize)) {
        const char* oldbufp = m_wrBufp;
-        m_wrChunkSize = minsize * 2;
+        m_wrChunkSize = roundUpToMultipleOf<1024>(minsize * 2);
        m_wrBufp = new char[m_wrChunkSize * 8];
        std::memcpy(m_wrBufp, oldbufp, m_writep - oldbufp);
        m_writep = m_wrBufp + (m_writep - oldbufp);
@ -463,14 +478,16 @@ void VerilatedVcd::declare(uint32_t code, const char* name, const char* wirep, b
                           int arraynum, bool tri, bool bussed, int msb, int lsb) {
    const int bits = ((msb > lsb) ? (msb - lsb) : (lsb - msb)) + 1;

-    const bool enabled = VerilatedTrace<VerilatedVcd>::declCode(code, name, bits, tri);
+    const bool enabled = Super::declCode(code, name, bits, tri);

    if (m_suffixes.size() <= nextCode() * VL_TRACE_SUFFIX_ENTRY_SIZE) {
        m_suffixes.resize(nextCode() * VL_TRACE_SUFFIX_ENTRY_SIZE * 2, 0);
    }

-    // Make sure write buffer is large enough (one character per bit), plus header
-    bufferResize(bits + 1024);
+    // Keep upper bound on bytes a single signal cna emit into the buffer
+    m_maxSignalBytes = std::max<size_t>(m_maxSignalBytes, bits + 32);
+    // Make sure write buffer is large enough, plus header
+    bufferResize(m_maxSignalBytes + 1024);

    if (!enabled) return;

@ -562,26 +579,73 @@ void VerilatedVcd::declArray(uint32_t code, const char* name, bool array, int ar
 void VerilatedVcd::declDouble(uint32_t code, const char* name, bool array, int arraynum) {
    declare(code, name, "real", array, arraynum, false, false, 63, 0);
 }
-#ifdef VL_TRACE_VCD_OLD_API
-void VerilatedVcd::declTriBit(uint32_t code, const char* name, bool array, int arraynum) {
-    declare(code, name, "wire", array, arraynum, true, false, 0, 0);
-}
-void VerilatedVcd::declTriBus(uint32_t code, const char* name, bool array, int arraynum, int msb,
-                              int lsb) {
-    declare(code, name, "wire", array, arraynum, true, true, msb, lsb);
-}
-void VerilatedVcd::declTriQuad(uint32_t code, const char* name, bool array, int arraynum, int msb,
-                               int lsb) {
-    declare(code, name, "wire", array, arraynum, true, true, msb, lsb);
-}
-void VerilatedVcd::declTriArray(uint32_t code, const char* name, bool array, int arraynum, int msb,
-                                int lsb) {
-    declare(code, name, "wire", array, arraynum, true, true, msb, lsb);
-}
-#endif  //  VL_TRACE_VCD_OLD_API

 //=============================================================================
-// Trace rendering prinitives
+// Get/commit trace buffer
+
+VerilatedVcdBuffer* VerilatedVcd::getTraceBuffer() {
+#ifdef VL_TRACE_PARALLEL
+    // Note: This is called from VeriltedVcd::dump, which already holds the lock
+    // If no buffer available, allocate a new one
+    if (m_freeBuffers.empty()) {
+        constexpr size_t pageSize = 4096;
+        // 4 * m_maxSignalBytes, so we can reserve 2 * m_maxSignalBytes at the end for safety
+        size_t startingSize = roundUpToMultipleOf<pageSize>(4 * m_maxSignalBytes);
+        m_freeBuffers.emplace_back(new char[startingSize], startingSize);
+        ++m_numBuffers;
+    }
+    // Grab a buffer
+    const auto pair = m_freeBuffers.back();
+    m_freeBuffers.pop_back();
+    // Return the buffer
+    return new VerilatedVcdBuffer{*this, pair.first, pair.second};
+#else
+    return new VerilatedVcdBuffer{*this};
+#endif
+}
+
+void VerilatedVcd::commitTraceBuffer(VerilatedVcdBuffer* bufp) {
+#ifdef VL_TRACE_PARALLEL
+    // Note: This is called from VeriltedVcd::dump, which already holds the lock
+    // Resize output buffer. Note, we use the full size of the trace buffer, as
+    // this is a lot more stable than the actual occupancy of the trace buffer.
+    // This helps us to avoid re-allocations due to small size changes.
+    bufferResize(bufp->m_size);
+    // Compute occupancy of buffer
+    const size_t usedSize = bufp->m_writep - bufp->m_bufp;
+    // Copy to output buffer
+    std::memcpy(m_writep, bufp->m_bufp, usedSize);
+    // Adjust write pointer
+    m_writep += usedSize;
+    // Flush if necessary
+    bufferCheck();
+    // Put buffer back on free list
+    m_freeBuffers.emplace_back(bufp->m_bufp, bufp->m_size);
+#else
+    // Needs adjusting for emitTimeChange
+    m_writep = bufp->m_writep;
+#endif
+    delete bufp;
+}
+
+//=============================================================================
+// VerilatedVcdBuffer implementation
+
+#ifdef VL_TRACE_PARALLEL
+VerilatedVcdBuffer::VerilatedVcdBuffer(VerilatedVcd& owner, char* bufp, size_t size)
+    : VerilatedTraceBuffer<VerilatedVcd, VerilatedVcdBuffer>{owner}
+    , m_writep{bufp}
+    , m_bufp{bufp}
+    , m_size{size} {
+    adjustGrowp();
+}
+#else
+VerilatedVcdBuffer::VerilatedVcdBuffer(VerilatedVcd& owner)
+    : VerilatedTraceBuffer<VerilatedVcd, VerilatedVcdBuffer>{owner} {}
+#endif
+
+//=============================================================================
+// Trace rendering primitives

 static inline void
 VerilatedVcdCCopyAndAppendNewLine(char* writep, const char* suffixp) VL_ATTR_NO_SANITIZE_ALIGN;
@ -606,26 +670,55 @@ static inline void VerilatedVcdCCopyAndAppendNewLine(char* writep, const char* s
 #endif
 }

-void VerilatedVcd::finishLine(uint32_t code, char* writep) {
-    const char* const suffixp = m_suffixes.data() + code * VL_TRACE_SUFFIX_ENTRY_SIZE;
+void VerilatedVcdBuffer::finishLine(uint32_t code, char* writep) {
+    const char* const suffixp = m_suffixes + code * VL_TRACE_SUFFIX_ENTRY_SIZE;
    VL_DEBUG_IFDEF(assert(suffixp[0]););
    VerilatedVcdCCopyAndAppendNewLine(writep, suffixp);

    // Now write back the write pointer incremented by the actual size of the
    // suffix, which was stored in the last byte of the suffix buffer entry.
    m_writep = writep + suffixp[VL_TRACE_SUFFIX_ENTRY_SIZE - 1];
-    bufferCheck();
+
+#ifdef VL_TRACE_PARALLEL
+    // Double the size of the buffer if necessary
+    if (VL_UNLIKELY(m_writep >= m_growp)) {
+        // Compute occupied size of current buffer
+        const size_t usedSize = m_writep - m_bufp;
+        // We are always doubling the size
+        m_size *= 2;
+        // Allocate the new buffer
+        char* const newBufp = new char[m_size];
+        // Copy from current buffer to new buffer
+        std::memcpy(newBufp, m_bufp, usedSize);
+        // Delete current buffer
+        delete[] m_bufp;
+        // Make new buffer the current buffer
+        m_bufp = newBufp;
+        // Adjust write pointer
+        m_writep = m_bufp + usedSize;
+        // Adjust resize limit
+        adjustGrowp();
+    }
+#else
+    // Flush the write buffer if there's not enough space left for new information
+    // We only call this once per vector, so we need enough slop for a very wide "b###" line
+    if (VL_UNLIKELY(m_writep > m_wrFlushp)) {
+        m_owner.m_writep = m_writep;
+        m_owner.bufferFlush();
+        m_writep = m_owner.m_writep;
+    }
+#endif
 }

 //=============================================================================
 // emit* trace routines

 // Note: emit* are only ever called from one place (full* in
-// verilated_trace_imp.cpp, which is included in this file at the top),
+// verilated_trace_imp.h, which is included in this file at the top),
 // so always inline them.

 VL_ATTR_ALWINLINE
-void VerilatedVcd::emitBit(uint32_t code, CData newval) {
+void VerilatedVcdBuffer::emitBit(uint32_t code, CData newval) {
    // Don't prefetch suffix as it's a bit too late;
    char* wp = m_writep;
    *wp++ = '0' | static_cast<char>(newval);
@ -633,7 +726,7 @@ void VerilatedVcd::emitBit(uint32_t code, CData newval) {
 }

 VL_ATTR_ALWINLINE
-void VerilatedVcd::emitCData(uint32_t code, CData newval, int bits) {
+void VerilatedVcdBuffer::emitCData(uint32_t code, CData newval, int bits) {
    char* wp = m_writep;
    *wp++ = 'b';
    cvtCDataToStr(wp, newval << (VL_BYTESIZE - bits));
@ -641,7 +734,7 @@ void VerilatedVcd::emitCData(uint32_t code, CData newval, int bits) {
 }

 VL_ATTR_ALWINLINE
-void VerilatedVcd::emitSData(uint32_t code, SData newval, int bits) {
+void VerilatedVcdBuffer::emitSData(uint32_t code, SData newval, int bits) {
    char* wp = m_writep;
    *wp++ = 'b';
    cvtSDataToStr(wp, newval << (VL_SHORTSIZE - bits));
@ -649,7 +742,7 @@ void VerilatedVcd::emitSData(uint32_t code, SData newval, int bits) {
 }

 VL_ATTR_ALWINLINE
-void VerilatedVcd::emitIData(uint32_t code, IData newval, int bits) {
+void VerilatedVcdBuffer::emitIData(uint32_t code, IData newval, int bits) {
    char* wp = m_writep;
    *wp++ = 'b';
    cvtIDataToStr(wp, newval << (VL_IDATASIZE - bits));
@ -657,7 +750,7 @@ void VerilatedVcd::emitIData(uint32_t code, IData newval, int bits) {
 }

 VL_ATTR_ALWINLINE
-void VerilatedVcd::emitQData(uint32_t code, QData newval, int bits) {
+void VerilatedVcdBuffer::emitQData(uint32_t code, QData newval, int bits) {
    char* wp = m_writep;
    *wp++ = 'b';
    cvtQDataToStr(wp, newval << (VL_QUADSIZE - bits));
@ -665,7 +758,7 @@ void VerilatedVcd::emitQData(uint32_t code, QData newval, int bits) {
 }

 VL_ATTR_ALWINLINE
-void VerilatedVcd::emitWData(uint32_t code, const WData* newvalp, int bits) {
+void VerilatedVcdBuffer::emitWData(uint32_t code, const WData* newvalp, int bits) {
    int words = VL_WORDS_I(bits);
    char* wp = m_writep;
    *wp++ = 'b';
@ -682,272 +775,10 @@ void VerilatedVcd::emitWData(uint32_t code, const WData* newvalp, int bits) {
 }

 VL_ATTR_ALWINLINE
-void VerilatedVcd::emitDouble(uint32_t code, double newval) {
+void VerilatedVcdBuffer::emitDouble(uint32_t code, double newval) {
    char* wp = m_writep;
    // Buffer can't overflow before VL_SNPRINTF; we sized during declaration
-    VL_SNPRINTF(wp, m_wrChunkSize, "r%.16g", newval);
+    VL_SNPRINTF(wp, m_maxSignalBytes, "r%.16g", newval);
    wp += std::strlen(wp);
    finishLine(code, wp);
 }
-
-#ifdef VL_TRACE_VCD_OLD_API
-
-void VerilatedVcd::fullBit(uint32_t code, const uint32_t newval) {
-    // Note the &1, so we don't require clean input -- makes more common no change case faster
-    *oldp(code) = newval;
-    *m_writep++ = ('0' + static_cast<char>(newval & 1));
-    m_writep = writeCode(m_writep, code);
-    *m_writep++ = '\n';
-    bufferCheck();
-}
-void VerilatedVcd::fullBus(uint32_t code, const uint32_t newval, int bits) {
-    *oldp(code) = newval;
-    *m_writep++ = 'b';
-    for (int bit = bits - 1; bit >= 0; --bit) {
-        *m_writep++ = ((newval & (1L << bit)) ? '1' : '0');
-    }
-    *m_writep++ = ' ';
-    m_writep = writeCode(m_writep, code);
-    *m_writep++ = '\n';
-    bufferCheck();
-}
-void VerilatedVcd::fullQuad(uint32_t code, const uint64_t newval, int bits) {
-    (*(reinterpret_cast<uint64_t*>(oldp(code)))) = newval;
-    *m_writep++ = 'b';
-    for (int bit = bits - 1; bit >= 0; --bit) {
-        *m_writep++ = ((newval & (1ULL << bit)) ? '1' : '0');
-    }
-    *m_writep++ = ' ';
-    m_writep = writeCode(m_writep, code);
-    *m_writep++ = '\n';
-    bufferCheck();
-}
-void VerilatedVcd::fullArray(uint32_t code, const uint32_t* newval, int bits) {
-    for (int word = 0; word < (((bits - 1) / 32) + 1); ++word) { oldp(code)[word] = newval[word]; }
-    *m_writep++ = 'b';
-    for (int bit = bits - 1; bit >= 0; --bit) {
-        *m_writep++ = ((newval[(bit / 32)] & (1L << (bit & 0x1f))) ? '1' : '0');
-    }
-    *m_writep++ = ' ';
-    m_writep = writeCode(m_writep, code);
-    *m_writep++ = '\n';
-    bufferCheck();
-}
-void VerilatedVcd::fullArray(uint32_t code, const uint64_t* newval, int bits) {
-    for (int word = 0; word < (((bits - 1) / 64) + 1); ++word) { oldp(code)[word] = newval[word]; }
-    *m_writep++ = 'b';
-    for (int bit = bits - 1; bit >= 0; --bit) {
-        *m_writep++ = ((newval[(bit / 64)] & (1ULL << (bit & 0x3f))) ? '1' : '0');
-    }
-    *m_writep++ = ' ';
-    m_writep = writeCode(m_writep, code);
-    *m_writep++ = '\n';
-    bufferCheck();
-}
-void VerilatedVcd::fullTriBit(uint32_t code, const uint32_t newval, const uint32_t newtri) {
-    oldp(code)[0] = newval;
-    oldp(code)[1] = newtri;
-    *m_writep++ = "01zz"[newval | (newtri << 1)];
-    m_writep = writeCode(m_writep, code);
-    *m_writep++ = '\n';
-    bufferCheck();
-}
-void VerilatedVcd::fullTriBus(uint32_t code, const uint32_t newval, const uint32_t newtri,
-                              int bits) {
-    oldp(code)[0] = newval;
-    oldp(code)[1] = newtri;
-    *m_writep++ = 'b';
-    for (int bit = bits - 1; bit >= 0; --bit) {
-        *m_writep++ = "01zz"[((newval >> bit) & 1) | (((newtri >> bit) & 1) << 1)];
-    }
-    *m_writep++ = ' ';
-    m_writep = writeCode(m_writep, code);
-    *m_writep++ = '\n';
-    bufferCheck();
-}
-void VerilatedVcd::fullTriQuad(uint32_t code, const uint64_t newval, const uint64_t newtri,
-                               int bits) {
-    (*(reinterpret_cast<uint64_t*>(oldp(code)))) = newval;
-    (*(reinterpret_cast<uint64_t*>(oldp(code + 1)))) = newtri;
-    *m_writep++ = 'b';
-    for (int bit = bits - 1; bit >= 0; --bit) {
-        *m_writep++ = "01zz"[((newval >> bit) & 1ULL) | (((newtri >> bit) & 1ULL) << 1ULL)];
-    }
-    *m_writep++ = ' ';
-    m_writep = writeCode(m_writep, code);
-    *m_writep++ = '\n';
-    bufferCheck();
-}
-void VerilatedVcd::fullTriArray(uint32_t code, const uint32_t* newvalp, const uint32_t* newtrip,
-                                int bits) {
-    for (int word = 0; word < (((bits - 1) / 32) + 1); ++word) {
-        oldp(code)[word * 2] = newvalp[word];
-        oldp(code)[word * 2 + 1] = newtrip[word];
-    }
-    *m_writep++ = 'b';
-    for (int bit = bits - 1; bit >= 0; --bit) {
-        uint32_t valbit = (newvalp[(bit / 32)] >> (bit & 0x1f)) & 1;
-        uint32_t tribit = (newtrip[(bit / 32)] >> (bit & 0x1f)) & 1;
-        *m_writep++ = "01zz"[valbit | (tribit << 1)];
-    }
-    *m_writep++ = ' ';
-    m_writep = writeCode(m_writep, code);
-    *m_writep++ = '\n';
-    bufferCheck();
-}
-void VerilatedVcd::fullDouble(uint32_t code, const double newval) {
-    // cppcheck-suppress invalidPointerCast
-    (*(reinterpret_cast<double*>(oldp(code)))) = newval;
-    // Buffer can't overflow before VL_SNPRINTF; we sized during declaration
-    VL_SNPRINTF(m_writep, m_wrChunkSize, "r%.16g", newval);
-    m_writep += std::strlen(m_writep);
-    *m_writep++ = ' ';
-    m_writep = writeCode(m_writep, code);
-    *m_writep++ = '\n';
-    bufferCheck();
-}
-
-#endif  // VL_TRACE_VCD_OLD_API
-
-//======================================================================
-//======================================================================
-//======================================================================
-
-#ifdef VERILATED_VCD_TEST
-#include <iostream>
-
-extern void verilated_trace_imp_selftest();
-
-uint32_t v1, v2, s1, s2[3];
-uint32_t tri96[3];
-uint32_t tri96__tri[3];
-uint64_t quad96[2];
-uint64_t tquad;
-uint64_t tquad__tri;
-uint8_t ch;
-uint64_t timestamp = 1;
-double doub = 0.0;
-float flo = 0.0f;
-
-void vcdInit(void*, VerilatedVcd* vcdp, uint32_t) {
-    vcdp->scopeEscape('.');
-    vcdp->pushNamePrefix("top.");
-    /**/ vcdp->declBus(0x2, "v1", -1, 0, 5, 1);
-    /**/ vcdp->declBus(0x3, "v2", -1, 0, 6, 1);
-    /**/ vcdp->pushNamePrefix("sub1.");
-    /***/ vcdp->declBit(0x4, "s1", -1, 0);
-    /***/ vcdp->declBit(0x5, "ch", -1, 0);
-    /**/ vcdp->popNamePrefix();
-    /**/ vcdp->pushNamePrefix("sub2.");
-    /***/ vcdp->declArray(0x6, "s2", -1, 0, 40, 3);
-    /**/ vcdp->popNamePrefix();
-    vcdp->popNamePrefix();
-    // Note need to add 3 for next code.
-    vcdp->pushNamePrefix("top2.");
-    /**/ vcdp->declBus(0x2, "t2v1", -1, 0, 4, 1);
-    /**/ vcdp->declTriBit(0x10, "io1", -1, 0);
-    /**/ vcdp->declTriBus(0x12, "io5", -1, 0, 4, 0);
-    /**/ vcdp->declTriArray(0x16, "io96", -1, 0, 95, 0);
-    /**/  // Note need to add 6 for next code.
-    /**/ vcdp->declDouble(0x1c, "doub", -1, 0);
-    /**/  // Note need to add 2 for next code.
-    /**/ vcdp->declArray(0x20, "q2", -1, 0, 95, 0);
-    /**/  // Note need to add 4 for next code.
-    /**/ vcdp->declTriQuad(0x24, "tq", -1, 0, 63, 0);
-    /**/  // Note need to add 4 for next code.
-    vcdp->popNamePrefix();
-}
-
-void vcdFull(void*, VerilatedVcd* vcdp) {
-    vcdp->fullBus(0x2, v1, 5);
-    vcdp->fullBus(0x3, v2, 7);
-    vcdp->fullBit(0x4, s1);
-    vcdp->fullBus(0x5, ch, 2);
-    vcdp->fullArray(0x6, &s2[0], 38);
-    vcdp->fullTriBit(0x10, tri96[0] & 1, tri96__tri[0] & 1);
-    vcdp->fullTriBus(0x12, tri96[0] & 0x1f, tri96__tri[0] & 0x1f, 5);
-    vcdp->fullTriArray(0x16, tri96, tri96__tri, 96);
-    vcdp->fullDouble(0x1c, doub);
-    vcdp->fullArray(0x20, &quad96[0], 96);
-    vcdp->fullTriQuad(0x24, tquad, tquad__tri, 64);
-}
-
-void vcdChange(void*, VerilatedVcd* vcdp) {
-    vcdp->chgBus(0x2, v1, 5);
-    vcdp->chgBus(0x3, v2, 7);
-    vcdp->chgBit(0x4, s1);
-    vcdp->chgBus(0x5, ch, 2);
-    vcdp->chgArray(0x6, &s2[0], 38);
-    vcdp->chgTriBit(0x10, tri96[0] & 1, tri96__tri[0] & 1);
-    vcdp->chgTriBus(0x12, tri96[0] & 0x1f, tri96__tri[0] & 0x1f, 5);
-    vcdp->chgTriArray(0x16, tri96, tri96__tri, 96);
-    vcdp->chgDouble(0x1c, doub);
-    vcdp->chgArray(0x20, &quad96[0], 96);
-    vcdp->chgTriQuad(0x24, tquad, tquad__tri, 64);
-}
-
-// clang-format off
-void vcdTestMain(const char* filenamep) {
-    verilated_trace_imp_selftest();
-
-    v1 = v2 = s1 = 0;
-    s2[0] = s2[1] = s2[2] = 0;
-    tri96[2] = tri96[1] = tri96[0] = 0;
-    tri96__tri[2] = tri96__tri[1] = tri96__tri[0] = ~0;
-    quad96[1] = quad96[0] = 0;
-    ch = 0;
-    doub = 0;
-    tquad = tquad__tri = 0;
-    {
-        VerilatedVcdC* vcdp = new VerilatedVcdC;
-        vcdp->evcd(true);
-        vcdp->set_time_unit("1ms");
-        vcdp->set_time_unit(std::string{"1ms"});
-        vcdp->set_time_resolution("1ns");
-        vcdp->set_time_resolution(std::string{"1ns"});
-        vcdp->spTrace()->addInitCb(&vcdInit, 0);
-        vcdp->spTrace()->addFullCb(&vcdFull, 0);
-        vcdp->spTrace()->addChgCb(&vcdChange, 0);
-        vcdp->open(filenamep);
-        // Dumping
-        vcdp->dump(++timestamp);
-        v1 = 0xfff;
-        tri96[2] = 4; tri96[1] = 2; tri96[0] = 1;
-        tri96__tri[2] = tri96__tri[1] = tri96__tri[0] = ~0;  // Still tri
-        quad96[1] = 0xffffffff; quad96[0] = 0;
-        doub = 1.5;
-        flo = 1.4f;
-        vcdp->dump(++timestamp);
-        v2 = 0x1;
-        s2[1] = 2;
-        tri96__tri[2] = tri96__tri[1] = tri96__tri[0] = 0;  // enable w/o data change
-        quad96[1] = 0; quad96[0] = ~0;
-        doub = -1.66e13;
-        flo = 0.123f;
-        tquad = 0x00ff00ff00ff00ffULL;
-        tquad__tri = 0x0000fffff0000ffffULL;
-        vcdp->dump(++timestamp);
-        ch = 2;
-        tri96[2] = ~4; tri96[1] = ~2; tri96[0] = ~1;
-        doub = -3.33e-13;
-        vcdp->dump(++timestamp);
-        vcdp->dump(++timestamp);
-# ifdef VERILATED_VCD_TEST_64BIT
-        const uint64_t bytesPerDump = 15ULL;
-        for (uint64_t i = 0; i < ((1ULL << 32) / bytesPerDump); i++) {
-            v1 = i;
-            vcdp->dump(++timestamp);
-        }
-# endif
-        vcdp->close();
-        VL_DO_CLEAR(delete vcdp, vcdp = nullptr);
-    }
-}
-#endif
-// clang-format on
-
-//********************************************************************
-// ;compile-command: "v4make test_regress/t/t_trace_c_api.pl"
-//
-// Local Variables:
-// End:
--- a/include/verilated_vcd_c.h
+++ b/include/verilated_vcd_c.h
@ -28,39 +28,20 @@
 #include <string>
 #include <vector>

-class VerilatedVcd;
-
-//=============================================================================
-// VerilatedFile
-/// Class representing a file to write to. These virtual methods can be
-/// overrode for e.g. socket I/O.
-
-class VerilatedVcdFile VL_NOT_FINAL {
-private:
-    int m_fd = 0;  // File descriptor we're writing to
-public:
-    // METHODS
-    /// Construct a (as yet) closed file
-    VerilatedVcdFile() = default;
-    /// Close and destruct
-    virtual ~VerilatedVcdFile() = default;
-    /// Open a file with given filename
-    virtual bool open(const std::string& name) VL_MT_UNSAFE;
-    /// Close object's file
-    virtual void close() VL_MT_UNSAFE;
-    /// Write data to file (if it is open)
-    virtual ssize_t write(const char* bufp, ssize_t len) VL_MT_UNSAFE;
-};
+class VerilatedVcdBuffer;
+class VerilatedVcdFile;

 //=============================================================================
 // VerilatedVcd
 // Base class to create a Verilator VCD dump
 // This is an internally used class - see VerilatedVcdC for what to call from applications

-class VerilatedVcd VL_NOT_FINAL : public VerilatedTrace<VerilatedVcd> {
+class VerilatedVcd VL_NOT_FINAL : public VerilatedTrace<VerilatedVcd, VerilatedVcdBuffer> {
+public:
+    using Super = VerilatedTrace<VerilatedVcd, VerilatedVcdBuffer>;
+
 private:
-    // Give the superclass access to private bits (to avoid virtual functions)
-    friend class VerilatedTrace<VerilatedVcd>;
+    friend Buffer;  // Give the buffer access to the private bits

    //=========================================================================
    // VCD specific internals
@ -74,9 +55,10 @@ private:
    int m_modDepth = 0;  // Depth of module hierarchy

    char* m_wrBufp;  // Output buffer
-    const char* m_wrFlushp;  // Output buffer flush trigger location
+    char* m_wrFlushp;  // Output buffer flush trigger location
    char* m_writep;  // Write pointer into output buffer
-    uint64_t m_wrChunkSize;  // Output buffer size
+    size_t m_wrChunkSize;  // Output buffer size
+    size_t m_maxSignalBytes = 0;  // Upper bound on number of bytes a single signal can generate
    uint64_t m_wroteBytes = 0;  // Number of bytes written to this file

    std::vector<char> m_suffixes;  // VCD line end string codes + metadata
@ -84,7 +66,13 @@ private:
    using NameMap = std::map<const std::string, const std::string>;
    NameMap* m_namemapp = nullptr;  // List of names for the header

-    void bufferResize(uint64_t minsize);
+#ifdef VL_TRACE_PARALLEL
+    // Vector of free trace buffers as (pointer, size) pairs.
+    std::vector<std::pair<char*, size_t>> m_freeBuffers;
+    size_t m_numBuffers = 0;  // Number of trace buffers allocated
+#endif
+
+    void bufferResize(size_t minsize);
    void bufferFlush() VL_MT_UNSAFE_ONE;
    inline void bufferCheck() {
        // Flush the write buffer if there's not enough space left for new information
@ -107,8 +95,6 @@ private:

    static char* writeCode(char* writep, uint32_t code);

-    void finishLine(uint32_t code, char* writep);
-
    // CONSTRUCTORS
    VL_UNCOPYABLE(VerilatedVcd);

@ -116,27 +102,22 @@ protected:
    //=========================================================================
    // Implementation of VerilatedTrace interface

-    // Implementations of protected virtual methods for VerilatedTrace
+    // Called when the trace moves forward to a new time point
    virtual void emitTimeChange(uint64_t timeui) override;

    // Hooks called from VerilatedTrace
    virtual bool preFullDump() override { return isOpen(); }
    virtual bool preChangeDump() override;

-    // Implementations of duck-typed methods for VerilatedTrace. These are
-    // called from only one place (namely full*) so always inline them.
-    inline void emitBit(uint32_t code, CData newval);
-    inline void emitCData(uint32_t code, CData newval, int bits);
-    inline void emitSData(uint32_t code, SData newval, int bits);
-    inline void emitIData(uint32_t code, IData newval, int bits);
-    inline void emitQData(uint32_t code, QData newval, int bits);
-    inline void emitWData(uint32_t code, const WData* newvalp, int bits);
-    inline void emitDouble(uint32_t code, double newval);
+    // Trace buffer management
+    virtual VerilatedVcdBuffer* getTraceBuffer() override;
+    virtual void commitTraceBuffer(VerilatedVcdBuffer*) override;

 public:
    //=========================================================================
    // External interface to client code

+    // CONSTRUCTOR
    explicit VerilatedVcd(VerilatedVcdFile* filep = nullptr);
    ~VerilatedVcd();

@ -144,7 +125,7 @@ public:
    // Set size in megabytes after which new file should be created
    void rolloverMB(uint64_t rolloverMB) { m_rolloverMB = rolloverMB; }

-    // METHODS
+    // METHODS - All must be thread safe
    // Open the file; call isOpen() to see if errors
    void open(const char* filename) VL_MT_SAFE_EXCLUDES(m_mutex);
    // Open next data-only file
@ -164,168 +145,95 @@ public:
    void declQuad(uint32_t code, const char* name, bool array, int arraynum, int msb, int lsb);
    void declArray(uint32_t code, const char* name, bool array, int arraynum, int msb, int lsb);
    void declDouble(uint32_t code, const char* name, bool array, int arraynum);
-
-#ifdef VL_TRACE_VCD_OLD_API
-    //=========================================================================
-    // Note: These are only for testing for backward compatibility with foreign
-    // code and is not used by Verilator. Do not use these as there is no
-    // guarantee of functionality.
-
-    void declTriBit(uint32_t code, const char* name, bool array, int arraynum);
-    void declTriBus(uint32_t code, const char* name, bool array, int arraynum, int msb, int lsb);
-    void declTriQuad(uint32_t code, const char* name, bool array, int arraynum, int msb, int lsb);
-    void declTriArray(uint32_t code, const char* name, bool array, int arraynum, int msb, int lsb);
-
-    void fullBit(uint32_t* oldp, CData newval) { fullBit(oldp - this->oldp(0), newval); }
-    void fullCData(uint32_t* oldp, CData newval, int bits) {
-        fullBus(oldp - this->oldp(0), newval, bits);
-    }
-    void fullSData(uint32_t* oldp, SData newval, int bits) {
-        fullBus(oldp - this->oldp(0), newval, bits);
-    }
-    void fullIData(uint32_t* oldp, IData newval, int bits) {
-        fullBus(oldp - this->oldp(0), newval, bits);
-    }
-    void fullQData(uint32_t* oldp, QData newval, int bits) {
-        fullQuad(oldp - this->oldp(0), newval, bits);
-    }
-    void fullWData(uint32_t* oldp, const WData* newvalp, int bits) {
-        fullArray(oldp - this->oldp(0), newvalp, bits);
-    }
-    void fullDouble(uint32_t* oldp, double newval) { fullDouble(oldp - this->oldp(0), newval); }
-
-    inline void chgBit(uint32_t* oldp, CData newval) { chgBit(oldp - this->oldp(0), newval); }
-    inline void chgCData(uint32_t* oldp, CData newval, int bits) {
-        chgBus(oldp - this->oldp(0), newval, bits);
-    }
-    inline void chgSData(uint32_t* oldp, SData newval, int bits) {
-        chgBus(oldp - this->oldp(0), newval, bits);
-    }
-    inline void chgIData(uint32_t* oldp, IData newval, int bits) {
-        chgBus(oldp - this->oldp(0), newval, bits);
-    }
-    inline void chgQData(uint32_t* oldp, QData newval, int bits) {
-        chgQuad(oldp - this->oldp(0), newval, bits);
-    }
-    inline void chgWData(uint32_t* oldp, const WData* newvalp, int bits) {
-        chgArray(oldp - this->oldp(0), newvalp, bits);
-    }
-    inline void chgDouble(uint32_t* oldp, double newval) {
-        chgDouble(oldp - this->oldp(0), newval);
-    }
-
-    // Inside dumping routines, dump one signal, faster when not inlined
-    // due to code size reduction.
-    void fullBit(uint32_t code, const uint32_t newval);
-    void fullBus(uint32_t code, const uint32_t newval, int bits);
-    void fullQuad(uint32_t code, const uint64_t newval, int bits);
-    void fullArray(uint32_t code, const uint32_t* newvalp, int bits);
-    void fullArray(uint32_t code, const uint64_t* newvalp, int bits);
-    void fullTriBit(uint32_t code, const uint32_t newval, const uint32_t newtri);
-    void fullTriBus(uint32_t code, const uint32_t newval, const uint32_t newtri, int bits);
-    void fullTriQuad(uint32_t code, const uint64_t newval, const uint64_t newtri, int bits);
-    void fullTriArray(uint32_t code, const uint32_t* newvalp, const uint32_t* newtrip, int bits);
-    void fullDouble(uint32_t code, const double newval);
-
-    // Inside dumping routines, dump one signal if it has changed.
-    // We do want to inline these to avoid calls when the value did not change.
-    inline void chgBit(uint32_t code, const uint32_t newval) {
-        const uint32_t diff = oldp(code)[0] ^ newval;
-        if (VL_UNLIKELY(diff)) fullBit(code, newval);
-    }
-    inline void chgBus(uint32_t code, const uint32_t newval, int bits) {
-        const uint32_t diff = oldp(code)[0] ^ newval;
-        if (VL_UNLIKELY(diff)) {
-            if (VL_UNLIKELY(bits == 32 || (diff & ((1U << bits) - 1)))) {
-                fullBus(code, newval, bits);
-            }
-        }
-    }
-    inline void chgQuad(uint32_t code, const uint64_t newval, int bits) {
-        const uint64_t diff = (*(reinterpret_cast<uint64_t*>(oldp(code)))) ^ newval;
-        if (VL_UNLIKELY(diff)) {
-            if (VL_UNLIKELY(bits == 64 || (diff & ((1ULL << bits) - 1)))) {
-                fullQuad(code, newval, bits);
-            }
-        }
-    }
-    inline void chgArray(uint32_t code, const uint32_t* newvalp, int bits) {
-        for (int word = 0; word < (((bits - 1) / 32) + 1); ++word) {
-            if (VL_UNLIKELY(oldp(code)[word] ^ newvalp[word])) {
-                fullArray(code, newvalp, bits);
-                return;
-            }
-        }
-    }
-    inline void chgArray(uint32_t code, const uint64_t* newvalp, int bits) {
-        for (int word = 0; word < (((bits - 1) / 64) + 1); ++word) {
-            if (VL_UNLIKELY(*(reinterpret_cast<uint64_t*>(oldp(code + 2 * word)))
-                            ^ newvalp[word])) {
-                fullArray(code, newvalp, bits);
-                return;
-            }
-        }
-    }
-    inline void chgTriBit(uint32_t code, const uint32_t newval, const uint32_t newtri) {
-        const uint32_t diff = ((oldp(code)[0] ^ newval) | (oldp(code)[1] ^ newtri));
-        if (VL_UNLIKELY(diff)) {
-            // Verilator 3.510 and newer provide clean input, so the below
-            // is only for back compatibility
-            if (VL_UNLIKELY(diff & 1)) {  // Change after clean?
-                fullTriBit(code, newval, newtri);
-            }
-        }
-    }
-    inline void chgTriBus(uint32_t code, const uint32_t newval, const uint32_t newtri, int bits) {
-        const uint32_t diff = ((oldp(code)[0] ^ newval) | (oldp(code)[1] ^ newtri));
-        if (VL_UNLIKELY(diff)) {
-            if (VL_UNLIKELY(bits == 32 || (diff & ((1U << bits) - 1)))) {
-                fullTriBus(code, newval, newtri, bits);
-            }
-        }
-    }
-    inline void chgTriQuad(uint32_t code, const uint64_t newval, const uint64_t newtri, int bits) {
-        const uint64_t diff = (((*(reinterpret_cast<uint64_t*>(oldp(code)))) ^ newval)
-                               | ((*(reinterpret_cast<uint64_t*>(oldp(code + 1)))) ^ newtri));
-        if (VL_UNLIKELY(diff)) {
-            if (VL_UNLIKELY(bits == 64 || (diff & ((1ULL << bits) - 1)))) {
-                fullTriQuad(code, newval, newtri, bits);
-            }
-        }
-    }
-    inline void chgTriArray(uint32_t code, const uint32_t* newvalp, const uint32_t* newtrip,
-                            int bits) {
-        for (int word = 0; word < (((bits - 1) / 32) + 1); ++word) {
-            if (VL_UNLIKELY((oldp(code)[word * 2] ^ newvalp[word])
-                            | (oldp(code)[word * 2 + 1] ^ newtrip[word]))) {
-                fullTriArray(code, newvalp, newtrip, bits);
-                return;
-            }
-        }
-    }
-    inline void chgDouble(uint32_t code, const double newval) {
-        // cppcheck-suppress invalidPointerCast
-        if (VL_UNLIKELY((*(reinterpret_cast<double*>(oldp(code)))) != newval)) {
-            fullDouble(code, newval);
-        }
-    }
-
-    // METHODS
-    // Old/standalone API only
-    void evcd(bool flag) { m_evcd = flag; }
-#endif  // VL_TRACE_VCD_OLD_API
 };

 #ifndef DOXYGEN
-// Declare specializations here they are used in VerilatedVcdC just below
-template <> void VerilatedTrace<VerilatedVcd>::dump(uint64_t timeui);
-template <> void VerilatedTrace<VerilatedVcd>::set_time_unit(const char* unitp);
-template <> void VerilatedTrace<VerilatedVcd>::set_time_unit(const std::string& unit);
-template <> void VerilatedTrace<VerilatedVcd>::set_time_resolution(const char* unitp);
-template <> void VerilatedTrace<VerilatedVcd>::set_time_resolution(const std::string& unit);
-template <> void VerilatedTrace<VerilatedVcd>::dumpvars(int level, const std::string& hier);
+// Declare specialization here as it's used in VerilatedFstC just below
+template <> void VerilatedVcd::Super::dump(uint64_t time);
+template <> void VerilatedVcd::Super::set_time_unit(const char* unitp);
+template <> void VerilatedVcd::Super::set_time_unit(const std::string& unit);
+template <> void VerilatedVcd::Super::set_time_resolution(const char* unitp);
+template <> void VerilatedVcd::Super::set_time_resolution(const std::string& unit);
+template <> void VerilatedVcd::Super::dumpvars(int level, const std::string& hier);
 #endif  // DOXYGEN

+//=============================================================================
+// VerilatedVcdBuffer
+
+class VerilatedVcdBuffer final : public VerilatedTraceBuffer<VerilatedVcd, VerilatedVcdBuffer> {
+    // Give the trace file access to the private bits
+    friend VerilatedVcd;
+    friend VerilatedVcd::Super;
+
+#ifdef VL_TRACE_PARALLEL
+    char* m_writep;  // Write pointer into m_bufp
+    char* m_bufp;  // The beginning of the trace buffer
+    size_t m_size;  // The size of the buffer at m_bufp
+    char* m_growp;  // Resize limit pointer
+#else
+    char* m_writep = m_owner.m_writep;  // Write pointer into output buffer
+    char* const m_wrFlushp = m_owner.m_wrFlushp;  // Output buffer flush trigger location
+#endif
+
+    // VCD line end string codes + metadata
+    const char* const m_suffixes = m_owner.m_suffixes.data();
+    // The maximum number of bytes a single signal can emit
+    const size_t m_maxSignalBytes = m_owner.m_maxSignalBytes;
+
+    void finishLine(uint32_t code, char* writep);
+
+#ifdef VL_TRACE_PARALLEL
+    void adjustGrowp() {
+        m_growp = (m_bufp + m_size) - (2 * m_maxSignalBytes);
+        assert(m_growp >= m_bufp + m_maxSignalBytes);
+    }
+#endif
+
+public:
+    // CONSTRUCTOR
+#ifdef VL_TRACE_PARALLEL
+    explicit VerilatedVcdBuffer(VerilatedVcd& owner, char* bufp, size_t size);
+#else
+    explicit VerilatedVcdBuffer(VerilatedVcd& owner);
+#endif
+    ~VerilatedVcdBuffer() = default;
+
+    //=========================================================================
+    // Implementation of VerilatedTraceBuffer interface
+
+    // Implementations of duck-typed methods for VerilatedTraceBuffer. These are
+    // called from only one place (the full* methods), so always inline them.
+    VL_ATTR_ALWINLINE inline void emitBit(uint32_t code, CData newval);
+    VL_ATTR_ALWINLINE inline void emitCData(uint32_t code, CData newval, int bits);
+    VL_ATTR_ALWINLINE inline void emitSData(uint32_t code, SData newval, int bits);
+    VL_ATTR_ALWINLINE inline void emitIData(uint32_t code, IData newval, int bits);
+    VL_ATTR_ALWINLINE inline void emitQData(uint32_t code, QData newval, int bits);
+    VL_ATTR_ALWINLINE inline void emitWData(uint32_t code, const WData* newvalp, int bits);
+    VL_ATTR_ALWINLINE inline void emitDouble(uint32_t code, double newval);
+};
+
+//=============================================================================
+// VerilatedFile
+/// Class representing a file to write to. These virtual methods can be
+/// overrode for e.g. socket I/O.
+
+class VerilatedVcdFile VL_NOT_FINAL {
+private:
+    int m_fd = 0;  // File descriptor we're writing to
+public:
+    // METHODS
+    /// Construct a (as yet) closed file
+    VerilatedVcdFile() = default;
+    /// Close and destruct
+    virtual ~VerilatedVcdFile() = default;
+    /// Open a file with given filename
+    virtual bool open(const std::string& name) VL_MT_UNSAFE;
+    /// Close object's file
+    virtual void close() VL_MT_UNSAFE;
+    /// Write data to file (if it is open)
+    virtual ssize_t write(const char* bufp, ssize_t len) VL_MT_UNSAFE;
+};
+
 //=============================================================================
 // VerilatedVcdC
 /// Class representing a VCD dump file in C standalone (no SystemC)
@ -396,16 +304,6 @@ public:

    // Internal class access
    inline VerilatedVcd* spTrace() { return &m_sptrace; }
-
-#ifdef VL_TRACE_VCD_OLD_API
-    //=========================================================================
-    // Note: These are only for testing for backward compatibility with foreign
-    // code and is not used by Verilator. Do not use these as there is no
-    // guarantee of functionality.
-
-    // Use evcd format
-    void evcd(bool flag) VL_MT_UNSAFE_ONE { m_sptrace.evcd(flag); }
-#endif
 };

 #endif  // guard
--- a/include/verilatedos.h
+++ b/include/verilatedos.h
@ -40,6 +40,7 @@
 #ifdef __GNUC__
 # define VL_ATTR_ALIGNED(alignment) __attribute__((aligned(alignment)))
 # define VL_ATTR_ALWINLINE __attribute__((always_inline))
+# define VL_ATTR_NOINLINE __attribute__((noinline))
 # define VL_ATTR_COLD __attribute__((cold))
 # define VL_ATTR_HOT __attribute__((hot))
 # define VL_ATTR_NORETURN __attribute__((noreturn))
@ -82,6 +83,9 @@
 #ifndef VL_ATTR_ALWINLINE
 # define VL_ATTR_ALWINLINE  ///< Attribute to inline, even when not optimizing
 #endif
+#ifndef VL_ATTR_NOINLINE
+# define VL_ATTR_NOINLINE  ///< Attribute to never inline, even when optimizing
+#endif
 #ifndef VL_ATTR_COLD
 # define VL_ATTR_COLD  ///< Attribute that function is rarely executed
 #endif
--- a/src/V3AstNodes.h
+++ b/src/V3AstNodes.h
@ -8533,6 +8533,7 @@ public:
    AstNodeDType* childDTypep() const { return VN_AS(op1p(), NodeDType); }
    void childDTypep(AstNodeDType* nodep) { setOp1p(nodep); }
    AstNode* itemsp() const { return op2p(); }  // op2 = AstPatReplicate, AstPatMember, etc
+    void addItemsp(AstNode* nodep) { addOp2p(nodep); }
 };
 class AstPatMember final : public AstNodeMath {
    // Verilog '{a} or '{a{b}}
--- a/src/V3AstUserAllocator.h
+++ b/src/V3AstUserAllocator.h
@ -106,7 +106,7 @@ public:
    }

    // Get a reference to the user data
-    T_Data& operator()(const T_Node* nodep) {
+    T_Data& operator()(const T_Node* nodep) const {
        T_Data* const userp = getUserp(nodep);
        UASSERT_OBJ(userp, nodep, "Missing User data on const AstNode");
        return *userp;
--- a/src/V3Case.cpp
+++ b/src/V3Case.cpp
@ -496,7 +496,7 @@ private:
        V3Case::caseLint(nodep);
        iterateChildren(nodep);
        if (debug() >= 9) nodep->dumpTree(cout, " case_old: ");
-        if (isCaseTreeFast(nodep) && v3Global.opt.oCase()) {
+        if (isCaseTreeFast(nodep) && v3Global.opt.fCase()) {
            // It's a simple priority encoder or complete statement
            // we can make a tree of statements to avoid extra comparisons
            ++m_statCaseFast;
--- a/src/V3Const.cpp
+++ b/src/V3Const.cpp
@ -111,6 +111,15 @@ class ConstBitOpTreeVisitor final : public VNVisitor {
        BitPolarityEntry() = default;
    };

+    struct FrozenNodeInfo final {  // Context when a frozen node is found
+        bool m_polarity;
+        int m_lsb;
+        bool operator<(const FrozenNodeInfo& other) const {
+            if (m_lsb != other.m_lsb) return m_lsb < other.m_lsb;
+            return m_polarity < other.m_polarity;
+        }
+    };
+
    class Restorer final {  // Restore the original state unless disableRestore() is called
        ConstBitOpTreeVisitor& m_visitor;
        const size_t m_polaritiesSize;
@ -299,7 +308,8 @@ class ConstBitOpTreeVisitor final : public VNVisitor {
    LeafInfo* m_leafp = nullptr;  // AstConst or AstVarRef that currently looking for
    const AstNode* const m_rootp;  // Root of this AST subtree

-    std::vector<AstNode*> m_frozenNodes;  // Nodes that cannot be optimized
+    std::vector<std::pair<AstNode*, FrozenNodeInfo>>
+        m_frozenNodes;  // Nodes that cannot be optimized
    std::vector<BitPolarityEntry> m_bitPolarities;  // Polarity of bits found during iterate()
    std::vector<std::unique_ptr<VarInfo>> m_varInfos;  // VarInfo for each variable, [0] is nullptr

@ -487,7 +497,7 @@ class ConstBitOpTreeVisitor final : public VNVisitor {
                    restorer.restoreNow();
                    // Reach past a cast then add to frozen nodes to be added to final reduction
                    if (const AstCCast* const castp = VN_CAST(opp, CCast)) opp = castp->lhsp();
-                    m_frozenNodes.push_back(opp);
+                    m_frozenNodes.emplace_back(opp, FrozenNodeInfo{m_polarity, m_lsb});
                    m_failed = origFailed;
                    continue;
                }
@ -652,17 +662,21 @@ public:
            }
        }

+        std::map<FrozenNodeInfo, std::vector<AstNode*>> frozenNodes;  // Group by FrozenNodeInfo
        // Check if frozen terms are clean or not
-        for (AstNode* const termp : visitor.m_frozenNodes) {
+        for (const auto& frozenInfo : visitor.m_frozenNodes) {
+            AstNode* const termp = frozenInfo.first;
            // Comparison operators are clean
-            if (VN_IS(termp, Eq) || VN_IS(termp, Neq) || VN_IS(termp, Lt) || VN_IS(termp, Lte)
-                || VN_IS(termp, Gt) || VN_IS(termp, Gte)) {
+            if ((VN_IS(termp, Eq) || VN_IS(termp, Neq) || VN_IS(termp, Lt) || VN_IS(termp, Lte)
+                 || VN_IS(termp, Gt) || VN_IS(termp, Gte))
+                && frozenInfo.second.m_lsb == 0) {
                hasCleanTerm = true;
            } else {
                // Otherwise, conservatively assume the frozen term is dirty
                hasDirtyTerm = true;
                UINFO(9, "Dirty frozen term: " << termp << endl);
            }
+            frozenNodes[frozenInfo.second].push_back(termp);
        }

        // Figure out if a final negation is required
@ -672,7 +686,12 @@ public:
        const bool needsCleaning = visitor.isAndTree() ? !hasCleanTerm : hasDirtyTerm;

        // Add size of reduction tree to op count
-        resultOps += termps.size() + visitor.m_frozenNodes.size() - 1;
+        resultOps += termps.size() - 1;
+        for (const auto& lsbAndNodes : frozenNodes) {
+            if (lsbAndNodes.first.m_lsb > 0) ++resultOps;  // Needs AstShiftR
+            if (!lsbAndNodes.first.m_polarity) ++resultOps;  // Needs AstNot
+            resultOps += lsbAndNodes.second.size();
+        }
        // Add final polarity flip in Xor tree
        if (needsFlip) ++resultOps;
        // Add final cleaning AND
@ -681,7 +700,10 @@ public:
        if (debug() >= 9) {  // LCOV_EXCL_START
            cout << "Bitop tree considered: " << endl;
            for (AstNode* const termp : termps) termp->dumpTree("Reduced term: ");
-            for (AstNode* const termp : visitor.m_frozenNodes) termp->dumpTree("Frozen term: ");
+            for (const std::pair<AstNode*, FrozenNodeInfo>& termp : visitor.m_frozenNodes)
+                termp.first->dumpTree("Frozen term with lsb " + std::to_string(termp.second.m_lsb)
+                                      + " polarity " + std::to_string(termp.second.m_polarity)
+                                      + ": ");
            cout << "Needs flipping: " << needsFlip << endl;
            cout << "Needs cleaning: " << needsCleaning << endl;
            cout << "Size: " << resultOps << " input size: " << visitor.m_ops << endl;
@ -724,8 +746,25 @@ public:
            resultp = reduce(resultp, termp);
        }
        // Add any frozen terms to the reduction
-        for (AstNode* const frozenp : visitor.m_frozenNodes) {
-            resultp = reduce(resultp, frozenp->unlinkFrBack());
+        for (auto&& nodes : frozenNodes) {
+            // nodes.second has same lsb and polarity
+            AstNode* termp = nullptr;
+            for (AstNode* const itemp : nodes.second) {
+                termp = reduce(termp, itemp->unlinkFrBack());
+            }
+            if (nodes.first.m_lsb > 0) {  // LSB is not 0, so shiftR
+                AstNodeDType* const dtypep = termp->dtypep();
+                termp = new AstShiftR{termp->fileline(), termp,
+                                      new AstConst(termp->fileline(), AstConst::WidthedValue{},
+                                                   termp->width(), nodes.first.m_lsb)};
+                termp->dtypep(dtypep);
+            }
+            if (!nodes.first.m_polarity) {  // Polarity is inverted, so append Not
+                AstNodeDType* const dtypep = termp->dtypep();
+                termp = new AstNot{termp->fileline(), termp};
+                termp->dtypep(dtypep);
+            }
+            resultp = reduce(resultp, termp);
        }

        // Set width of masks to expected result width. This is required to prevent later removal
@ -1051,7 +1090,7 @@ private:

    bool matchBitOpTree(AstNode* nodep) {
        if (nodep->widthMin() != 1) return false;
-        if (!v3Global.opt.oConstBitOpTree()) return false;
+        if (!v3Global.opt.fConstBitOpTree()) return false;

        string debugPrefix;
        if (debug() >= 9) {  // LCOV_EXCL_START
@ -1373,7 +1412,7 @@ private:
        return (VN_IS(nodep, And) || VN_IS(nodep, Or) || VN_IS(nodep, Xor));
    }
    bool ifAdjacentSel(const AstSel* lhsp, const AstSel* rhsp) {
-        if (!v3Global.opt.oAssemble()) return false;  // opt disabled
+        if (!v3Global.opt.fAssemble()) return false;  // opt disabled
        if (!lhsp || !rhsp) return false;
        const AstNode* const lfromp = lhsp->fromp();
        const AstNode* const rfromp = rhsp->fromp();
@ -1388,7 +1427,7 @@ private:
    }
    bool ifMergeAdjacent(AstNode* lhsp, AstNode* rhsp) {
        // called by concatmergeable to determine if {lhsp, rhsp} make sense
-        if (!v3Global.opt.oAssemble()) return false;  // opt disabled
+        if (!v3Global.opt.fAssemble()) return false;  // opt disabled
        // two same varref
        if (operandsSame(lhsp, rhsp)) return true;
        const AstSel* lselp = VN_CAST(lhsp, Sel);
@ -1425,7 +1464,7 @@ private:
    }
    bool concatMergeable(const AstNode* lhsp, const AstNode* rhsp, unsigned depth) {
        // determine if {a OP b, c OP d} => {a, c} OP {b, d} is advantageous
-        if (!v3Global.opt.oAssemble()) return false;  // opt disabled
+        if (!v3Global.opt.fAssemble()) return false;  // opt disabled
        if (lhsp->type() != rhsp->type()) return false;
        if (!ifConcatMergeableBiop(lhsp)) return false;
        if (depth > CONCAT_MERGABLE_MAX_DEPTH) return false;  // As worse case O(n^2) algorithm
@ -2511,7 +2550,7 @@ private:
            if (nodep->access().isReadOnly()
                && ((!m_params  // Can reduce constant wires into equations
                     && m_doNConst
-                     && v3Global.opt.oConst()
+                     && v3Global.opt.fConst()
                     // Default value, not a "known" constant for this usage
                     && !nodep->varp()->isClassMember()
                     && !(nodep->varp()->isFuncLocal() && nodep->varp()->isNonOutput())
--- a/src/V3EmitCImp.cpp
+++ b/src/V3EmitCImp.cpp
@ -752,26 +752,26 @@ class EmitCTrace final : EmitCFunc {
        const string func = nodep->full() ? "full" : "chg";
        bool emitWidth = true;
        if (nodep->dtypep()->basicp()->isDouble()) {
-            puts("tracep->" + func + "Double");
+            puts("bufp->" + func + "Double");
            emitWidth = false;
        } else if (nodep->isWide() || emitTraceIsScBv(nodep) || emitTraceIsScBigUint(nodep)) {
-            puts("tracep->" + func + "WData");
+            puts("bufp->" + func + "WData");
        } else if (nodep->isQuad()) {
-            puts("tracep->" + func + "QData");
+            puts("bufp->" + func + "QData");
        } else if (nodep->declp()->widthMin() > 16) {
-            puts("tracep->" + func + "IData");
+            puts("bufp->" + func + "IData");
        } else if (nodep->declp()->widthMin() > 8) {
-            puts("tracep->" + func + "SData");
+            puts("bufp->" + func + "SData");
        } else if (nodep->declp()->widthMin() > 1) {
-            puts("tracep->" + func + "CData");
+            puts("bufp->" + func + "CData");
        } else {
-            puts("tracep->" + func + "Bit");
+            puts("bufp->" + func + "Bit");
            emitWidth = false;
        }

        const uint32_t offset = (arrayindex < 0) ? 0 : (arrayindex * nodep->declp()->widthWords());
        const uint32_t code = nodep->declp()->code() + offset;
-        puts(v3Global.opt.useTraceOffloadThread() && !nodep->full() ? "(base+" : "(oldp+");
+        puts(v3Global.opt.useTraceOffload() && !nodep->full() ? "(base+" : "(oldp+");
        puts(cvtToStr(code - nodep->baseCode()));
        puts(",");
        emitTraceValue(nodep, arrayindex);
--- a/src/V3EmitCMake.cpp
+++ b/src/V3EmitCMake.cpp
@ -113,9 +113,8 @@ class CMakeEmitter final {
        cmake_set_raw(*of, name + "_COVERAGE", v3Global.opt.coverage() ? "1" : "0");
        *of << "# Threaded output mode?  0/1/N threads (from --threads)\n";
        cmake_set_raw(*of, name + "_THREADS", cvtToStr(v3Global.opt.threads()));
-        *of << "# Threaded tracing output mode?  0/1/N threads (from --trace-threads)\n";
-        cmake_set_raw(*of, name + "_TRACE_THREADS",
-                      cvtToStr(v3Global.opt.useTraceOffloadThread()));
+        *of << "# Threaded tracing output mode?  0/1/N threads (from --threads/--trace-threads)\n";
+        cmake_set_raw(*of, name + "_TRACE_THREADS", cvtToStr(v3Global.opt.vmTraceThreads()));
        cmake_set_raw(*of, name + "_TRACE_FST_WRITER_THREAD",
                      v3Global.opt.traceThreads() && v3Global.opt.traceFormat().fst() ? "1" : "0");
        *of << "# Struct output mode?  0/1 (from --trace-structs)\n";
--- a/src/V3EmitMk.cpp
+++ b/src/V3EmitMk.cpp
@ -73,9 +73,10 @@ public:
        of.puts("VM_TRACE_FST = ");
        of.puts(v3Global.opt.trace() && v3Global.opt.traceFormat().fst() ? "1" : "0");
        of.puts("\n");
-        of.puts("# Tracing threaded output mode?  0/1/N threads (from --trace-thread)\n");
+        of.puts(
+            "# Tracing threaded output mode?  0/1/N threads (from --threads/--trace-thread)\n");
        of.puts("VM_TRACE_THREADS = ");
-        of.puts(cvtToStr(v3Global.opt.useTraceOffloadThread()));
+        of.puts(cvtToStr(v3Global.opt.vmTraceThreads()));
        of.puts("\n");
        of.puts("# Separate FST writer thread? 0/1 (from --trace-fst with --trace-thread > 0)\n");
        of.puts("VM_TRACE_FST_WRITER_THREAD = ");
--- a/src/V3Gate.cpp
+++ b/src/V3Gate.cpp
@ -397,11 +397,11 @@ private:
        // Then propagate more complicated equations
        optimizeSignals(true);
        // Remove redundant logic
-        if (v3Global.opt.oDedupe()) {
+        if (v3Global.opt.fDedupe()) {
            dedupe();
            if (debug() >= 6) m_graph.dumpDotFilePrefixed("gate_dedup");
        }
-        if (v3Global.opt.oAssemble()) {
+        if (v3Global.opt.fAssemble()) {
            mergeAssigns();
            if (debug() >= 6) m_graph.dumpDotFilePrefixed("gate_assm");
        }
--- a/src/V3GraphAcyc.cpp
+++ b/src/V3GraphAcyc.cpp
@ -254,7 +254,7 @@ void GraphAcyc::simplify(bool allowCut) {
        if (allowCut) {
            // The main algorithm works without these, though slower
            // So if changing the main algorithm, comment these out for a test run
-            if (v3Global.opt.oAcycSimp()) {
+            if (v3Global.opt.fAcycSimp()) {
                cutBasic(vertexp);
                cutBackward(vertexp);
            }
--- a/src/V3MergeCond.cpp
+++ b/src/V3MergeCond.cpp
@ -42,6 +42,34 @@
 //
 //  Also merges consecutive AstNodeIf statements with the same condition.
 //
+//  Because this optimization has notable performance impact, we go further
+//  and perform code motion to try to move mergeable conditionals next to each
+//  other, which in turn enable us to merge more conditionals. To do this, we
+//  perform an analysis pass, followed by an optimization pass on the whole
+//  AstCFunc we are optimizing.
+//
+//  The analysis pass gathers, for each statement in the tree, the information
+//  relevant for determining whether two statements can be swapped, and some
+//  other additional information that is useful during optimization.
+//
+//  The optimization pass tries to move conditionals near each other, first by
+//  trying to move a conditional node backwards in the list, so it becomes the
+//  direct successor of another earlier conditional with the same condition.
+//  If this is not possible due to variable interference, then we additionally
+//  try to pull earlier conditionals with the same condition closer forward to
+//  be the immediate predecessor of the conditional node. We limit maximum
+//  distance a node can travel to an empirically chosen but otherwise arbitrary
+//  constant. This limits worst case complexity to be O(n) rather than O(n^2).
+//  The worst case complexity manifests when N/2 conditionals, all with unique
+//  conditions are succeeded by N/2 conditionals with the same unique
+//  conditions, such that each unique condition is used by exactly 2
+//  conditionals. In this case N/2 all nodes need to travel approx N/2 distance.
+//  Limiting the distance bounds the latter, hence limiting complexity.
+//
+//  Once the analysis and optimization passes have been applied to the whole
+//  function, any merged conditionals will then undergo the same analysis,
+//  optimization, and merging again in their individual branches.
+//
 //*************************************************************************

 #include "config_build.h"
@ -51,71 +79,364 @@
 #include "V3MergeCond.h"
 #include "V3Stats.h"
 #include "V3Ast.h"
+#include "V3AstUserAllocator.h"
+#include "V3Hasher.h"
+#include "V3DupFinder.h"
+
+#include <queue>
+#include <set>
+
+namespace {

 //######################################################################
+// Utilities

-enum class Mergeable {
-    YES,  // Tree can be merged
-    NO_COND_ASSIGN,  // Tree cannot be merged because it contains an assignment to a condition
-    NO_IMPURE  // Tree cannot be merged because it contains an impure node
+// This function extracts the Cond node from the RHS of an assignment,
+// if there is one and it is in a supported position, which are:
+// - RHS is the Cond
+// - RHS is And(Const, Cond). This And is inserted often by V3Clean.
+AstNodeCond* extractCondFromRhs(AstNode* rhsp) {
+    if (AstNodeCond* const condp = VN_CAST(rhsp, NodeCond)) {
+        return condp;
+    } else if (const AstAnd* const andp = VN_CAST(rhsp, And)) {
+        if (AstNodeCond* const condp = VN_CAST(andp->rhsp(), NodeCond)) {
+            if (VN_IS(andp->lhsp(), Const)) return condp;
+        }
+    }
+    return nullptr;
+}
+
+// Predicate to check if two sets are disjoint. This is stable, as we only need
+// to determine if the sets contain a shared element, which is a boolean
+// property. It is also efficient as we use sorted sets, and therefore can
+// enumerate elements in order (what the ordering is, is unimportant), meaning
+// the worst case complexity is O(size of smaller set).
+bool areDisjoint(const std::set<const AstVar*>& a, const std::set<const AstVar*>& b) {
+    if (a.empty() || b.empty()) return true;
+    const auto endA = a.end();
+    const auto endB = b.end();
+    auto itA = a.begin();
+    auto itB = b.begin();
+    while (true) {
+        if (*itA == *itB) return false;
+        if (std::less<const AstVar*>{}(*itA, *itB)) {
+            itA = std::lower_bound(++itA, endA, *itB);
+            if (itA == endA) return true;
+        } else {
+            itB = std::lower_bound(++itB, endB, *itA);
+            if (itB == endB) return true;
+        }
+    }
+}
+
+//######################################################################
+// Structure containing information required for code motion/merging
+
+struct StmtProperties {
+    AstNode* m_condp = nullptr;  // The condition expression, if a conditional node
+    std::set<const AstVar*> m_rdVars;  // Variables read by this statement
+    std::set<const AstVar*> m_wrVars;  // Variables writen by this statement
+    bool m_isFence = false;  // Nothing should move across this statement, nor should it be merged
+    AstNodeStmt* m_prevWithSameCondp = nullptr;  // Previous node in same list, with same condition
+    bool writesConditionVar() const {
+        // This relies on MarkVarsVisitor having been called on the condition node
+        for (const AstVar* const varp : m_wrVars) {
+            if (varp->user1()) return true;
+        }
+        return false;
+    }
 };

-class CheckMergeableVisitor final : public VNVisitor {
-private:
-    // STATE
-    bool m_condAssign = false;  // Does this tree contain an assignment to a condition variable??
-    bool m_impure = false;  // Does this tree contain an impure node?
+// We store the statement properties in user3 via AstUser3Allocator
+using StmtPropertiesAllocator = AstUser3Allocator<AstNodeStmt, StmtProperties>;

-    // METHODS
-    VL_DEBUG_FUNC;  // Declare debug()
+//######################################################################
+// Code motion analysis and implementation

-    // VISITORS
-    virtual void visit(AstNode* nodep) override {
-        if (m_impure) return;
-        // Clear if node is impure
-        if (!nodep->isPure()) {
-            UINFO(9, "Not mergeable due to impure node" << nodep << endl);
-            m_impure = true;
-            return;
+// Pure analysis visitor that build the StmtProperties for each statement in the given
+// AstNode list (following AstNode::nextp())
+class CodeMotionAnalysisVisitor final : public VNVisitor {
+    // NODE STATE
+    // AstNodeStmt::user3   -> StmtProperties (accessed via m_stmtProperties, managed externally,
+    //                         see MergeCondVisitor::process)
+    // AstNode::user4       -> Used by V3Hasher
+    // AstNode::user5       -> AstNode*: Set on a condition node, points to the last conditional
+    //                         with that condition so far encountered in the same AstNode list
+
+    VNUser5InUse m_user5InUse;
+
+    StmtPropertiesAllocator& m_stmtProperties;
+
+    // MEMBERS
+    V3Hasher m_hasher;  // Used by V3DupFinder
+    // Stack of a V3DupFinder used for finding identical condition expressions within one
+    // statement list.
+    std::vector<V3DupFinder> m_stack;
+    StmtProperties* m_propsp = nullptr;  // StmtProperties structure of current AstNodeStmt
+
+    // Extract condition expression from a megeable conditional statement, if any
+    static AstNode* extractCondition(const AstNodeStmt* nodep) {
+        AstNode* conditionp = nullptr;
+        if (const AstNodeAssign* const assignp = VN_CAST(nodep, NodeAssign)) {
+            if (AstNodeCond* const conditionalp = extractCondFromRhs(assignp->rhsp())) {
+                conditionp = conditionalp->condp();
+            }
+        } else if (const AstNodeIf* const ifp = VN_CAST(nodep, NodeIf)) {
+            conditionp = ifp->condp();
        }
+        while (AstCCast* const castp = VN_CAST(conditionp, CCast)) conditionp = castp->lhsp();
+        return conditionp;
+    }
+
+    void analyzeStmt(AstNodeStmt* nodep, bool tryCondMatch) {
+        VL_RESTORER(m_propsp);
+        // Keep hold of props of enclosing statement
+        StmtProperties* const outerPropsp = m_propsp;
+        // Grab the props of this statement
+        m_propsp = &m_stmtProperties(nodep);
+
+        // Extract condition from statement
+        if (AstNode* const condp = extractCondition(nodep)) {
+            // Remember condition node. We always need this as it is used in the later
+            // traversal.
+            m_propsp->m_condp = condp;
+            // If this is a conditional statement, try to find an earlier one with the same
+            // condition in the same list (unless we have been told not to bother because we know
+            // this node is in a singleton list).
+            if (tryCondMatch) {
+                // Grab the duplicate finder of this list
+                V3DupFinder& dupFinder = m_stack.back();
+                // Find a duplicate condition
+                const V3DupFinder::iterator& dit = dupFinder.findDuplicate(condp);
+                if (dit == dupFinder.end()) {
+                    // First time seeing this condition in the current list
+                    dupFinder.insert(condp);
+                    // Remember last statement with this condition (which is this statement)
+                    condp->user5p(nodep);
+                } else {
+                    // Seen a conditional with the same condition earlier in the current list
+                    AstNode* const firstp = dit->second;
+                    // Add to properties for easy retrieval during optimization
+                    m_propsp->m_prevWithSameCondp = static_cast<AstNodeStmt*>(firstp->user5p());
+                    // Remember last statement with this condition (which is this statement)
+                    firstp->user5p(nodep);
+                }
+            }
+        }
+
+        // Analyse this statement
+        analyzeNode(nodep);
+
+        // If there is an enclosing statement, propagate properties upwards
+        if (outerPropsp) {
+            // Add all rd/wr vars to outer statement
+            outerPropsp->m_rdVars.insert(m_propsp->m_rdVars.cbegin(), m_propsp->m_rdVars.cend());
+            outerPropsp->m_wrVars.insert(m_propsp->m_wrVars.cbegin(), m_propsp->m_wrVars.cend());
+            // If this statement is impure, the enclosing statement is also impure
+            if (m_propsp->m_isFence) outerPropsp->m_isFence = true;
+        }
+    }
+
+    void analyzeVarRef(AstVarRef* nodep) {
+        const VAccess access = nodep->access();
+        AstVar* const varp = nodep->varp();
+        // Gather read and written variables
+        if (access.isReadOrRW()) m_propsp->m_rdVars.insert(varp);
+        if (access.isWriteOrRW()) m_propsp->m_wrVars.insert(varp);
+    }
+
+    void analyzeNode(AstNode* nodep) {
+        // If an impure node under a statement, mark that statement as impure
+        if (m_propsp && !nodep->isPure()) m_propsp->m_isFence = true;
+        // Analyze children
        iterateChildrenConst(nodep);
    }
-    virtual void visit(AstVarRef* nodep) override {
-        if (m_impure || m_condAssign) return;
-        // Clear if it's an LValue referencing a marked variable
-        if (nodep->access().isWriteOrRW() && nodep->varp()->user1()) {
-            UINFO(9, "Not mergeable due assignment to condition" << nodep << endl);
-            m_condAssign = true;
+
+    // VISITORS
+    void visit(AstNode* nodep) override {
+        // Push a new stack entry at the start of a list, but only if the list is not a
+        // single element (this saves a lot of allocations in expressions)
+        bool singletonListStart = false;
+        if (nodep->backp()->nextp() != nodep) {  // If at head of list
+            singletonListStart = nodep->nextp() == nullptr;
+            if (!singletonListStart) m_stack.emplace_back(m_hasher);
        }
+
+        // Analyse node
+        if (AstNodeStmt* const stmtp = VN_CAST(nodep, NodeStmt)) {
+            analyzeStmt(stmtp, /*tryCondMatch:*/ !singletonListStart);
+        } else if (AstVarRef* const vrefp = VN_CAST(nodep, VarRef)) {
+            analyzeVarRef(vrefp);
+        } else {
+            analyzeNode(nodep);
+        }
+
+        // Pop the stack at the end of a list
+        if (!singletonListStart && !nodep->nextp()) m_stack.pop_back();
+    }
+
+    // CONSTRUCTOR
+    CodeMotionAnalysisVisitor(AstNode* nodep, StmtPropertiesAllocator& stmtProperties)
+        : m_stmtProperties(stmtProperties) {
+        iterateAndNextConstNull(nodep);
    }

 public:
-    CheckMergeableVisitor() = default;
-
-    // Return false if this node should not be merged at all because:
-    // - It contains an impure expression
-    // - It contains an LValue referencing the condition
-    Mergeable operator()(const AstNode* node) {
-        m_condAssign = false;
-        m_impure = false;
-        iterateChildrenConst(const_cast<AstNode*>(node));
-        if (m_impure) {  // Impure is stronger than cond assign
-            return Mergeable::NO_IMPURE;
-        } else if (m_condAssign) {
-            return Mergeable::NO_COND_ASSIGN;
-        } else {
-            return Mergeable::YES;
-        }
+    // Analyse the statement list starting at nodep, filling in stmtProperties.
+    static void analyze(AstNode* nodep, StmtPropertiesAllocator& stmtProperties) {
+        CodeMotionAnalysisVisitor{nodep, stmtProperties};
    }
 };

+class CodeMotionOptimizeVisitor final : public VNVisitor {
+    // Do not move a node more than this many statements.
+    // This bounds complexity at O(N), rather than O(N^2).
+    static constexpr unsigned MAX_DISTANCE = 500;
+
+    // NODE STATE
+    // AstNodeStmt::user3   -> StmtProperties (accessed via m_stmtProperties, managed externally,
+    //                         see MergeCondVisitor::process)
+    // AstNodeStmt::user4   -> bool: Already processed this node
+
+    VNUser4InUse m_user4InUse;
+
+    const StmtPropertiesAllocator& m_stmtProperties;
+
+    // MEMBERS
+
+    // Predicate that checks if the order of two statements can be swapped
+    bool areSwappable(const AstNodeStmt* ap, const AstNodeStmt* bp) const {
+        const StmtProperties& aProps = m_stmtProperties(ap);
+        const StmtProperties& bProps = m_stmtProperties(bp);
+        // Don't move across fences
+        if (aProps.m_isFence) return false;
+        if (bProps.m_isFence) return false;
+        // If either statement writes a variable that the other reads, they are not swappable
+        if (!areDisjoint(aProps.m_rdVars, bProps.m_wrVars)) return false;
+        if (!areDisjoint(bProps.m_rdVars, aProps.m_wrVars)) return false;
+        // If they both write to the same variable, they are not swappable
+        if (!areDisjoint(aProps.m_wrVars, bProps.m_wrVars)) return false;
+        // Otherwise good to go
+        return true;
+    }
+
+    // VISITORS
+    void visit(AstNodeStmt* nodep) override {
+        // Process only on first encounter
+        if (nodep->user4SetOnce()) return;
+        // First re-order children
+        iterateChildren(nodep);
+        // Grab hold of previous node with same condition
+        AstNodeStmt* prevp = m_stmtProperties(nodep).m_prevWithSameCondp;
+        // If no previous node with same condition, we are done
+        if (!prevp) return;
+#ifdef VL_DEBUG
+        {  // Sanity check, only in debug build, otherwise expensive
+            const AstNode* currp = prevp;
+            while (currp && currp != nodep) currp = currp->nextp();
+            UASSERT_OBJ(currp, nodep, "Predecessor not in same list as " << currp);
+        }
+#endif
+        // Otherwise try to move this node backwards, as close as we can to the previous node
+        // with the same condition
+        if (AstNodeStmt* predp = VN_CAST(nodep->backp(), NodeStmt)) {
+            // 'predp' is the newly computed predecessor node of 'nodep', which is initially
+            // (without movement) the 'backp' of the node.
+            for (unsigned i = MAX_DISTANCE; i; --i) {
+                // If the predecessor is the previous node with the same condition, job done
+                if (predp == prevp) break;
+                // Don't move past a non-statement (e.g.: AstVar), or end of list
+                AstNodeStmt* const backp = VN_CAST(predp->backp(), NodeStmt);
+                if (!backp) break;
+                // Don't swap statements if doing so would change program semantics
+                if (!areSwappable(predp, nodep)) break;
+                // Otherwise move 'nodep' back
+                predp = backp;
+            }
+
+            // If we decided that 'nodep' should be moved back
+            if (nodep->backp() != predp) {
+                // Move the current node to directly follow the computed predecessor
+                nodep->unlinkFrBack();
+                predp->addNextHere(nodep);
+                // If the predecessor is the previous node with the same condition, job done
+                if (predp == prevp) return;
+            }
+        }
+        // If we reach here, it means we were unable to move the current node all the way back
+        // such that it immediately follows the previous statement with the same condition. Now
+        // try to move all previous statements with the same condition forward, in the hope of
+        // compacting the list further.
+        for (AstNodeStmt* currp = nodep; prevp;
+             currp = prevp, prevp = m_stmtProperties(currp).m_prevWithSameCondp) {
+            // Move prevp (previous statement with same condition) towards currp
+            if (AstNodeStmt* succp = VN_CAST(prevp->nextp(), NodeStmt)) {
+                // 'succp' is the newly computed successor node of 'prevp', which is initially
+                // (without movement) the 'nextp' of the node.
+                for (unsigned i = MAX_DISTANCE; --i;) {
+                    // If the successor of the previous statement with same condition is the
+                    // target node, we are done with this predecessor
+                    if (succp == currp) break;
+                    // Don't move past a non-statement (e.g.: AstVar), or end of list
+                    AstNodeStmt* const nextp = VN_CAST(succp->nextp(), NodeStmt);
+                    if (!nextp) break;
+                    // Don't swap statements if doing so would change program semantics
+                    if (!areSwappable(prevp, succp)) break;
+                    // Otherwise move further forward
+                    succp = nextp;
+                }
+
+                // If we decided that 'prevp' should be moved forward
+                if (prevp->nextp() != succp) {
+                    // Move the current node to directly before the computed successor
+                    prevp->unlinkFrBack();
+                    succp->addHereThisAsNext(prevp);
+                }
+            }
+        }
+    }
+
+    void visit(AstNode* nodep) override {}  // Ignore all non-statements
+
+    // CONSTRUCTOR
+    CodeMotionOptimizeVisitor(AstNode* nodep, const StmtPropertiesAllocator& stmtProperties)
+        : m_stmtProperties(stmtProperties) {
+        // We assert the given node is at the head of the list otherwise we might move a node
+        // before the given node. This is easy to fix in the above iteration with a check on a
+        // boundary node we should not move past, if we ever need to do so.
+        // Note: we will do iterateAndNextNull which requires nodep->backp() != nullptr anyway
+        UASSERT_OBJ(nodep->backp()->nextp() != nodep, nodep, "Must be at head of list");
+        // Optimize the list
+        iterateAndNextNull(nodep);
+    }
+
+public:
+    // Given an AstNode list (held via AstNode::nextp()), move conditional statements as close
+    // together as possible
+    static AstNode* optimize(AstNode* nodep, const StmtPropertiesAllocator& stmtProperties) {
+        CodeMotionOptimizeVisitor{nodep, stmtProperties};
+        // It is possible for the head of the list to be moved later such that it is no longer
+        // in head position. If so, rewind the list and return the new head.
+        while (nodep->backp()->nextp() == nodep) nodep = nodep->backp();
+        return nodep;
+    }
+};
+
+//######################################################################
+// Conditional merging
+
 class MergeCondVisitor final : public VNVisitor {
 private:
    // NODE STATE
-    // AstVar::user1        -> Flag set for variables referenced by m_mgCondp
-    // AstNode::user2       -> Flag marking node as included in merge because cheap to duplicate
-    const VNUser1InUse m_user1InUse;
-    const VNUser2InUse m_user2InUse;
+    // AstVar::user1        -> bool: Set for variables referenced by m_mgCondp
+    //                         (Only below MergeCondVisitor::process).
+    // AstNode::user2       -> bool: Marking node as included in merge because cheap to
+    //                         duplicate
+    //                         (Only below MergeCondVisitor::process).
+    // AstNodeStmt::user3   -> StmtProperties
+    //                         (Only below MergeCondVisitor::process).
+    // AstNode::user4       -> See CodeMotionAnalysisVisitor/CodeMotionOptimizeVisitor
+    // AstNode::user5       -> See CodeMotionAnalysisVisitor

    // STATE
    VDouble0 m_statMerges;  // Statistic tracking
@ -128,24 +449,84 @@ private:
    const AstNode* m_mgNextp = nullptr;  // Next node in list being examined
    uint32_t m_listLenght = 0;  // Length of current list

-    CheckMergeableVisitor m_checkMergeable;  // Sub visitor for encapsulation & speed
+    std::queue<AstNode*>* m_workQueuep = nullptr;  // Node lists (via AstNode::nextp()) to merge
+    // Statement properties for code motion and merging
+    StmtPropertiesAllocator* m_stmtPropertiesp = nullptr;

    // METHODS
    VL_DEBUG_FUNC;  // Declare debug()

-    // This function extracts the Cond node from the RHS, if there is one and
-    // it is in a supported position, which are:
-    // - RHS is the Cond
-    // - RHS is And(Const, Cond). This And is inserted often by V3Clean.
-    static AstNodeCond* extractCond(AstNode* rhsp) {
-        if (AstNodeCond* const condp = VN_CAST(rhsp, NodeCond)) {
-            return condp;
-        } else if (const AstAnd* const andp = VN_CAST(rhsp, And)) {
-            if (AstNodeCond* const condp = VN_CAST(andp->rhsp(), NodeCond)) {
-                if (VN_IS(andp->lhsp(), Const)) return condp;
-            }
+    // Function that processes a whole sub-tree
+    void process(AstNode* nodep) {
+        // Set up work queue
+        std::queue<AstNode*> workQueue;
+        m_workQueuep = &workQueue;
+        m_workQueuep->push(nodep);
+
+        do {
+            // Set up user* for this iteration
+            const VNUser1InUse user1InUse;
+            const VNUser2InUse user2InUse;
+            const VNUser3InUse user3InUse;
+            // Statement properties only preserved for this iteration,
+            // then memory is released immediately.
+            StmtPropertiesAllocator stmtProperties;
+            m_stmtPropertiesp = &stmtProperties;
+
+            // Pop off current work item
+            AstNode* currp = m_workQueuep->front();
+            m_workQueuep->pop();
+
+            // Analyse sub-tree list for code motion
+            CodeMotionAnalysisVisitor::analyze(currp, stmtProperties);
+            // Perform the code motion within the whole sub-tree list
+            currp = CodeMotionOptimizeVisitor::optimize(currp, stmtProperties);
+
+            // Merge conditionals in the whole sub-tree list (this might create new work items)
+            iterateAndNextNull(currp);
+
+            // Close pending merge, if there is one at the end of the whole sub-tree list
+            if (m_mgFirstp) mergeEnd();
+        } while (!m_workQueuep->empty());
+    }
+
+    // Skip past AstArraySel and AstWordSel with const index
+    static AstNode* skipConstSels(AstNode* nodep) {
+        while (const AstArraySel* const aselp = VN_CAST(nodep, ArraySel)) {
+            // ArraySel index is not constant, so might be expensive
+            if (!VN_IS(aselp->bitp(), Const)) return nodep;
+            nodep = aselp->fromp();
        }
-        return nullptr;
+        while (const AstWordSel* const wselp = VN_CAST(nodep, WordSel)) {
+            // WordSel index is not constant, so might be expensive
+            if (!VN_IS(wselp->bitp(), Const)) return nodep;
+            nodep = wselp->fromp();
+        }
+        return nodep;
+    }
+
+    // Check if this node is cheap enough that duplicating it in two branches of an
+    // AstIf is not likely to cause a performance degradation.
+    static bool isCheapNode(AstNode* nodep) {
+        // Comments are cheap
+        if (VN_IS(nodep, Comment)) return true;
+        // So are some assignments
+        if (const AstNodeAssign* const assignp = VN_CAST(nodep, NodeAssign)) {
+            // Check LHS
+            AstNode* const lhsp = skipConstSels(assignp->lhsp());
+            // LHS is not a VarRef, so might be expensive
+            if (!VN_IS(lhsp, VarRef)) return false;
+
+            // Check RHS
+            AstNode* const rhsp = skipConstSels(assignp->rhsp());
+            // RHS is not a VarRef or Constant so might be expensive
+            if (!VN_IS(rhsp, VarRef) && !VN_IS(rhsp, Const)) return false;
+
+            // Otherwise it is a cheap assignment
+            return true;
+        }
+        // Others are not
+        return false;
    }

    // Predicate to check if an expression yields only 0 or 1 (i.e.: a 1-bit value)
@ -196,23 +577,21 @@ private:
    static AstNode* maskLsb(AstNode* nodep) {
        if (yieldsOneOrZero(nodep)) return nodep;
        // Otherwise apply masking
-        AstNode* const maskp = new AstConst(nodep->fileline(), AstConst::BitTrue());
+        AstNode* const maskp = new AstConst{nodep->fileline(), AstConst::BitTrue()};
        // Mask on left, as conventional
-        return new AstAnd(nodep->fileline(), maskp, nodep);
+        return new AstAnd{nodep->fileline(), maskp, nodep};
    }

-    // Fold the RHS expression assuming the given condition state. Unlink bits
-    // from the RHS which is only used once, and can be reused. What remains
-    // of the RHS is expected to be deleted by the caller.
+    // Fold the RHS expression of an assignment assuming the given condition state.
+    // Unlink bits from the RHS which is only used once, and can be reused (is an unomdified
+    // sub-tree). What remains of the RHS is expected to be deleted by the caller.
    AstNode* foldAndUnlink(AstNode* rhsp, bool condTrue) {
        if (rhsp->sameTree(m_mgCondp)) {
-            return new AstConst(rhsp->fileline(), AstConst::BitTrue{}, condTrue);
-        } else if (const AstNodeCond* const condp = extractCond(rhsp)) {
+            return new AstConst{rhsp->fileline(), AstConst::BitTrue{}, condTrue};
+        } else if (const AstNodeCond* const condp = extractCondFromRhs(rhsp)) {
            AstNode* const resp
                = condTrue ? condp->expr1p()->unlinkFrBack() : condp->expr2p()->unlinkFrBack();
-            if (condp == rhsp) {  //
-                return resp;
-            }
+            if (condp == rhsp) return resp;
            if (const AstAnd* const andp = VN_CAST(rhsp, And)) {
                UASSERT_OBJ(andp->rhsp() == condp, rhsp, "Should not try to fold this");
                return new AstAnd{andp->fileline(), andp->lhsp()->cloneTree(false), resp};
@ -227,17 +606,18 @@ private:
                return condTrue ? maskLsb(andp->lhsp()->unlinkFrBack())
                                : new AstConst{rhsp->fileline(), AstConst::BitFalse()};
            }
-        } else if (VN_IS(rhsp, WordSel) || VN_IS(rhsp, VarRef) || VN_IS(rhsp, Const)) {
+        } else if (VN_IS(rhsp, ArraySel) || VN_IS(rhsp, WordSel) || VN_IS(rhsp, VarRef)
+                   || VN_IS(rhsp, Const)) {
            return rhsp->cloneTree(false);
        }
-        rhsp->dumpTree("Don't know how to fold expression: ");
-        rhsp->v3fatalSrc("Don't know how to fold expression");
+        // LCOV_EXCL_START
+        if (debug()) rhsp->dumpTree("Don't know how to fold expression: ");
+        rhsp->v3fatalSrc("Should not try to fold this during conditional merging");
+        // LCOV_EXCL_STOP
    }

-    void mergeEnd(int lineno) {
-        UASSERT(m_mgFirstp, "mergeEnd without list " << lineno);
-        // We might want to recursively merge an AstIf. We stash it in this variable.
-        const AstNodeIf* recursivep = nullptr;
+    void mergeEnd() {
+        UASSERT(m_mgFirstp, "mergeEnd without list");
        // Drop leading cheap nodes. These were only added in the hope of finding
        // an earlier reduced form, but we failed to do so.
        while (m_mgFirstp->user2() && m_mgFirstp != m_mgLastp) {
@ -254,8 +634,11 @@ private:
            m_mgLastp = m_mgLastp->backp();
            --m_listLenght;
            UASSERT_OBJ(m_mgLastp && m_mgLastp->nextp() == nextp, m_mgFirstp,
-                        "Cheap assignment should not be at the front of the list");
+                        "Cheap statement should not be at the front of the list");
        }
+        // If the list contains a single AstNodeIf, we will want to merge its branches.
+        // If so, keep hold of the AstNodeIf in this variable.
+        AstNodeIf* recursivep = nullptr;
        // Merge if list is longer than one node
        if (m_mgFirstp != m_mgLastp) {
            UINFO(6, "MergeCond - First: " << m_mgFirstp << " Last: " << m_mgLastp << endl);
@ -266,7 +649,7 @@ private:
            // and we also need to keep track of it for comparisons later.
            m_mgCondp = m_mgCondp->cloneTree(false);
            // Create equivalent 'if' statement and insert it before the first node
-            AstIf* const resultp = new AstIf(m_mgCondp->fileline(), m_mgCondp);
+            AstIf* const resultp = new AstIf{m_mgCondp->fileline(), m_mgCondp};
            m_mgFirstp->addHereThisAsNext(resultp);
            // Unzip the list and insert under branches
            AstNode* nextp = m_mgFirstp;
@ -308,10 +691,12 @@ private:
                    VL_DO_DANGLING(ifp->deleteTree(), ifp);
                }
            } while (nextp);
-            // Recursively merge the resulting AstIf
-            recursivep = resultp;
-        } else if (const AstNodeIf* const ifp = VN_CAST(m_mgFirstp, NodeIf)) {
-            // There was nothing to merge this AstNodeIf with, but try to merge it's branches
+            // Merge the branches of the resulting AstIf after re-analysis
+            if (resultp->ifsp()) m_workQueuep->push(resultp->ifsp());
+            if (resultp->elsesp()) m_workQueuep->push(resultp->elsesp());
+        } else if (AstNodeIf* const ifp = VN_CAST(m_mgFirstp, NodeIf)) {
+            // There was nothing to merge this AstNodeIf with, so try to merge its branches.
+            // No re-analysis is required for this, so do it directly below
            recursivep = ifp;
        }
        // Reset state
@ -321,14 +706,13 @@ private:
        m_mgNextp = nullptr;
        AstNode::user1ClearTree();  // Clear marked variables
        AstNode::user2ClearTree();
-        // Merge recursively within the branches
+        // Merge recursively within the branches of an un-merged AstNodeIF
        if (recursivep) {
            iterateAndNextNull(recursivep->ifsp());
-            // Close list, if there is one at the end of the then branch
-            if (m_mgFirstp) mergeEnd(__LINE__);
            iterateAndNextNull(recursivep->elsesp());
-            // Close list, if there is one at the end of the else branch
-            if (m_mgFirstp) mergeEnd(__LINE__);
+            // Close a pending merge to ensure merge state is
+            // reset as expected at the end of this function
+            if (m_mgFirstp) mergeEnd();
        }
    }

@ -351,47 +735,16 @@ private:
        return false;
    }

-    // Check if this node is cheap enough that duplicating it in two branches of an
-    // AstIf and is hence not likely to cause a performance degradation if doing so.
-    bool isCheapNode(AstNode* nodep) const {
-        if (VN_IS(nodep, Comment)) return true;
-        if (const AstNodeAssign* const assignp = VN_CAST(nodep, NodeAssign)) {
-            // Check LHS
-            AstNode* lhsp = assignp->lhsp();
-            while (AstWordSel* const wselp = VN_CAST(lhsp, WordSel)) {
-                // WordSel index is not constant, so might be expensive
-                if (!VN_IS(wselp->bitp(), Const)) return false;
-                lhsp = wselp->fromp();
-            }
-            // LHS is not a VarRef, so might be expensive
-            if (!VN_IS(lhsp, VarRef)) return false;
-
-            // Check RHS
-            AstNode* rhsp = assignp->rhsp();
-            while (AstWordSel* const wselp = VN_CAST(rhsp, WordSel)) {
-                // WordSel index is not constant, so might be expensive
-                if (!VN_IS(wselp->bitp(), Const)) return false;
-                rhsp = wselp->fromp();
-            }
-            // RHS is not a VarRef or Constant so might be expensive
-            if (!VN_IS(rhsp, VarRef) && !VN_IS(rhsp, Const)) return false;
-
-            // Otherwise it is a cheap assignment
-            return true;
-        }
-        return false;
-    }
-
-    bool addToList(AstNode* nodep, AstNode* condp, int line) {
+    bool addToList(AstNodeStmt* nodep, AstNode* condp) {
        // Set up head of new list if node is first in list
        if (!m_mgFirstp) {
-            UASSERT_OBJ(condp, nodep, "Cannot start new list without condition " << line);
+            UASSERT_OBJ(condp, nodep, "Cannot start new list without condition");
            // Mark variable references in the condition
            condp->foreach<AstVarRef>([](const AstVarRef* nodep) { nodep->varp()->user1(1); });
            // Now check again if mergeable. We need this to pick up assignments to conditions,
            // e.g.: 'c = c ? a : b' at the beginning of the list, which is in fact not mergeable
            // because it updates the condition. We simply bail on these.
-            if (m_checkMergeable(nodep) != Mergeable::YES) {
+            if ((*m_stmtPropertiesp)(nodep).writesConditionVar()) {
                // Clear marked variables
                AstNode::user1ClearTree();
                // We did not add to the list
@ -400,11 +753,13 @@ private:
            m_mgFirstp = nodep;
            m_mgCondp = condp;
            m_listLenght = 0;
-            // Add any preceding nodes to the list that would allow us to extend the merge range
-            for (;;) {
-                AstNode* const backp = m_mgFirstp->backp();
+            // Add any preceding nodes to the list that would allow us to extend the merge
+            // range
+            while (true) {
+                AstNodeStmt* const backp = VN_CAST(m_mgFirstp->backp(), NodeStmt);
                if (!backp || backp->nextp() != m_mgFirstp) break;  // Don't move up the tree
-                if (m_checkMergeable(backp) != Mergeable::YES) break;
+                const StmtProperties& props = (*m_stmtPropertiesp)(backp);
+                if (props.m_isFence || props.writesConditionVar()) break;
                if (isSimplifiableNode(backp)) {
                    ++m_listLenght;
                    m_mgFirstp = backp;
@ -424,59 +779,53 @@ private:
        // Set up expected next node in list.
        m_mgNextp = nodep->nextp();
        // If last under parent, done with current list
-        if (!m_mgNextp) mergeEnd(__LINE__);
+        if (!m_mgNextp) mergeEnd();
        // We did add to the list
        return true;
    }

    // If this node is the next expected node and is helpful to add to the list, do so,
    // otherwise end the current merge. Return ture if added, false if ended merge.
-    bool addIfHelpfulElseEndMerge(AstNode* nodep) {
+    bool addIfHelpfulElseEndMerge(AstNodeStmt* nodep) {
        UASSERT_OBJ(m_mgFirstp, nodep, "List must be open");
        if (m_mgNextp == nodep) {
            if (isSimplifiableNode(nodep)) {
-                if (addToList(nodep, nullptr, __LINE__)) return true;
+                if (addToList(nodep, nullptr)) return true;
            } else if (isCheapNode(nodep)) {
                nodep->user2(1);
-                if (addToList(nodep, nullptr, __LINE__)) return true;
+                if (addToList(nodep, nullptr)) return true;
            }
        }
        // Not added to list, so we are done with the current list
-        mergeEnd(__LINE__);
+        mergeEnd();
        return false;
    }

-    bool checkOrMakeMergeable(AstNode* nodep) {
-        const Mergeable reason = m_checkMergeable(nodep);
-        // If meregeable, we are done
-        if (reason == Mergeable::YES) return true;
-        // Node not mergeable.
-        // If no current list, then this node is just special, move on.
-        if (!m_mgFirstp) return false;
-        // Otherwise finish current list
-        mergeEnd(__LINE__);
-        // If a tree was not mergeable due to an assignment to a condition,
-        // then finishing the current list makes it mergeable again.
-        return reason == Mergeable::NO_COND_ASSIGN;
+    bool checkOrMakeMergeable(const AstNodeStmt* nodep) {
+        const StmtProperties& props = (*m_stmtPropertiesp)(nodep);
+        if (props.m_isFence) return false;  // Fence node never mergeable
+        // If the statement writes a condition variable of a pending merge,
+        // we must end the pending merge
+        if (m_mgFirstp && props.writesConditionVar()) mergeEnd();
+        return true;  // Now surely mergeable
    }

-    void mergeEndIfIncompatible(AstNode* nodep, AstNode* condp) {
+    void mergeEndIfIncompatible(const AstNode* nodep, const AstNode* condp) {
        if (m_mgFirstp && (m_mgNextp != nodep || !condp->sameTree(m_mgCondp))) {
            // Node in different list, or has different condition. Finish current list.
-            mergeEnd(__LINE__);
+            mergeEnd();
        }
    }

    // VISITORS
    virtual void visit(AstNodeAssign* nodep) override {
-        AstNode* const rhsp = nodep->rhsp();
-        if (const AstNodeCond* const condp = extractCond(rhsp)) {
+        if (AstNode* const condp = (*m_stmtPropertiesp)(nodep).m_condp) {
            // Check if mergeable
            if (!checkOrMakeMergeable(nodep)) return;
            // Close potentially incompatible pending merge
-            mergeEndIfIncompatible(nodep, condp->condp());
+            mergeEndIfIncompatible(nodep, condp);
            // Add current node
-            addToList(nodep, condp->condp(), __LINE__);
+            addToList(nodep, condp);
        } else if (m_mgFirstp) {
            addIfHelpfulElseEndMerge(nodep);
        }
@ -493,21 +842,22 @@ private:
        // Close potentially incompatible pending merge
        mergeEndIfIncompatible(nodep, nodep->condp());
        // Add current node
-        addToList(nodep, nodep->condp(), __LINE__);
+        addToList(nodep, nodep->condp());
+    }
+
+    virtual void visit(AstNodeStmt* nodep) override {
+        if (m_mgFirstp && addIfHelpfulElseEndMerge(nodep)) return;
+        iterateChildren(nodep);
+    }
+
+    virtual void visit(AstCFunc* nodep) override {
+        // Merge function body
+        if (nodep->stmtsp()) process(nodep->stmtsp());
    }

    // For speed, only iterate what is necessary.
    virtual void visit(AstNetlist* nodep) override { iterateAndNextNull(nodep->modulesp()); }
    virtual void visit(AstNodeModule* nodep) override { iterateAndNextNull(nodep->stmtsp()); }
-    virtual void visit(AstCFunc* nodep) override {
-        iterateChildren(nodep);
-        // Close list, if there is one at the end of the function
-        if (m_mgFirstp) mergeEnd(__LINE__);
-    }
-    virtual void visit(AstNodeStmt* nodep) override {
-        if (m_mgFirstp && addIfHelpfulElseEndMerge(nodep)) return;
-        iterateChildren(nodep);
-    }
    virtual void visit(AstNode* nodep) override {}

 public:
@ -520,6 +870,8 @@ public:
    }
 };

+}  // namespace
+
 //######################################################################
 // MergeConditionals class functions

--- a/src/V3OptionParser.cpp
+++ b/src/V3OptionParser.cpp
@ -30,6 +30,7 @@ struct V3OptionParser::Impl {
    // Setting for isOnOffAllowed() and isPartialMatchAllowed()
    enum class en : uint8_t {
        NONE,  // "-opt"
+        FONOFF,  // "-fopt" and "-fno-opt"
        ONOFF,  // "-opt" and "-no-opt"
        VALUE  // "-opt val"
    };
@ -39,6 +40,7 @@ struct V3OptionParser::Impl {
        bool m_undocumented = false;  // This option is not documented
    public:
        virtual bool isValueNeeded() const override final { return MODE == en::VALUE; }
+        virtual bool isFOnOffAllowed() const override final { return MODE == en::FONOFF; }
        virtual bool isOnOffAllowed() const override final { return MODE == en::ONOFF; }
        virtual bool isPartialMatchAllowed() const override final { return ALLOW_PARTIAL_MATCH; }
        virtual bool isUndocumented() const override { return m_undocumented; }
@ -47,6 +49,7 @@ struct V3OptionParser::Impl {

    // Actual action classes
    template <typename T> class ActionSet;  // "-opt" for bool-ish, "-opt val" for int and string
+    template <typename BOOL> class ActionFOnOff;  // "-fopt" and "-fno-opt" for bool-ish
    template <typename BOOL> class ActionOnOff;  // "-opt" and "-no-opt" for bool-ish
    class ActionCbCall;  // Callback without argument for "-opt"
    class ActionCbOnOff;  // Callback for "-opt" and "-no-opt"
@ -80,6 +83,7 @@ V3OPTION_PARSER_DEF_ACT_CLASS(ActionSet, VOptionBool, m_valp->setTrueOrFalse(tru
 V3OPTION_PARSER_DEF_ACT_CLASS(ActionSet, int, *m_valp = std::atoi(argp), en::VALUE);
 V3OPTION_PARSER_DEF_ACT_CLASS(ActionSet, string, *m_valp = argp, en::VALUE);

+V3OPTION_PARSER_DEF_ACT_CLASS(ActionFOnOff, bool, *m_valp = !hasPrefixFNo(optp), en::FONOFF);
 V3OPTION_PARSER_DEF_ACT_CLASS(ActionOnOff, bool, *m_valp = !hasPrefixNo(optp), en::ONOFF);
 #ifndef V3OPTION_PARSER_NO_VOPTION_BOOL
 V3OPTION_PARSER_DEF_ACT_CLASS(ActionOnOff, VOptionBool, m_valp->setTrueOrFalse(!hasPrefixNo(optp)),
@ -117,12 +121,23 @@ V3OPTION_PARSER_DEF_ACT_CB_CLASS(ActionCbPartialMatchVal, void(const char*, cons

 V3OptionParser::ActionIfs* V3OptionParser::find(const char* optp) {
    const auto it = m_pimpl->m_options.find(optp);
-    if (it != m_pimpl->m_options.end()) return it->second.get();
+    if (it != m_pimpl->m_options.end()) return it->second.get();  // Exact match
    for (auto&& act : m_pimpl->m_options) {
+        if (act.second->isFOnOffAllowed()) {  // Find starts with "-fno"
+            if (const char* const nop
+                = VString::startsWith(optp, "-fno-") ? (optp + strlen("-fno-")) : nullptr) {
+                if (act.first.substr(strlen("-f"), std::string::npos)
+                    == nop) {  // [-f]opt = [-fno-]opt
+                    return act.second.get();
+                }
+            }
+        }
        if (act.second->isOnOffAllowed()) {  // Find starts with "-no"
-            const char* const nop = VString::startsWith(optp, "-no") ? (optp + 3) : nullptr;
-            if (nop && (act.first == nop || act.first == (string{"-"} + nop))) {
-                return act.second.get();
+            if (const char* const nop
+                = VString::startsWith(optp, "-no") ? (optp + strlen("-no")) : nullptr) {
+                if (act.first == nop || act.first == (string{"-"} + nop)) {
+                    return act.second.get();
+                }
            }
        } else if (act.second->isPartialMatchAllowed()) {
            if (VString::startsWith(optp, act.first)) return act.second.get();
@ -143,6 +158,12 @@ V3OptionParser::ActionIfs& V3OptionParser::add(const std::string& opt, ARG arg)
    return *insertedResult.first->second;
 }

+bool V3OptionParser::hasPrefixFNo(const char* strp) {
+    UASSERT(strp[0] == '-', strp << " does not start with '-'");
+    if (strp[1] == '-') ++strp;
+    return VString::startsWith(strp, "-fno");
+}
+
 bool V3OptionParser::hasPrefixNo(const char* strp) {
    UASSERT(strp[0] == '-', strp << " does not start with '-'");
    if (strp[1] == '-') ++strp;
@ -178,6 +199,10 @@ void V3OptionParser::finalize() {
    for (auto&& opt : m_pimpl->m_options) {
        if (opt.second->isUndocumented()) continue;
        m_pimpl->m_spellCheck.pushCandidate(opt.first);
+        if (opt.second->isFOnOffAllowed()) {
+            m_pimpl->m_spellCheck.pushCandidate(
+                "-fno-" + opt.first.substr(strlen("-f"), std::string::npos));
+        }
        if (opt.second->isOnOffAllowed()) m_pimpl->m_spellCheck.pushCandidate("-no" + opt.first);
    }
    m_pimpl->m_isFinalized = true;
@ -202,6 +227,7 @@ V3OPTION_PARSER_DEF_OP(Set, VOptionBool*, ActionSet<VOptionBool>)
 #endif
 V3OPTION_PARSER_DEF_OP(Set, int*, ActionSet<int>)
 V3OPTION_PARSER_DEF_OP(Set, string*, ActionSet<string>)
+V3OPTION_PARSER_DEF_OP(FOnOff, bool*, ActionFOnOff<bool>)
 V3OPTION_PARSER_DEF_OP(OnOff, bool*, ActionOnOff<bool>)
 #ifndef V3OPTION_PARSER_NO_VOPTION_BOOL
 V3OPTION_PARSER_DEF_OP(OnOff, VOptionBool*, ActionOnOff<VOptionBool>)
--- a/src/V3OptionParser.h
+++ b/src/V3OptionParser.h
@ -66,6 +66,7 @@ private:
    // METHODS
    ActionIfs* find(const char* optp);
    template <class ACT, class ARG> ActionIfs& add(const string& opt, ARG arg);
+    static bool hasPrefixFNo(const char* strp);  // Returns true if strp starts with "-fno"
    static bool hasPrefixNo(const char* strp);  // Returns true if strp starts with "-no"

 public:
@ -87,6 +88,7 @@ class V3OptionParser::ActionIfs VL_NOT_FINAL {
 public:
    virtual ~ActionIfs() = default;
    virtual bool isValueNeeded() const = 0;  // Need val of "-opt val"
+    virtual bool isFOnOffAllowed() const = 0;  // true if "-fno-opt" is allowd
    virtual bool isOnOffAllowed() const = 0;  // true if "-no-opt" is allowd
    virtual bool isPartialMatchAllowed() const = 0;  // true if "-Wno-" matches "-Wno-fatal"
    virtual bool isUndocumented() const = 0;  // Will not be suggested in typo
@ -101,13 +103,15 @@ class V3OptionParser::AppendHelper final {
 public:
    // TYPES
    // Tag to specify which operator() to call
-    struct Set {};  // For ActionSet
+    struct FOnOff {};  // For ActionFOnOff
    struct OnOff {};  // For ActionOnOff
+    struct Set {};  // For ActionSet
+
    struct CbCall {};  // For ActionCbCall
-    struct CbOnOff {};  // For ActionOnOff
-    struct CbVal {};  // For ActionCbVal
+    struct CbOnOff {};  // For ActionOnOff of ActionFOnOff
    struct CbPartialMatch {};  // For ActionCbPartialMatch
    struct CbPartialMatchVal {};  // For ActionCbPartialMatchVal
+    struct CbVal {};  // For ActionCbVal

 private:
    // MEMBERS
@ -122,6 +126,7 @@ public:
    ActionIfs& operator()(const char* optp, Set, int*) const;
    ActionIfs& operator()(const char* optp, Set, string*) const;

+    ActionIfs& operator()(const char* optp, FOnOff, bool*) const;
    ActionIfs& operator()(const char* optp, OnOff, bool*) const;
 #ifndef V3OPTION_PARSER_NO_VOPTION_BOOL
    ActionIfs& operator()(const char* optp, OnOff, VOptionBool*) const;
@ -144,13 +149,14 @@ public:

 #define V3OPTION_PARSER_DECL_TAGS \
    const auto Set VL_ATTR_UNUSED = V3OptionParser::AppendHelper::Set{}; \
+    const auto FOnOff VL_ATTR_UNUSED = V3OptionParser::AppendHelper::FOnOff{}; \
    const auto OnOff VL_ATTR_UNUSED = V3OptionParser::AppendHelper::OnOff{}; \
    const auto CbCall VL_ATTR_UNUSED = V3OptionParser::AppendHelper::CbCall{}; \
    const auto CbOnOff VL_ATTR_UNUSED = V3OptionParser::AppendHelper::CbOnOff{}; \
-    const auto CbVal VL_ATTR_UNUSED = V3OptionParser::AppendHelper::CbVal{}; \
    const auto CbPartialMatch VL_ATTR_UNUSED = V3OptionParser::AppendHelper::CbPartialMatch{}; \
    const auto CbPartialMatchVal VL_ATTR_UNUSED \
-        = V3OptionParser::AppendHelper::CbPartialMatchVal {}
+        = V3OptionParser::AppendHelper::CbPartialMatchVal{}; \
+    const auto CbVal VL_ATTR_UNUSED = V3OptionParser::AppendHelper::CbVal{};

 //######################################################################

--- a/src/V3Options.cpp
+++ b/src/V3Options.cpp
@ -775,8 +775,16 @@ void V3Options::notify() {
            && !v3Global.opt.xmlOnly());
    }

-    // --trace-threads implies --threads 1 unless explicitly specified
-    if (traceThreads() && !threads()) m_threads = 1;
+    if (trace()) {
+        // With --trace-fst, --trace-threads implies --threads 1 unless explicitly specified
+        if (traceFormat().fst() && traceThreads() && !threads()) m_threads = 1;
+
+        // With --trace, --trace-threads is ignored
+        if (traceFormat().vcd()) m_traceThreads = threads() ? 1 : 0;
+    }
+
+    UASSERT(!(useTraceParallel() && useTraceOffload()),
+            "Cannot use both parallel and offloaded tracing");

    // Default split limits if not specified
    if (m_outputSplitCFuncs < 0) m_outputSplitCFuncs = m_outputSplit;
@ -1075,6 +1083,28 @@ void V3Options::parseOptsList(FileLine* fl, const string& optdir, int argc, char
    });
    DECL_OPTION("-flatten", OnOff, &m_flatten);

+    DECL_OPTION("-facyc-simp", FOnOff, &m_fAcycSimp);
+    DECL_OPTION("-fassemble", FOnOff, &m_fAssemble);
+    DECL_OPTION("-fcase", FOnOff, &m_fCase);
+    DECL_OPTION("-fcombine", FOnOff, &m_fCombine);
+    DECL_OPTION("-fconst", FOnOff, &m_fConst);
+    DECL_OPTION("-fconst-bit-op-tree", FOnOff, &m_fConstBitOpTree);
+    DECL_OPTION("-fdedup", FOnOff, &m_fDedupe);
+    DECL_OPTION("-fexpand", FOnOff, &m_fExpand);
+    DECL_OPTION("-fgate", FOnOff, &m_fGate);
+    DECL_OPTION("-finline", FOnOff, &m_fInline);
+    DECL_OPTION("-flife", FOnOff, &m_fLife);
+    DECL_OPTION("-flife-post", FOnOff, &m_fLifePost);
+    DECL_OPTION("-flocalize", FOnOff, &m_fLocalize);
+    DECL_OPTION("-fmerge-cond", FOnOff, &m_fMergeCond);
+    DECL_OPTION("-fmerge-const-pool", FOnOff, &m_fMergeConstPool);
+    DECL_OPTION("-freloop", FOnOff, &m_fReloop);
+    DECL_OPTION("-freorder", FOnOff, &m_fReorder);
+    DECL_OPTION("-fsplit", FOnOff, &m_fSplit);
+    DECL_OPTION("-fsubst", FOnOff, &m_fSubst);
+    DECL_OPTION("-fsubst-const", FOnOff, &m_fSubstConst);
+    DECL_OPTION("-ftable", FOnOff, &m_fTable);
+
    DECL_OPTION("-G", CbPartialMatch, [this](const char* optp) { addParameter(optp, false); });
    DECL_OPTION("-gate-stmts", Set, &m_gateStmts);
    DECL_OPTION("-gdb", CbCall, []() {});  // Processed only in bin/verilator shell
@ -1144,50 +1174,51 @@ void V3Options::parseOptsList(FileLine* fl, const string& optdir, int argc, char
        }
    });
    DECL_OPTION("-max-num-width", Set, &m_maxNumWidth);
-    DECL_OPTION("-merge-const-pool", OnOff, &m_mergeConstPool);
    DECL_OPTION("-mod-prefix", Set, &m_modPrefix);

-    DECL_OPTION("-O", CbPartialMatch, [this](const char* optp) {
-        // Optimization
+    DECL_OPTION("-O0", CbCall, [this]() { optimize(0); });
+    DECL_OPTION("-O1", CbCall, [this]() { optimize(1); });
+    DECL_OPTION("-O2", CbCall, [this]() { optimize(2); });
+    DECL_OPTION("-O3", CbCall, [this]() { optimize(3); });
+
+    DECL_OPTION("-O", CbPartialMatch, [this, fl](const char* optp) {
+        // Optimization, e.g. -O1rX
+        // LCOV_EXCL_START
+        fl->v3warn(DEPRECATED, "Option -O<letter> is deprecated. "
+                               "Use -f<optimization> or -fno-<optimization> instead.");
        for (const char* cp = optp; *cp; ++cp) {
            const bool flag = isupper(*cp);
            switch (tolower(*cp)) {
-            case '0': optimize(0); break;  // 0=all off
-            case '1': optimize(1); break;  // 1=all on
-            case '2': optimize(2); break;  // 2=not used
-            case '3': optimize(3); break;  // 3=high
-            case 'a': m_oTable = flag; break;
-            case 'b': m_oCombine = flag; break;
-            case 'c': m_oConst = flag; break;
-            case 'd': m_oDedupe = flag; break;
-            case 'e': m_oCase = flag; break;
-            //    f
-            case 'g': m_oGate = flag; break;
-            //    h
-            case 'i': m_oInline = flag; break;
-            //    j
-            case 'k': m_oSubstConst = flag; break;
-            case 'l': m_oLife = flag; break;
-            case 'm': m_oAssemble = flag; break;
-            //    n
-            case 'o':
-                m_oConstBitOpTree = flag;
-                break;  // Can remove ~2022-01 when stable
-            //    o will be used as an escape for a second character of optimization disables
+            case '0': optimize(0); break;
+            case '1': optimize(1); break;
+            case '2': optimize(2); break;
+            case '3': optimize(3); break;
+            case 'a': m_fTable = flag; break;  // == -fno-table
+            case 'b': m_fCombine = flag; break;  // == -fno-combine
+            case 'c': m_fConst = flag; break;  // == -fno-const
+            case 'd': m_fDedupe = flag; break;  // == -fno-dedup
+            case 'e': m_fCase = flag; break;  // == -fno-case
+            case 'g': m_fGate = flag; break;  // == -fno-gate
+            case 'i': m_fInline = flag; break;  // == -fno-inline
+            case 'k': m_fSubstConst = flag; break;  // == -fno-subst-const
+            case 'l': m_fLife = flag; break;  // == -fno-life
+            case 'm': m_fAssemble = flag; break;  // == -fno-assemble
+            case 'o': m_fConstBitOpTree = flag; break;  // == -fno-const-bit-op-tree
            case 'p':
                m_public = !flag;
                break;  // With -Op so flag=0, we want public on so few optimizations done
-            //    q
-            case 'r': m_oReorder = flag; break;
-            case 's': m_oSplit = flag; break;
-            case 't': m_oLifePost = flag; break;
-            case 'u': m_oSubst = flag; break;
-            case 'v': m_oReloop = flag; break;
-            case 'w': m_oMergeCond = flag; break;
-            case 'x': m_oExpand = flag; break;
-            case 'y': m_oAcycSimp = flag; break;
-            case 'z': m_oLocalize = flag; break;
-            default: break;  // No error, just ignore
+            case 'r': m_fReorder = flag; break;  // == -fno-reorder
+            case 's': m_fSplit = flag; break;  // == -fno-split
+            case 't': m_fLifePost = flag; break;  // == -fno-life-post
+            case 'u': m_fSubst = flag; break;  // == -fno-subst
+            case 'v': m_fReloop = flag; break;  // == -fno-reloop
+            case 'w': m_fMergeCond = flag; break;  // == -fno-merge-cond
+            case 'x': m_fExpand = flag; break;  // == -fno-expand
+            case 'y': m_fAcycSimp = flag; break;  // == -fno-acyc-simp
+            case 'z': m_fLocalize = flag; break;  // == -fno-localize
+            default:
+                break;  // No error, just ignore
+                // LCOV_EXCL_STOP
            }
        }
    });
@ -1352,7 +1383,7 @@ void V3Options::parseOptsList(FileLine* fl, const string& optdir, int argc, char
    DECL_OPTION("-trace-threads", CbVal, [this, fl](const char* valp) {
        m_trace = true;
        m_traceThreads = std::atoi(valp);
-        if (m_traceThreads < 0) fl->v3fatal("--trace-threads must be >= 0: " << valp);
+        if (m_traceThreads < 1) fl->v3fatal("--trace-threads must be >= 1: " << valp);
    });
    DECL_OPTION("-trace-underscore", OnOff, &m_traceUnderscore);

@ -1781,26 +1812,26 @@ int V3Options::dumpTreeLevel(const string& srcfile_path) {
 void V3Options::optimize(int level) {
    // Set all optimizations to on/off
    const bool flag = level > 0;
-    m_oAcycSimp = flag;
-    m_oAssemble = flag;
-    m_oCase = flag;
-    m_oCombine = flag;
-    m_oConst = flag;
-    m_oConstBitOpTree = flag;
-    m_oDedupe = flag;
-    m_oExpand = flag;
-    m_oGate = flag;
-    m_oInline = flag;
-    m_oLife = flag;
-    m_oLifePost = flag;
-    m_oLocalize = flag;
-    m_oMergeCond = flag;
-    m_oReloop = flag;
-    m_oReorder = flag;
-    m_oSplit = flag;
-    m_oSubst = flag;
-    m_oSubstConst = flag;
-    m_oTable = flag;
+    m_fAcycSimp = flag;
+    m_fAssemble = flag;
+    m_fCase = flag;
+    m_fCombine = flag;
+    m_fConst = flag;
+    m_fConstBitOpTree = flag;
+    m_fDedupe = flag;
+    m_fExpand = flag;
+    m_fGate = flag;
+    m_fInline = flag;
+    m_fLife = flag;
+    m_fLifePost = flag;
+    m_fLocalize = flag;
+    m_fMergeCond = flag;
+    m_fReloop = flag;
+    m_fReorder = flag;
+    m_fSplit = flag;
+    m_fSubst = flag;
+    m_fSubstConst = flag;
+    m_fTable = flag;
    // And set specific optimization levels
    if (level >= 3) {
        m_inlineMult = -1;  // Maximum inlining
--- a/src/V3Options.h
+++ b/src/V3Options.h
@ -246,7 +246,6 @@ private:
    bool m_lintOnly = false;        // main switch: --lint-only
    bool m_gmake = false;           // main switch: --make gmake
    bool m_main = false;            // main swithc: --main
-    bool m_mergeConstPool = true;   // main switch: --merge-const-pool
    bool m_outFormatOk = false;     // main switch: --cc, --sc or --sp was specified
    bool m_pedantic = false;        // main switch: --Wpedantic
    bool m_pinsScUint = false;      // main switch: --pins-sc-uint
@ -340,27 +339,27 @@ private:
    V3LangCode  m_defaultLanguage;      // main switch: --language

    // MEMBERS (optimizations)
-    //                          // main switch: -Op: --public
-    bool        m_oAcycSimp;    // main switch: -Oy: acyclic pre-optimizations
-    bool        m_oAssemble;    // main switch: -Om: assign assemble
-    bool        m_oCase;        // main switch: -Oe: case tree conversion
-    bool        m_oCombine;     // main switch: -Ob: common icode packing
-    bool        m_oConst;       // main switch: -Oc: constant folding
-    bool        m_oConstBitOpTree;  // main switch: -Oo: constant bit op tree
-    bool        m_oDedupe;      // main switch: -Od: logic deduplication
-    bool        m_oExpand;      // main switch: -Ox: expansion of C macros
-    bool        m_oGate;        // main switch: -Og: gate wire elimination
-    bool        m_oInline;      // main switch: -Oi: module inlining
-    bool        m_oLife;        // main switch: -Ol: variable lifetime
-    bool        m_oLifePost;    // main switch: -Ot: delayed assignment elimination
-    bool        m_oLocalize;    // main switch: -Oz: convert temps to local variables
-    bool        m_oMergeCond;   // main switch: -Ob: merge conditionals
-    bool        m_oReloop;      // main switch: -Ov: reform loops
-    bool        m_oReorder;     // main switch: -Or: reorder assignments in blocks
-    bool        m_oSplit;       // main switch: -Os: always assignment splitting
-    bool        m_oSubst;       // main switch: -Ou: substitute expression temp values
-    bool        m_oSubstConst;  // main switch: -Ok: final constant substitution
-    bool        m_oTable;       // main switch: -Oa: lookup table creation
+    bool m_fAcycSimp;    // main switch: -fno-acyc-simp: acyclic pre-optimizations
+    bool m_fAssemble;    // main switch: -fno-assemble: assign assemble
+    bool m_fCase;        // main switch: -fno-case: case tree conversion
+    bool m_fCombine;     // main switch: -fno-combine: common icode packing
+    bool m_fConst;       // main switch: -fno-const: constant folding
+    bool m_fConstBitOpTree;  // main switch: -fno-const-bit-op-tree constant bit op tree
+    bool m_fDedupe;      // main switch: -fno-dedupe: logic deduplication
+    bool m_fExpand;      // main switch: -fno-expand: expansion of C macros
+    bool m_fGate;        // main switch: -fno-gate: gate wire elimination
+    bool m_fInline;      // main switch: -fno-inline: module inlining
+    bool m_fLife;        // main switch: -fno-life: variable lifetime
+    bool m_fLifePost;    // main switch: -fno-life-post: delayed assignment elimination
+    bool m_fLocalize;    // main switch: -fno-localize: convert temps to local variables
+    bool m_fMergeCond;   // main switch: -fno-merge-cond: merge conditionals
+    bool m_fMergeConstPool = true;  // main switch: --fmerge-const-pool
+    bool m_fReloop;      // main switch: -fno-reloop: reform loops
+    bool m_fReorder;     // main switch: -fno-reorder: reorder assignments in blocks
+    bool m_fSplit;       // main switch: -fno-split: always assignment splitting
+    bool m_fSubst;       // main switch: -fno-subst: substitute expression temp values
+    bool m_fSubstConst;  // main switch: -fno-subst-const: final constant substitution
+    bool m_fTable;       // main switch: -fno-table: lookup table creation
    // clang-format on

    bool m_available = false;  // Set to true at the end of option parsing
@ -458,7 +457,6 @@ public:
    bool traceStructs() const { return m_traceStructs; }
    bool traceUnderscore() const { return m_traceUnderscore; }
    bool main() const { return m_main; }
-    bool mergeConstPool() const { return m_mergeConstPool; }
    bool outFormatOk() const { return m_outFormatOk; }
    bool keepTempFiles() const { return (V3Error::debugDefault() != 0); }
    bool pedantic() const { return m_pedantic; }
@ -516,8 +514,10 @@ public:
    int traceMaxArray() const { return m_traceMaxArray; }
    int traceMaxWidth() const { return m_traceMaxWidth; }
    int traceThreads() const { return m_traceThreads; }
-    bool useTraceOffloadThread() const {
-        return traceThreads() == 0 ? 0 : traceThreads() - traceFormat().fst();
+    bool useTraceOffload() const { return trace() && traceFormat().fst() && traceThreads() > 1; }
+    bool useTraceParallel() const { return trace() && traceFormat().vcd() && threads() > 1; }
+    unsigned vmTraceThreads() const {
+        return useTraceParallel() ? threads() : useTraceOffload() ? 1 : 0;
    }
    int unrollCount() const { return m_unrollCount; }
    int unrollStmts() const { return m_unrollStmts; }
@ -571,26 +571,27 @@ public:
    bool isNoClocker(const string& signame) const;

    // ACCESSORS (optimization options)
-    bool oAcycSimp() const { return m_oAcycSimp; }
-    bool oAssemble() const { return m_oAssemble; }
-    bool oCase() const { return m_oCase; }
-    bool oCombine() const { return m_oCombine; }
-    bool oConst() const { return m_oConst; }
-    bool oConstBitOpTree() const { return m_oConstBitOpTree; }
-    bool oDedupe() const { return m_oDedupe; }
-    bool oExpand() const { return m_oExpand; }
-    bool oGate() const { return m_oGate; }
-    bool oInline() const { return m_oInline; }
-    bool oLife() const { return m_oLife; }
-    bool oLifePost() const { return m_oLifePost; }
-    bool oLocalize() const { return m_oLocalize; }
-    bool oMergeCond() const { return m_oMergeCond; }
-    bool oReloop() const { return m_oReloop; }
-    bool oReorder() const { return m_oReorder; }
-    bool oSplit() const { return m_oSplit; }
-    bool oSubst() const { return m_oSubst; }
-    bool oSubstConst() const { return m_oSubstConst; }
-    bool oTable() const { return m_oTable; }
+    bool fAcycSimp() const { return m_fAcycSimp; }
+    bool fAssemble() const { return m_fAssemble; }
+    bool fCase() const { return m_fCase; }
+    bool fCombine() const { return m_fCombine; }
+    bool fConst() const { return m_fConst; }
+    bool fConstBitOpTree() const { return m_fConstBitOpTree; }
+    bool fDedupe() const { return m_fDedupe; }
+    bool fExpand() const { return m_fExpand; }
+    bool fGate() const { return m_fGate; }
+    bool fInline() const { return m_fInline; }
+    bool fLife() const { return m_fLife; }
+    bool fLifePost() const { return m_fLifePost; }
+    bool fLocalize() const { return m_fLocalize; }
+    bool fMergeCond() const { return m_fMergeCond; }
+    bool fMergeConstPool() const { return m_fMergeConstPool; }
+    bool fReloop() const { return m_fReloop; }
+    bool fReorder() const { return m_fReorder; }
+    bool fSplit() const { return m_fSplit; }
+    bool fSubst() const { return m_fSubst; }
+    bool fSubstConst() const { return m_fSubstConst; }
+    bool fTable() const { return m_fTable; }

    string traceClassBase() const { return m_traceFormat.classBase(); }
    string traceClassLang() const { return m_traceFormat.classBase() + (systemC() ? "Sc" : "C"); }
--- a/src/V3Premit.cpp
+++ b/src/V3Premit.cpp
@ -133,7 +133,7 @@ private:
                                  && !constp->num().isString();  // Not a string
        if (useConstPool) {
            // Extract into constant pool.
-            const bool merge = v3Global.opt.mergeConstPool();
+            const bool merge = v3Global.opt.fMergeConstPool();
            varp = v3Global.rootp()->constPoolp()->findConst(constp, merge)->varp();
            nodep->deleteTree();
            ++m_extractedToConstPool;
--- a/src/V3Trace.cpp
+++ b/src/V3Trace.cpp
@ -180,6 +180,10 @@ private:
    TraceActivityVertex* const m_alwaysVtxp;  // "Always trace" vertex
    bool m_finding = false;  // Pass one of algorithm?

+    // Trace parallelism. Only VCD tracing can be parallelized at this time.
+    const uint32_t m_parallelism
+        = v3Global.opt.useTraceParallel() ? static_cast<uint32_t>(v3Global.opt.threads()) : 1;
+
    VDouble0 m_statUniqSigs;  // Statistic tracking
    VDouble0 m_statUniqCodes;  // Statistic tracking

@ -388,7 +392,7 @@ private:
                if (!it->second->duplicatep()) {
                    uint32_t cost = 0;
                    const AstTraceDecl* const declp = it->second->nodep();
-                    // The number of comparisons required by tracep->chg*
+                    // The number of comparisons required by bufp->chg*
                    cost += declp->isWide() ? declp->codeInc() : 1;
                    // Arrays are traced by element
                    cost *= declp->arrayRange().ranged() ? declp->arrayRange().elements() : 1;
@ -494,7 +498,7 @@ private:
        };
        if (isTopFunc) {
            // Top functions
-            funcp->argTypes("void* voidSelf, " + v3Global.opt.traceClassBase() + "* tracep");
+            funcp->argTypes("void* voidSelf, " + v3Global.opt.traceClassBase() + "::Buffer* bufp");
            addInitStr(voidSelfAssign(m_topModp));
            addInitStr(symClassAssign());
            // Add global activity check to change dump functions
@ -508,32 +512,33 @@ private:
                m_regFuncp->addStmtsp(new AstText(flp, "tracep->addChgCb(", true));
            }
            m_regFuncp->addStmtsp(new AstAddrOfCFunc(flp, funcp));
-            m_regFuncp->addStmtsp(new AstText(flp, ", vlSelf);\n", true));
+            const string threadPool{m_parallelism > 1 ? "vlSymsp->__Vm_threadPoolp" : "nullptr"};
+            m_regFuncp->addStmtsp(new AstText(flp, ", vlSelf, " + threadPool + ");\n", true));
        } else {
            // Sub functions
-            funcp->argTypes(v3Global.opt.traceClassBase() + "* tracep");
+            funcp->argTypes(v3Global.opt.traceClassBase() + "::Buffer* bufp");
            // Setup base references. Note in rare occasions we can end up with an empty trace
            // sub function, hence the VL_ATTR_UNUSED attributes.
            if (full) {
                // Full dump sub function
                addInitStr("uint32_t* const oldp VL_ATTR_UNUSED = "
-                           "tracep->oldp(vlSymsp->__Vm_baseCode);\n");
+                           "bufp->oldp(vlSymsp->__Vm_baseCode);\n");
            } else {
                // Change dump sub function
-                if (v3Global.opt.useTraceOffloadThread()) {
+                if (v3Global.opt.useTraceOffload()) {
                    addInitStr("const uint32_t base VL_ATTR_UNUSED = "
                               "vlSymsp->__Vm_baseCode + "
                               + cvtToStr(baseCode) + ";\n");
-                    addInitStr("if (false && tracep) {}  // Prevent unused\n");
+                    addInitStr("if (false && bufp) {}  // Prevent unused\n");
                } else {
                    addInitStr("uint32_t* const oldp VL_ATTR_UNUSED = "
-                               "tracep->oldp(vlSymsp->__Vm_baseCode + "
+                               "bufp->oldp(vlSymsp->__Vm_baseCode + "
                               + cvtToStr(baseCode) + ");\n");
                }
            }
            // Add call to top function
            AstCCall* const callp = new AstCCall(funcp->fileline(), funcp);
-            callp->argTypes("tracep");
+            callp->argTypes("bufp");
            topFuncp->addStmtsp(callp);
        }
        // Done
@ -728,7 +733,7 @@ private:
        // We will split functions such that each have to dump roughly the same amount of data
        // for this we need to keep tack of the number of codes used by the trace functions.
        uint32_t nFullCodes = 0;  // Number of non-duplicate codes (need to go into full* dump)
-        uint32_t nChgCodes = 0;  // Number of non-consant codes (need to go in to chg* dump)
+        uint32_t nChgCodes = 0;  // Number of non-constant codes (need to go in to chg* dump)
        sortTraces(traces, nFullCodes, nChgCodes);

        UINFO(5, "nFullCodes: " << nFullCodes << " nChgCodes: " << nChgCodes << endl);
@ -747,13 +752,11 @@ private:
        m_regFuncp->isLoose(true);
        m_topScopep->addActivep(m_regFuncp);

-        const int parallelism = 1;  // Note: will bump this later, code below works for any value
-
        // Create the full dump functions, also allocates signal numbers
-        createFullTraceFunction(traces, nFullCodes, parallelism);
+        createFullTraceFunction(traces, nFullCodes, m_parallelism);

        // Create the incremental dump functions
-        createChgTraceFunctions(traces, nChgCodes, parallelism);
+        createChgTraceFunctions(traces, nChgCodes, m_parallelism);

        // Remove refs to traced values from TraceDecl nodes, these have now moved under
        // TraceInc
--- a/src/V3Width.cpp
+++ b/src/V3Width.cpp
@ -504,6 +504,7 @@ private:
        //   width: LHS + RHS
        AstNodeDType* const vdtypep = m_vup->dtypeNullSkipRefp();
        userIterate(vdtypep, WidthVP(SELF, BOTH).p());
+        // Conversions
        if (VN_IS(vdtypep, QueueDType)) {
            // Queue "element 0" is lhsp, so we need to swap arguments
            auto* const newp = new AstConsQueue(nodep->fileline(), nodep->rhsp()->unlinkFrBack(),
@ -521,6 +522,16 @@ private:
            userIterateChildren(newp, m_vup);
            return;
        }
+        if (VN_IS(vdtypep, UnpackArrayDType)) {
+            auto* const newp = new AstPattern{nodep->fileline(), nullptr};
+            patConcatConvertRecurse(newp, nodep);
+            nodep->replaceWith(newp);
+            VL_DO_DANGLING(pushDeletep(nodep), nodep);
+            userIterate(newp, m_vup);
+            return;
+        }
+
+        // Concat handling
        if (m_vup->prelim()) {
            if (VN_IS(vdtypep, AssocArrayDType)  //
                || VN_IS(vdtypep, DynArrayDType)  //
@ -662,7 +673,8 @@ private:
            }

            AstNodeDType* const vdtypep = m_vup->dtypeNullSkipRefp();
-            if (VN_IS(vdtypep, QueueDType) || VN_IS(vdtypep, DynArrayDType)) {
+            if (VN_IS(vdtypep, QueueDType) || VN_IS(vdtypep, DynArrayDType)
+                || VN_IS(vdtypep, UnpackArrayDType)) {
                if (times != 1)
                    nodep->v3warn(E_UNSUPPORTED, "Unsupported: Non-1 replication to form "
                                                     << vdtypep->prettyDTypeNameQ()
@ -674,7 +686,7 @@ private:
                VL_DO_DANGLING(pushDeletep(nodep), nodep);
                return;
            }
-            if (VN_IS(vdtypep, AssocArrayDType) || VN_IS(vdtypep, UnpackArrayDType)) {
+            if (VN_IS(vdtypep, AssocArrayDType)) {
                nodep->v3warn(E_UNSUPPORTED, "Unsupported: Replication to form "
                                                 << vdtypep->prettyDTypeNameQ() << " data type");
            }
@ -6236,6 +6248,21 @@ private:
        return patmap;
    }

+    void patConcatConvertRecurse(AstPattern* patternp, AstConcat* nodep) {
+        if (AstConcat* lhsp = VN_CAST(nodep->lhsp(), Concat)) {
+            patConcatConvertRecurse(patternp, lhsp);
+        } else {
+            patternp->addItemsp(new AstPatMember{nodep->lhsp()->fileline(),
+                                                 nodep->lhsp()->unlinkFrBack(), nullptr, nullptr});
+        }
+        if (AstConcat* rhsp = VN_CAST(nodep->rhsp(), Concat)) {
+            patConcatConvertRecurse(patternp, rhsp);
+        } else {
+            patternp->addItemsp(new AstPatMember{nodep->rhsp()->fileline(),
+                                                 nodep->rhsp()->unlinkFrBack(), nullptr, nullptr});
+        }
+    }
+
    void makeOpenArrayShell(AstNodeFTaskRef* nodep) {
        UINFO(4, "Replicate openarray function " << nodep->taskp() << endl);
        AstNodeFTask* const oldTaskp = nodep->taskp();
--- a/src/Verilator.cpp
+++ b/src/Verilator.cpp
@ -237,7 +237,7 @@ static void process() {
        // Module inlining
        // Cannot remove dead variables after this, as alias information for final
        // V3Scope's V3LinkDot is in the AstVar.
-        if (v3Global.opt.oInline()) {
+        if (v3Global.opt.fInline()) {
            V3Inline::inlineAll(v3Global.rootp());
            V3LinkDot::linkDotArrayed(v3Global.rootp());  // Cleanup as made new modules
        }
@ -308,11 +308,11 @@ static void process() {
        // Push constants across variables and remove redundant assignments
        V3Const::constifyAll(v3Global.rootp());

-        if (v3Global.opt.oLife()) V3Life::lifeAll(v3Global.rootp());
+        if (v3Global.opt.fLife()) V3Life::lifeAll(v3Global.rootp());

        // Make large low-fanin logic blocks into lookup tables
        // This should probably be done much later, once we have common logic elimination.
-        if (!v3Global.opt.lintOnly() && v3Global.opt.oTable()) {
+        if (!v3Global.opt.lintOnly() && v3Global.opt.fTable()) {
            V3Table::tableAll(v3Global.rootp());
        }

@ -326,7 +326,7 @@ static void process() {
        V3Active::activeAll(v3Global.rootp());

        // Split single ALWAYS blocks into multiple blocks for better ordering chances
-        if (v3Global.opt.oSplit()) V3Split::splitAlwaysAll(v3Global.rootp());
+        if (v3Global.opt.fSplit()) V3Split::splitAlwaysAll(v3Global.rootp());
        V3SplitAs::splitAsAll(v3Global.rootp());

        // Create tracing sample points, before we start eliminating signals
@ -338,11 +338,11 @@ static void process() {

        // Gate-based logic elimination; eliminate signals and push constant across cell boundaries
        // Instant propagation makes lots-o-constant reduction possibilities.
-        if (v3Global.opt.oGate()) {
+        if (v3Global.opt.fGate()) {
            V3Gate::gateAll(v3Global.rootp());
            // V3Gate calls constant propagation itself.
        } else {
-            v3info("Command Line disabled gate optimization with -Og/-O0.  "
+            v3info("Command Line disabled gate optimization with -fno-gate.  "
                   "This may cause ordering problems.");
        }

@ -361,7 +361,7 @@ static void process() {
        }

        // Reorder assignments in pipelined blocks
-        if (v3Global.opt.oReorder()) V3Split::splitReorderAll(v3Global.rootp());
+        if (v3Global.opt.fReorder()) V3Split::splitReorderAll(v3Global.rootp());

        // Create delayed assignments
        // This creates lots of duplicate ACTIVES so ActiveTop needs to be after this step
@ -383,12 +383,12 @@ static void process() {
        // Cleanup any dly vars or other temps that are simple assignments
        // Life must be done before Subst, as it assumes each CFunc under
        // _eval is called only once.
-        if (v3Global.opt.oLife()) {
+        if (v3Global.opt.fLife()) {
            V3Const::constifyAll(v3Global.rootp());
            V3Life::lifeAll(v3Global.rootp());
        }

-        if (v3Global.opt.oLifePost()) V3LifePost::lifepostAll(v3Global.rootp());
+        if (v3Global.opt.fLifePost()) V3LifePost::lifepostAll(v3Global.rootp());

        // Remove unused vars
        V3Const::constifyAll(v3Global.rootp());
@ -415,13 +415,13 @@ static void process() {
        v3Global.assertScoped(false);

        // Move variables from modules to function local variables where possible
-        if (v3Global.opt.oLocalize()) V3Localize::localizeAll(v3Global.rootp());
+        if (v3Global.opt.fLocalize()) V3Localize::localizeAll(v3Global.rootp());

        // Remove remaining scopes; make varrefs/funccalls relative to current module
        V3Descope::descopeAll(v3Global.rootp());

        // Icache packing; combine common code in each module's functions into subroutines
-        if (v3Global.opt.oCombine()) V3Combine::combineAll(v3Global.rootp());
+        if (v3Global.opt.fCombine()) V3Combine::combineAll(v3Global.rootp());
    }

    V3Error::abortIfErrors();
@ -445,30 +445,30 @@ static void process() {
    }

    // Expand macros and wide operators into C++ primitives
-    if (!v3Global.opt.lintOnly() && !v3Global.opt.xmlOnly() && v3Global.opt.oExpand()) {
+    if (!v3Global.opt.lintOnly() && !v3Global.opt.xmlOnly() && v3Global.opt.fExpand()) {
        V3Expand::expandAll(v3Global.rootp());
    }

    // Propagate constants across WORDSEL arrayed temporaries
-    if (!v3Global.opt.xmlOnly() && v3Global.opt.oSubst()) {
+    if (!v3Global.opt.xmlOnly() && v3Global.opt.fSubst()) {
        // Constant folding of expanded stuff
        V3Const::constifyCpp(v3Global.rootp());
        V3Subst::substituteAll(v3Global.rootp());
    }

-    if (!v3Global.opt.xmlOnly() && v3Global.opt.oSubstConst()) {
+    if (!v3Global.opt.xmlOnly() && v3Global.opt.fSubstConst()) {
        // Constant folding of substitutions
        V3Const::constifyCpp(v3Global.rootp());
        V3Dead::deadifyAll(v3Global.rootp());
    }

    if (!v3Global.opt.lintOnly() && !v3Global.opt.xmlOnly()) {
-        if (v3Global.opt.oMergeCond()) {
+        if (v3Global.opt.fMergeCond()) {
            // Merge conditionals
            V3MergeCond::mergeAll(v3Global.rootp());
        }

-        if (v3Global.opt.oReloop()) {
+        if (v3Global.opt.fReloop()) {
            // Reform loops to reduce code size
            // Must be after all Sel/array index based optimizations
            V3Reloop::reloopAll(v3Global.rootp());
--- a/test_regress/driver.pl
+++ b/test_regress/driver.pl
@ -77,7 +77,6 @@ my $opt_gdbbt;
 my $opt_gdbsim;
 my $opt_hashset;
 my $opt_jobs = 1;
-my $opt_optimize;
 my $opt_quiet;
 my $opt_rerun;
 my $opt_rrsim;
@ -104,7 +103,6 @@ if (! GetOptions(
          "hashset=s"   => \$opt_hashset,
          "help"        => \&usage,
          "j=i"         => \$opt_jobs,
-          "optimize:s"  => \$opt_optimize,
          "quiet!"      => \$opt_quiet,
          "rerun!"      => \$opt_rerun,
          "rr!"         => \$opt_rr,
@ -661,7 +659,7 @@ sub new {
        verilator_define => 'VERILATOR',
        verilator_flags => ["-cc",
                            "-Mdir $self->{obj_dir}",
-                            "-OD",  # As currently disabled unless -O3
+                            "--fdedup",  # As currently disabled unless -O3
                            "--debug-check",
                            "--comp-limit-members 10", ],
        verilator_flags2 => [],
@ -924,7 +922,6 @@ sub compile_vlt_flags {
    unshift @verilator_flags, "--trace" if $opt_trace;
    my $threads = ::calc_threads($Vltmt_threads);
    unshift @verilator_flags, "--threads $threads" if $param{vltmt} && $checkflags !~ /-threads /;
-    unshift @verilator_flags, "--trace-threads 1" if $param{vltmt} && $checkflags =~ /-trace /;
    unshift @verilator_flags, "--trace-threads 2" if $param{vltmt} && $checkflags =~ /-trace-fst /;
    unshift @verilator_flags, "--debug-partition" if $param{vltmt};
    unshift @verilator_flags, "-CFLAGS -ggdb -LDFLAGS -ggdb" if $opt_gdbsim;
@ -935,19 +932,6 @@ sub compile_vlt_flags {
        $param{make_main} && $param{verilator_make_gmake};
    unshift @verilator_flags, "../" . $self->{main_filename} if
        $param{make_main} && $param{verilator_make_gmake};
-    if (defined $opt_optimize) {
-        my $letters = "";
-        if ($opt_optimize =~ /[a-zA-Z]/) {
-            $letters = $opt_optimize;
-        } else {  # Randomly turn on/off different optimizations
-            foreach my $l ('a' .. 'z') {
-                $letters .= ((rand() > 0.5) ? $l : uc $l);
-            }
-            unshift @verilator_flags, "--trace" if rand() > 0.5;
-            unshift @verilator_flags, "--coverage" if rand() > 0.5;
-        }
-        unshift @verilator_flags, "--O" . $letters;
-    }

    my @cmdargs = (
                   "--prefix " . $param{VM_PREFIX},
@ -2907,11 +2891,6 @@ Displays this message and program version and exits.
 Run number of parallel tests, or 0 to determine the count based on the
 number of cores installed.  Requires Perl's Parallel::Forker package.

-=item --optimize
-
-Randomly turn on/off different optimizations.  With specific flags,
-use those optimization settings
-
 =item --quiet

 Suppress all output except for failures and progress messages every 15
--- a/test_regress/t/t_altera_lpm_mult_noinl.pl
+++ b/test_regress/t/t_altera_lpm_mult_noinl.pl
@ -15,7 +15,7 @@ top_filename("t/t_altera_lpm.v");
 $module =~ s/_noinl//;

 compile(
-    verilator_flags2 => ["--top-module ${module}", "-Oi"]
+    verilator_flags2 => ["--top-module ${module}", "-fno-inline"]
    );

 ok(1);
--- a/test_regress/t/t_alw_noreorder.pl
+++ b/test_regress/t/t_alw_noreorder.pl
@ -12,7 +12,7 @@ scenarios(vlt_all => 1);

 top_filename("t/t_alw_reorder.v");
 compile(
-    verilator_flags2 => ["--stats -Or"],
+    verilator_flags2 => ["--stats -fno-reorder"],
    );

 file_grep($Self->{stats}, qr/Optimizations, Split always\s+(\d+)/i, 0);
--- a/test_regress/t/t_assign_inline.pl
+++ b/test_regress/t/t_assign_inline.pl
@ -11,7 +11,7 @@ if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); di
 scenarios(simulator => 1);

 compile(
-    verilator_flags2 => ["-O0 -OG"],
+    verilator_flags2 => ["-O0 -fgate"],
    );

 execute(
--- a/test_regress/t/t_assign_slice_overflow_ox.pl
+++ b/test_regress/t/t_assign_slice_overflow_ox.pl
@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t_assign_slice_overflow.v");

 compile(
-    verilator_flags2 => ["-Ox"],
+    verilator_flags2 => ["-fno-expand"],
    );

 execute(
--- a/test_regress/t/t_case_66bits_noexpand.pl
+++ b/test_regress/t/t_case_66bits_noexpand.pl
@ -13,7 +13,7 @@ scenarios(vlt => 1);
 top_filename("t/t_case_66bits.v");

 compile(
-    verilator_flags2 => ['-Ox'],
+    verilator_flags2 => ['-fno-expand'],
    );

 execute(
--- a/test_regress/t/t_case_incrdecr.pl
+++ b/test_regress/t/t_case_incrdecr.pl
@ -11,7 +11,7 @@ if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); di
 scenarios(simulator => 1);

 compile(
-    verilator_flags2 => ["--trace --Os -x-assign 0"],
+    verilator_flags2 => ["--trace --fno-split -x-assign 0"],
    );

 execute(
--- a/test_regress/t/t_case_write1.pl
+++ b/test_regress/t/t_case_write1.pl
@ -11,7 +11,7 @@ if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); di
 scenarios(simulator => 1);

 compile(
-    verilator_flags2 => ["--stats --O3 -x-assign fast"],
+    verilator_flags2 => ["--stats -O3 -x-assign fast"],
    );

 execute(
--- a/test_regress/t/t_case_write1_noexpand.pl
+++ b/test_regress/t/t_case_write1_noexpand.pl
@ -13,7 +13,7 @@ scenarios(vlt => 1);
 top_filename("t/t_case_write1.v");

 compile(
-    verilator_flags2 => ['-Ox'],
+    verilator_flags2 => ['-fno-expand'],
    );

 execute(
--- a/test_regress/t/t_case_write2.pl
+++ b/test_regress/t/t_case_write2.pl
@ -11,7 +11,7 @@ if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); di
 scenarios(simulator => 1);

 compile(
-    verilator_flags2 => ["--stats --O3 -x-assign fast"],
+    verilator_flags2 => ["--stats -O3 -x-assign fast"],
    );

 execute(
--- a/test_regress/t/t_concat_unpack.pl
+++ b/test_regress/t/t_concat_unpack.pl
@ -2,29 +2,20 @@
 if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); die; }
 # DESCRIPTION: Verilator: Verilog Test driver/expect definition
 #
-# Copyright 2003-2013 by Wilson Snyder. This program is free software; you
+# Copyright 2022 by Wilson Snyder. This program is free software; you
 # can redistribute it and/or modify it under the terms of either the GNU
 # Lesser General Public License Version 3 or the Perl Artistic License
 # Version 2.0.
 # SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0

-scenarios(vlt => 1);
+scenarios(simulator => 1);

 compile(
-    make_top_shell => 0,
-    make_main => 0,
-    v_flags2 => ["--trace --exe $Self->{t_dir}/t_trace_c_api.cpp",
-                 "-CFLAGS -DVERILATED_VCD_TEST",
-                 "-CFLAGS -DVL_TRACE_VCD_OLD_API"],
    );

 execute(
    check_finished => 1,
    );

-# vcddiff bug crashes
-#vcd_identical("$Self->{obj_dir}/simx.vcd",
-#              $Self->{golden_filename});
-
 ok(1);
 1;
--- a/test_regress/t/t_concat_unpack.v
+++ b/test_regress/t/t_concat_unpack.v
@ -0,0 +1,36 @@
+// DESCRIPTION: Verilator: Verilog Test module
+//
+// This file ONLY is placed under the Creative Commons Public Domain, for
+// any use, without warranty, 2022 by Wilson Snyder.
+// SPDX-License-Identifier: CC0-1.0
+
+module t(/*AUTOARG*/
+   // Inputs
+   clk
+   );
+   input clk;
+
+   wire [31:0] arr [0:7];
+   assign arr[0:7] = {
+                      {16'hffff, 16'h0000},
+                      {16'h0000, 16'h0000},
+                      {16'h0a0a, 16'h0000},
+                      {16'ha0a0, 16'h0000},
+                      {16'hffff, 16'h0000},
+                      {16'h0000, 16'h0000},
+                      {16'h0a0a, 16'h0000},
+                      {16'ha0a0, 16'h0000}
+                      };
+
+   int cyc = 0;
+
+   always @(posedge clk) begin
+      cyc <= cyc + 1;
+      if (cyc == 9) begin
+         if (arr[0] !== 32'hffff0000) $stop;
+         if (arr[7] !== 32'ha0a00000) $stop;
+         $write("*-* All Finished *-*\n");
+         $finish;
+      end
+   end
+endmodule
--- a/test_regress/t/t_const_no_opt.pl
+++ b/test_regress/t/t_const_no_opt.pl
@ -13,7 +13,7 @@ top_filename("t/t_const_opt.v");

 # Run the same design as t_const_opt.pl without bitopt tree optimization to make sure that the result is same.
 compile(
-    verilator_flags2 => ["-Wno-UNOPTTHREADS", "--stats", "-Oo", "$Self->{t_dir}/t_const_opt.cpp"],
+    verilator_flags2 => ["-Wno-UNOPTTHREADS", "--stats", "-fno-const-bit-op-tree", "$Self->{t_dir}/t_const_opt.cpp"],
    );

 execute(
--- a/test_regress/t/t_const_opt.pl
+++ b/test_regress/t/t_const_opt.pl
@ -18,5 +18,8 @@ execute(
    check_finished => 1,
    );

+if ($Self->{vlt}) {
+    file_grep($Self->{stats}, qr/Optimizations, Const bit op reduction\s+(\d+)/i, 11);
+}
 ok(1);
 1;
--- a/test_regress/t/t_const_opt.v
+++ b/test_regress/t/t_const_opt.v
@ -4,6 +4,11 @@
 // any use, without warranty, 2021 Yutetsu TAKATSUKASA.
 // SPDX-License-Identifier: CC0-1.0

+// This function always returns 0, so safe to take bitwise OR with any value.
+// Calling this function stops constant folding as Verialtor does not know
+// what this function returns.
+import "DPI-C" context function int fake_dependency();
+
 module t(/*AUTOARG*/
   // Inputs
   clk
@ -57,7 +62,8 @@ module t(/*AUTOARG*/
         $write("[%0t] cyc==%0d crc=%x sum=%x\n", $time, cyc, crc, sum);
         if (crc !== 64'hc77bb9b3784ea091) $stop;
         // What checksum will we end up with (above print should match)
-`define EXPECTED_SUM 64'hcae926ece668f35d
+`define EXPECTED_SUM 64'hdccb9e7b8b638233
+
         if (sum !== `EXPECTED_SUM) $stop;
         $write("*-* All Finished *-*\n");
         $finish;
@ -79,10 +85,11 @@ module Test(/*AUTOARG*/
   logic d0, d1, d2, d3, d4, d5, d6, d7;
   logic bug3182_out;
   logic bug3197_out;
+   logic bug3445_out;

   output logic o;

-   logic [6:0] tmp;
+   logic [7:0] tmp;
   assign o = ^tmp;

   always_ff @(posedge clk) begin
@ -105,10 +112,12 @@ module Test(/*AUTOARG*/
      tmp[4] <= i[0] & (i[1] & (i[2] & (i[3] | d[4])));  // ConstBitOpTreeVisitor::m_frozenNodes
      tmp[5] <= bug3182_out;
      tmp[6] <= bug3197_out;
+      tmp[7] <= bug3445_out;
   end

   bug3182 i_bug3182(.in(d[4:0]), .out(bug3182_out));
   bug3197 i_bug3197(.clk(clk), .in(d), .out(bug3197_out));
+   bug3445 i_bug3445(.clk(clk), .in(d), .out(bug3445_out));

 endmodule

@ -116,11 +125,6 @@ module bug3182(in, out);
   input wire [4:0] in;
   output wire out;

-   // This function always returns 0, so safe to take bitwise OR with any value.
-   // Calling this function stops constant folding as Verialtor does not know
-   // what this function returns.
-   import "DPI-C" context function int fake_dependency();
-
   logic [4:0] bit_source;

   /* verilator lint_off WIDTH */
@ -140,3 +144,62 @@ module bug3197(input wire clk, input wire [31:0] in, output out);
   wire tmp0 = (|d[38:0]);
   assign out = (d[39] | tmp0);
 endmodule
+
+
+// Bug #3445
+// An unoptimized node is kept as frozen node, but its LSB and polarity were not saved.
+// AST of RHS of result0 looks as below:
+//   AND(SHIFTR(AND(WORDSEL(ARRAYSEL(VARREF)), WORDSEL(ARRAYSEL(VARREF)))), 32'd11)
+//                  ~~~~~~~~~~~~~~~~~~~~~~~~~  ~~~~~~~~~~~~~~~~~~~~~~~~~
+// Two of WORDSELs are frozen nodes. They are under SHIFTR of 11 bits.
+//
+// Fixing #3445 needs to
+//  1. Take AstShiftR and AstNot into op count when diciding optimizable or not
+//     (result0 and result2 in the test)
+//  2. Insert AstShiftR if LSB of the frozen node is not 0 (result1 in the test)
+//  3. Insert AstNot if polarity of the frozen node is false (resutl3 in the
+//  test)
+module bug3445(input wire clk, input wire [31:0] in, output wire out);
+   logic [127:0] d;
+   always_ff @(posedge clk)
+      d <= {d[95:0], in};
+
+   typedef struct packed {
+      logic        a;
+      logic [ 2:0] b;
+      logic [ 2:0] c;
+      logic [ 1:0] d;
+      logic [ 7:0] e;
+      logic [31:0] f;
+      logic [ 3:0] g;
+      logic [31:0] h;
+      logic        i;
+      logic [41:0] j;
+   } packed_struct;
+   packed_struct st[4];
+
+   // This is always 1'b0, but Verilator cannot notice it.
+   // This signal helps to reveal wrong optimization of result2 and result3.
+   logic zero;
+   always_ff @(posedge clk) begin
+      st[0] <= d;
+      st[1] <= st[0];
+      st[2] <= st[1];
+      st[3] <= st[2];
+      zero <= fake_dependency() > 0;
+   end
+
+   logic result0, result1, result2, result3;
+   always_ff @(posedge clk) begin
+      // Cannot optimize further.
+      result0 <= (st[0].g[0] & st[0].h[0]) & (in[0] == 1'b0);
+      // There are redundant !in[0] terms. They should be simplified.
+      result1 <= (!in[0] & (st[1].g[0] & st[1].h[0])) & ((in[0] == 1'b0) & !in[0]);
+      // Cannot optimize further.
+      result2 <= !(st[2].g[0] & st[2].h[0]) & (zero == 1'b0);
+      // There are redundant zero terms. They should be simplified.
+      result3 <= (!zero & !(st[3].g[0] & st[3].h[0])) & ((zero == 1'b0) & !zero);
+   end
+
+   assign out = result0 ^ result1 ^ (result2 | result3);
+endmodule
--- a/test_regress/t/t_emit_constw.pl
+++ b/test_regress/t/t_emit_constw.pl
@ -11,7 +11,7 @@ if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); di
 scenarios(simulator => 1);

 compile(
-    verilator_flags2 => ['--Ox'],
+    verilator_flags2 => ['--fno-expand'],
    );

 execute(
--- a/test_regress/t/t_extract_static_const_no_merge.pl
+++ b/test_regress/t/t_extract_static_const_no_merge.pl
@ -14,7 +14,7 @@ top_filename("t/t_extract_static_const.v");
 golden_filename("t/t_extract_static_const.out");

 compile(
-    verilator_flags2 => ["--stats", "--no-merge-const-pool"],
+    verilator_flags2 => ["--stats", "--fno-merge-const-pool"],
    );

 execute(
--- a/test_regress/t/t_func_twocall_noexpand.pl
+++ b/test_regress/t/t_func_twocall_noexpand.pl
@ -13,7 +13,7 @@ scenarios(vlt => 1);
 top_filename("t/t_func_twocall.v");

 compile(
-    verilator_flags2 => ['-Ox'],
+    verilator_flags2 => ['-fno-expand'],
    );

 execute(
--- a/test_regress/t/t_gen_genblk_noinl.pl
+++ b/test_regress/t/t_gen_genblk_noinl.pl
@ -16,7 +16,7 @@ scenarios(simulator => 1);
 $Self->{sim_time} = 11000;

 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
    );

 execute(
--- a/test_regress/t/t_incr_void.pl
+++ b/test_regress/t/t_incr_void.pl
@ -11,7 +11,7 @@ if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); di
 scenarios(simulator => 1);

 compile(
-    verilator_flags2 => ["--Os -x-assign 0"],
+    verilator_flags2 => ["--fno-split -x-assign 0"],
    );

 execute(
--- a/test_regress/t/t_inst_slice_noinl.pl
+++ b/test_regress/t/t_inst_slice_noinl.pl
@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_inst_slice.v");

 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
    );

 execute(
--- a/test_regress/t/t_interface1_modport_noinl.pl
+++ b/test_regress/t/t_interface1_modport_noinl.pl
@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface1_modport.v");

 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
    );

 execute(
--- a/test_regress/t/t_interface1_noinl.pl
+++ b/test_regress/t/t_interface1_noinl.pl
@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface1.v");

 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
    );

 execute(
--- a/test_regress/t/t_interface2_noinl.pl
+++ b/test_regress/t/t_interface2_noinl.pl
@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface2.v");

 compile(
-    verilator_flags2 => ["--top-module t -Oi"],
+    verilator_flags2 => ["--top-module t -fno-inline"],
    );

 execute(
--- a/test_regress/t/t_interface_array2_noinl.pl
+++ b/test_regress/t/t_interface_array2_noinl.pl
@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface_array2.v");

 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
    );

 execute(
--- a/test_regress/t/t_interface_array_noinl.pl
+++ b/test_regress/t/t_interface_array_noinl.pl
@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface_array.v");

 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
    );

 execute(
--- a/test_regress/t/t_interface_down_noinl.pl
+++ b/test_regress/t/t_interface_down_noinl.pl
@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface_down.v");

 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
    );

 execute(
--- a/test_regress/t/t_interface_gen10_noinl.pl
+++ b/test_regress/t/t_interface_gen10_noinl.pl
@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface_gen10.v");

 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
    );

 execute(
--- a/test_regress/t/t_interface_gen11_noinl.pl
+++ b/test_regress/t/t_interface_gen11_noinl.pl
@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface_gen11.v");

 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
    );

 execute(
--- a/test_regress/t/t_interface_gen12_noinl.pl
+++ b/test_regress/t/t_interface_gen12_noinl.pl
@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface_gen12.v");

 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
    );

 execute(
--- a/test_regress/t/t_interface_gen2_noinl.pl
+++ b/test_regress/t/t_interface_gen2_noinl.pl
@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface_gen2.v");

 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
    );

 execute(
--- a/test_regress/t/t_interface_gen3_noinl.pl
+++ b/test_regress/t/t_interface_gen3_noinl.pl
@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface_gen3.v");

 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
    );

 execute(
--- a/test_regress/t/t_interface_gen4_noinl.pl
+++ b/test_regress/t/t_interface_gen4_noinl.pl
@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface_gen4.v");

 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
    );

 execute(
--- a/test_regress/t/t_interface_gen5_noinl.pl
+++ b/test_regress/t/t_interface_gen5_noinl.pl
@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface_gen5.v");

 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
    );

 execute(
--- a/test_regress/t/t_interface_gen6_noinl.pl
+++ b/test_regress/t/t_interface_gen6_noinl.pl
@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface_gen6.v");

 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
    );

 execute(
--- a/test_regress/t/t_interface_gen7_noinl.pl
+++ b/test_regress/t/t_interface_gen7_noinl.pl
@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface_gen7.v");

 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
    );

 execute(
--- a/test_regress/t/t_interface_gen8_noinl.pl
+++ b/test_regress/t/t_interface_gen8_noinl.pl
@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface_gen8.v");

 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
    );

 execute(
--- a/test_regress/t/t_interface_gen9_noinl.pl
+++ b/test_regress/t/t_interface_gen9_noinl.pl
@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface_gen9.v");

 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
    );

 execute(
--- a/test_regress/t/t_interface_gen_noinl.pl
+++ b/test_regress/t/t_interface_gen_noinl.pl
@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface_gen.v");

 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
    );

 execute(
--- a/test_regress/t/t_interface_inl.pl
+++ b/test_regress/t/t_interface_inl.pl
@ -14,7 +14,7 @@ top_filename("t/t_interface.v");

 compile(
    # Avoid inlining so we find bugs in the non-inliner connection code
-    verilator_flags2 => ["-Oi"],
+    verilator_flags2 => ["-fno-inline"],
    );

 execute(
--- a/test_regress/t/t_interface_modport_import_noinl.pl
+++ b/test_regress/t/t_interface_modport_import_noinl.pl
@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface_modport_import.v");

 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
    );

 execute(
--- a/test_regress/t/t_interface_modport_inl.pl
+++ b/test_regress/t/t_interface_modport_inl.pl
@ -14,7 +14,7 @@ top_filename("t/t_interface_modport.v");

 compile(
    # Avoid inlining so we find bugs in the non-inliner connection code
-    verilator_flags2 => ["-Oi"],
+    verilator_flags2 => ["-fno-inline"],
    );

 execute(
--- a/test_regress/t/t_interface_modport_noinl.pl
+++ b/test_regress/t/t_interface_modport_noinl.pl
@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface_modport.v");

 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
    );

 execute(
--- a/test_regress/t/t_interface_mp_func_noinl.pl
+++ b/test_regress/t/t_interface_mp_func_noinl.pl
@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface_mp_func.v");

 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
    );

 execute(
--- a/test_regress/t/t_interface_nest_noinl.pl
+++ b/test_regress/t/t_interface_nest_noinl.pl
@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface_nest.v");

 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
    );

 execute(
--- a/test_regress/t/t_interface_noinl.pl
+++ b/test_regress/t/t_interface_noinl.pl
@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface.v");

 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
    );

 execute(
--- a/test_regress/t/t_interface_twod_noinl.pl
+++ b/test_regress/t/t_interface_twod_noinl.pl
@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface_twod.v");

 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
    );

 execute(
--- a/test_regress/t/t_lint_setout_bad_noinl.pl
+++ b/test_regress/t/t_lint_setout_bad_noinl.pl
@ -13,7 +13,7 @@ scenarios(linter => 1);
 top_filename("t/t_lint_setout_bad.v");

 lint(
-    verilator_flags2 => ["--lint-only -Oi"],
+    verilator_flags2 => ["--lint-only -fno-inline"],
    fails => 1,
    expect_filename => $Self->{golden_filename},
    );
--- a/test_regress/t/t_math_cond_huge_noexpand.pl
+++ b/test_regress/t/t_math_cond_huge_noexpand.pl
@ -13,7 +13,7 @@ scenarios(vlt => 1);
 top_filename("t/t_math_cond_huge.v");

 compile(
-    verilator_flags2 => ['-Ox'],
+    verilator_flags2 => ['-fno-expand'],
    );

 execute(
--- a/test_regress/t/t_math_div_noexpand.pl
+++ b/test_regress/t/t_math_div_noexpand.pl
@ -13,7 +13,7 @@ scenarios(vlt => 1);
 top_filename("t/t_math_div.v");

 compile(
-    verilator_flags2 => ['-Ox'],
+    verilator_flags2 => ['-fno-expand'],
    );

 execute(
--- a/test_regress/t/t_math_eq_noexpand.pl
+++ b/test_regress/t/t_math_eq_noexpand.pl
@ -13,7 +13,7 @@ scenarios(vlt => 1);
 top_filename("t/t_math_eq.v");

 compile(
-    verilator_flags2 => ['-Ox'],
+    verilator_flags2 => ['-fno-expand'],
    );

 execute(
--- a/test_regress/t/t_math_red_noexpand.pl
+++ b/test_regress/t/t_math_red_noexpand.pl
@ -13,7 +13,7 @@ scenarios(vlt => 1);
 top_filename("t/t_math_red.v");

 compile(
-    verilator_flags2 => ['-Ox'],
+    verilator_flags2 => ['-fno-expand'],
    );

 execute(
--- a/test_regress/t/t_math_shift_noexpand.pl
+++ b/test_regress/t/t_math_shift_noexpand.pl
@ -13,7 +13,7 @@ scenarios(vlt => 1);
 top_filename("t/t_math_shift.v");

 compile(
-    verilator_flags2 => ['-Ox'],
+    verilator_flags2 => ['-fno-expand'],
    );

 execute(
--- a/test_regress/t/t_math_signed_noexpand.pl
+++ b/test_regress/t/t_math_signed_noexpand.pl
@ -13,7 +13,7 @@ scenarios(vlt => 1);
 top_filename("t/t_math_signed.v");

 compile(
-    verilator_flags2 => ['-Ox'],
+    verilator_flags2 => ['-fno-expand'],
    );

 execute(
--- a/test_regress/t/t_math_vliw_noexpand.pl
+++ b/test_regress/t/t_math_vliw_noexpand.pl
@ -13,7 +13,7 @@ scenarios(vlt => 1);
 top_filename("t/t_math_vliw.v");

 compile(
-    verilator_flags2 => ['-Ox'],
+    verilator_flags2 => ['-fno-expand'],
    );

 execute(
--- a/test_regress/t/t_mem_multi_io.pl
+++ b/test_regress/t/t_mem_multi_io.pl
@ -12,7 +12,7 @@ scenarios(simulator => 1);

 compile(
    # Disable inlining, this test is trivial without it
-    verilator_flags2 => ["-Oi --trace"],
+    verilator_flags2 => ["-fno-inline --trace"],
    verilator_flags3 => [],
    );

--- a/Show More
+++ b/Show More