From f4038e36743bfb5d5860293520df781deedfe598 Mon Sep 17 00:00:00 2001
From: Geza Lore <gezalore@gmail.com>
Date: Tue, 12 Jul 2022 11:41:15 +0100
Subject: [PATCH] Move thread pool and execution profiler into the context.
 (#3477)

Fixes #3454
---
 include/verilated.cpp                      |  58 +++++++++
 include/verilated.h                        |  48 +++++++-
 include/verilated_profiler.cpp             |  43 +++++--
 include/verilated_profiler.h               |  17 +--
 include/verilated_threads.cpp              |  54 ++++-----
 include/verilated_threads.h                |  32 ++---
 include/verilated_trace.h                  |  22 ++--
 include/verilated_trace_imp.h              | 133 ++++++++++-----------
 src/V3EmitCHeaders.cpp                     |   6 +-
 src/V3EmitCMake.cpp                        |   2 +-
 src/V3EmitCModel.cpp                       |  31 +++--
 src/V3EmitCSyms.cpp                        |  24 ++--
 src/V3EmitMk.cpp                           |   2 +-
 src/V3Trace.cpp                            |   9 +-
 test_regress/driver.pl                     |   8 +-
 test_regress/t/t_embed1.pl                 |   3 +-
 test_regress/t/t_gantt_two.cpp             |  43 +++++++
 test_regress/t/t_gantt_two.pl              |  61 ++++++++++
 test_regress/t/t_hier_block_cmake/main.cpp |   5 +-
 test_regress/t/t_lib_prot_shared.pl        |   3 +-
 test_regress/t/t_threads_crazy.pl          |  12 +-
 test_regress/t/t_threads_crazy_context.pl  |  36 ++++++
 test_regress/t/t_wrapper_context.cpp       |   2 +
 23 files changed, 470 insertions(+), 184 deletions(-)
 create mode 100644 test_regress/t/t_gantt_two.cpp
 create mode 100755 test_regress/t/t_gantt_two.pl
 create mode 100755 test_regress/t/t_threads_crazy_context.pl

diff --git a/include/verilated.cpp b/include/verilated.cpp
index da8599a42..cf1d76d8f 100644
--- a/include/verilated.cpp
+++ b/include/verilated.cpp
@@ -66,6 +66,10 @@
 #if defined(_WIN32) || defined(__MINGW32__)
 # include <direct.h>  // mkdir
 #endif
+
+#ifdef VL_THREADED
+# include "verilated_threads.h"
+#endif
 // clang-format on
 
 // Max characters in static char string for VL_VALUE_STRING
@@ -2428,6 +2432,33 @@ const char* VerilatedContext::timeprecisionString() const VL_MT_SAFE {
     return vl_time_str(timeprecision());
 }
 
+void VerilatedContext::threads(unsigned n) {
+    if (n == 0) VL_FATAL_MT(__FILE__, __LINE__, "", "%Error: Simulation threads must be >= 1");
+
+    if (m_threadPool) {
+        VL_FATAL_MT(
+            __FILE__, __LINE__, "",
+            "%Error: Cannot set simulation threads after the thread pool has been created.");
+    }
+
+#if VL_THREADED
+    if (m_threads == n) return;  // To avoid unnecessary warnings
+    m_threads = n;
+    const unsigned hardwareThreadsAvailable = std::thread::hardware_concurrency();
+    if (m_threads > hardwareThreadsAvailable) {
+        VL_PRINTF_MT("%%Warning: System has %u hardware threads but simulation thread count set "
+                     "to %u. This will likely cause significant slowdown.\n",
+                     hardwareThreadsAvailable, m_threads);
+    }
+#else
+    if (n > 1) {
+        VL_PRINTF_MT("%%Warning: Verilator run-time library built without VL_THREADS. Ignoring "
+                     "call to 'VerilatedContext::threads' with argument %u.\n",
+                     n);
+    }
+#endif
+}
+
 void VerilatedContext::commandArgs(int argc, const char** argv) VL_MT_SAFE_EXCLUDES(m_argMutex) {
     const VerilatedLockGuard lock{m_argMutex};
     m_args.m_argVec.clear();  // Empty first, then add
@@ -2458,6 +2489,33 @@ void VerilatedContext::internalsDump() const VL_MT_SAFE {
     VerilatedImp::userDump();
 }
 
+void VerilatedContext::addModel(VerilatedModel* modelp) {
+    threadPoolp();  // Ensure thread pool is created, so m_threads cannot change any more
+
+    if (modelp->threads() > m_threads) {
+        std::ostringstream msg;
+        msg << "VerilatedContext has " << m_threads << " threads but model '"
+            << modelp->modelName() << "' (instantiated as '" << modelp->hierName()
+            << "') was Verilated with --threads " << modelp->threads() << ".\n";
+        const std::string str = msg.str();
+        VL_FATAL_MT(__FILE__, __LINE__, modelp->hierName(), str.c_str());
+    }
+}
+
+VerilatedVirtualBase* VerilatedContext::threadPoolp() {
+    if (m_threads == 1) return nullptr;
+#if VL_THREADED
+    if (!m_threadPool) m_threadPool.reset(new VlThreadPool{this, m_threads - 1});
+#endif
+    return m_threadPool.get();
+}
+
+VerilatedVirtualBase*
+VerilatedContext::enableExecutionProfiler(VerilatedVirtualBase* (*construct)(VerilatedContext&)) {
+    if (!m_executionProfiler) m_executionProfiler.reset(construct(*this));
+    return m_executionProfiler.get();
+}
+
 //======================================================================
 // VerilatedContextImp:: Methods - command line
 
diff --git a/include/verilated.h b/include/verilated.h
index f9cf79601..bc1d5a3f2 100644
--- a/include/verilated.h
+++ b/include/verilated.h
@@ -252,6 +252,28 @@ public:
 #endif
 };
 
+//=========================================================================
+/// Base class of a Verilator generated (Verilated) model.
+///
+/// VerilatedModel is a base class of the user facing primary class generated
+/// by Verilator.
+
+class VerilatedModel VL_NOT_FINAL {
+    VL_UNCOPYABLE(VerilatedModel);
+
+protected:
+    explicit VerilatedModel() = default;
+    virtual ~VerilatedModel() = default;
+
+public:
+    /// Returns the hierarchical name of this module instance.
+    virtual const char* hierName() = 0;
+    /// Returns the name of this model (the name of the generated model class).
+    virtual const char* modelName() = 0;
+    /// Returns the thread level parallelism, this model was Verilated with. Always 1 or higher.
+    virtual unsigned threads() = 0;
+};
+
 //=========================================================================
 /// Base class for all Verilated module classes.
 
@@ -266,10 +288,6 @@ public:
     const char* name() const { return m_namep; }  ///< Return name of module
 };
 
-/// Declare a module, ala SC_MODULE
-#define VL_MODULE(modname) class modname VL_NOT_FINAL : public VerilatedModule
-// Not class final in VL_MODULE, as users might be abstracting our models (--hierarchical)
-
 //=========================================================================
 // Functions overridable by user defines
 // (Internals however must use VL_PRINTF_MT, which calls these.)
@@ -362,6 +380,16 @@ protected:
 
     // Implementation details
     const std::unique_ptr<VerilatedContextImpData> m_impdatap;
+    // Number of threads to use for simulation (size of m_threadPool + 1 for main thread)
+#ifdef VL_THREADED
+    unsigned m_threads = std::thread::hardware_concurrency();
+#else
+    const unsigned m_threads = 1;
+#endif
+    // The thread pool shared by all models added to this context
+    std::unique_ptr<VerilatedVirtualBase> m_threadPool;
+    // The execution profiler shared by all models added to this context
+    std::unique_ptr<VerilatedVirtualBase> m_executionProfiler;
     // Coverage access
     std::unique_ptr<VerilatedVirtualBase> m_coveragep;  // Pointer for coveragep()
 
@@ -495,6 +523,12 @@ public:
     /// Get time precision as IEEE-standard text
     const char* timeprecisionString() const VL_MT_SAFE;
 
+    /// Get number of threads used for simulation (including the main thread)
+    unsigned threads() const { return m_threads; }
+    /// Set number of threads used for simulation (including the main thread)
+    /// Can only be called before the thread pool is created (before first model is added).
+    void threads(unsigned n);
+
     /// Allow traces to at some point be enabled (disables some optimizations)
     void traceEverOn(bool flag) VL_MT_SAFE {
         if (flag) calcUnusedSigs(true);
@@ -517,6 +551,12 @@ public:  // But for internal use only
         return reinterpret_cast<const VerilatedContextImp*>(this);
     }
 
+    void addModel(VerilatedModel*);
+
+    VerilatedVirtualBase* threadPoolp();
+    VerilatedVirtualBase*
+    enableExecutionProfiler(VerilatedVirtualBase* (*construct)(VerilatedContext&));
+
     // Internal: $dumpfile
     void dumpfile(const std::string& flag) VL_MT_SAFE_EXCLUDES(m_timeDumpMutex);
     std::string dumpfile() const VL_MT_SAFE_EXCLUDES(m_timeDumpMutex);
diff --git a/include/verilated_profiler.cpp b/include/verilated_profiler.cpp
index 21246827a..d65442f44 100644
--- a/include/verilated_profiler.cpp
+++ b/include/verilated_profiler.cpp
@@ -66,41 +66,66 @@ template <size_t N> static size_t roundUptoMultipleOf(size_t value) {
     return (value + mask) & ~mask;
 }
 
-VlExecutionProfiler::VlExecutionProfiler() {
+VlExecutionProfiler::VlExecutionProfiler(VerilatedContext& context)
+    : m_context{context} {
     // Setup profiling on main thread
     setupThread(0);
 }
 
-void VlExecutionProfiler::configure(const VerilatedContext& context) {
+void VlExecutionProfiler::configure() {
+
     if (VL_UNLIKELY(m_enabled)) {
         --m_windowCount;
-        if (VL_UNLIKELY(m_windowCount == context.profExecWindow())) {
+        if (VL_UNLIKELY(m_windowCount == m_context.profExecWindow())) {
             VL_DEBUG_IF(VL_DBG_MSGF("+ profile start collection\n"););
             clear();  // Clear the profile after the cache warm-up cycles.
             m_tickBegin = VL_CPU_TICK();
         } else if (VL_UNLIKELY(m_windowCount == 0)) {
             const uint64_t tickEnd = VL_CPU_TICK();
             VL_DEBUG_IF(VL_DBG_MSGF("+ profile end\n"););
-            const std::string& fileName = context.profExecFilename();
+            const std::string& fileName = m_context.profExecFilename();
             dump(fileName.c_str(), tickEnd);
             m_enabled = false;
         }
         return;
     }
 
-    const uint64_t startReq = context.profExecStart() + 1;  // + 1, so we can start at time 0
+    const uint64_t startReq = m_context.profExecStart() + 1;  // + 1, so we can start at time 0
 
-    if (VL_UNLIKELY(m_lastStartReq < startReq && VL_TIME_Q() >= context.profExecStart())) {
+    if (VL_UNLIKELY(m_lastStartReq < startReq && VL_TIME_Q() >= m_context.profExecStart())) {
         VL_DEBUG_IF(VL_DBG_MSGF("+ profile start warmup\n"););
         VL_DEBUG_IF(assert(m_windowCount == 0););
         m_enabled = true;
-        m_windowCount = context.profExecWindow() * 2;
+        m_windowCount = m_context.profExecWindow() * 2;
         m_lastStartReq = startReq;
     }
 }
 
-void VlExecutionProfiler::startWorkerSetup(VlExecutionProfiler* profilep, uint32_t threadId) {
-    profilep->setupThread(threadId);
+VerilatedVirtualBase* VlExecutionProfiler::construct(VerilatedContext& context) {
+    VlExecutionProfiler* const selfp = new VlExecutionProfiler{context};
+#if VL_THREADED
+    if (VlThreadPool* const threadPoolp = static_cast<VlThreadPool*>(context.threadPoolp())) {
+        for (int i = 0; i < threadPoolp->numThreads(); ++i) {
+            // Data to pass to worker thread initialization
+            struct Data {
+                VlExecutionProfiler* const selfp;
+                const uint32_t threadId;
+            } data{selfp, static_cast<uint32_t>(i + 1)};
+
+            // Initialize worker thread
+            threadPoolp->workerp(i)->addTask(
+                [](void* userp, bool) {
+                    Data* const datap = static_cast<Data*>(userp);
+                    datap->selfp->setupThread(datap->threadId);
+                },
+                &data);
+
+            // Wait until initializationis complete
+            threadPoolp->workerp(i)->wait();
+        }
+    }
+#endif
+    return selfp;
 }
 
 void VlExecutionProfiler::setupThread(uint32_t threadId) {
diff --git a/include/verilated_profiler.h b/include/verilated_profiler.h
index f85c95528..61f2813d3 100644
--- a/include/verilated_profiler.h
+++ b/include/verilated_profiler.h
@@ -33,13 +33,14 @@
 #include <vector>
 
 class VlExecutionProfiler;
+class VlThreadPool;
 
 //=============================================================================
 // Macros to simplify generated code
 
 #define VL_EXEC_TRACE_ADD_RECORD(vlSymsp) \
-    if (VL_UNLIKELY((vlSymsp)->__Vm_executionProfiler.enabled())) \
-    (vlSymsp)->__Vm_executionProfiler.addRecord()
+    if (VL_UNLIKELY((vlSymsp)->__Vm_executionProfilerp->enabled())) \
+    (vlSymsp)->__Vm_executionProfilerp->addRecord()
 
 //=============================================================================
 // Return high-precision counter for profiling, or 0x0 if not available
@@ -131,7 +132,7 @@ static_assert(std::is_trivially_destructible<VlExecutionRecord>::value,
 //=============================================================================
 // VlExecutionProfiler is for collecting profiling data about model execution
 
-class VlExecutionProfiler final {
+class VlExecutionProfiler final : public VerilatedVirtualBase {
     // CONSTANTS
 
     // In order to try to avoid dynamic memory allocations during the actual profiling phase,
@@ -149,6 +150,7 @@ class VlExecutionProfiler final {
     using ExecutionTrace = std::vector<VlExecutionRecord>;
 
     // STATE
+    VerilatedContext& m_context;  // The context this profiler is under
     static VL_THREAD_LOCAL ExecutionTrace t_trace;  // thread-local trace buffers
     mutable VerilatedMutex m_mutex;
     // Map from thread id to &t_trace of given thread
@@ -162,7 +164,8 @@ class VlExecutionProfiler final {
 
 public:
     // CONSTRUCTOR
-    VlExecutionProfiler();
+    explicit VlExecutionProfiler(VerilatedContext& context);
+    virtual ~VlExecutionProfiler() = default;
 
     // METHODS
 
@@ -174,7 +177,7 @@ public:
         return t_trace.back();
     }
     // Configure profiler (called in beginning of 'eval')
-    void configure(const VerilatedContext&);
+    void configure();
     // Setup profiling on a particular thread;
     void setupThread(uint32_t threadId);
     // Clear all profiling data
@@ -182,8 +185,8 @@ public:
     // Write profiling data into file
     void dump(const char* filenamep, uint64_t tickEnd) VL_MT_SAFE_EXCLUDES(m_mutex);
 
-    // Called via VlStartWorkerCb in VlWorkerThread::startWorker
-    static void startWorkerSetup(VlExecutionProfiler* profilep, uint32_t threadId);
+    // Passed to VerilatedContext to create the VlExecutionProfiler profiler instance
+    static VerilatedVirtualBase* construct(VerilatedContext& context);
 };
 
 //=============================================================================
diff --git a/include/verilated_threads.cpp b/include/verilated_threads.cpp
index a78ea9ae6..6696d738d 100644
--- a/include/verilated_threads.cpp
+++ b/include/verilated_threads.cpp
@@ -47,11 +47,9 @@ VlMTaskVertex::VlMTaskVertex(uint32_t upstreamDepCount)
 //=============================================================================
 // VlWorkerThread
 
-VlWorkerThread::VlWorkerThread(uint32_t threadId, VerilatedContext* contextp,
-                               VlExecutionProfiler* profilerp, VlStartWorkerCb startCb)
+VlWorkerThread::VlWorkerThread(VerilatedContext* contextp)
     : m_ready_size{0}
-    , m_cthread{startWorker, this, threadId, profilerp, startCb}
-    , m_contextp{contextp} {}
+    , m_cthread{startWorker, this, contextp} {}
 
 VlWorkerThread::~VlWorkerThread() {
     shutdown();
@@ -59,47 +57,49 @@ VlWorkerThread::~VlWorkerThread() {
     m_cthread.join();
 }
 
-void VlWorkerThread::shutdownTask(void*, bool) {
+static void shutdownTask(void*, bool) {
     // Deliberately empty, we use the address of this function as a magic number
 }
 
+void VlWorkerThread::shutdown() { addTask(shutdownTask, nullptr); }
+
+void VlWorkerThread::wait() {
+    // Enqueue a task that sets this flag. Execution is in-order so this ensures completion.
+    std::atomic<bool> flag{false};
+    addTask([](void* flagp, bool) { static_cast<std::atomic<bool>*>(flagp)->store(true); }, &flag);
+    // Spin wait
+    for (unsigned i = 0; i < VL_LOCK_SPINS; ++i) {
+        if (flag.load()) return;
+        VL_CPU_RELAX();
+    }
+    // Yield wait
+    while (!flag.load()) std::this_thread::yield();
+}
+
 void VlWorkerThread::workerLoop() {
     ExecRec work;
 
+    // Wait for the first task without spinning, in case the thread is never actually used.
+    dequeWork</* SpinWait: */ false>(&work);
+
     while (true) {
-        dequeWork(&work);
         if (VL_UNLIKELY(work.m_fnp == shutdownTask)) break;
         work.m_fnp(work.m_selfp, work.m_evenCycle);
+        // Wait for next task with spinning.
+        dequeWork</* SpinWait: */ true>(&work);
     }
 }
 
-void VlWorkerThread::startWorker(VlWorkerThread* workerp, uint32_t threadId,
-                                 VlExecutionProfiler* profilerp, VlStartWorkerCb startCb) {
-    Verilated::threadContextp(workerp->m_contextp);
-    if (VL_UNLIKELY(startCb)) startCb(profilerp, threadId);
+void VlWorkerThread::startWorker(VlWorkerThread* workerp, VerilatedContext* contextp) {
+    Verilated::threadContextp(contextp);
     workerp->workerLoop();
 }
 
 //=============================================================================
 // VlThreadPool
 
-VlThreadPool::VlThreadPool(VerilatedContext* contextp, int nThreads,
-                           VlExecutionProfiler* profilerp, VlStartWorkerCb startCb) {
-    // --threads N passes nThreads=N-1, as the "main" threads counts as 1
-    ++nThreads;
-    const unsigned cpus = std::thread::hardware_concurrency();
-    if (cpus < nThreads) {
-        static int warnedOnce = 0;
-        if (!warnedOnce++) {
-            VL_PRINTF_MT("%%Warning: System has %u CPUs but model Verilated with"
-                         " --threads %d; may run slow.\n",
-                         cpus, nThreads);
-        }
-    }
-    // Create worker threads
-    for (uint32_t threadId = 1; threadId < nThreads; ++threadId) {
-        m_workers.push_back(new VlWorkerThread{threadId, contextp, profilerp, startCb});
-    }
+VlThreadPool::VlThreadPool(VerilatedContext* contextp, unsigned nThreads) {
+    for (unsigned i = 0; i < nThreads; ++i) m_workers.push_back(new VlWorkerThread{contextp});
 }
 
 VlThreadPool::~VlThreadPool() {
diff --git a/include/verilated_threads.h b/include/verilated_threads.h
index eeb8f9342..fdb45580e 100644
--- a/include/verilated_threads.h
+++ b/include/verilated_threads.h
@@ -60,9 +60,6 @@ using VlSelfP = void*;
 
 using VlExecFnp = void (*)(VlSelfP, bool);
 
-// VlWorkerThread::startWorker callback, used to hook in VlExecutionProfiler
-using VlStartWorkerCb = void (*)(VlExecutionProfiler*, uint32_t threadId);
-
 // Track dependencies for a single MTask.
 class VlMTaskVertex final {
     // MEMBERS
@@ -166,24 +163,23 @@ private:
     std::atomic<size_t> m_ready_size;
 
     std::thread m_cthread;  // Underlying C++ thread record
-    VerilatedContext* const m_contextp;  // Context for spawned thread
 
     VL_UNCOPYABLE(VlWorkerThread);
 
 public:
     // CONSTRUCTORS
-    explicit VlWorkerThread(uint32_t threadId, VerilatedContext* contextp,
-                            VlExecutionProfiler* profilerp, VlStartWorkerCb startCb);
+    explicit VlWorkerThread(VerilatedContext* contextp);
     ~VlWorkerThread();
 
     // METHODS
+    template <bool SpinWait>  //
     inline void dequeWork(ExecRec* workp) VL_MT_SAFE_EXCLUDES(m_mutex) {
         // Spin for a while, waiting for new data
-        for (int i = 0; i < VL_LOCK_SPINS; ++i) {
-            if (VL_LIKELY(m_ready_size.load(std::memory_order_relaxed))) {  //
-                break;
+        if VL_CONSTEXPR_CXX17 (SpinWait) {
+            for (unsigned i = 0; i < VL_LOCK_SPINS; ++i) {
+                if (VL_LIKELY(m_ready_size.load(std::memory_order_relaxed))) break;
+                VL_CPU_RELAX();
             }
-            VL_CPU_RELAX();
         }
         VerilatedLockGuard lock{m_mutex};
         while (m_ready.empty()) {
@@ -197,7 +193,7 @@ public:
         m_ready.erase(m_ready.begin());
         m_ready_size.fetch_sub(1, std::memory_order_relaxed);
     }
-    inline void addTask(VlExecFnp fnp, VlSelfP selfp, bool evenCycle)
+    inline void addTask(VlExecFnp fnp, VlSelfP selfp, bool evenCycle = false)
         VL_MT_SAFE_EXCLUDES(m_mutex) {
         bool notify;
         {
@@ -209,15 +205,14 @@ public:
         if (notify) m_cv.notify_one();
     }
 
-    inline void shutdown() { addTask(shutdownTask, nullptr, false); }
-    static void shutdownTask(void*, bool);
+    void shutdown();  // Finish current tasks, then terminate thread
+    void wait();  // Blocks calling thread until all tasks complete in this thread
 
     void workerLoop();
-    static void startWorker(VlWorkerThread* workerp, uint32_t threadId,
-                            VlExecutionProfiler* profilerp, VlStartWorkerCb startCb);
+    static void startWorker(VlWorkerThread* workerp, VerilatedContext* contextp);
 };
 
-class VlThreadPool final {
+class VlThreadPool final : public VerilatedVirtualBase {
     // MEMBERS
     std::vector<VlWorkerThread*> m_workers;  // our workers
 
@@ -226,9 +221,8 @@ public:
     // Construct a thread pool with 'nThreads' dedicated threads. The thread
     // pool will create these threads and make them available to execute tasks
     // via this->workerp(index)->addTask(...)
-    VlThreadPool(VerilatedContext* contextp, int nThreads, VlExecutionProfiler* profilerp,
-                 VlStartWorkerCb startCb);
-    ~VlThreadPool();
+    VlThreadPool(VerilatedContext* contextp, unsigned nThreads);
+    virtual ~VlThreadPool();
 
     // METHODS
     inline int numThreads() const { return m_workers.size(); }
diff --git a/include/verilated_trace.h b/include/verilated_trace.h
index 7915c3645..0d0f7c0f6 100644
--- a/include/verilated_trace.h
+++ b/include/verilated_trace.h
@@ -194,8 +194,6 @@ private:
     static void parallelWorkerTask(void*, bool);
 #endif
 
-    using ParallelCallbackMap = std::unordered_map<VlThreadPool*, std::vector<CallbackRecord>>;
-
 protected:
     uint32_t* m_sigs_oldvalp = nullptr;  // Previous value store
     EData* m_sigs_enabledp = nullptr;  // Bit vector of enabled codes (nullptr = all on)
@@ -203,10 +201,10 @@ private:
     uint64_t m_timeLastDump = 0;  // Last time we did a dump
     std::vector<bool> m_sigs_enabledVec;  // Staging for m_sigs_enabledp
     std::vector<CallbackRecord> m_initCbs;  // Routines to initialize tracing
-    ParallelCallbackMap m_fullCbs;  // Routines to perform full dump
-    ParallelCallbackMap m_chgCbs;  // Routines to perform incremental dump
+    std::vector<CallbackRecord> m_fullCbs;  // Routines to perform full dump
+    std::vector<CallbackRecord> m_chgCbs;  // Routines to perform incremental dump
     std::vector<CallbackRecord> m_cleanupCbs;  // Routines to call at the end of dump
-    std::vector<VlThreadPool*> m_threadPoolps;  // All thread pools, in insertion order
+    VerilatedContext* m_contextp = nullptr;  // The context used by the traced models
     bool m_fullDump = true;  // Whether a full dump is required on the next call to 'dump'
     uint32_t m_nextCode = 0;  // Next code number to assign
     uint32_t m_numSignals = 0;  // Number of distinct signals
@@ -217,16 +215,16 @@ private:
     double m_timeRes = 1e-9;  // Time resolution (ns/ms etc)
     double m_timeUnit = 1e-0;  // Time units (ns/ms etc)
 
-    void addThreadPool(VlThreadPool* threadPoolp) VL_MT_SAFE_EXCLUDES(m_mutex);
+    void addContext(VerilatedContext*) VL_MT_SAFE_EXCLUDES(m_mutex);
 
-    void addCallbackRecord(std::vector<CallbackRecord>& cbVec, CallbackRecord& cbRec)
+    void addCallbackRecord(std::vector<CallbackRecord>& cbVec, CallbackRecord&& cbRec)
         VL_MT_SAFE_EXCLUDES(m_mutex);
 
     // Equivalent to 'this' but is of the sub-type 'T_Trace*'. Use 'self()->'
     // to access duck-typed functions to avoid a virtual function call.
     T_Trace* self() { return static_cast<T_Trace*>(this); }
 
-    void runParallelCallbacks(const ParallelCallbackMap& cbMap);
+    void runCallbacks(const std::vector<CallbackRecord>& cbVec);
 
     // Flush any remaining data for this file
     static void onFlush(void* selfp) VL_MT_UNSAFE_ONE;
@@ -341,10 +339,10 @@ public:
     //=========================================================================
     // Non-hot path internal interface to Verilator generated code
 
-    void addInitCb(initCb_t cb, void* userp) VL_MT_SAFE;
-    void addFullCb(dumpCb_t cb, void* userp, VlThreadPool* = nullptr) VL_MT_SAFE;
-    void addChgCb(dumpCb_t cb, void* userp, VlThreadPool* = nullptr) VL_MT_SAFE;
-    void addCleanupCb(cleanupCb_t cb, void* userp) VL_MT_SAFE;
+    void addInitCb(initCb_t cb, void* userp, VerilatedContext*) VL_MT_SAFE;
+    void addFullCb(dumpCb_t cb, void* userp, VerilatedContext*) VL_MT_SAFE;
+    void addChgCb(dumpCb_t cb, void* userp, VerilatedContext*) VL_MT_SAFE;
+    void addCleanupCb(cleanupCb_t cb, void* userp, VerilatedContext*) VL_MT_SAFE;
 
     void scopeEscape(char flag) { m_scopeEscape = flag; }
 
diff --git a/include/verilated_trace_imp.h b/include/verilated_trace_imp.h
index d2ffa965c..a09ac0f43 100644
--- a/include/verilated_trace_imp.h
+++ b/include/verilated_trace_imp.h
@@ -478,55 +478,52 @@ template <> VL_ATTR_NOINLINE void VerilatedTrace<VL_SUB_T, VL_BUF_T>::ParallelWo
 #endif
 
 template <>
-void VerilatedTrace<VL_SUB_T, VL_BUF_T>::runParallelCallbacks(const ParallelCallbackMap& cbMap) {
-    for (VlThreadPool* threadPoolp : m_threadPoolps) {
+void VerilatedTrace<VL_SUB_T, VL_BUF_T>::runCallbacks(const std::vector<CallbackRecord>& cbVec) {
 #ifdef VL_TRACE_PARALLEL
-        // If tracing in parallel, dispatch to the thread pool (if exists)
-        if (threadPoolp && threadPoolp->numThreads()) {
-            // List of work items for thread (std::list, as ParallelWorkerData is not movable)
-            std::list<ParallelWorkerData> workerData;
-            // We use the whole pool + the main thread
-            const unsigned threads = threadPoolp->numThreads() + 1;
-            // Main thread executes all jobs with index % threads == 0
-            std::vector<ParallelWorkerData*> mainThreadWorkerData;
-            // The tracing callbacks to execute on this thread-pool
-            const auto& cbVec = cbMap.at(threadPoolp);
-            // Enuque all the jobs
-            for (unsigned i = 0; i < cbVec.size(); ++i) {
-                const CallbackRecord& cbr = cbVec[i];
-                // Always get the trace buffer on the main thread
-                Buffer* const bufp = getTraceBuffer();
-                // Create new work item
-                workerData.emplace_back(cbr.m_dumpCb, cbr.m_userp, bufp);
-                // Grab the new work item
-                ParallelWorkerData* const itemp = &workerData.back();
-                // Enqueue task to thread pool, or main thread
-                if (unsigned rem = i % threads) {
-                    threadPoolp->workerp(rem - 1)->addTask(parallelWorkerTask, itemp, false);
-                } else {
-                    mainThreadWorkerData.push_back(itemp);
-                }
-            }
-            // Execute main thead jobs
-            for (ParallelWorkerData* const itemp : mainThreadWorkerData) {
-                parallelWorkerTask(itemp, false);
-            }
-            // Commit all trace buffers in order
-            for (ParallelWorkerData& item : workerData) {
-                // Wait until ready
-                item.wait();
-                // Commit the buffer
-                commitTraceBuffer(item.m_bufp);
-            }
-            continue;
+    // If tracing in parallel, dispatch to the thread pool
+    VlThreadPool* threadPoolp = static_cast<VlThreadPool*>(m_contextp->threadPoolp());
+    // List of work items for thread (std::list, as ParallelWorkerData is not movable)
+    std::list<ParallelWorkerData> workerData;
+    // We use the whole pool + the main thread
+    const unsigned threads = threadPoolp->numThreads() + 1;
+    // Main thread executes all jobs with index % threads == 0
+    std::vector<ParallelWorkerData*> mainThreadWorkerData;
+    // Enuque all the jobs
+    for (unsigned i = 0; i < cbVec.size(); ++i) {
+        const CallbackRecord& cbr = cbVec[i];
+        // Always get the trace buffer on the main thread
+        Buffer* const bufp = getTraceBuffer();
+        // Create new work item
+        workerData.emplace_back(cbr.m_dumpCb, cbr.m_userp, bufp);
+        // Grab the new work item
+        ParallelWorkerData* const itemp = &workerData.back();
+        // Enqueue task to thread pool, or main thread
+        if (unsigned rem = i % threads) {
+            threadPoolp->workerp(rem - 1)->addTask(parallelWorkerTask, itemp);
+        } else {
+            mainThreadWorkerData.push_back(itemp);
         }
+    }
+    // Execute main thead jobs
+    for (ParallelWorkerData* const itemp : mainThreadWorkerData) {
+        parallelWorkerTask(itemp, false);
+    }
+    // Commit all trace buffers in order
+    for (ParallelWorkerData& item : workerData) {
+        // Wait until ready
+        item.wait();
+        // Commit the buffer
+        commitTraceBuffer(item.m_bufp);
+    }
+
+    // Done
+    return;
 #endif
-        // Fall back on sequential execution
-        for (const CallbackRecord& cbr : cbMap.at(threadPoolp)) {
-            Buffer* const traceBufferp = getTraceBuffer();
-            cbr.m_dumpCb(cbr.m_userp, traceBufferp);
-            commitTraceBuffer(traceBufferp);
-        }
+    // Fall back on sequential execution
+    for (const CallbackRecord& cbr : cbVec) {
+        Buffer* const traceBufferp = getTraceBuffer();
+        cbr.m_dumpCb(cbr.m_userp, traceBufferp);
+        commitTraceBuffer(traceBufferp);
     }
 }
 
@@ -579,9 +576,9 @@ void VerilatedTrace<VL_SUB_T, VL_BUF_T>::dump(uint64_t timeui) VL_MT_SAFE_EXCLUD
     // Run the callbacks
     if (VL_UNLIKELY(m_fullDump)) {
         m_fullDump = false;  // No more need for next dump to be full
-        runParallelCallbacks(m_fullCbs);
+        runCallbacks(m_fullCbs);
     } else {
-        runParallelCallbacks(m_chgCbs);
+        runCallbacks(m_chgCbs);
     }
 
     for (uint32_t i = 0; i < m_cleanupCbs.size(); ++i) {
@@ -607,18 +604,20 @@ void VerilatedTrace<VL_SUB_T, VL_BUF_T>::dump(uint64_t timeui) VL_MT_SAFE_EXCLUD
 // Non-hot path internal interface to Verilator generated code
 
 template <>
-void VerilatedTrace<VL_SUB_T, VL_BUF_T>::addThreadPool(VlThreadPool* threadPoolp)
+void VerilatedTrace<VL_SUB_T, VL_BUF_T>::addContext(VerilatedContext* contextp)
     VL_MT_SAFE_EXCLUDES(m_mutex) {
     const VerilatedLockGuard lock{m_mutex};
-    for (VlThreadPool* const poolp : m_threadPoolps) {
-        if (poolp == threadPoolp) return;
+    if (m_contextp && contextp != m_contextp) {
+        VL_FATAL_MT(
+            __FILE__, __LINE__, "",
+            "A trace file instance can only handle models from the same simulation context");
     }
-    m_threadPoolps.push_back(threadPoolp);
+    m_contextp = contextp;
 }
 
 template <>
 void VerilatedTrace<VL_SUB_T, VL_BUF_T>::addCallbackRecord(std::vector<CallbackRecord>& cbVec,
-                                                           CallbackRecord& cbRec)
+                                                           CallbackRecord&& cbRec)
     VL_MT_SAFE_EXCLUDES(m_mutex) {
     const VerilatedLockGuard lock{m_mutex};
     if (VL_UNCOVERABLE(timeLastDump() != 0)) {  // LCOV_EXCL_START
@@ -630,28 +629,28 @@ void VerilatedTrace<VL_SUB_T, VL_BUF_T>::addCallbackRecord(std::vector<CallbackR
 }
 
 template <>
-void VerilatedTrace<VL_SUB_T, VL_BUF_T>::addInitCb(initCb_t cb, void* userp) VL_MT_SAFE {
-    CallbackRecord cbr{cb, userp};
-    addCallbackRecord(m_initCbs, cbr);
+void VerilatedTrace<VL_SUB_T, VL_BUF_T>::addInitCb(initCb_t cb, void* userp,
+                                                   VerilatedContext* contextp) VL_MT_SAFE {
+    addContext(contextp);
+    addCallbackRecord(m_initCbs, CallbackRecord{cb, userp});
 }
 template <>
 void VerilatedTrace<VL_SUB_T, VL_BUF_T>::addFullCb(dumpCb_t cb, void* userp,
-                                                   VlThreadPool* threadPoolp) VL_MT_SAFE {
-    CallbackRecord cbr{cb, userp};
-    addThreadPool(threadPoolp);
-    addCallbackRecord(m_fullCbs[threadPoolp], cbr);
+                                                   VerilatedContext* contextp) VL_MT_SAFE {
+    addContext(contextp);
+    addCallbackRecord(m_fullCbs, CallbackRecord{cb, userp});
 }
 template <>
 void VerilatedTrace<VL_SUB_T, VL_BUF_T>::addChgCb(dumpCb_t cb, void* userp,
-                                                  VlThreadPool* threadPoolp) VL_MT_SAFE {
-    CallbackRecord cbr{cb, userp};
-    addThreadPool(threadPoolp);
-    addCallbackRecord(m_chgCbs[threadPoolp], cbr);
+                                                  VerilatedContext* contextp) VL_MT_SAFE {
+    addContext(contextp);
+    addCallbackRecord(m_chgCbs, CallbackRecord{cb, userp});
 }
 template <>
-void VerilatedTrace<VL_SUB_T, VL_BUF_T>::addCleanupCb(cleanupCb_t cb, void* userp) VL_MT_SAFE {
-    CallbackRecord cbr{cb, userp};
-    addCallbackRecord(m_cleanupCbs, cbr);
+void VerilatedTrace<VL_SUB_T, VL_BUF_T>::addCleanupCb(cleanupCb_t cb, void* userp,
+                                                      VerilatedContext* contextp) VL_MT_SAFE {
+    addContext(contextp);
+    addCallbackRecord(m_cleanupCbs, CallbackRecord{cb, userp});
 }
 
 template <> void VerilatedTrace<VL_SUB_T, VL_BUF_T>::pushNamePrefix(const std::string& prefix) {
diff --git a/src/V3EmitCHeaders.cpp b/src/V3EmitCHeaders.cpp
index ef53dfa16..77835212b 100644
--- a/src/V3EmitCHeaders.cpp
+++ b/src/V3EmitCHeaders.cpp
@@ -251,15 +251,15 @@ class EmitCHeader final : public EmitCConstInit {
         emitTextSection(modp, VNType::atScHdr);
 
         // Open class body {{{
+        puts("\nclass ");
+        puts(prefixNameProtect(modp));
         if (const AstClass* const classp = VN_CAST(modp, Class)) {
-            puts("class ");
-            puts(prefixNameProtect(modp));
             if (classp->extendsp()) {
                 puts(" : public ");
                 puts(prefixNameProtect(classp->extendsp()->classp()));
             }
         } else {
-            puts("VL_MODULE(" + prefixNameProtect(modp) + ")");
+            puts(" final : public VerilatedModule");
         }
         puts(" {\n");
         ofp()->resetPrivate();
diff --git a/src/V3EmitCMake.cpp b/src/V3EmitCMake.cpp
index 7df71dfeb..710829eaf 100644
--- a/src/V3EmitCMake.cpp
+++ b/src/V3EmitCMake.cpp
@@ -173,7 +173,7 @@ class CMakeEmitter final {
                                     + ".cpp");
             }
         }
-        if (v3Global.opt.mtasks()) {
+        if (v3Global.opt.threads()) {
             global.emplace_back("${VERILATOR_ROOT}/include/verilated_threads.cpp");
         }
         if (v3Global.opt.usesProfiler()) {
diff --git a/src/V3EmitCModel.cpp b/src/V3EmitCModel.cpp
index e04c79f7e..203582609 100644
--- a/src/V3EmitCModel.cpp
+++ b/src/V3EmitCModel.cpp
@@ -89,11 +89,12 @@ class EmitCModel final : public EmitCFunc {
         puts("\n");
 
         puts("// This class is the main interface to the Verilated model\n");
+        puts("class " + topClassName() + " VL_NOT_FINAL : ");
         if (optSystemC()) {
-            puts("SC_MODULE(" + topClassName() + ") {\n");
-        } else {
-            puts("class " + topClassName() + " VL_NOT_FINAL {\n");
+            // SC_MODULE, but with multiple-inheritance of VerilatedModel
+            puts("public ::sc_core::sc_module, ");
         }
+        puts("public VerilatedModel {\n");
         ofp()->resetPrivate();
         ofp()->putsPrivate(true);  // private:
 
@@ -221,6 +222,11 @@ class EmitCModel final : public EmitCFunc {
                  + topClassName() + "& rhs);\n");
         }
 
+        puts("\n// Abstract methods from VerilatedModel\n");
+        puts("const char* hierName() override;\n");
+        puts("const char* modelName() override;\n");
+        puts("unsigned threads() override;\n");
+
         puts("} VL_ATTR_ALIGNED(VL_CACHE_LINE_BYTES);\n");
 
         ofp()->putsEndGuard();
@@ -235,7 +241,8 @@ class EmitCModel final : public EmitCFunc {
         puts(topClassName() + "::" + topClassName());
         if (optSystemC()) {
             puts("(sc_module_name /* unused */)\n");
-            puts("    : vlSymsp{new " + symClassName() + "(nullptr, name(), this)}\n");
+            puts("    : vlSymsp{new " + symClassName()
+                 + "(Verilated::threadContextp(), name(), this)}\n");
         } else {
             puts(+"(VerilatedContext* _vcontextp__, const char* _vcname__)\n");
             puts("    : vlSymsp{new " + symClassName() + "(_vcontextp__, _vcname__, this)}\n");
@@ -263,6 +270,8 @@ class EmitCModel final : public EmitCFunc {
         puts("    , rootp{&(vlSymsp->TOP)}\n");
 
         puts("{\n");
+        puts("// Register model with the context\n");
+        puts("vlSymsp->_vm_contextp__->addModel(this);\n");
 
         if (optSystemC()) {
             // Create sensitivity list for when to evaluate the model.
@@ -301,7 +310,7 @@ class EmitCModel final : public EmitCFunc {
         if (!optSystemC()) {
             puts("\n");
             puts(topClassName() + "::" + topClassName() + "(const char* _vcname__)\n");
-            puts("    : " + topClassName() + "(nullptr, _vcname__)\n{\n}\n");
+            puts("    : " + topClassName() + "(Verilated::threadContextp(), _vcname__)\n{\n}\n");
         }
     }
 
@@ -428,7 +437,7 @@ class EmitCModel final : public EmitCFunc {
         }
 
         if (v3Global.opt.profExec()) {
-            puts("vlSymsp->__Vm_executionProfiler.configure(*(vlSymsp->_vm_contextp__));\n");
+            puts("vlSymsp->__Vm_executionProfilerp->configure();\n");
             puts("VL_EXEC_TRACE_ADD_RECORD(vlSymsp).evalBegin();\n");
         }
 
@@ -477,6 +486,13 @@ class EmitCModel final : public EmitCFunc {
         puts("\nVL_ATTR_COLD void " + topClassName() + "::final() {\n");
         puts(/**/ topModNameProtected + "__" + protect("_final") + "(&(vlSymsp->TOP));\n");
         puts("}\n");
+
+        putSectionDelimiter("Implementations of abstract methods from VerilatedModel\n");
+        puts("const char* " + topClassName() + "::hierName() { return vlSymsp->name(); }\n");
+        puts("const char* " + topClassName() + "::modelName() { return \"" + topClassName()
+             + "\"; }\n");
+        puts("unsigned " + topClassName() + "::threads() { return "
+             + cvtToStr(std::max(1, v3Global.opt.threads())) + "; }\n");
     }
 
     void emitTraceMethods(AstNodeModule* modp) {
@@ -529,7 +545,8 @@ class EmitCModel final : public EmitCFunc {
             puts(/**/ "}");
         }
         puts(/**/ "if (false && levels && options) {}  // Prevent unused\n");
-        puts(/**/ "tfp->spTrace()->addInitCb(&" + protect("trace_init") + ", &(vlSymsp->TOP));\n");
+        puts(/**/ "tfp->spTrace()->addInitCb(&" + protect("trace_init")
+             + ", &(vlSymsp->TOP), contextp());\n");
         puts(/**/ topModNameProtected + "__" + protect("trace_register")
              + "(&(vlSymsp->TOP), tfp->spTrace());\n");
 
diff --git a/src/V3EmitCSyms.cpp b/src/V3EmitCSyms.cpp
index c66f346b0..e325aa79e 100644
--- a/src/V3EmitCSyms.cpp
+++ b/src/V3EmitCSyms.cpp
@@ -445,17 +445,17 @@ void EmitCSyms::emitSymHdr() {
     }
     puts("bool __Vm_didInit = false;\n");
 
-    if (v3Global.opt.profExec()) {
-        puts("\n// EXECUTION PROFILING\n");
-        puts("VlExecutionProfiler __Vm_executionProfiler;\n");
-    }
-
     if (v3Global.opt.mtasks()) {
         puts("\n// MULTI-THREADING\n");
         puts("VlThreadPool* const __Vm_threadPoolp;\n");
         puts("bool __Vm_even_cycle = false;\n");
     }
 
+    if (v3Global.opt.profExec()) {
+        puts("\n// EXECUTION PROFILING\n");
+        puts("VlExecutionProfiler* const __Vm_executionProfilerp;\n");
+    }
+
     puts("\n// MODULE INSTANCE STATE\n");
     for (const auto& i : m_scopes) {
         const AstScope* const scopep = i.first;
@@ -673,7 +673,6 @@ void EmitCSyms::emitSymImp() {
         puts("_vm_pgoProfiler.write(\"" + topClassName()
              + "\", _vm_contextp__->profVltFilename());\n");
     }
-    if (v3Global.opt.mtasks()) puts("delete __Vm_threadPoolp;\n");
     puts("}\n\n");
 
     // Constructor
@@ -705,12 +704,13 @@ void EmitCSyms::emitSymImp() {
         // Note we create N-1 threads in the thread pool. The thread
         // that calls eval() becomes the final Nth thread for the
         // duration of the eval call.
-        puts("    , __Vm_threadPoolp{new VlThreadPool{_vm_contextp__, "
-             + cvtToStr(v3Global.opt.threads() - 1) + ", "
-             + (v3Global.opt.profExec()
-                    ? "&__Vm_executionProfiler, &VlExecutionProfiler::startWorkerSetup"
-                    : "nullptr, nullptr")
-             + "}}\n");
+        puts("    , __Vm_threadPoolp{static_cast<VlThreadPool*>(contextp->threadPoolp())}\n");
+    }
+
+    if (v3Global.opt.profExec()) {
+        puts("    , "
+             "__Vm_executionProfilerp{static_cast<VlExecutionProfiler*>(contextp->"
+             "enableExecutionProfiler(&VlExecutionProfiler::construct))}\n");
     }
 
     puts("    // Setup module instances\n");
diff --git a/src/V3EmitMk.cpp b/src/V3EmitMk.cpp
index 3fb3907be..800cf589c 100644
--- a/src/V3EmitMk.cpp
+++ b/src/V3EmitMk.cpp
@@ -116,7 +116,7 @@ public:
                             putMakeClassEntry(of, v3Global.opt.traceSourceLang() + ".cpp");
                         }
                     }
-                    if (v3Global.opt.mtasks()) putMakeClassEntry(of, "verilated_threads.cpp");
+                    if (v3Global.opt.threads()) putMakeClassEntry(of, "verilated_threads.cpp");
                     if (v3Global.opt.usesProfiler()) {
                         putMakeClassEntry(of, "verilated_profiler.cpp");
                     }
diff --git a/src/V3Trace.cpp b/src/V3Trace.cpp
index 9fa1b099a..7113c5e0e 100644
--- a/src/V3Trace.cpp
+++ b/src/V3Trace.cpp
@@ -512,8 +512,10 @@ private:
                 m_regFuncp->addStmtsp(new AstText(flp, "tracep->addChgCb(", true));
             }
             m_regFuncp->addStmtsp(new AstAddrOfCFunc(flp, funcp));
-            const string threadPool{m_parallelism > 1 ? "vlSymsp->__Vm_threadPoolp" : "nullptr"};
-            m_regFuncp->addStmtsp(new AstText(flp, ", vlSelf, " + threadPool + ");\n", true));
+            m_regFuncp->addStmtsp(new AstText(flp, ", vlSelf", true));
+            m_regFuncp->addStmtsp(
+                new AstText(flp, ", vlSelf->vlSymsp->__Vm_modelp->contextp()", true));
+            m_regFuncp->addStmtsp(new AstText(flp, ");\n", true));
         } else {
             // Sub functions
             funcp->argTypes(v3Global.opt.traceClassBase() + "::Buffer* bufp");
@@ -700,7 +702,8 @@ private:
         // Register it
         m_regFuncp->addStmtsp(new AstText(fl, "tracep->addCleanupCb(", true));
         m_regFuncp->addStmtsp(new AstAddrOfCFunc(fl, cleanupFuncp));
-        m_regFuncp->addStmtsp(new AstText(fl, ", vlSelf);\n", true));
+        m_regFuncp->addStmtsp(
+            new AstText(fl, ", vlSelf, vlSelf->vlSymsp->__Vm_modelp->contextp());\n", true));
 
         // Clear global activity flag
         cleanupFuncp->addStmtsp(
diff --git a/test_regress/driver.pl b/test_regress/driver.pl
index cbd9ba9ea..ae0ed4f36 100755
--- a/test_regress/driver.pl
+++ b/test_regress/driver.pl
@@ -579,6 +579,7 @@ sub new {
         sc_time_resolution => "SC_PS",  # Keep - PS is SystemC default
         sim_time => 1100,
         threads => -1,          # --threads (negative means auto based on scenario)
+        context_threads => 0,   # Number of threads to allocate in the context
         benchmark => $opt_benchmark,
         verbose => $opt_verbose,
         run_env => '',
@@ -974,7 +975,11 @@ sub compile {
     $self->oprint("Compile\n") if $self->{verbose};
 
     die "%Error: 'threads =>' argument must be <= 1 for vlt scenario" if $param{vlt} && $param{threads} > 1;
-    $param{threads} = ::calc_threads($Vltmt_threads) if ($param{threads} < 0 && $param{vltmt});
+    # Compute automatic parameter values
+    $param{threads} = ::calc_threads($Vltmt_threads) if $param{threads} < 0 && $param{vltmt};
+    $param{context_threads} = $param{threads} >= 1 ? $param{threads} : 1 if !$param{context_threads};
+    $self->{threads} = $param{threads};
+    $self->{context_threads} = $param{context_threads};
 
     compile_vlt_cmd(%param);
 
@@ -1795,6 +1800,7 @@ sub _make_main {
     }
 
     print $fh "    const std::unique_ptr<VerilatedContext> contextp{new VerilatedContext};\n";
+    print $fh "    contextp->threads($self->{context_threads});\n";
     print $fh "    contextp->commandArgs(argc, argv);\n";
     print $fh "    contextp->debug(" . ($self->{verilated_debug} ? 1 : 0) . ");\n";
     print $fh "    srand48(5);\n";  # Ensure determinism
diff --git a/test_regress/t/t_embed1.pl b/test_regress/t/t_embed1.pl
index 08e4c042c..2c5b8a918 100755
--- a/test_regress/t/t_embed1.pl
+++ b/test_regress/t/t_embed1.pl
@@ -22,7 +22,8 @@ mkdir $child_dir;
         (VM_PREFIX => "$Self->{VM_PREFIX}_child",
          top_filename => "$Self->{name}_child.v",
          verilator_flags => ["-cc", "-Mdir", "${child_dir}", "--debug-check"],
-         threads => $Self->{vltmt} ? $Self->get_default_vltmt_threads() : 0
+         # Can't use multi threading (like hier blocks), but needs to be thread safe
+         threads => $Self->{vltmt} ? 1 : 0,
         );
 
     run(logfile => "${child_dir}/vlt_compile.log",
diff --git a/test_regress/t/t_gantt_two.cpp b/test_regress/t/t_gantt_two.cpp
new file mode 100644
index 000000000..da253fab7
--- /dev/null
+++ b/test_regress/t/t_gantt_two.cpp
@@ -0,0 +1,43 @@
+//
+// DESCRIPTION: Verilator: Verilog Multiple Model Test Module
+//
+// This file ONLY is placed under the Creative Commons Public Domain, for
+// any use, without warranty, 2022 by Geza Lore.
+// SPDX-License-Identifier: CC0-1.0
+//
+
+#include <memory>
+#include "verilated.h"
+#include "Vt_gantt_two.h"
+
+int main(int argc, char** argv, char** env) {
+    srand48(5);
+
+    const std::unique_ptr<VerilatedContext> contextp{new VerilatedContext};
+#ifdef VL_THREADED
+    contextp->threads(2);
+#endif
+    contextp->commandArgs(argc, argv);
+    contextp->debug(0);
+
+    std::unique_ptr<Vt_gantt_two> topap{new Vt_gantt_two{contextp.get(), "topa"}};
+    std::unique_ptr<Vt_gantt_two> topbp{new Vt_gantt_two{contextp.get(), "topb"}};
+
+    topap->clk = false;
+    topap->eval();
+    topbp->clk = false;
+    topbp->eval();
+
+    contextp->timeInc(10);
+    while ((contextp->time() < 1100) && !contextp->gotFinish()) {
+        topap->clk = !topap->clk;
+        topap->eval();
+        topbp->clk = !topbp->clk;
+        topbp->eval();
+        contextp->timeInc(5);
+    }
+    if (!contextp->gotFinish()) {
+        vl_fatal(__FILE__, __LINE__, "main", "%Error: Timeout; never got a $finish");
+    }
+    return 0;
+}
diff --git a/test_regress/t/t_gantt_two.pl b/test_regress/t/t_gantt_two.pl
new file mode 100755
index 000000000..768f55440
--- /dev/null
+++ b/test_regress/t/t_gantt_two.pl
@@ -0,0 +1,61 @@
+#!/usr/bin/env perl
+if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); die; }
+# DESCRIPTION: Verilator: Verilog Test driver/expect definition
+#
+# Copyright 2003 by Wilson Snyder. This program is free software; you
+# can redistribute it and/or modify it under the terms of either the GNU
+# Lesser General Public License Version 3 or the Perl Artistic License
+# Version 2.0.
+# SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
+
+# Test for bin/verilator_gantt,
+
+scenarios(vlt_all => 1);
+
+# It doesn't really matter what test
+# we use, so long as it runs several cycles,
+# enough for the profiling to happen:
+top_filename("t/t_gen_alw.v");
+
+compile(
+    make_top_shell => 0,
+    make_main => 0,
+    v_flags2 => ["--prof-exec --exe $Self->{t_dir}/$Self->{name}.cpp"],
+    # Checks below care about thread count, so use 2 (minimum reasonable)
+    threads => $Self->{vltmt} ? 2 : 0,
+    make_flags => 'CPPFLAGS_ADD=-DVL_NO_LEGACY',
+    );
+
+execute(
+    all_run_flags => ["+verilator+prof+exec+start+4",
+                      " +verilator+prof+exec+window+4",
+                      " +verilator+prof+exec+file+$Self->{obj_dir}/profile_exec.dat",
+                      " +verilator+prof+vlt+file+$Self->{obj_dir}/profile.vlt",
+                      ],
+    check_finished => 1,
+    );
+
+# For now, verilator_gantt still reads from STDIN
+#  (probably it should take a file, gantt.dat like verilator_profcfunc)
+# The profiling data still goes direct to the runtime's STDOUT
+#  (maybe that should go to a separate file - gantt.dat?)
+run(cmd => ["$ENV{VERILATOR_ROOT}/bin/verilator_gantt",
+            "$Self->{obj_dir}/profile_exec.dat",
+            "--vcd $Self->{obj_dir}/profile_exec.vcd",
+            "| tee $Self->{obj_dir}/gantt.log"],
+    );
+
+if ($Self->{vltmt}) {
+    file_grep("$Self->{obj_dir}/gantt.log", qr/Total threads += 2/i);
+    file_grep("$Self->{obj_dir}/gantt.log", qr/Total mtasks += 7/i);
+} else {
+    file_grep("$Self->{obj_dir}/gantt.log", qr/Total threads += 1/i);
+    file_grep("$Self->{obj_dir}/gantt.log", qr/Total mtasks += 0/i);
+}
+file_grep("$Self->{obj_dir}/gantt.log", qr/Total evals += 4/i);
+
+# Diff to itself, just to check parsing
+vcd_identical("$Self->{obj_dir}/profile_exec.vcd", "$Self->{obj_dir}/profile_exec.vcd");
+
+ok(1);
+1;
diff --git a/test_regress/t/t_hier_block_cmake/main.cpp b/test_regress/t/t_hier_block_cmake/main.cpp
index e49101162..58bc8d5ad 100644
--- a/test_regress/t/t_hier_block_cmake/main.cpp
+++ b/test_regress/t/t_hier_block_cmake/main.cpp
@@ -14,8 +14,11 @@
 
 int main(int argc, char *argv[]) {
     const std::unique_ptr<VerilatedContext> contextp{new VerilatedContext};
-    std::unique_ptr<Vt_hier_block> top{new Vt_hier_block{contextp.get(), "top"}};
+#if VL_THREADED
+    contextp->threads(6);
+#endif
     contextp->commandArgs(argc, argv);
+    std::unique_ptr<Vt_hier_block> top{new Vt_hier_block{contextp.get(), "top"}};
     for (int i = 0; i < 100 && !contextp->gotFinish(); ++i) {
         top->eval();
         top->clk ^= 1;
diff --git a/test_regress/t/t_lib_prot_shared.pl b/test_regress/t/t_lib_prot_shared.pl
index 1a3f8af5f..cc0c2f977 100755
--- a/test_regress/t/t_lib_prot_shared.pl
+++ b/test_regress/t/t_lib_prot_shared.pl
@@ -59,7 +59,8 @@ while (1) {
                              "-LDFLAGS",
                              "'-Wl,-rpath,$abs_secret_dir -L$abs_secret_dir -l$secret_prefix'"],
         xsim_flags2 => ["$secret_dir/secret.sv"],
-        threads => $Self->{vltmt} ? 1 : 0
+        threads => $Self->{vltmt} ? 1 : 0,
+        context_threads => $Self->{vltmt} ? 6 : 1
         );
 
     execute(
diff --git a/test_regress/t/t_threads_crazy.pl b/test_regress/t/t_threads_crazy.pl
index 6bb21acb0..c72858f2c 100755
--- a/test_regress/t/t_threads_crazy.pl
+++ b/test_regress/t/t_threads_crazy.pl
@@ -10,20 +10,16 @@ if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); di
 
 scenarios(vltmt => 1);
 
-if ($Self->cfg_with_m32) {
-    skip("Does not work with -m32 (resource unavailable)");
-}
-
 compile(
     verilator_flags2 => ['--cc'],
-    threads => 1024
+    threads => 4,
+    context_threads => 2
     );
 
 execute(
-    check_finished => 1,
+    fails => 1
     );
 
-file_grep($Self->{run_log_filename}, qr/System has .* CPUs but.*--threads 1024/);
-
+file_grep($Self->{run_log_filename}, qr/%Error: .*\/verilated\.cpp:\d+: VerilatedContext has 2 threads but model 'Vt_threads_crazy' \(instantiated as 'top'\) was Verilated with --threads 4\./);
 ok(1);
 1;
diff --git a/test_regress/t/t_threads_crazy_context.pl b/test_regress/t/t_threads_crazy_context.pl
new file mode 100755
index 000000000..8e28bb87a
--- /dev/null
+++ b/test_regress/t/t_threads_crazy_context.pl
@@ -0,0 +1,36 @@
+#!/usr/bin/env perl
+if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); die; }
+# DESCRIPTION: Verilator: Verilog Test driver/expect definition
+#
+# Copyright 2003-2009 by Wilson Snyder. This program is free software; you
+# can redistribute it and/or modify it under the terms of either the GNU
+# Lesser General Public License Version 3 or the Perl Artistic License
+# Version 2.0.
+# SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
+
+scenarios(vlt_all => 1);
+
+if ($Self->cfg_with_m32) {
+    skip("Does not work with -m32 (resource unavailable)");
+}
+
+top_filename("t/t_threads_crazy.v");
+
+compile(
+    verilator_flags2 => ['--cc'],
+    threads => $Self->{vltmt} ? 2 : 0,
+    context_threads => 1024
+    );
+
+execute(
+    check_finished => 1,
+    );
+
+if ($Self->{vltmt}) {
+    file_grep($Self->{run_log_filename}, qr/System has \d+ hardware threads but simulation thread count set to 1024\. This will likely cause significant slowdown\./);
+} else {
+    file_grep($Self->{run_log_filename}, qr/Verilator run-time library built without VL_THREADS\. Ignoring call to 'VerilatedContext::threads' with argument 1024\./);
+}
+
+ok(1);
+1;
diff --git a/test_regress/t/t_wrapper_context.cpp b/test_regress/t/t_wrapper_context.cpp
index 87332eb8c..31a9334f0 100644
--- a/test_regress/t/t_wrapper_context.cpp
+++ b/test_regress/t/t_wrapper_context.cpp
@@ -92,6 +92,8 @@ int main(int argc, char** argv, char** env) {
     std::unique_ptr<VerilatedContext> context1p{new VerilatedContext};
 
     // configuration
+    context0p->threads(1);
+    context1p->threads(1);
     context0p->fatalOnError(false);
     context1p->fatalOnError(false);
     context0p->traceEverOn(true);