verilator/include/verilated_trace_imp.h

// -*- mode: C++; c-file-style: "cc-mode" -*-
//=============================================================================
//
// Code available from: https://verilator.org
//
// Copyright 2001-2022 by Wilson Snyder. This program is free software; you
// can redistribute it and/or modify it under the terms of either the GNU
// Lesser General Public License Version 3 or the Perl Artistic License
// Version 2.0.
// SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
//
//=============================================================================
//
// Verilated tracing implementation code template common to all formats.
// This file is included by the format specific implementations and
// should not be used otherwise.
//
//=============================================================================

// clang-format off

#ifndef VL_CPPCHECK
#if !defined(VL_SUB_T) || !defined(VL_BUF_T)
# error "This file should be included in trace format implementations"
#endif

#include "verilated_intrinsics.h"
#include "verilated_trace.h"
#ifdef VL_TRACE_PARALLEL
# include "verilated_threads.h"
# include <list>
#endif

#if 0
# include <iostream>
# define VL_TRACE_OFFLOAD_DEBUG(msg) std::cout << "TRACE OFFLOAD THREAD: " << msg << std::endl
#else
# define VL_TRACE_OFFLOAD_DEBUG(msg)
#endif

// clang-format on

//=============================================================================
// Static utility functions

static double timescaleToDouble(const char* unitp) {
    char* endp = nullptr;
    double value = std::strtod(unitp, &endp);
    // On error so we allow just "ns" to return 1e-9.
    if (value == 0.0 && endp == unitp) value = 1;
    unitp = endp;
    for (; *unitp && std::isspace(*unitp); unitp++) {}
    switch (*unitp) {
    case 's': value *= 1e0; break;
    case 'm': value *= 1e-3; break;
    case 'u': value *= 1e-6; break;
    case 'n': value *= 1e-9; break;
    case 'p': value *= 1e-12; break;
    case 'f': value *= 1e-15; break;
    case 'a': value *= 1e-18; break;
    }
    return value;
}

static std::string doubleToTimescale(double value) {
    const char* suffixp = "s";
    // clang-format off
    if      (value >= 1e0)   { suffixp = "s"; value *= 1e0; }
    else if (value >= 1e-3 ) { suffixp = "ms"; value *= 1e3; }
    else if (value >= 1e-6 ) { suffixp = "us"; value *= 1e6; }
    else if (value >= 1e-9 ) { suffixp = "ns"; value *= 1e9; }
    else if (value >= 1e-12) { suffixp = "ps"; value *= 1e12; }
    else if (value >= 1e-15) { suffixp = "fs"; value *= 1e15; }
    else if (value >= 1e-18) { suffixp = "as"; value *= 1e18; }
    // clang-format on
    char valuestr[100];
    VL_SNPRINTF(valuestr, 100, "%0.0f%s", value, suffixp);
    return valuestr;  // Gets converted to string, so no ref to stack
}

#ifdef VL_TRACE_OFFLOAD
//=========================================================================
// Buffer management

template <> uint32_t* VerilatedTrace<VL_SUB_T, VL_BUF_T>::getOffloadBuffer() {
    uint32_t* bufferp;
    // Some jitter is expected, so some number of alternative offlaod buffers are
    // required, but don't allocate more than 8 buffers.
    if (m_numOffloadBuffers < 8) {
        // Allocate a new buffer if none is available
        if (!m_offloadBuffersFromWorker.tryGet(bufferp)) {
            ++m_numOffloadBuffers;
            // Note: over allocate a bit so pointer comparison is well defined
            // if we overflow only by a small amount
            bufferp = new uint32_t[m_offloadBufferSize + 16];
        }
    } else {
        // Block until a buffer becomes available
        bufferp = m_offloadBuffersFromWorker.get();
    }
    return bufferp;
}

template <> void VerilatedTrace<VL_SUB_T, VL_BUF_T>::waitForOffloadBuffer(const uint32_t* buffp) {
    // Slow path code only called on flush/shutdown, so use a simple algorithm.
    // Collect buffers from worker and stash them until we get the one we want.
    std::deque<uint32_t*> stash;
    do { stash.push_back(m_offloadBuffersFromWorker.get()); } while (stash.back() != buffp);
    // Now put them back in the queue, in the original order.
    while (!stash.empty()) {
        m_offloadBuffersFromWorker.put_front(stash.back());
        stash.pop_back();
    }
}

//=========================================================================
// Worker thread

template <> void VerilatedTrace<VL_SUB_T, VL_BUF_T>::offloadWorkerThreadMain() {
    bool shutdown = false;

    do {
        uint32_t* const bufferp = m_offloadBuffersToWorker.get();

        VL_TRACE_OFFLOAD_DEBUG("");
        VL_TRACE_OFFLOAD_DEBUG("Got buffer: " << bufferp);

        const uint32_t* readp = bufferp;

        std::unique_ptr<VL_BUF_T> traceBufp;  // We own the passed tracebuffer

        while (true) {
            const uint32_t cmd = readp[0];
            const uint32_t top = cmd >> 4;
            // Always set this up, as it is almost always needed
            uint32_t* const oldp = m_sigs_oldvalp + readp[1];
            // Note this increment needs to be undone on commands which do not
            // actually contain a code, but those are the rare cases.
            readp += 2;

            switch (cmd & 0xF) {
                //===
                // CHG_* commands
            case VerilatedTraceOffloadCommand::CHG_BIT_0:
                VL_TRACE_OFFLOAD_DEBUG("Command CHG_BIT_0 " << top);
                traceBufp->chgBitImpl(oldp, 0);
                continue;
            case VerilatedTraceOffloadCommand::CHG_BIT_1:
                VL_TRACE_OFFLOAD_DEBUG("Command CHG_BIT_1 " << top);
                traceBufp->chgBitImpl(oldp, 1);
                continue;
            case VerilatedTraceOffloadCommand::CHG_CDATA:
                VL_TRACE_OFFLOAD_DEBUG("Command CHG_CDATA " << top);
                // Bits stored in bottom byte of command
                traceBufp->chgCDataImpl(oldp, *readp, top);
                readp += 1;
                continue;
            case VerilatedTraceOffloadCommand::CHG_SDATA:
                VL_TRACE_OFFLOAD_DEBUG("Command CHG_SDATA " << top);
                // Bits stored in bottom byte of command
                traceBufp->chgSDataImpl(oldp, *readp, top);
                readp += 1;
                continue;
            case VerilatedTraceOffloadCommand::CHG_IDATA:
                VL_TRACE_OFFLOAD_DEBUG("Command CHG_IDATA " << top);
                // Bits stored in bottom byte of command
                traceBufp->chgIDataImpl(oldp, *readp, top);
                readp += 1;
                continue;
            case VerilatedTraceOffloadCommand::CHG_QDATA:
                VL_TRACE_OFFLOAD_DEBUG("Command CHG_QDATA " << top);
                // Bits stored in bottom byte of command
                traceBufp->chgQDataImpl(oldp, *reinterpret_cast<const QData*>(readp), top);
                readp += 2;
                continue;
            case VerilatedTraceOffloadCommand::CHG_WDATA:
                VL_TRACE_OFFLOAD_DEBUG("Command CHG_WDATA " << top);
                traceBufp->chgWDataImpl(oldp, readp, top);
                readp += VL_WORDS_I(top);
                continue;
            case VerilatedTraceOffloadCommand::CHG_DOUBLE:
                VL_TRACE_OFFLOAD_DEBUG("Command CHG_DOUBLE " << top);
                traceBufp->chgDoubleImpl(oldp, *reinterpret_cast<const double*>(readp));
                readp += 2;
                continue;

                //===
                // Rare commands
            case VerilatedTraceOffloadCommand::TIME_CHANGE:
                VL_TRACE_OFFLOAD_DEBUG("Command TIME_CHANGE " << top);
                readp -= 1;  // No code in this command, undo increment
                emitTimeChange(*reinterpret_cast<const uint64_t*>(readp));
                readp += 2;
                continue;

            case VerilatedTraceOffloadCommand::TRACE_BUFFER:
                VL_TRACE_OFFLOAD_DEBUG("Command TRACE_BUFFER " << top);
                readp -= 1;  // No code in this command, undo increment
                traceBufp.reset(*reinterpret_cast<VL_BUF_T* const*>(readp));
                readp += 2;
                continue;

                //===
                // Commands ending this buffer
            case VerilatedTraceOffloadCommand::END:  //
                VL_TRACE_OFFLOAD_DEBUG("Command END");
                break;
            case VerilatedTraceOffloadCommand::SHUTDOWN:
                VL_TRACE_OFFLOAD_DEBUG("Command SHUTDOWN");
                shutdown = true;
                break;

            //===
            // Unknown command
            default: {  // LCOV_EXCL_START
                VL_TRACE_OFFLOAD_DEBUG("Command UNKNOWN " << cmd);
                VL_FATAL_MT(__FILE__, __LINE__, "", "Unknown trace command");
                break;
            }  // LCOV_EXCL_STOP
            }

            // The above switch will execute 'continue' when necessary,
            // so if we ever reach here, we are done with the buffer.
            break;
        }

        VL_TRACE_OFFLOAD_DEBUG("Returning buffer");

        // Return buffer
        m_offloadBuffersFromWorker.put(bufferp);
    } while (VL_LIKELY(!shutdown));
}

template <> void VerilatedTrace<VL_SUB_T, VL_BUF_T>::shutdownOffloadWorker() {
    // If the worker thread is not running, done..
    if (!m_workerThread) return;

    // Hand an buffer with a shutdown command to the worker thread
    uint32_t* const bufferp = getOffloadBuffer();
    bufferp[0] = VerilatedTraceOffloadCommand::SHUTDOWN;
    m_offloadBuffersToWorker.put(bufferp);
    // Wait for it to return
    waitForOffloadBuffer(bufferp);
    // Join the thread and delete it
    m_workerThread->join();
    m_workerThread.reset(nullptr);
}

#endif

//=============================================================================
// Life cycle

template <> void VerilatedTrace<VL_SUB_T, VL_BUF_T>::closeBase() {
#ifdef VL_TRACE_OFFLOAD
    shutdownOffloadWorker();
    while (m_numOffloadBuffers) {
        delete[] m_offloadBuffersFromWorker.get();
        --m_numOffloadBuffers;
    }
#endif
}

template <> void VerilatedTrace<VL_SUB_T, VL_BUF_T>::flushBase() {
#ifdef VL_TRACE_OFFLOAD
    // Hand an empty buffer to the worker thread
    uint32_t* const bufferp = getOffloadBuffer();
    *bufferp = VerilatedTraceOffloadCommand::END;
    m_offloadBuffersToWorker.put(bufferp);
    // Wait for it to be returned. As the processing is in-order,
    // this ensures all previous buffers have been processed.
    waitForOffloadBuffer(bufferp);
#endif
}

//=============================================================================
// Callbacks to run on global events

template <> void VerilatedTrace<VL_SUB_T, VL_BUF_T>::onFlush(void* selfp) {
    // This calls 'flush' on the derived class (which must then get any mutex)
    reinterpret_cast<VL_SUB_T*>(selfp)->flush();
}

template <> void VerilatedTrace<VL_SUB_T, VL_BUF_T>::onExit(void* selfp) {
    // This calls 'close' on the derived class (which must then get any mutex)
    reinterpret_cast<VL_SUB_T*>(selfp)->close();
}

//=============================================================================
// VerilatedTrace

template <> VerilatedTrace<VL_SUB_T, VL_BUF_T>::VerilatedTrace() {
    set_time_unit(Verilated::threadContextp()->timeunitString());
    set_time_resolution(Verilated::threadContextp()->timeprecisionString());
}

template <> VerilatedTrace<VL_SUB_T, VL_BUF_T>::~VerilatedTrace() {
    if (m_sigs_oldvalp) VL_DO_CLEAR(delete[] m_sigs_oldvalp, m_sigs_oldvalp = nullptr);
    if (m_sigs_enabledp) VL_DO_CLEAR(delete[] m_sigs_enabledp, m_sigs_enabledp = nullptr);
    Verilated::removeFlushCb(VerilatedTrace<VL_SUB_T, VL_BUF_T>::onFlush, this);
    Verilated::removeExitCb(VerilatedTrace<VL_SUB_T, VL_BUF_T>::onExit, this);
#ifdef VL_TRACE_OFFLOAD
    closeBase();
#endif
}

//=========================================================================
// Internals available to format specific implementations

template <> void VerilatedTrace<VL_SUB_T, VL_BUF_T>::traceInit() VL_MT_UNSAFE {
    // Note: It is possible to re-open a trace file (VCD in particular),
    // so we must reset the next code here, but it must have the same number
    // of codes on re-open
    const uint32_t expectedCodes = nextCode();
    m_nextCode = 1;
    m_numSignals = 0;
    m_maxBits = 0;
    m_sigs_enabledVec.clear();

    // Call all initialize callbacks, which will:
    // - Call decl* for each signal (these eventually call ::declCode)
    // - Store the base code
    for (uint32_t i = 0; i < m_initCbs.size(); ++i) {
        const CallbackRecord& cbr = m_initCbs[i];
        cbr.m_initCb(cbr.m_userp, self(), nextCode());
    }

    if (expectedCodes && nextCode() != expectedCodes) {
        VL_FATAL_MT(__FILE__, __LINE__, "",
                    "Reopening trace file with different number of signals");
    }

    // Now that we know the number of codes, allocate space for the buffer
    // holding previous signal values.
    if (!m_sigs_oldvalp) m_sigs_oldvalp = new uint32_t[nextCode()];

    // Apply enables
    if (m_sigs_enabledp) VL_DO_CLEAR(delete[] m_sigs_enabledp, m_sigs_enabledp = nullptr);
    if (!m_sigs_enabledVec.empty()) {
        // Else if was empty, m_sigs_enabledp = nullptr to short circuit tests
        // But it isn't, so alloc one bit for each code to indicate enablement
        // We don't want to still use m_signs_enabledVec as std::vector<bool> is not
        // guarenteed to be fast
        m_sigs_enabledp = new uint32_t[1 + VL_WORDS_I(nextCode())]{0};
        m_sigs_enabledVec.reserve(nextCode());
        for (size_t code = 0; code < nextCode(); ++code) {
            if (m_sigs_enabledVec[code]) {
                m_sigs_enabledp[VL_BITWORD_I(code)] |= 1U << VL_BITBIT_I(code);
            }
        }
        m_sigs_enabledVec.clear();
    }

    // Set callback so flush/abort will flush this file
    Verilated::addFlushCb(VerilatedTrace<VL_SUB_T, VL_BUF_T>::onFlush, this);
    Verilated::addExitCb(VerilatedTrace<VL_SUB_T, VL_BUF_T>::onExit, this);

#ifdef VL_TRACE_OFFLOAD
    // Compute offload buffer size. we need to be able to store a new value for
    // each signal, which is 'nextCode()' entries after the init callbacks
    // above have been run, plus up to 2 more words of metadata per signal,
    // plus fixed overhead of 1 for a termination flag and 3 for a time stamp
    // update.
    m_offloadBufferSize = nextCode() + numSignals() * 2 + 4;

    // Start the worker thread
    m_workerThread.reset(
        new std::thread{&VerilatedTrace<VL_SUB_T, VL_BUF_T>::offloadWorkerThreadMain, this});
#endif
}

template <>
bool VerilatedTrace<VL_SUB_T, VL_BUF_T>::declCode(uint32_t code, const char* namep, uint32_t bits,
                                                  bool tri) {
    if (VL_UNCOVERABLE(!code)) {
        VL_FATAL_MT(__FILE__, __LINE__, "", "Internal: internal trace problem, code 0 is illegal");
    }
    // To keep it simple, this is O(enables * signals), but we expect few enables
    std::string declName = namePrefix() + namep;
    bool enabled = false;
    if (m_dumpvars.empty()) enabled = true;
    for (const auto& item : m_dumpvars) {
        const int dumpvarsLevel = item.first;
        const char* dvp = item.second.c_str();
        const char* np = declName.c_str();
        while (*dvp && *dvp == *np) {
            ++dvp;
            ++np;
        }
        if (*dvp) continue;  // Didn't match dumpvar item
        if (*np && *np != ' ') continue;  // e.g. "t" isn't a match for "top"
        int levels = 0;
        while (*np) {
            if (*np++ == ' ') ++levels;
        }
        if (levels > dumpvarsLevel) continue;  // Too deep
        // We only need to set first code word if it's a multicode signal
        // as that's all we'll check for later
        if (m_sigs_enabledVec.size() <= code) m_sigs_enabledVec.resize((code + 1024) * 2);
        m_sigs_enabledVec[code] = true;
        enabled = true;
        break;
    }

    // Note: The tri-state flag is not used by Verilator, but is here for
    // compatibility with some foreign code.
    int codesNeeded = VL_WORDS_I(bits);
    if (tri) codesNeeded *= 2;
    m_nextCode = std::max(m_nextCode, code + codesNeeded);
    ++m_numSignals;
    m_maxBits = std::max(m_maxBits, bits);
    return enabled;
}

//=========================================================================
// Internals available to format specific implementations

template <> std::string VerilatedTrace<VL_SUB_T, VL_BUF_T>::timeResStr() const {
    return doubleToTimescale(m_timeRes);
}

//=========================================================================
// External interface to client code

template <> void VerilatedTrace<VL_SUB_T, VL_BUF_T>::set_time_unit(const char* unitp) VL_MT_SAFE {
    m_timeUnit = timescaleToDouble(unitp);
}
template <>
void VerilatedTrace<VL_SUB_T, VL_BUF_T>::set_time_unit(const std::string& unit) VL_MT_SAFE {
    set_time_unit(unit.c_str());
}
template <>
void VerilatedTrace<VL_SUB_T, VL_BUF_T>::set_time_resolution(const char* unitp) VL_MT_SAFE {
    m_timeRes = timescaleToDouble(unitp);
}
template <>
void VerilatedTrace<VL_SUB_T, VL_BUF_T>::set_time_resolution(const std::string& unit) VL_MT_SAFE {
    set_time_resolution(unit.c_str());
}
template <>
void VerilatedTrace<VL_SUB_T, VL_BUF_T>::dumpvars(int level, const std::string& hier) VL_MT_SAFE {
    if (level == 0) {
        m_dumpvars.clear();  // empty = everything on
    } else {
        // Convert Verilog . separators to trace space separators
        std::string hierSpaced = hier;
        for (auto& i : hierSpaced) {
            if (i == '.') i = ' ';
        }
        m_dumpvars.push_back(std::make_pair(level, hierSpaced));
    }
}

#ifdef VL_TRACE_PARALLEL
template <>  //
void VerilatedTrace<VL_SUB_T, VL_BUF_T>::parallelWorkerTask(void* datap, bool) {
    ParallelWorkerData* const wdp = reinterpret_cast<ParallelWorkerData*>(datap);
    // Run the task
    wdp->m_cb(wdp->m_userp, wdp->m_bufp);
    // Mark buffer as ready
    const VerilatedLockGuard lock{wdp->m_mutex};
    wdp->m_ready.store(true);
    if (wdp->m_waiting) wdp->m_cv.notify_one();
}

template <> VL_ATTR_NOINLINE void VerilatedTrace<VL_SUB_T, VL_BUF_T>::ParallelWorkerData::wait() {
    // Spin for a while, waiting for the buffer to become ready
    for (int i = 0; i < VL_LOCK_SPINS; ++i) {
        if (VL_LIKELY(m_ready.load(std::memory_order_relaxed))) return;
        VL_CPU_RELAX();
    }
    // We have been spinning for a while, so yield the thread
    VerilatedLockGuard lock{m_mutex};
    m_waiting = true;
    m_cv.wait(lock, [this] { return m_ready.load(std::memory_order_relaxed); });
    m_waiting = false;
}
#endif

template <>
void VerilatedTrace<VL_SUB_T, VL_BUF_T>::runParallelCallbacks(const ParallelCallbackMap& cbMap) {
    for (VlThreadPool* threadPoolp : m_threadPoolps) {
#ifdef VL_TRACE_PARALLEL
        // If tracing in parallel, dispatch to the thread pool (if exists)
        if (threadPoolp && threadPoolp->numThreads()) {
            // List of work items for thread (std::list, as ParallelWorkerData is not movable)
            std::list<ParallelWorkerData> workerData;
            // We use the whole pool + the main thread
            const unsigned threads = threadPoolp->numThreads() + 1;
            // Main thread executes all jobs with index % threads == 0
            std::vector<ParallelWorkerData*> mainThreadWorkerData;
            // The tracing callbacks to execute on this thread-pool
            const auto& cbVec = cbMap.at(threadPoolp);
            // Enuque all the jobs
            for (unsigned i = 0; i < cbVec.size(); ++i) {
                const CallbackRecord& cbr = cbVec[i];
                // Always get the trace buffer on the main thread
                Buffer* const bufp = getTraceBuffer();
                // Create new work item
                workerData.emplace_back(cbr.m_dumpCb, cbr.m_userp, bufp);
                // Grab the new work item
                ParallelWorkerData* const itemp = &workerData.back();
                // Enqueue task to thread pool, or main thread
                if (unsigned rem = i % threads) {
                    threadPoolp->workerp(rem - 1)->addTask(parallelWorkerTask, itemp, false);
                } else {
                    mainThreadWorkerData.push_back(itemp);
                }
            }
            // Execute main thead jobs
            for (ParallelWorkerData* const itemp : mainThreadWorkerData) {
                parallelWorkerTask(itemp, false);
            }
            // Commit all trace buffers in order
            for (ParallelWorkerData& item : workerData) {
                // Wait until ready
                item.wait();
                // Commit the buffer
                commitTraceBuffer(item.m_bufp);
            }
            continue;
        }
#endif
        // Fall back on sequential execution
        for (const CallbackRecord& cbr : cbMap.at(threadPoolp)) {
            Buffer* const traceBufferp = getTraceBuffer();
            cbr.m_dumpCb(cbr.m_userp, traceBufferp);
            commitTraceBuffer(traceBufferp);
        }
    }
}

template <>
void VerilatedTrace<VL_SUB_T, VL_BUF_T>::dump(uint64_t timeui) VL_MT_SAFE_EXCLUDES(m_mutex) {
    // Not really VL_MT_SAFE but more VL_MT_UNSAFE_ONE.
    // This does get the mutex, but if multiple threads are trying to dump
    // chances are the data being dumped will have other problems
    const VerilatedLockGuard lock{m_mutex};
    if (VL_UNCOVERABLE(m_timeLastDump && timeui <= m_timeLastDump)) {  // LCOV_EXCL_START
        VL_PRINTF_MT("%%Warning: previous dump at t=%" PRIu64 ", requesting t=%" PRIu64
                     ", dump call ignored\n",
                     m_timeLastDump, timeui);
        return;
    }  // LCOV_EXCL_STOP
    m_timeLastDump = timeui;

    Verilated::quiesce();

    // Call hook for format specific behaviour
    if (VL_UNLIKELY(m_fullDump)) {
        if (!preFullDump()) return;
    } else {
        if (!preChangeDump()) return;
    }

#ifdef VL_TRACE_OFFLOAD
    // Currently only incremental dumps run on the worker thread
    uint32_t* bufferp = nullptr;
    if (VL_LIKELY(!m_fullDump)) {
        // Get the offload buffer we are about to fill
        bufferp = getOffloadBuffer();
        m_offloadBufferWritep = bufferp;
        m_offloadBufferEndp = bufferp + m_offloadBufferSize;

        // Tell worker to update time point
        m_offloadBufferWritep[0] = VerilatedTraceOffloadCommand::TIME_CHANGE;
        *reinterpret_cast<uint64_t*>(m_offloadBufferWritep + 1) = timeui;
        m_offloadBufferWritep += 3;
    } else {
        // Update time point
        flushBase();
        emitTimeChange(timeui);
    }
#else
    // Update time point
    emitTimeChange(timeui);
#endif

    // Run the callbacks
    if (VL_UNLIKELY(m_fullDump)) {
        m_fullDump = false;  // No more need for next dump to be full
        runParallelCallbacks(m_fullCbs);
    } else {
        runParallelCallbacks(m_chgCbs);
    }

    for (uint32_t i = 0; i < m_cleanupCbs.size(); ++i) {
        const CallbackRecord& cbr = m_cleanupCbs[i];
        cbr.m_cleanupCb(cbr.m_userp, self());
    }

#ifdef VL_TRACE_OFFLOAD
    if (VL_LIKELY(bufferp)) {
        // Mark end of the offload buffer we just filled
        *m_offloadBufferWritep++ = VerilatedTraceOffloadCommand::END;

        // Assert no buffer overflow
        assert(m_offloadBufferWritep - bufferp <= m_offloadBufferSize);

        // Pass it to the worker thread
        m_offloadBuffersToWorker.put(bufferp);
    }
#endif
}

//=============================================================================
// Non-hot path internal interface to Verilator generated code

template <>
void VerilatedTrace<VL_SUB_T, VL_BUF_T>::addThreadPool(VlThreadPool* threadPoolp)
    VL_MT_SAFE_EXCLUDES(m_mutex) {
    const VerilatedLockGuard lock{m_mutex};
    for (VlThreadPool* const poolp : m_threadPoolps) {
        if (poolp == threadPoolp) return;
    }
    m_threadPoolps.push_back(threadPoolp);
}

template <>
void VerilatedTrace<VL_SUB_T, VL_BUF_T>::addCallbackRecord(std::vector<CallbackRecord>& cbVec,
                                                           CallbackRecord& cbRec)
    VL_MT_SAFE_EXCLUDES(m_mutex) {
    const VerilatedLockGuard lock{m_mutex};
    if (VL_UNCOVERABLE(timeLastDump() != 0)) {  // LCOV_EXCL_START
        const std::string msg = (std::string{"Internal: "} + __FILE__ + "::" + __FUNCTION__
                                 + " called with already open file");
        VL_FATAL_MT(__FILE__, __LINE__, "", msg.c_str());
    }  // LCOV_EXCL_STOP
    cbVec.push_back(cbRec);
}

template <>
void VerilatedTrace<VL_SUB_T, VL_BUF_T>::addInitCb(initCb_t cb, void* userp) VL_MT_SAFE {
    CallbackRecord cbr{cb, userp};
    addCallbackRecord(m_initCbs, cbr);
}
template <>
void VerilatedTrace<VL_SUB_T, VL_BUF_T>::addFullCb(dumpCb_t cb, void* userp,
                                                   VlThreadPool* threadPoolp) VL_MT_SAFE {
    CallbackRecord cbr{cb, userp};
    addThreadPool(threadPoolp);
    addCallbackRecord(m_fullCbs[threadPoolp], cbr);
}
template <>
void VerilatedTrace<VL_SUB_T, VL_BUF_T>::addChgCb(dumpCb_t cb, void* userp,
                                                  VlThreadPool* threadPoolp) VL_MT_SAFE {
    CallbackRecord cbr{cb, userp};
    addThreadPool(threadPoolp);
    addCallbackRecord(m_chgCbs[threadPoolp], cbr);
}
template <>
void VerilatedTrace<VL_SUB_T, VL_BUF_T>::addCleanupCb(cleanupCb_t cb, void* userp) VL_MT_SAFE {
    CallbackRecord cbr{cb, userp};
    addCallbackRecord(m_cleanupCbs, cbr);
}

template <> void VerilatedTrace<VL_SUB_T, VL_BUF_T>::pushNamePrefix(const std::string& prefix) {
    m_namePrefixStack.push_back(m_namePrefixStack.back() + prefix);
}

template <> void VerilatedTrace<VL_SUB_T, VL_BUF_T>::popNamePrefix(unsigned count) {
    while (count--) m_namePrefixStack.pop_back();
    assert(!m_namePrefixStack.empty());
}

//=========================================================================
// Primitives converting binary values to strings...

// All of these take a destination pointer where the string will be emitted,
// and a value to convert. There are a couple of variants for efficiency.

static inline void cvtCDataToStr(char* dstp, CData value) {
#ifdef VL_HAVE_SSE2
    // Similar to cvtSDataToStr but only the bottom 8 byte lanes are used
    const __m128i a = _mm_cvtsi32_si128(value);
    const __m128i b = _mm_unpacklo_epi8(a, a);
    const __m128i c = _mm_shufflelo_epi16(b, 0);
    const __m128i m = _mm_set1_epi64x(0x0102040810204080);
    const __m128i d = _mm_cmpeq_epi8(_mm_and_si128(c, m), m);
    const __m128i result = _mm_sub_epi8(_mm_set1_epi8('0'), d);
    _mm_storel_epi64(reinterpret_cast<__m128i*>(dstp), result);
#else
    dstp[0] = '0' | static_cast<char>((value >> 7) & 1);
    dstp[1] = '0' | static_cast<char>((value >> 6) & 1);
    dstp[2] = '0' | static_cast<char>((value >> 5) & 1);
    dstp[3] = '0' | static_cast<char>((value >> 4) & 1);
    dstp[4] = '0' | static_cast<char>((value >> 3) & 1);
    dstp[5] = '0' | static_cast<char>((value >> 2) & 1);
    dstp[6] = '0' | static_cast<char>((value >> 1) & 1);
    dstp[7] = '0' | static_cast<char>(value & 1);
#endif
}

static inline void cvtSDataToStr(char* dstp, SData value) {
#ifdef VL_HAVE_SSE2
    // We want each bit in the 16-bit input value to end up in a byte lane
    // within the 128-bit XMM register. Note that x86 is little-endian and we
    // want the MSB of the input at the low address, so we will bit-reverse
    // at the same time.

    // Put value in bottom of 128-bit register a[15:0] = value
    const __m128i a = _mm_cvtsi32_si128(value);
    // Interleave bytes with themselves
    // b[15: 0] = {2{a[ 7:0]}} == {2{value[ 7:0]}}
    // b[31:16] = {2{a[15:8]}} == {2{value[15:8]}}
    const __m128i b = _mm_unpacklo_epi8(a, a);
    // Shuffle bottom 64 bits, note swapping high bytes with low bytes
    // c[31: 0] = {2{b[31:16]}} == {4{value[15:8}}
    // c[63:32] = {2{b[15: 0]}} == {4{value[ 7:0}}
    const __m128i c = _mm_shufflelo_epi16(b, 0x05);
    // Shuffle whole register
    // d[ 63: 0] = {2{c[31: 0]}} == {8{value[15:8}}
    // d[126:54] = {2{c[63:32]}} == {8{value[ 7:0}}
    const __m128i d = _mm_shuffle_epi32(c, 0x50);
    // Test each bit within the bytes, this sets each byte lane to 0
    // if the bit for that lane is 0 and to 0xff if the bit is 1.
    const __m128i m = _mm_set1_epi64x(0x0102040810204080);
    const __m128i e = _mm_cmpeq_epi8(_mm_and_si128(d, m), m);
    // Convert to ASCII by subtracting the masks from ASCII '0':
    // '0' - 0 is '0', '0' - -1 is '1'
    const __m128i result = _mm_sub_epi8(_mm_set1_epi8('0'), e);
    // Store the 16 characters to the un-aligned buffer
    _mm_storeu_si128(reinterpret_cast<__m128i*>(dstp), result);
#else
    cvtCDataToStr(dstp, value >> 8);
    cvtCDataToStr(dstp + 8, value);
#endif
}

static inline void cvtIDataToStr(char* dstp, IData value) {
#ifdef VL_HAVE_AVX2
    // Similar to cvtSDataToStr but the bottom 16-bits are processed in the
    // top half of the YMM registerss
    const __m256i a = _mm256_insert_epi32(_mm256_undefined_si256(), value, 0);
    const __m256i b = _mm256_permute4x64_epi64(a, 0);
    const __m256i s = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2,
                                      2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3);
    const __m256i c = _mm256_shuffle_epi8(b, s);
    const __m256i m = _mm256_set1_epi64x(0x0102040810204080);
    const __m256i d = _mm256_cmpeq_epi8(_mm256_and_si256(c, m), m);
    const __m256i result = _mm256_sub_epi8(_mm256_set1_epi8('0'), d);
    _mm256_storeu_si256(reinterpret_cast<__m256i*>(dstp), result);
#else
    cvtSDataToStr(dstp, value >> 16);
    cvtSDataToStr(dstp + 16, value);
#endif
}

static inline void cvtQDataToStr(char* dstp, QData value) {
    cvtIDataToStr(dstp, value >> 32);
    cvtIDataToStr(dstp + 32, value);
}

#define cvtEDataToStr cvtIDataToStr

//=========================================================================
// VerilatedTraceBuffer

template <>  //
VerilatedTraceBuffer<VL_SUB_T, VL_BUF_T>::VerilatedTraceBuffer(VL_SUB_T& owner)
    : m_owner{owner} {
#ifdef VL_TRACE_OFFLOAD
    if (m_offloadBufferWritep) {
        using This = VerilatedTraceBuffer<VL_SUB_T, VL_BUF_T>*;
        // Tack on the buffer address
        static_assert(2 * sizeof(uint32_t) >= sizeof(This),
                      "This should be enough on all plafrorms");
        *m_offloadBufferWritep++ = VerilatedTraceOffloadCommand::TRACE_BUFFER;
        *reinterpret_cast<This*>(m_offloadBufferWritep) = this;
        m_offloadBufferWritep += 2;
    }
#endif
}

// These functions must write the new value back into the old value store,
// and subsequently call the format specific emit* implementations. Note
// that this file must be included in the format specific implementation, so
// the emit* functions can be inlined for performance.

template <>  //
void VerilatedTraceBuffer<VL_SUB_T, VL_BUF_T>::fullBit(uint32_t* oldp, CData newval) {
    const uint32_t code = oldp - m_sigs_oldvalp;
    *oldp = newval;  // Still copy even if not tracing so chg doesn't call full
    if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return;
    self()->emitBit(code, newval);
}

template <>
void VerilatedTraceBuffer<VL_SUB_T, VL_BUF_T>::fullCData(uint32_t* oldp, CData newval, int bits) {
    const uint32_t code = oldp - m_sigs_oldvalp;
    *oldp = newval;  // Still copy even if not tracing so chg doesn't call full
    if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return;
    self()->emitCData(code, newval, bits);
}

template <>
void VerilatedTraceBuffer<VL_SUB_T, VL_BUF_T>::fullSData(uint32_t* oldp, SData newval, int bits) {
    const uint32_t code = oldp - m_sigs_oldvalp;
    *oldp = newval;  // Still copy even if not tracing so chg doesn't call full
    if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return;
    self()->emitSData(code, newval, bits);
}

template <>
void VerilatedTraceBuffer<VL_SUB_T, VL_BUF_T>::fullIData(uint32_t* oldp, IData newval, int bits) {
    const uint32_t code = oldp - m_sigs_oldvalp;
    *oldp = newval;  // Still copy even if not tracing so chg doesn't call full
    if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return;
    self()->emitIData(code, newval, bits);
}

template <>
void VerilatedTraceBuffer<VL_SUB_T, VL_BUF_T>::fullQData(uint32_t* oldp, QData newval, int bits) {
    const uint32_t code = oldp - m_sigs_oldvalp;
    *reinterpret_cast<QData*>(oldp) = newval;
    if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return;
    self()->emitQData(code, newval, bits);
}

template <>
void VerilatedTraceBuffer<VL_SUB_T, VL_BUF_T>::fullWData(uint32_t* oldp, const WData* newvalp,
                                                         int bits) {
    const uint32_t code = oldp - m_sigs_oldvalp;
    for (int i = 0; i < VL_WORDS_I(bits); ++i) oldp[i] = newvalp[i];
    if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return;
    self()->emitWData(code, newvalp, bits);
}

template <>
void VerilatedTraceBuffer<VL_SUB_T, VL_BUF_T>::fullDouble(uint32_t* oldp, double newval) {
    const uint32_t code = oldp - m_sigs_oldvalp;
    *reinterpret_cast<double*>(oldp) = newval;
    if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return;
    // cppcheck-suppress invalidPointerCast
    self()->emitDouble(code, newval);
}

#endif  // VL_CPPCHECK