Use SIMD intrinsics to render VCD traces (#2289)

Use SIMD intrinsics to render VCD traces. I have measured 10-40% single threaded performance increase with VCD tracing on SweRV EH1 and lowRISC Ibex using SSE2 intrinsics to render the trace. Also helps a tiny bit with FST, but now almost all of the FST overhead is in the FST library. I have reworked the tracing routines to use more precisely sized arguments. The nice thing about this is that the performance without the intrinsics is pretty much the same as it was before, as we do at most 2x as much work as necessary, but in exchange there are no data dependent branches at all.
2025-01-01 04:07:34 +00:00 · 2020-04-30 00:09:09 +01:00 · 2020-04-30 00:09:09 +01:00 · aa9cde22c8
commit aa9cde22c8
parent faf7255e83
11 changed files with 451 additions and 255 deletions
--- a/include/verilated_fst_c.cpp
+++ b/include/verilated_fst_c.cpp
@ -64,11 +64,13 @@

 VerilatedFst::VerilatedFst(void* fst)
    : m_fst(fst)
-    , m_symbolp(NULL) {}
+    , m_symbolp(NULL)
+    , m_strbuf(NULL) {}

 VerilatedFst::~VerilatedFst() {
    if (m_fst) fstWriterClose(m_fst);
    if (m_symbolp) VL_DO_CLEAR(delete[] m_symbolp, m_symbolp = NULL);
+    if (m_strbuf) VL_DO_CLEAR(delete[] m_strbuf, m_strbuf = NULL);
 }

 void VerilatedFst::open(const char* filename) VL_MT_UNSAFE {
@ -100,6 +102,9 @@ void VerilatedFst::open(const char* filename) VL_MT_UNSAFE {
        }
    }
    m_code2symbol.clear();
+
+    // Allocate string buffer for arrays
+    if (!m_strbuf) { m_strbuf = new char[maxBits() + 32]; }
 }

 void VerilatedFst::close() {
@ -213,25 +218,59 @@ void VerilatedFst::declDouble(vluint32_t code, const char* name, int dtypenum, f
 // so always inline them.

 VL_ATTR_ALWINLINE
-void VerilatedFst::emitBit(vluint32_t code, vluint32_t newval) {
+void VerilatedFst::emitBit(vluint32_t code, CData newval) {
    fstWriterEmitValueChange(m_fst, m_symbolp[code], newval ? "1" : "0");
 }
+
 VL_ATTR_ALWINLINE
-void VerilatedFst::emitBus(vluint32_t code, vluint32_t newval, int bits) {
-    fstWriterEmitValueChange32(m_fst, m_symbolp[code], bits, newval);
+void VerilatedFst::emitCData(vluint32_t code, CData newval, int bits) {
+    char buf[VL_BYTESIZE];
+    cvtCDataToStr(buf, newval << (VL_BYTESIZE - bits));
+    fstWriterEmitValueChange(m_fst, m_symbolp[code], buf);
 }
+
 VL_ATTR_ALWINLINE
-void VerilatedFst::emitQuad(vluint32_t code, vluint64_t newval, int bits) {
-    fstWriterEmitValueChange64(m_fst, m_symbolp[code], bits, newval);
+void VerilatedFst::emitSData(vluint32_t code, SData newval, int bits) {
+    char buf[VL_SHORTSIZE];
+    cvtSDataToStr(buf, newval << (VL_SHORTSIZE - bits));
+    fstWriterEmitValueChange(m_fst, m_symbolp[code], buf);
 }
+
 VL_ATTR_ALWINLINE
-void VerilatedFst::emitArray(vluint32_t code, const vluint32_t* newvalp, int bits) {
-    fstWriterEmitValueChangeVec32(m_fst, m_symbolp[code], bits, newvalp);
+void VerilatedFst::emitIData(vluint32_t code, IData newval, int bits) {
+    char buf[VL_IDATASIZE];
+    cvtIDataToStr(buf, newval << (VL_IDATASIZE - bits));
+    fstWriterEmitValueChange(m_fst, m_symbolp[code], buf);
 }
+
+VL_ATTR_ALWINLINE
+void VerilatedFst::emitQData(vluint32_t code, QData newval, int bits) {
+    char buf[VL_QUADSIZE];
+    cvtQDataToStr(buf, newval << (VL_QUADSIZE - bits));
+    fstWriterEmitValueChange(m_fst, m_symbolp[code], buf);
+}
+
+VL_ATTR_ALWINLINE
+void VerilatedFst::emitWData(vluint32_t code, const WData* newvalp, int bits) {
+    int words = VL_WORDS_I(bits);
+    char* wp = m_strbuf;
+    // Convert the most significant word
+    const int bitsInMSW = VL_BITBIT_E(bits) ? VL_BITBIT_E(bits) : VL_EDATASIZE;
+    cvtEDataToStr(wp, newvalp[--words] << (VL_EDATASIZE - bitsInMSW));
+    wp += bitsInMSW;
+    // Convert the remaining words
+    while (words > 0) {
+        cvtEDataToStr(wp, newvalp[--words]);
+        wp += VL_EDATASIZE;
+    }
+    fstWriterEmitValueChange(m_fst, m_symbolp[code], m_strbuf);
+}
+
 VL_ATTR_ALWINLINE
 void VerilatedFst::emitFloat(vluint32_t code, float newval) {
    fstWriterEmitValueChange(m_fst, m_symbolp[code], &newval);
 }
+
 VL_ATTR_ALWINLINE
 void VerilatedFst::emitDouble(vluint32_t code, double newval) {
    fstWriterEmitValueChange(m_fst, m_symbolp[code], &newval);
--- a/include/verilated_fst_c.h
+++ b/include/verilated_fst_c.h
@ -51,6 +51,8 @@ private:
    Local2FstDtype m_local2fstdtype;
    std::list<std::string> m_curScope;
    fstHandle* m_symbolp;  ///< same as m_code2symbol, but as an array
+    char* m_strbuf;  ///< String buffer long enough to hold maxBits() chars
+
    // CONSTRUCTORS
    VL_UNCOPYABLE(VerilatedFst);
    void declSymbol(vluint32_t code, const char* name, int dtypenum, fstVarDir vardir,
@ -69,10 +71,12 @@ protected:

    // Implementations of duck-typed methods for VerilatedTrace. These are
    // called from only one place (namely full*) so always inline them.
-    inline void emitBit(vluint32_t code, vluint32_t newval);
-    inline void emitBus(vluint32_t code, vluint32_t newval, int bits);
-    inline void emitQuad(vluint32_t code, vluint64_t newval, int bits);
-    inline void emitArray(vluint32_t code, const vluint32_t* newvalp, int bits);
+    inline void emitBit(vluint32_t code, CData newval);
+    inline void emitCData(vluint32_t code, CData newval, int bits);
+    inline void emitSData(vluint32_t code, SData newval, int bits);
+    inline void emitIData(vluint32_t code, IData newval, int bits);
+    inline void emitQData(vluint32_t code, QData newval, int bits);
+    inline void emitWData(vluint32_t code, const WData* newvalp, int bits);
    inline void emitFloat(vluint32_t code, float newval);
    inline void emitDouble(vluint32_t code, double newval);

--- a/include/verilated_intrinsics.h
+++ b/include/verilated_intrinsics.h
@ -0,0 +1,41 @@
+// -*- mode: C++; c-file-style: "cc-mode" -*-
+//*************************************************************************
+//
+// Copyright 2003-2020 by Wilson Snyder. This program is free software; you can
+// redistribute it and/or modify it under the terms of either the GNU
+// Lesser General Public License Version 3 or the Perl Artistic License
+// Version 2.0.
+// SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
+//
+//*************************************************************************
+///
+/// \file
+/// \brief Verilator: Common include for target specific intrinsics.
+///
+///     Code using machine specific intrinsics for optimization should
+///     include this header rather than directly including he target
+///     specific headers. We provide macros to check for availability
+///     of instruction sets, and a common mechanism to disable them.
+///
+//*************************************************************************
+
+#ifndef _VERILATED_INTRINSICS_H_
+#define _VERILATED_INTRINSICS_H_ 1  ///< Header Guard
+
+// clang-format off
+
+// Use VL_DISABLE_INTRINSICS to disable all intrinsics based optimization
+#if !defined(VL_DISABLE_INTRINSICS) && !defined(VL_PORTABLE_ONLY)
+# if defined(__SSE2__) && !defined(VL_DISABLE_SSE2)
+#  define VL_HAVE_SSE2 1
+#  include <emmintrin.h>
+# endif
+# if defined(__AVX2__) && defined(VL_HAVE_SSE2) && !defined(VL_DISABLE_AVX2)
+#  define VL_HAVE_AVX2 1
+#  include <immintrin.h>
+# endif
+#endif
+
+// clang-format on
+
+#endif  // Guard
--- a/include/verilated_trace.h
+++ b/include/verilated_trace.h
@ -90,11 +90,13 @@ public:
    enum {
        CHG_BIT_0 = 0x0,
        CHG_BIT_1 = 0x1,
-        CHG_BUS = 0x2,
-        CHG_QUAD = 0x3,
-        CHG_ARRAY = 0x4,
-        CHG_FLOAT = 0x5,
-        CHG_DOUBLE = 0x6,
+        CHG_CDATA = 0x2,
+        CHG_SDATA = 0x3,
+        CHG_IDATA = 0x4,
+        CHG_QDATA = 0x5,
+        CHG_WDATA = 0x6,
+        CHG_FLOAT = 0x7,
+        CHG_DOUBLE = 0x8,
        // TODO: full..
        TIME_CHANGE = 0xd,
        END = 0xe,  // End of buffer
@ -122,6 +124,7 @@ private:
    bool m_fullDump;  ///< Whether a full dump is required on the next call to 'dump'
    vluint32_t m_nextCode;  ///< Next code number to assign
    vluint32_t m_numSignals;  ///< Number of distinct signals
+    vluint32_t m_maxBits;  ///< Number of bits in the widest signal
    std::string m_moduleName;  ///< Name of module being trace initialized now
    char m_scopeEscape;
    double m_timeRes;  ///< Time resolution (ns/ms etc)
@ -176,6 +179,7 @@ protected:

    vluint32_t nextCode() const { return m_nextCode; }
    vluint32_t numSignals() const { return m_numSignals; }
+    vluint32_t maxBits() const { return m_maxBits; }
    const std::string& moduleName() const { return m_moduleName; }
    void fullDump(bool value) { m_fullDump = value; }
    vluint64_t timeLastDump() { return m_timeLastDump; }
@ -251,47 +255,65 @@ public:
    // these here, but we cannot afford dynamic dispatch for calling these as
    // this is very hot code during tracing.

-    // duck-typed void emitBit(vluint32_t code, vluint32_t newval) = 0;
-    // duck-typed void emitBus(vluint32_t code, vluint32_t newval, int bits) = 0;
-    // duck-typed void emitQuad(vluint32_t code, vluint64_t newval, int bits) = 0;
-    // duck-typed void emitArray(vluint32_t code, const vluint32_t* newvalp, int bits) = 0;
+    // duck-typed void emitBit(vluint32_t code, CData newval) = 0;
+    // duck-typed void emitCData(vluint32_t code, CData newval, int bits) = 0;
+    // duck-typed void emitSData(vluint32_t code, SData newval, int bits) = 0;
+    // duck-typed void emitIData(vluint32_t code, IData newval, int bits) = 0;
+    // duck-typed void emitQData(vluint32_t code, QData newval, int bits) = 0;
+    // duck-typed void emitWData(vluint32_t code, const WData* newvalp, int bits) = 0;
    // duck-typed void emitFloat(vluint32_t code, float newval) = 0;
    // duck-typed void emitDouble(vluint32_t code, double newval) = 0;

    vluint32_t* oldp(vluint32_t code) { return m_sigs_oldvalp + code; }

    // Write to previous value buffer value and emit trace entry.
-    void fullBit(vluint32_t* oldp, vluint32_t newval);
-    void fullBus(vluint32_t* oldp, vluint32_t newval, int bits);
-    void fullQuad(vluint32_t* oldp, vluint64_t newval, int bits);
-    void fullArray(vluint32_t* oldp, const vluint32_t* newvalp, int bits);
+    void fullBit(vluint32_t* oldp, CData newval);
+    void fullCData(vluint32_t* oldp, CData newval, int bits);
+    void fullSData(vluint32_t* oldp, SData newval, int bits);
+    void fullIData(vluint32_t* oldp, IData newval, int bits);
+    void fullQData(vluint32_t* oldp, QData newval, int bits);
+    void fullWData(vluint32_t* oldp, const WData* newvalp, int bits);
    void fullFloat(vluint32_t* oldp, float newval);
    void fullDouble(vluint32_t* oldp, double newval);

 #ifdef VL_TRACE_THREADED
    // Threaded tracing. Just dump everything in the trace buffer
-    inline void chgBit(vluint32_t code, vluint32_t newval) {
+    inline void chgBit(vluint32_t code, CData newval) {
        m_traceBufferWritep[0] = VerilatedTraceCommand::CHG_BIT_0 | newval;
        m_traceBufferWritep[1] = code;
        m_traceBufferWritep += 2;
        VL_DEBUG_IF(assert(m_traceBufferWritep <= m_traceBufferEndp););
    }
-    inline void chgBus(vluint32_t code, vluint32_t newval, int bits) {
-        m_traceBufferWritep[0] = (bits << 4) | VerilatedTraceCommand::CHG_BUS;
+    inline void chgCData(vluint32_t code, CData newval, int bits) {
+        m_traceBufferWritep[0] = (bits << 4) | VerilatedTraceCommand::CHG_CDATA;
        m_traceBufferWritep[1] = code;
        m_traceBufferWritep[2] = newval;
        m_traceBufferWritep += 3;
        VL_DEBUG_IF(assert(m_traceBufferWritep <= m_traceBufferEndp););
    }
-    inline void chgQuad(vluint32_t code, vluint64_t newval, int bits) {
-        m_traceBufferWritep[0] = (bits << 4) | VerilatedTraceCommand::CHG_QUAD;
+    inline void chgSData(vluint32_t code, SData newval, int bits) {
+        m_traceBufferWritep[0] = (bits << 4) | VerilatedTraceCommand::CHG_SDATA;
        m_traceBufferWritep[1] = code;
-        *reinterpret_cast<vluint64_t*>(m_traceBufferWritep + 2) = newval;
+        m_traceBufferWritep[2] = newval;
+        m_traceBufferWritep += 3;
+        VL_DEBUG_IF(assert(m_traceBufferWritep <= m_traceBufferEndp););
+    }
+    inline void chgIData(vluint32_t code, IData newval, int bits) {
+        m_traceBufferWritep[0] = (bits << 4) | VerilatedTraceCommand::CHG_IDATA;
+        m_traceBufferWritep[1] = code;
+        m_traceBufferWritep[2] = newval;
+        m_traceBufferWritep += 3;
+        VL_DEBUG_IF(assert(m_traceBufferWritep <= m_traceBufferEndp););
+    }
+    inline void chgQData(vluint32_t code, QData newval, int bits) {
+        m_traceBufferWritep[0] = (bits << 4) | VerilatedTraceCommand::CHG_QDATA;
+        m_traceBufferWritep[1] = code;
+        *reinterpret_cast<QData*>(m_traceBufferWritep + 2) = newval;
        m_traceBufferWritep += 4;
        VL_DEBUG_IF(assert(m_traceBufferWritep <= m_traceBufferEndp););
    }
-    inline void chgArray(vluint32_t code, const vluint32_t* newvalp, int bits) {
-        m_traceBufferWritep[0] = (bits << 4) | VerilatedTraceCommand::CHG_ARRAY;
+    inline void chgWData(vluint32_t code, const WData* newvalp, int bits) {
+        m_traceBufferWritep[0] = (bits << 4) | VerilatedTraceCommand::CHG_WDATA;
        m_traceBufferWritep[1] = code;
        m_traceBufferWritep += 2;
        for (int i = 0; i < (bits + 31) / 32; ++i) { *m_traceBufferWritep++ = newvalp[i]; }
@ -324,22 +346,30 @@ public:
    // thread and are called chg*Impl

    // Check previous dumped value of signal. If changed, then emit trace entry
-    inline void CHG(Bit)(vluint32_t* oldp, vluint32_t newval) {
+    inline void CHG(Bit)(vluint32_t* oldp, CData newval) {
        const vluint32_t diff = *oldp ^ newval;
        if (VL_UNLIKELY(diff)) fullBit(oldp, newval);
    }
-    inline void CHG(Bus)(vluint32_t* oldp, vluint32_t newval, int bits) {
+    inline void CHG(CData)(vluint32_t* oldp, CData newval, int bits) {
        const vluint32_t diff = *oldp ^ newval;
-        if (VL_UNLIKELY(diff)) fullBus(oldp, newval, bits);
+        if (VL_UNLIKELY(diff)) fullCData(oldp, newval, bits);
    }
-    inline void CHG(Quad)(vluint32_t* oldp, vluint64_t newval, int bits) {
-        const vluint64_t diff = *reinterpret_cast<vluint64_t*>(oldp) ^ newval;
-        if (VL_UNLIKELY(diff)) fullQuad(oldp, newval, bits);
+    inline void CHG(SData)(vluint32_t* oldp, SData newval, int bits) {
+        const vluint32_t diff = *oldp ^ newval;
+        if (VL_UNLIKELY(diff)) fullSData(oldp, newval, bits);
    }
-    inline void CHG(Array)(vluint32_t* oldp, const vluint32_t* newvalp, int bits) {
+    inline void CHG(IData)(vluint32_t* oldp, IData newval, int bits) {
+        const vluint32_t diff = *oldp ^ newval;
+        if (VL_UNLIKELY(diff)) fullIData(oldp, newval, bits);
+    }
+    inline void CHG(QData)(vluint32_t* oldp, QData newval, int bits) {
+        const vluint64_t diff = *reinterpret_cast<QData*>(oldp) ^ newval;
+        if (VL_UNLIKELY(diff)) fullQData(oldp, newval, bits);
+    }
+    inline void CHG(WData)(vluint32_t* oldp, const WData* newvalp, int bits) {
        for (int i = 0; i < (bits + 31) / 32; ++i) {
            if (VL_UNLIKELY(oldp[i] ^ newvalp[i])) {
-                fullArray(oldp, newvalp, bits);
+                fullWData(oldp, newvalp, bits);
                return;
            }
        }
--- a/include/verilated_trace_imp.cpp
+++ b/include/verilated_trace_imp.cpp
@ -23,6 +23,7 @@
 # error "This file should be included in trace format implementations"
 #endif

+#include "verilated_intrinsics.h"
 #include "verilated_trace.h"

 #if 0
@ -166,22 +167,34 @@ template <> void VerilatedTrace<VL_DERIVED_T>::workerThreadMain() {
                VL_TRACE_THREAD_DEBUG("Command CHG_BIT_1 " << top);
                chgBitImpl(oldp, 1);
                continue;
-            case VerilatedTraceCommand::CHG_BUS:
-                VL_TRACE_THREAD_DEBUG("Command CHG_BUS " << top);
+            case VerilatedTraceCommand::CHG_CDATA:
+                VL_TRACE_THREAD_DEBUG("Command CHG_CDATA " << top);
                // Bits stored in bottom byte of command
-                chgBusImpl(oldp, *readp, top);
+                chgCDataImpl(oldp, *readp, top);
                readp += 1;
                continue;
-            case VerilatedTraceCommand::CHG_QUAD:
-                VL_TRACE_THREAD_DEBUG("Command CHG_QUAD " << top);
+            case VerilatedTraceCommand::CHG_SDATA:
+                VL_TRACE_THREAD_DEBUG("Command CHG_SDATA " << top);
                // Bits stored in bottom byte of command
-                chgQuadImpl(oldp, *reinterpret_cast<const vluint64_t*>(readp), top);
+                chgSDataImpl(oldp, *readp, top);
+                readp += 1;
+                continue;
+            case VerilatedTraceCommand::CHG_IDATA:
+                VL_TRACE_THREAD_DEBUG("Command CHG_IDATA " << top);
+                // Bits stored in bottom byte of command
+                chgIDataImpl(oldp, *readp, top);
+                readp += 1;
+                continue;
+            case VerilatedTraceCommand::CHG_QDATA:
+                VL_TRACE_THREAD_DEBUG("Command CHG_QDATA " << top);
+                // Bits stored in bottom byte of command
+                chgQDataImpl(oldp, *reinterpret_cast<const QData*>(readp), top);
                readp += 2;
                continue;
-            case VerilatedTraceCommand::CHG_ARRAY:
-                VL_TRACE_THREAD_DEBUG("Command CHG_ARRAY " << top);
-                chgArrayImpl(oldp, readp, top);
-                readp += (top + 31) / 32;
+            case VerilatedTraceCommand::CHG_WDATA:
+                VL_TRACE_THREAD_DEBUG("Command CHG_WDATA " << top);
+                chgWDataImpl(oldp, readp, top);
+                readp += VL_WORDS_I(top);
                continue;
            case VerilatedTraceCommand::CHG_FLOAT:
                VL_TRACE_THREAD_DEBUG("Command CHG_FLOAT " << top);
@ -284,6 +297,7 @@ VerilatedTrace<VL_DERIVED_T>::VerilatedTrace()
    , m_fullDump(true)
    , m_nextCode(0)
    , m_numSignals(0)
+    , m_maxBits(0)
    , m_scopeEscape('.')
    , m_timeRes(1e-9)
    , m_timeUnit(1e-9)
@ -318,6 +332,7 @@ template <> void VerilatedTrace<VL_DERIVED_T>::traceInit() VL_MT_UNSAFE {
    const vluint32_t expectedCodes = nextCode();
    m_nextCode = 1;
    m_numSignals = 0;
+    m_maxBits = 0;

    // Call all initialize callbacks, which will call decl* for each signal.
    for (vluint32_t ent = 0; ent < m_callbacks.size(); ++ent) {
@ -355,10 +370,11 @@ void VerilatedTrace<VL_DERIVED_T>::declCode(vluint32_t code, vluint32_t bits, bo
    }
    // Note: The tri-state flag is not used by Verilator, but is here for
    // compatibility with some foreign code.
-    int codesNeeded = (bits + 31) / 32;
+    int codesNeeded = VL_WORDS_I(bits);
    if (tri) codesNeeded *= 2;
    m_nextCode = std::max(m_nextCode, code + codesNeeded);
    ++m_numSignals;
+    m_maxBits = std::max(m_maxBits, bits);
 }

 //=========================================================================
@ -486,35 +502,139 @@ void VerilatedTrace<VL_DERIVED_T>::addCallback(callback_t initcb, callback_t ful
 // that this file must be included in the format specific implementation, so
 // the emit* functions can be inlined for performance.

-template <> void VerilatedTrace<VL_DERIVED_T>::fullBit(vluint32_t* oldp, vluint32_t newval) {
+template <> void VerilatedTrace<VL_DERIVED_T>::fullBit(vluint32_t* oldp, CData newval) {
    *oldp = newval;
    self()->emitBit(oldp - m_sigs_oldvalp, newval);
 }

 template <>
-void VerilatedTrace<VL_DERIVED_T>::fullBus(vluint32_t* oldp, vluint32_t newval, int bits) {
+void VerilatedTrace<VL_DERIVED_T>::fullCData(vluint32_t* oldp, CData newval, int bits) {
    *oldp = newval;
-    self()->emitBus(oldp - m_sigs_oldvalp, newval, bits);
+    self()->emitCData(oldp - m_sigs_oldvalp, newval, bits);
 }

 template <>
-void VerilatedTrace<VL_DERIVED_T>::fullQuad(vluint32_t* oldp, vluint64_t newval, int bits) {
-    *reinterpret_cast<vluint64_t*>(oldp) = newval;
-    self()->emitQuad(oldp - m_sigs_oldvalp, newval, bits);
+void VerilatedTrace<VL_DERIVED_T>::fullSData(vluint32_t* oldp, SData newval, int bits) {
+    *oldp = newval;
+    self()->emitSData(oldp - m_sigs_oldvalp, newval, bits);
 }
+
 template <>
-void VerilatedTrace<VL_DERIVED_T>::fullArray(vluint32_t* oldp, const vluint32_t* newvalp,
-                                             int bits) {
-    for (int i = 0; i < (bits + 31) / 32; ++i) oldp[i] = newvalp[i];
-    self()->emitArray(oldp - m_sigs_oldvalp, newvalp, bits);
+void VerilatedTrace<VL_DERIVED_T>::fullIData(vluint32_t* oldp, IData newval, int bits) {
+    *oldp = newval;
+    self()->emitIData(oldp - m_sigs_oldvalp, newval, bits);
 }
+
+template <>
+void VerilatedTrace<VL_DERIVED_T>::fullQData(vluint32_t* oldp, QData newval, int bits) {
+    *reinterpret_cast<QData*>(oldp) = newval;
+    self()->emitQData(oldp - m_sigs_oldvalp, newval, bits);
+}
+
+template <>
+void VerilatedTrace<VL_DERIVED_T>::fullWData(vluint32_t* oldp, const WData* newvalp, int bits) {
+    for (int i = 0; i < VL_WORDS_I(bits); ++i) oldp[i] = newvalp[i];
+    self()->emitWData(oldp - m_sigs_oldvalp, newvalp, bits);
+}
+
 template <> void VerilatedTrace<VL_DERIVED_T>::fullFloat(vluint32_t* oldp, float newval) {
    // cppcheck-suppress invalidPointerCast
    *reinterpret_cast<float*>(oldp) = newval;
    self()->emitFloat(oldp - m_sigs_oldvalp, newval);
 }
+
 template <> void VerilatedTrace<VL_DERIVED_T>::fullDouble(vluint32_t* oldp, double newval) {
    // cppcheck-suppress invalidPointerCast
    *reinterpret_cast<double*>(oldp) = newval;
    self()->emitDouble(oldp - m_sigs_oldvalp, newval);
 }
+
+//=========================================================================
+// Primitives converting binary values to strings...
+
+// All of these take a destination pointer where the string will be emitted,
+// and a value to convert. There are a couple of variants for efficiency.
+
+inline static void cvtCDataToStr(char* dstp, CData value) {
+#ifdef VL_HAVE_SSE2
+    // Similar to cvtSDataToStr but only the bottom 8 byte lanes are used
+    const __m128i a = _mm_cvtsi32_si128(value);
+    const __m128i b = _mm_unpacklo_epi8(a, a);
+    const __m128i c = _mm_shufflelo_epi16(b, 0);
+    const __m128i m = _mm_set1_epi64x(0x0102040810204080);
+    const __m128i d = _mm_cmpeq_epi8(_mm_and_si128(c, m), m);
+    const __m128i result = _mm_sub_epi8(_mm_set1_epi8('0'), d);
+    _mm_storel_epi64(reinterpret_cast<__m128i*>(dstp), result);
+#else
+    dstp[0] = '0' | static_cast<char>((value >> 7) & 1);
+    dstp[1] = '0' | static_cast<char>((value >> 6) & 1);
+    dstp[2] = '0' | static_cast<char>((value >> 5) & 1);
+    dstp[3] = '0' | static_cast<char>((value >> 4) & 1);
+    dstp[4] = '0' | static_cast<char>((value >> 3) & 1);
+    dstp[5] = '0' | static_cast<char>((value >> 2) & 1);
+    dstp[6] = '0' | static_cast<char>((value >> 1) & 1);
+    dstp[7] = '0' | static_cast<char>(value & 1);
+#endif
+}
+
+inline static void cvtSDataToStr(char* dstp, SData value) {
+#ifdef VL_HAVE_SSE2
+    // We want each bit in the 16-bit input value to end up in a byte lane
+    // within the 128-bit XMM register. Note that x86 is little-endian and we
+    // want the MSB of the input at the low address, so we will bit-reverse
+    // at the same time.
+
+    // Put value in bottom of 128-bit register a[15:0] = value
+    const __m128i a = _mm_cvtsi32_si128(value);
+    // Interleave bytes with themselves
+    // b[15: 0] = {2{a[ 7:0]}} == {2{value[ 7:0]}}
+    // b[31:16] = {2{a[15:8]}} == {2{value[15:8]}}
+    const __m128i b = _mm_unpacklo_epi8(a, a);
+    // Shuffle bottom 64 bits, note swapping high bytes with low bytes
+    // c[31: 0] = {2{b[31:16]}} == {4{value[15:8}}
+    // c[63:32] = {2{b[15: 0]}} == {4{value[ 7:0}}
+    const __m128i c = _mm_shufflelo_epi16(b, 0x05);
+    // Shuffle whole register
+    // d[ 63: 0] = {2{c[31: 0]}} == {8{value[15:8}}
+    // d[126:54] = {2{c[63:32]}} == {8{value[ 7:0}}
+    const __m128i d = _mm_shuffle_epi32(c, 0x50);
+    // Test each bit within the bytes, this sets each byte lane to 0
+    // if the bit for that lane is 0 and to 0xff if the bit is 1.
+    const __m128i m = _mm_set1_epi64x(0x0102040810204080);
+    const __m128i e = _mm_cmpeq_epi8(_mm_and_si128(d, m), m);
+    // Convert to ASCII by subtracting the masks from ASCII '0':
+    // '0' - 0 is '0', '0' - -1 is '1'
+    const __m128i result = _mm_sub_epi8(_mm_set1_epi8('0'), e);
+    // Store the 16 characters to the un-aligned buffer
+    _mm_storeu_si128(reinterpret_cast<__m128i*>(dstp), result);
+#else
+    cvtCDataToStr(dstp, value >> 8);
+    cvtCDataToStr(dstp + 8, value);
+#endif
+}
+
+inline static void cvtIDataToStr(char* dstp, IData value) {
+#ifdef VL_HAVE_AVX2
+    // Similar to cvtSDataToStr but the bottom 16-bits are processed in the
+    // top half of the YMM registerss
+    const __m256i a = _mm256_insert_epi32(_mm256_undefined_si256(), value, 0);
+    const __m256i b = _mm256_permute4x64_epi64(a, 0);
+    const __m256i s = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2,
+                                      2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3);
+    const __m256i c = _mm256_shuffle_epi8(b, s);
+    const __m256i m = _mm256_set1_epi64x(0x0102040810204080);
+    const __m256i d = _mm256_cmpeq_epi8(_mm256_and_si256(c, m), m);
+    const __m256i result = _mm256_sub_epi8(_mm256_set1_epi8('0'), d);
+    _mm256_storeu_si256(reinterpret_cast<__m256i*>(dstp), result);
+#else
+    cvtSDataToStr(dstp, value >> 16);
+    cvtSDataToStr(dstp + 16, value);
+#endif
+}
+
+inline static void cvtQDataToStr(char* dstp, QData value) {
+    cvtIDataToStr(dstp, value >> 32);
+    cvtIDataToStr(dstp + 32, value);
+}
+
+#define cvtEDataToStr cvtIDataToStr
--- a/include/verilated_vcd_c.cpp
+++ b/include/verilated_vcd_c.cpp
@ -35,6 +35,8 @@
 # include <unistd.h>
 #endif

+#include "verilated_intrinsics.h"
+
 // SPDIFF_ON

 #ifndef O_LARGEFILE  // For example on WIN32
@ -606,20 +608,14 @@ void VerilatedVcd::declTriArray(vluint32_t code, const char* name, bool array, i
 #endif  //  VL_TRACE_VCD_OLD_API

 //=============================================================================
-// Trace recording routines
+// Trace rendering prinitives

-//=============================================================================
-// Emit trace entries
-
-#define VL_VCD_SUFFIXP(code) (m_suffixesp + (code)*VL_TRACE_SUFFIX_ENTRY_SIZE)
-
-// Emit suffix, write back write pointer, check buffer
 void VerilatedVcd::finishLine(vluint32_t code, char* writep) {
-    const char* const suffixp = VL_VCD_SUFFIXP(code);
+    const char* const suffixp = m_suffixesp + code * VL_TRACE_SUFFIX_ENTRY_SIZE;
    // Copy the whole suffix (this avoid having hard to predict branches which
    // helps a lot). Note: The maximum length of the suffix is
    // VL_TRACE_MAX_VCD_CODE_SIZE + 2 == 7, but we unroll this here for speed.
-#ifdef __x86_64__
+#ifdef VL_X86_64
    // Copy the whole 8 bytes in one go, this works on little-endian machines
    // supporting unaligned stores.
    *reinterpret_cast<vluint64_t*>(writep) = *reinterpret_cast<const vluint64_t*>(suffixp);
@ -639,12 +635,15 @@ void VerilatedVcd::finishLine(vluint32_t code, char* writep) {
    bufferCheck();
 }

+//=============================================================================
+// emit* trace routines
+
 // Note: emit* are only ever called from one place (full* in
 // verilated_trace_imp.cpp, which is included in this file at the top),
 // so always inline them.

 VL_ATTR_ALWINLINE
-void VerilatedVcd::emitBit(vluint32_t code, vluint32_t newval) {
+void VerilatedVcd::emitBit(vluint32_t code, CData newval) {
    // Don't prefetch suffix as it's a bit too late;
    char* wp = m_writep;
    *wp++ = '0' | static_cast<char>(newval);
@ -652,182 +651,56 @@ void VerilatedVcd::emitBit(vluint32_t code, vluint32_t newval) {
 }

 VL_ATTR_ALWINLINE
-void VerilatedVcd::emitBus(vluint32_t code, vluint32_t newval, int bits) {
-    VL_PREFETCH_RD(VL_VCD_SUFFIXP(code));
+void VerilatedVcd::emitCData(vluint32_t code, CData newval, int bits) {
    char* wp = m_writep;
    *wp++ = 'b';
-    wp += bits;
-    // clang-format off
-    switch (bits) {
-    case 32: wp[-32] = '0' | static_cast<char>((newval >> 31)    ); //FALLTHRU
-    case 31: wp[-31] = '0' | static_cast<char>((newval >> 30) & 1); //FALLTHRU
-    case 30: wp[-30] = '0' | static_cast<char>((newval >> 29) & 1); //FALLTHRU
-    case 29: wp[-29] = '0' | static_cast<char>((newval >> 28) & 1); //FALLTHRU
-    case 28: wp[-28] = '0' | static_cast<char>((newval >> 27) & 1); //FALLTHRU
-    case 27: wp[-27] = '0' | static_cast<char>((newval >> 26) & 1); //FALLTHRU
-    case 26: wp[-26] = '0' | static_cast<char>((newval >> 25) & 1); //FALLTHRU
-    case 25: wp[-25] = '0' | static_cast<char>((newval >> 24) & 1); //FALLTHRU
-    case 24: wp[-24] = '0' | static_cast<char>((newval >> 23) & 1); //FALLTHRU
-    case 23: wp[-23] = '0' | static_cast<char>((newval >> 22) & 1); //FALLTHRU
-    case 22: wp[-22] = '0' | static_cast<char>((newval >> 21) & 1); //FALLTHRU
-    case 21: wp[-21] = '0' | static_cast<char>((newval >> 20) & 1); //FALLTHRU
-    case 20: wp[-20] = '0' | static_cast<char>((newval >> 19) & 1); //FALLTHRU
-    case 19: wp[-19] = '0' | static_cast<char>((newval >> 18) & 1); //FALLTHRU
-    case 18: wp[-18] = '0' | static_cast<char>((newval >> 17) & 1); //FALLTHRU
-    case 17: wp[-17] = '0' | static_cast<char>((newval >> 16) & 1); //FALLTHRU
-    case 16: wp[-16] = '0' | static_cast<char>((newval >> 15) & 1); //FALLTHRU
-    case 15: wp[-15] = '0' | static_cast<char>((newval >> 14) & 1); //FALLTHRU
-    case 14: wp[-14] = '0' | static_cast<char>((newval >> 13) & 1); //FALLTHRU
-    case 13: wp[-13] = '0' | static_cast<char>((newval >> 12) & 1); //FALLTHRU
-    case 12: wp[-12] = '0' | static_cast<char>((newval >> 11) & 1); //FALLTHRU
-    case 11: wp[-11] = '0' | static_cast<char>((newval >> 10) & 1); //FALLTHRU
-    case 10: wp[-10] = '0' | static_cast<char>((newval >>  9) & 1); //FALLTHRU
-    case 9:  wp[ -9] = '0' | static_cast<char>((newval >>  8) & 1); //FALLTHRU
-    case 8:  wp[ -8] = '0' | static_cast<char>((newval >>  7) & 1); //FALLTHRU
-    case 7:  wp[ -7] = '0' | static_cast<char>((newval >>  6) & 1); //FALLTHRU
-    case 6:  wp[ -6] = '0' | static_cast<char>((newval >>  5) & 1); //FALLTHRU
-    case 5:  wp[ -5] = '0' | static_cast<char>((newval >>  4) & 1); //FALLTHRU
-    case 4:  wp[ -4] = '0' | static_cast<char>((newval >>  3) & 1); //FALLTHRU
-    case 3:  wp[ -3] = '0' | static_cast<char>((newval >>  2) & 1); //FALLTHRU
-    case 2:  wp[ -2] = '0' | static_cast<char>((newval >>  1) & 1); //FALLTHRU
-    /*bit*/  wp[ -1] = '0' | static_cast<char>((newval      ) & 1); //FALLTHRU
-    }
-    // clang-format on
-    finishLine(code, wp);
+    cvtCDataToStr(wp, newval << (VL_BYTESIZE - bits));
+    finishLine(code, wp + bits);
 }

 VL_ATTR_ALWINLINE
-void VerilatedVcd::emitQuad(vluint32_t code, vluint64_t newval, int bits) {
-    VL_PREFETCH_RD(VL_VCD_SUFFIXP(code));
+void VerilatedVcd::emitSData(vluint32_t code, SData newval, int bits) {
    char* wp = m_writep;
    *wp++ = 'b';
-    // Handle the top 32 bits within the 64 bit input
-    const int bitsInTopHalf = bits - 32;
-    wp += bitsInTopHalf;
-    // clang-format off
-    switch (bitsInTopHalf) {
-    case 32: wp[-32] = '0' | static_cast<char>((newval >> 63)    ); //FALLTHRU
-    case 31: wp[-31] = '0' | static_cast<char>((newval >> 62) & 1); //FALLTHRU
-    case 30: wp[-30] = '0' | static_cast<char>((newval >> 61) & 1); //FALLTHRU
-    case 29: wp[-29] = '0' | static_cast<char>((newval >> 60) & 1); //FALLTHRU
-    case 28: wp[-28] = '0' | static_cast<char>((newval >> 59) & 1); //FALLTHRU
-    case 27: wp[-27] = '0' | static_cast<char>((newval >> 58) & 1); //FALLTHRU
-    case 26: wp[-26] = '0' | static_cast<char>((newval >> 57) & 1); //FALLTHRU
-    case 25: wp[-25] = '0' | static_cast<char>((newval >> 56) & 1); //FALLTHRU
-    case 24: wp[-24] = '0' | static_cast<char>((newval >> 55) & 1); //FALLTHRU
-    case 23: wp[-23] = '0' | static_cast<char>((newval >> 54) & 1); //FALLTHRU
-    case 22: wp[-22] = '0' | static_cast<char>((newval >> 53) & 1); //FALLTHRU
-    case 21: wp[-21] = '0' | static_cast<char>((newval >> 52) & 1); //FALLTHRU
-    case 20: wp[-20] = '0' | static_cast<char>((newval >> 51) & 1); //FALLTHRU
-    case 19: wp[-19] = '0' | static_cast<char>((newval >> 50) & 1); //FALLTHRU
-    case 18: wp[-18] = '0' | static_cast<char>((newval >> 49) & 1); //FALLTHRU
-    case 17: wp[-17] = '0' | static_cast<char>((newval >> 48) & 1); //FALLTHRU
-    case 16: wp[-16] = '0' | static_cast<char>((newval >> 47) & 1); //FALLTHRU
-    case 15: wp[-15] = '0' | static_cast<char>((newval >> 46) & 1); //FALLTHRU
-    case 14: wp[-14] = '0' | static_cast<char>((newval >> 45) & 1); //FALLTHRU
-    case 13: wp[-13] = '0' | static_cast<char>((newval >> 44) & 1); //FALLTHRU
-    case 12: wp[-12] = '0' | static_cast<char>((newval >> 43) & 1); //FALLTHRU
-    case 11: wp[-11] = '0' | static_cast<char>((newval >> 42) & 1); //FALLTHRU
-    case 10: wp[-10] = '0' | static_cast<char>((newval >> 41) & 1); //FALLTHRU
-    case 9:  wp[ -9] = '0' | static_cast<char>((newval >> 40) & 1); //FALLTHRU
-    case 8:  wp[ -8] = '0' | static_cast<char>((newval >> 39) & 1); //FALLTHRU
-    case 7:  wp[ -7] = '0' | static_cast<char>((newval >> 38) & 1); //FALLTHRU
-    case 6:  wp[ -6] = '0' | static_cast<char>((newval >> 37) & 1); //FALLTHRU
-    case 5:  wp[ -5] = '0' | static_cast<char>((newval >> 36) & 1); //FALLTHRU
-    case 4:  wp[ -4] = '0' | static_cast<char>((newval >> 35) & 1); //FALLTHRU
-    case 3:  wp[ -3] = '0' | static_cast<char>((newval >> 34) & 1); //FALLTHRU
-    case 2:  wp[ -2] = '0' | static_cast<char>((newval >> 33) & 1); //FALLTHRU
-    case 1:  wp[ -1] = '0' | static_cast<char>((newval >> 32) & 1); //FALLTHRU
-    }
-    // clang-format on
-    // Handle the bottom 32 bits within the 64 bit input
-    vluint32_t val = static_cast<vluint32_t>(newval);  // Truncate to bottom 32 bits
-    int loops = 4;
-    do {
-        wp[0] = '0' | static_cast<char>((val >> 31));
-        wp[1] = '0' | static_cast<char>((val >> 30) & 1);
-        wp[2] = '0' | static_cast<char>((val >> 29) & 1);
-        wp[3] = '0' | static_cast<char>((val >> 28) & 1);
-        wp[4] = '0' | static_cast<char>((val >> 27) & 1);
-        wp[5] = '0' | static_cast<char>((val >> 26) & 1);
-        wp[6] = '0' | static_cast<char>((val >> 25) & 1);
-        wp[7] = '0' | static_cast<char>((val >> 24) & 1);
-        wp += 8;
-        val <<= 8;
-    } while (--loops);
-
-    finishLine(code, wp);
+    cvtSDataToStr(wp, newval << (VL_SHORTSIZE - bits));
+    finishLine(code, wp + bits);
 }

 VL_ATTR_ALWINLINE
-void VerilatedVcd::emitArray(vluint32_t code, const vluint32_t* newvalp, int bits) {
-    VL_PREFETCH_RD(VL_VCD_SUFFIXP(code));
-    int words = (bits + 31) / 32;
+void VerilatedVcd::emitIData(vluint32_t code, IData newval, int bits) {
+    char* wp = m_writep;
+    *wp++ = 'b';
+    cvtIDataToStr(wp, newval << (VL_IDATASIZE - bits));
+    finishLine(code, wp + bits);
+}
+
+VL_ATTR_ALWINLINE
+void VerilatedVcd::emitQData(vluint32_t code, QData newval, int bits) {
+    char* wp = m_writep;
+    *wp++ = 'b';
+    cvtQDataToStr(wp, newval << (VL_QUADSIZE - bits));
+    finishLine(code, wp + bits);
+}
+
+VL_ATTR_ALWINLINE
+void VerilatedVcd::emitWData(vluint32_t code, const WData* newvalp, int bits) {
+    int words = VL_WORDS_I(bits);
    char* wp = m_writep;
    *wp++ = 'b';
    // Handle the most significant word
-    vluint32_t val = newvalp[--words];
-    const int bitsInMSW = bits % 32 == 0 ? 32 : bits % 32;
+    const int bitsInMSW = VL_BITBIT_E(bits) ? VL_BITBIT_E(bits) : VL_EDATASIZE;
+    cvtEDataToStr(wp, newvalp[--words] << (VL_EDATASIZE - bitsInMSW));
    wp += bitsInMSW;
-    // clang-format off
-    switch (bitsInMSW) {
-    case 32: wp[-32] = '0' | static_cast<char>((val >> 31)    ); //FALLTHRU
-    case 31: wp[-31] = '0' | static_cast<char>((val >> 30) & 1); //FALLTHRU
-    case 30: wp[-30] = '0' | static_cast<char>((val >> 29) & 1); //FALLTHRU
-    case 29: wp[-29] = '0' | static_cast<char>((val >> 28) & 1); //FALLTHRU
-    case 28: wp[-28] = '0' | static_cast<char>((val >> 27) & 1); //FALLTHRU
-    case 27: wp[-27] = '0' | static_cast<char>((val >> 26) & 1); //FALLTHRU
-    case 26: wp[-26] = '0' | static_cast<char>((val >> 25) & 1); //FALLTHRU
-    case 25: wp[-25] = '0' | static_cast<char>((val >> 24) & 1); //FALLTHRU
-    case 24: wp[-24] = '0' | static_cast<char>((val >> 23) & 1); //FALLTHRU
-    case 23: wp[-23] = '0' | static_cast<char>((val >> 22) & 1); //FALLTHRU
-    case 22: wp[-22] = '0' | static_cast<char>((val >> 21) & 1); //FALLTHRU
-    case 21: wp[-21] = '0' | static_cast<char>((val >> 20) & 1); //FALLTHRU
-    case 20: wp[-20] = '0' | static_cast<char>((val >> 19) & 1); //FALLTHRU
-    case 19: wp[-19] = '0' | static_cast<char>((val >> 18) & 1); //FALLTHRU
-    case 18: wp[-18] = '0' | static_cast<char>((val >> 17) & 1); //FALLTHRU
-    case 17: wp[-17] = '0' | static_cast<char>((val >> 16) & 1); //FALLTHRU
-    case 16: wp[-16] = '0' | static_cast<char>((val >> 15) & 1); //FALLTHRU
-    case 15: wp[-15] = '0' | static_cast<char>((val >> 14) & 1); //FALLTHRU
-    case 14: wp[-14] = '0' | static_cast<char>((val >> 13) & 1); //FALLTHRU
-    case 13: wp[-13] = '0' | static_cast<char>((val >> 12) & 1); //FALLTHRU
-    case 12: wp[-12] = '0' | static_cast<char>((val >> 11) & 1); //FALLTHRU
-    case 11: wp[-11] = '0' | static_cast<char>((val >> 10) & 1); //FALLTHRU
-    case 10: wp[-10] = '0' | static_cast<char>((val >>  9) & 1); //FALLTHRU
-    case 9:  wp[ -9] = '0' | static_cast<char>((val >>  8) & 1); //FALLTHRU
-    case 8:  wp[ -8] = '0' | static_cast<char>((val >>  7) & 1); //FALLTHRU
-    case 7:  wp[ -7] = '0' | static_cast<char>((val >>  6) & 1); //FALLTHRU
-    case 6:  wp[ -6] = '0' | static_cast<char>((val >>  5) & 1); //FALLTHRU
-    case 5:  wp[ -5] = '0' | static_cast<char>((val >>  4) & 1); //FALLTHRU
-    case 4:  wp[ -4] = '0' | static_cast<char>((val >>  3) & 1); //FALLTHRU
-    case 3:  wp[ -3] = '0' | static_cast<char>((val >>  2) & 1); //FALLTHRU
-    case 2:  wp[ -2] = '0' | static_cast<char>((val >>  1) & 1); //FALLTHRU
-    case 1:  wp[ -1] = '0' | static_cast<char>((val      ) & 1); //FALLTHRU
-    }
-    // clang-format on
    // Handle the remaining words
    while (words > 0) {
-        vluint32_t val = newvalp[--words];
-        int loops = 4;
-        do {
-            wp[0] = '0' | static_cast<char>((val >> 31));
-            wp[1] = '0' | static_cast<char>((val >> 30) & 1);
-            wp[2] = '0' | static_cast<char>((val >> 29) & 1);
-            wp[3] = '0' | static_cast<char>((val >> 28) & 1);
-            wp[4] = '0' | static_cast<char>((val >> 27) & 1);
-            wp[5] = '0' | static_cast<char>((val >> 26) & 1);
-            wp[6] = '0' | static_cast<char>((val >> 25) & 1);
-            wp[7] = '0' | static_cast<char>((val >> 24) & 1);
-            wp += 8;
-            val <<= 8;
-        } while (--loops);
+        cvtEDataToStr(wp, newvalp[--words]);
+        wp += VL_EDATASIZE;
    }
    finishLine(code, wp);
 }

 VL_ATTR_ALWINLINE
 void VerilatedVcd::emitFloat(vluint32_t code, float newval) {
-    VL_PREFETCH_RD(VL_VCD_SUFFIXP(code));
    char* wp = m_writep;
    // Buffer can't overflow before sprintf; we sized during declaration
    sprintf(wp, "r%.16g", static_cast<double>(newval));
@ -837,7 +710,6 @@ void VerilatedVcd::emitFloat(vluint32_t code, float newval) {

 VL_ATTR_ALWINLINE
 void VerilatedVcd::emitDouble(vluint32_t code, double newval) {
-    VL_PREFETCH_RD(VL_VCD_SUFFIXP(code));
    char* wp = m_writep;
    // Buffer can't overflow before sprintf; we sized during declaration
    sprintf(wp, "r%.16g", newval);
@ -845,8 +717,6 @@ void VerilatedVcd::emitDouble(vluint32_t code, double newval) {
    finishLine(code, wp);
 }

-#undef VL_VCD_SUFFIXP
-
 #ifdef VL_TRACE_VCD_OLD_API

 void VerilatedVcd::fullBit(vluint32_t code, const vluint32_t newval) {
--- a/include/verilated_vcd_c.h
+++ b/include/verilated_vcd_c.h
@ -124,10 +124,12 @@ protected:

    // Implementations of duck-typed methods for VerilatedTrace. These are
    // called from only one place (namely full*) so always inline them.
-    inline void emitBit(vluint32_t code, vluint32_t newval);
-    inline void emitBus(vluint32_t code, vluint32_t newval, int bits);
-    inline void emitQuad(vluint32_t code, vluint64_t newval, int bits);
-    inline void emitArray(vluint32_t code, const vluint32_t* newvalp, int bits);
+    inline void emitBit(vluint32_t code, CData newval);
+    inline void emitCData(vluint32_t code, CData newval, int bits);
+    inline void emitSData(vluint32_t code, SData newval, int bits);
+    inline void emitIData(vluint32_t code, IData newval, int bits);
+    inline void emitQData(vluint32_t code, QData newval, int bits);
+    inline void emitWData(vluint32_t code, const WData* newvalp, int bits);
    inline void emitFloat(vluint32_t code, float newval);
    inline void emitDouble(vluint32_t code, double newval);

@ -176,37 +178,48 @@ public:
                     int lsb);
    void declTriArray(vluint32_t code, const char* name, bool array, int arraynum, int msb,
                      int lsb);
-    //=========================================================================
-    // Write back to previous value buffer value and emit

-    void fullBit(vluint32_t* oldp, vluint32_t newval) { fullBit(oldp - this->oldp(0), newval); }
-    void fullBus(vluint32_t* oldp, vluint32_t newval, int bits) {
+    void fullBit(vluint32_t* oldp, CData newval) { fullBit(oldp - this->oldp(0), newval); }
+    void fullCData(vluint32_t* oldp, CData newval, int bits) {
        fullBus(oldp - this->oldp(0), newval, bits);
    }
-    void fullQuad(vluint32_t* oldp, vluint64_t newval, int bits) {
+    void fullSData(vluint32_t* oldp, SData newval, int bits) {
+        fullBus(oldp - this->oldp(0), newval, bits);
+    }
+    void fullIData(vluint32_t* oldp, IData newval, int bits) {
+        fullBus(oldp - this->oldp(0), newval, bits);
+    }
+    void fullQData(vluint32_t* oldp, QData newval, int bits) {
        fullQuad(oldp - this->oldp(0), newval, bits);
    }
-    void fullArray(vluint32_t* oldp, const vluint32_t* newvalp, int bits) {
+    void fullWData(vluint32_t* oldp, const WData* newvalp, int bits) {
        fullArray(oldp - this->oldp(0), newvalp, bits);
    }
    void fullFloat(vluint32_t* oldp, float newval) { fullFloat(oldp - this->oldp(0), newval); }
    void fullDouble(vluint32_t* oldp, double newval) { fullDouble(oldp - this->oldp(0), newval); }

-    //=========================================================================
-    // Check previous value and emit if changed
-
-    void chgBit(vluint32_t* oldp, vluint32_t newval) { chgBit(oldp - this->oldp(0), newval); }
-    void chgBus(vluint32_t* oldp, vluint32_t newval, int bits) {
+    inline void chgBit(vluint32_t* oldp, CData newval) { chgBit(oldp - this->oldp(0), newval); }
+    inline void chgCData(vluint32_t* oldp, CData newval, int bits) {
        chgBus(oldp - this->oldp(0), newval, bits);
    }
-    void chgQuad(vluint32_t* oldp, vluint64_t newval, int bits) {
+    inline void chgSData(vluint32_t* oldp, SData newval, int bits) {
+        chgBus(oldp - this->oldp(0), newval, bits);
+    }
+    inline void chgIData(vluint32_t* oldp, IData newval, int bits) {
+        chgBus(oldp - this->oldp(0), newval, bits);
+    }
+    inline void chgQData(vluint32_t* oldp, QData newval, int bits) {
        chgQuad(oldp - this->oldp(0), newval, bits);
    }
-    void chgArray(vluint32_t* oldp, const vluint32_t* newvalp, int bits) {
+    inline void chgWData(vluint32_t* oldp, const WData* newvalp, int bits) {
        chgArray(oldp - this->oldp(0), newvalp, bits);
    }
-    void chgFloat(vluint32_t* oldp, float newval) { chgFloat(oldp - this->oldp(0), newval); }
-    void chgDouble(vluint32_t* oldp, double newval) { chgDouble(oldp - this->oldp(0), newval); }
+    inline void chgFloat(vluint32_t* oldp, float newval) {
+        chgFloat(oldp - this->oldp(0), newval);
+    }
+    inline void chgDouble(vluint32_t* oldp, double newval) {
+        chgDouble(oldp - this->oldp(0), newval);
+    }

    /// Inside dumping routines, dump one signal, faster when not inlined
    /// due to code size reduction.
--- a/include/verilatedos.h
+++ b/include/verilatedos.h
@ -475,6 +475,16 @@ typedef unsigned long long vluint64_t;  ///< 64-bit unsigned type
 #else
 # define VL_STRCASECMP strcasecmp
 #endif
+
+//=========================================================================
+// Macros controlling target specific optimizations
+
+// Define VL_PORTABLE_ONLY to disable all target specific optimizations
+#ifndef VL_PORTABLE_ONLY
+# ifdef __x86_64__
+#  define VL_X86_64 1
+# endif
+#endif // VL_PORTABLE_ONLY
 // clang-format on

 //=========================================================================
--- a/src/V3EmitC.cpp
+++ b/src/V3EmitC.cpp
@ -3550,20 +3550,23 @@ class EmitCTrace : EmitCStmts {
        const bool full = (m_funcp->funcType() == AstCFuncType::TRACE_FULL
                           || m_funcp->funcType() == AstCFuncType::TRACE_FULL_SUB);
        const string func = full ? "full" : "chg";
-        bool emitWidth = false;
+        bool emitWidth = true;
        if (nodep->dtypep()->basicp()->isDouble()) {
            puts("vcdp->" + func + "Double");
+            emitWidth = false;
        } else if (nodep->isWide() || emitTraceIsScBv(nodep) || emitTraceIsScBigUint(nodep)) {
-            puts("vcdp->" + func + "Array");
-            emitWidth = true;
+            puts("vcdp->" + func + "WData");
        } else if (nodep->isQuad()) {
-            puts("vcdp->" + func + "Quad");
-            emitWidth = true;
+            puts("vcdp->" + func + "QData");
+        } else if (nodep->declp()->widthMin() > 16) {
+            puts("vcdp->" + func + "IData");
+        } else if (nodep->declp()->widthMin() > 8) {
+            puts("vcdp->" + func + "SData");
        } else if (nodep->declp()->widthMin() > 1) {
-            puts("vcdp->" + func + "Bus");
-            emitWidth = true;
+            puts("vcdp->" + func + "CData");
        } else {
            puts("vcdp->" + func + "Bit");
+            emitWidth = false;
        }

        const uint32_t offset = (arrayindex < 0) ? 0 : (arrayindex * nodep->declp()->widthWords());
--- a/test_regress/t/t_trace_array_fst_portable.pl
+++ b/test_regress/t/t_trace_array_fst_portable.pl
@ -0,0 +1,28 @@
+#!/usr/bin/perl
+if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); die; }
+# DESCRIPTION: Verilator: Verilog Test driver/expect definition
+#
+# Copyright 2003-2009 by Wilson Snyder. This program is free software; you
+# can redistribute it and/or modify it under the terms of either the GNU
+# Lesser General Public License Version 3 or the Perl Artistic License
+# Version 2.0.
+# SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
+
+scenarios(vlt => 1);
+
+top_filename("t/t_trace_array.v");
+$Self->{golden_filename} = "t/t_trace_array_fst.out";
+
+compile(
+    verilator_flags2 => ['--cc --trace-fst --trace-structs',
+                         '-CFLAGS -DVL_PORTABLE_ONLY'],
+    );
+
+execute(
+    check_finished => 1,
+    );
+
+fst_identical($Self->trace_filename, $Self->{golden_filename});
+
+ok(1);
+1;
--- a/test_regress/t/t_trace_complex_portable.pl
+++ b/test_regress/t/t_trace_complex_portable.pl
@ -0,0 +1,38 @@
+#!/usr/bin/perl
+if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); die; }
+# DESCRIPTION: Verilator: Verilog Test driver/expect definition
+#
+# Copyright 2003-2009 by Wilson Snyder. This program is free software; you
+# can redistribute it and/or modify it under the terms of either the GNU
+# Lesser General Public License Version 3 or the Perl Artistic License
+# Version 2.0.
+# SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
+
+# Same test as t_trace_complex, but exercising the old VCD tracing API
+
+scenarios(vlt => 1);
+
+top_filename("t/t_trace_complex.v");
+
+compile(
+    verilator_flags2 => ['--cc --trace -CFLAGS -DVL_PORTABLE_ONLY'],
+    );
+
+execute(
+    check_finished => 1,
+    );
+
+file_grep     ("$Self->{obj_dir}/simx.vcd", qr/ v_strp /);
+file_grep     ("$Self->{obj_dir}/simx.vcd", qr/ v_strp_strp /);
+file_grep     ("$Self->{obj_dir}/simx.vcd", qr/ v_arrp /);
+file_grep     ("$Self->{obj_dir}/simx.vcd", qr/ v_arrp_arrp /);
+file_grep     ("$Self->{obj_dir}/simx.vcd", qr/ v_arrp_strp /);
+file_grep     ("$Self->{obj_dir}/simx.vcd", qr/ v_arru\(/);
+file_grep     ("$Self->{obj_dir}/simx.vcd", qr/ v_arru_arru\(/);
+file_grep     ("$Self->{obj_dir}/simx.vcd", qr/ v_arru_arrp\(/);
+file_grep     ("$Self->{obj_dir}/simx.vcd", qr/ v_arru_strp\(/);
+
+vcd_identical ("$Self->{obj_dir}/simx.vcd", "t/t_trace_complex.out");
+
+ok(1);
+1;