Improve tracing performance. (#2257)

* Improve tracing performance. Various tactics used to improve performance of both VCD and FST tracing: - Both: Change tracing functions to templates to take variable widths as template parameters. For VCD, subsequently specialize these to the values used by Verilator. This avoids redundant instructions and hard to predict branches. - Both: Check for value changes via direct pointer access into the previous signal value buffer. This eliminates a lot of simple pointer arithmetic instructions form the tracing code. - Both: Verilator provides clean input, no need to mask out used bits. - VCD: pre-compute identifier codes and use memory copy instead of re-computing them every time a code is emitted. This saves a lot of instructions and hard to predict branches. The added D-cache misses are cheaper than the removed branches/instructions. - VCD: re-write the routines emitting the changes to be more efficient. - FST: Use previous signal value buffer the same way as the VCD tracing code, and only call the FST API when a change is detected. Performance as measured on SweRV EH1, with the pre-canned CoreMark benchmark running from DCCM/ICCM, clang 6.0.0, Intel i7-3770 @ 3.40GHz, and IO to ramdisk: +--------------+---------------+----------------------+ | VCD | FST | FST separate thread | | (--trace) | (--trace-fst) | (--trace-fst-thread) | ------------+-----------------------------------------------------+ Before | 30.2 s | 121.1 s | 69.8 s | ============+==============+===============+======================+ After | 24.7 s | 45.7 s | 32.4 s | ------------+--------------+---------------+----------------------+ Speedup | 22 % | 256 % | 215 % | ------------+--------------+---------------+----------------------+ Rel. to VCD | 1 x | 1.85 x | 1.31 x | ------------+--------------+---------------+----------------------+ In addition, FST trace size for the above reduced by 48%.
2020-04-14 00:13:10 +01:00 · 2020-04-14 00:13:10 +01:00 · dc5c259069
commit dc5c259069
parent dc27a179e2
13 changed files with 920 additions and 982 deletions
--- a/4
+++ b/4
@ -15,11 +15,11 @@ The contributors that suggested a given feature are shown in []. Thanks!

 ****  Add error if use SystemC 2.2 and earlier (pre-2011) as is deprecated.

-****  Improve FST dump performance, #2244, #2250. [Geza Lore]
+****  Greatly improve FST dump performance, #2244, #2250, #2257. [Geza Lore]

 ****  Fix build of fast path tracing code to use OPT_FAST, #2245. [Geza Lore]

-****  Improve VCD dump performance, #2246, #2250. [Geza Lore]
+****  Improve VCD dump performance, #2246, #2250, #2257. [Geza Lore]


 * Verilator 4.032 2020-04-04
--- a/bin/verilator
+++ b/bin/verilator
@ -4987,6 +4987,12 @@ using the Verilator --exe flag.
 Note you can also call ->trace on multiple Verilated objects with the same
 trace file if you want all data to land in the same output file.

+When using SystemC 2.3, the SystemC library must have been built with the
+experiemntal simulation phase callback based tracing disabled. This is
+disabled by default when building SystemC with its configure based build
+system, but when building SystemC with CMake, you must pass
+-DENABLE_PHASE_CALLBACKS_TRACING=OFF to disable this feature.
+
    #include "verilated_vcd_sc.h"
    ...
    int main(int argc, char** argv, char** env) {
--- a/include/verilated_fst_c.cpp
+++ b/include/verilated_fst_c.cpp
@ -17,6 +17,8 @@
 //=============================================================================
 // SPDIFF_OFF

+// clang-format off
+
 #define __STDC_LIMIT_MACROS  // UINT64_MAX
 #include "verilatedos.h"
 #include "verilated.h"
@ -49,6 +51,8 @@
 # include <unistd.h>
 #endif

+// clang-format on
+
 //=============================================================================

 class VerilatedFstCallInfo {
@ -79,13 +83,15 @@ VerilatedFst::VerilatedFst(void* fst)
    , m_minNextDumpTime(0)
    , m_nextCode(1)
    , m_scopeEscape('.')
-    , m_symbolp(NULL) {
+    , m_symbolp(NULL)
+    , m_sigs_oldvalp(NULL) {
    m_valueStrBuffer.reserve(64 + 1);  // Need enough room for quad
 }

 VerilatedFst::~VerilatedFst() {
    if (m_fst) fstWriterClose(m_fst);
    if (m_symbolp) VL_DO_CLEAR(delete[] m_symbolp, m_symbolp = NULL);
+    if (m_sigs_oldvalp) VL_DO_CLEAR(delete[] m_sigs_oldvalp, m_sigs_oldvalp = NULL);
 }

 void VerilatedFst::open(const char* filename) VL_MT_UNSAFE {
@ -121,6 +127,9 @@ void VerilatedFst::open(const char* filename) VL_MT_UNSAFE {
        }
    }
    m_code2symbol.clear();
+
+    // Allocate space now we know the number of codes
+    if (!m_sigs_oldvalp) m_sigs_oldvalp = new vluint32_t[m_nextCode + 10];
 }

 void VerilatedFst::module(const std::string& name) { m_module = name; }
@ -214,9 +223,9 @@ void VerilatedFst::addCallback(VerilatedFstCallback_t initcb, VerilatedFstCallba
 void VerilatedFst::dump(vluint64_t timeui) {
    if (!isOpen()) return;
    if (timeui < m_minNextDumpTime) {
-      VL_PRINTF_MT("%%Warning: previous dump at t=%" VL_PRI64 "u, requesting t=%" VL_PRI64 "u\n",
-        m_minNextDumpTime - 1, timeui);
-      return;
+        VL_PRINTF_MT("%%Warning: previous dump at t=%" VL_PRI64 "u, requesting t=%" VL_PRI64 "u\n",
+                     m_minNextDumpTime - 1, timeui);
+        return;
    }
    m_minNextDumpTime = timeui + 1;
    if (VL_UNLIKELY(m_fullDump)) {
--- a/include/verilated_fst_c.h
+++ b/include/verilated_fst_c.h
@ -57,11 +57,11 @@ private:
    Local2FstDtype m_local2fstdtype;
    std::list<std::string> m_curScope;
    fstHandle* m_symbolp;  ///< same as m_code2symbol, but as an array
+    vluint32_t* m_sigs_oldvalp;
    // CONSTRUCTORS
    VL_UNCOPYABLE(VerilatedFst);
-    void declSymbol(vluint32_t code, const char* name,
-                    int dtypenum, fstVarDir vardir, fstVarType vartype,
-                    bool array, int arraynum, vluint32_t len, vluint32_t bits);
+    void declSymbol(vluint32_t code, const char* name, int dtypenum, fstVarDir vardir,
+                    fstVarType vartype, bool array, int arraynum, vluint32_t len, vluint32_t bits);
    // helpers
    std::vector<char> m_valueStrBuffer;

@ -94,102 +94,108 @@ public:
    void dump(vluint64_t timeui);
    /// Inside dumping routines, declare callbacks for tracings
    void addCallback(VerilatedFstCallback_t initcb, VerilatedFstCallback_t fullcb,
-                     VerilatedFstCallback_t changecb,
-                     void* userthis) VL_MT_UNSAFE_ONE;
+                     VerilatedFstCallback_t changecb, void* userthis) VL_MT_UNSAFE_ONE;

    /// Inside dumping routines, declare a module
    void module(const std::string& name);
    /// Inside dumping routines, declare a data type
    void declDTypeEnum(int dtypenum, const char* name, vluint32_t elements,
-                       unsigned int minValbits,
-                       const char** itemNamesp, const char** itemValuesp);
+                       unsigned int minValbits, const char** itemNamesp, const char** itemValuesp);
    /// Inside dumping routines, declare a signal
-    void declBit(vluint32_t code, const char* name,
-                 int dtypenum, fstVarDir vardir, fstVarType vartype,
-                 bool array, int arraynum) {
+    void declBit(vluint32_t code, const char* name, int dtypenum, fstVarDir vardir,
+                 fstVarType vartype, bool array, int arraynum) {
        declSymbol(code, name, dtypenum, vardir, vartype, array, arraynum, 1, 1);
    }
-    void declBus(vluint32_t code, const char* name,
-                 int dtypenum, fstVarDir vardir, fstVarType vartype,
-                 bool array, int arraynum, int msb, int lsb) {
+    void declBus(vluint32_t code, const char* name, int dtypenum, fstVarDir vardir,
+                 fstVarType vartype, bool array, int arraynum, int msb, int lsb) {
        declSymbol(code, name, dtypenum, vardir, vartype, array, arraynum, msb - lsb + 1,
                   msb - lsb + 1);
    }
-    void declDouble(vluint32_t code, const char* name,
-                    int dtypenum, fstVarDir vardir, fstVarType vartype,
-                    bool array, int arraynum) {
-        declSymbol(code, name, dtypenum, vardir, vartype, array, arraynum, 2, 64);
+    void declQuad(vluint32_t code, const char* name, int dtypenum, fstVarDir vardir,
+                  fstVarType vartype, bool array, int arraynum, int msb, int lsb) {
+        declSymbol(code, name, dtypenum, vardir, vartype, array, arraynum, msb - lsb + 1,
+                   msb - lsb + 1);
    }
-    void declFloat(vluint32_t code, const char* name,
-                   int dtypenum, fstVarDir vardir, fstVarType vartype,
-                   bool array, int arraynum) {
+    void declArray(vluint32_t code, const char* name, int dtypenum, fstVarDir vardir,
+                   fstVarType vartype, bool array, int arraynum, int msb, int lsb) {
+        declSymbol(code, name, dtypenum, vardir, vartype, array, arraynum, msb - lsb + 1,
+                   msb - lsb + 1);
+    }
+    void declFloat(vluint32_t code, const char* name, int dtypenum, fstVarDir vardir,
+                   fstVarType vartype, bool array, int arraynum) {
        declSymbol(code, name, dtypenum, vardir, vartype, array, arraynum, 1, 32);
    }
-    void declQuad(vluint32_t code, const char* name,
-                  int dtypenum, fstVarDir vardir, fstVarType vartype,
-                  bool array, int arraynum, int msb, int lsb) {
-        declSymbol(code, name, dtypenum, vardir, vartype, array, arraynum, msb - lsb + 1,
-                   msb - lsb + 1);
-    }
-    void declArray(vluint32_t code, const char* name,
-                   int dtypenum, fstVarDir vardir, fstVarType vartype,
-                   bool array, int arraynum, int msb, int lsb) {
-        declSymbol(code, name, dtypenum, vardir, vartype, array, arraynum, msb - lsb + 1,
-                   msb - lsb + 1);
+    void declDouble(vluint32_t code, const char* name, int dtypenum, fstVarDir vardir,
+                    fstVarType vartype, bool array, int arraynum) {
+        declSymbol(code, name, dtypenum, vardir, vartype, array, arraynum, 2, 64);
    }

-    /// Inside dumping routines, dump one signal if it has changed
-    void chgBit(vluint32_t code, const vluint32_t newval) {
-        fstWriterEmitValueChange(m_fst, m_symbolp[code], newval ? "1" : "0");
+    //=========================================================================
+    // Inside dumping routines used by Verilator
+
+    vluint32_t* oldp(vluint32_t code) { return m_sigs_oldvalp + code; }
+
+    //=========================================================================
+    // Write back to previous value buffer value and emit
+
+    void fullBit(vluint32_t* oldp, vluint32_t newval) {
+        *oldp = newval;
+        fstWriterEmitValueChange(m_fst, m_symbolp[oldp - m_sigs_oldvalp], newval ? "1" : "0");
    }
-    void chgBus(vluint32_t code, const vluint32_t newval, int bits) {
-        fstWriterEmitValueChange32(m_fst, m_symbolp[code], bits, newval);
+    template <int T_Bits> void fullBus(vluint32_t* oldp, vluint32_t newval) {
+        *oldp = newval;
+        fstWriterEmitValueChange32(m_fst, m_symbolp[oldp - m_sigs_oldvalp], T_Bits, newval);
    }
-    void chgDouble(vluint32_t code, const double newval) {
-        double val = newval;
-        fstWriterEmitValueChange(m_fst, m_symbolp[code], &val);
+    void fullQuad(vluint32_t* oldp, vluint64_t newval, int bits) {
+        *reinterpret_cast<vluint64_t*>(oldp) = newval;
+        fstWriterEmitValueChange64(m_fst, m_symbolp[oldp - m_sigs_oldvalp], bits, newval);
    }
-    void chgFloat(vluint32_t code, const float newval) {
-        double val = (double)newval;
-        fstWriterEmitValueChange(m_fst, m_symbolp[code], &val);
+    void fullArray(vluint32_t* oldp, const vluint32_t* newvalp, int bits) {
+        for (int i = 0; i < (bits + 31) / 32; ++i) oldp[i] = newvalp[i];
+        fstWriterEmitValueChangeVec32(m_fst, m_symbolp[oldp - m_sigs_oldvalp], bits, newvalp);
    }
-    void chgQuad(vluint32_t code, const vluint64_t newval, int bits) {
-        fstWriterEmitValueChange64(m_fst, m_symbolp[code], bits, newval);
+    void fullFloat(vluint32_t* oldp, float newval) {
+        // cppcheck-suppress invalidPointerCast
+        *reinterpret_cast<float*>(oldp) = newval;
+        fstWriterEmitValueChange(m_fst, m_symbolp[oldp - m_sigs_oldvalp], oldp);
    }
-    void chgArray(vluint32_t code, const vluint32_t* newval, int bits) {
-        fstWriterEmitValueChangeVec32(m_fst, m_symbolp[code], bits, newval);
+    void fullDouble(vluint32_t* oldp, double newval) {
+        // cppcheck-suppress invalidPointerCast
+        *reinterpret_cast<double*>(oldp) = newval;
+        fstWriterEmitValueChange(m_fst, m_symbolp[oldp - m_sigs_oldvalp], oldp);
    }

-    void fullBit(vluint32_t code, const vluint32_t newval) { chgBit(code, newval); }
-    void fullBus(vluint32_t code, const vluint32_t newval, int bits) {
-        chgBus(code, newval, bits);
-    }
-    void fullDouble(vluint32_t code, const double newval) { chgDouble(code, newval); }
-    void fullFloat(vluint32_t code, const float newval) { chgFloat(code, newval); }
-    void fullQuad(vluint32_t code, const vluint64_t newval, int bits) {
-        chgQuad(code, newval, bits);
-    }
-    void fullArray(vluint32_t code, const vluint32_t* newval, int bits) {
-        chgArray(code, newval, bits);
-    }
+    //=========================================================================
+    // Check previous value and emit if changed

-    void declTriBit(vluint32_t code, const char* name, int arraynum);
-    void declTriBus(vluint32_t code, const char* name, int arraynum, int msb, int lsb);
-    void declTriQuad(vluint32_t code, const char* name, int arraynum, int msb, int lsb);
-    void declTriArray(vluint32_t code, const char* name, int arraynum, int msb, int lsb);
-    void fullTriBit(vluint32_t code, const vluint32_t newval, const vluint32_t newtri);
-    void fullTriBus(vluint32_t code, const vluint32_t newval, const vluint32_t newtri, int bits);
-    void fullTriQuad(vluint32_t code, const vluint64_t newval, const vluint32_t newtri, int bits);
-    void fullTriArray(vluint32_t code, const vluint32_t* newvalp, const vluint32_t* newtrip,
-                      int bits);
-    void fullBitX(vluint32_t code);
-    void fullBusX(vluint32_t code, int bits);
-    void fullQuadX(vluint32_t code, int bits);
-    void fullArrayX(vluint32_t code, int bits);
-    void chgTriBit(vluint32_t code, const vluint32_t newval, const vluint32_t newtri);
-    void chgTriBus(vluint32_t code, const vluint32_t newval, const vluint32_t newtri, int bits);
-    void chgTriQuad(vluint32_t code, const vluint64_t newval, const vluint32_t newtri, int bits);
-    void chgTriArray(vluint32_t code, const vluint32_t* newvalp, const vluint32_t* newtrip, int bits);
+    inline void chgBit(vluint32_t* oldp, vluint32_t newval) {
+        const vluint32_t diff = *oldp ^ newval;
+        if (VL_UNLIKELY(diff)) fullBit(oldp, newval);
+    }
+    template <int T_Bits> inline void chgBus(vluint32_t* oldp, vluint32_t newval) {
+        const vluint32_t diff = *oldp ^ newval;
+        if (VL_UNLIKELY(diff)) fullBus<T_Bits>(oldp, newval);
+    }
+    inline void chgQuad(vluint32_t* oldp, vluint64_t newval, int bits) {
+        const vluint64_t diff = *reinterpret_cast<vluint64_t*>(oldp) ^ newval;
+        if (VL_UNLIKELY(diff)) fullQuad(oldp, newval, bits);
+    }
+    inline void chgArray(vluint32_t* oldp, const vluint32_t* newvalp, int bits) {
+        for (int i = 0; i < (bits + 31) / 32; ++i) {
+            if (VL_UNLIKELY(oldp[i] ^ newvalp[i])) {
+                fullArray(oldp, newvalp, bits);
+                return;
+            }
+        }
+    }
+    inline void chgFloat(vluint32_t* oldp, float newval) {
+        // cppcheck-suppress invalidPointerCast
+        if (VL_UNLIKELY(*reinterpret_cast<float*>(oldp) != newval)) fullFloat(oldp, newval);
+    }
+    inline void chgDouble(vluint32_t* oldp, double newval) {
+        // cppcheck-suppress invalidPointerCast
+        if (VL_UNLIKELY(*reinterpret_cast<double*>(oldp) != newval)) fullDouble(oldp, newval);
+    }
 };

 //=============================================================================
--- a/include/verilated_vcd_c.cpp
+++ b/include/verilated_vcd_c.cpp
@ -17,6 +17,8 @@
 //=============================================================================
 // SPDIFF_OFF

+// clang-format off
+
 #include "verilatedos.h"
 #include "verilated.h"
 #include "verilated_vcd_c.h"
@ -45,6 +47,19 @@
 # define O_CLOEXEC 0
 #endif

+// clang-format on
+
+// This size comes form VCD allowing use of printable ASCII characters between
+// '!' and '~' inclusive, which are a total of 94 different values. Encoding a
+// 32 bit code hence needs a maximum of ceil(log94(2**32-1)) == 5 bytes.
+#define VL_TRACE_MAX_VCD_CODE_SIZE 5  ///< Maximum length of a VCD string code
+// We use 8 bytes per code in a suffix buffer array.
+// 1 byte optional separator + VL_TRACE_MAX_VCD_CODE_SIZE bytes for code
+// + 1 byte '\n' + 1 byte suffix size. This luckily comes out to a power of 2,
+// meaning the array can be aligned such that entries never straddle multiple
+// cache-lines.
+#define VL_TRACE_SUFFIX_ENTRY_SIZE 8  ///< Size of a suffix entry
+
 //=============================================================================
 // VerilatedVcdImp
 /// Base class to hold some static state
@ -57,7 +72,10 @@ private:
        VerilatedMutex s_vcdMutex;  ///< Protect the singleton
        VcdVec s_vcdVecp VL_GUARDED_BY(s_vcdMutex);  ///< List of all created traces
    };
-    static Singleton& singleton() { static Singleton s; return s; }
+    static Singleton& singleton() {
+        static Singleton s;
+        return s;
+    }

 public:
    static void pushVcd(VerilatedVcd* vcdp) VL_EXCLUDES(singleton().s_vcdMutex) {
@ -152,6 +170,7 @@ VerilatedVcd::VerilatedVcd(VerilatedVcdFile* filep)
    m_wrFlushp = m_wrBufp + m_wrChunkSize * 6;
    m_writep = m_wrBufp;
    m_wroteBytes = 0;
+    m_suffixesp = NULL;
 }

 void VerilatedVcd::open(const char* filename) {
@ -175,6 +194,9 @@ void VerilatedVcd::open(const char* filename) {
    // Allocate space now we know the number of codes
    if (!m_sigs_oldvalp) m_sigs_oldvalp = new vluint32_t[m_nextCode + 10];

+    // Get the direct access pointer to the code strings
+    m_suffixesp = &m_suffixes[0];  // Note: C++11 m_suffixes.data();
+
    if (m_rolloverMB) {
        openNext(true);
        if (!isOpen()) return;
@ -191,10 +213,8 @@ void VerilatedVcd::openNext(bool incFilename) {
        std::string name = m_filename;
        size_t pos = name.rfind('.');
        if (pos > 8 && 0 == strncmp("_cat", name.c_str() + pos - 8, 4)
-            && isdigit(name.c_str()[pos - 4])
-            && isdigit(name.c_str()[pos - 3])
-            && isdigit(name.c_str()[pos - 2])
-            && isdigit(name.c_str()[pos - 1])) {
+            && isdigit(name.c_str()[pos - 4]) && isdigit(name.c_str()[pos - 3])
+            && isdigit(name.c_str()[pos - 2]) && isdigit(name.c_str()[pos - 1])) {
            // Increment code.
            if ((++(name[pos - 1])) > '9') {
                name[pos - 1] = '0';
@ -404,7 +424,8 @@ void VerilatedVcd::set_time_resolution(const char* unitp) {
 double VerilatedVcd::timescaleToDouble(const char* unitp) {
    char* endp;
    double value = strtod(unitp, &endp);
-    if (value == 0.0 && endp == unitp) value = 1;  // On error so we allow just "ns" to return 1e-9.
+    // On error so we allow just "ns" to return 1e-9.
+    if (value == 0.0 && endp == unitp) value = 1;
    unitp = endp;
    for (; *unitp && isspace(*unitp); unitp++) {}
    switch (*unitp) {
@ -421,27 +442,41 @@ double VerilatedVcd::timescaleToDouble(const char* unitp) {

 std::string VerilatedVcd::doubleToTimescale(double value) {
    const char* suffixp = "s";
-    if      (value>=1e0)   { suffixp="s"; value *= 1e0; }
-    else if (value>=1e-3 ) { suffixp="ms"; value *= 1e3; }
-    else if (value>=1e-6 ) { suffixp="us"; value *= 1e6; }
-    else if (value>=1e-9 ) { suffixp="ns"; value *= 1e9; }
-    else if (value>=1e-12) { suffixp="ps"; value *= 1e12; }
-    else if (value>=1e-15) { suffixp="fs"; value *= 1e15; }
-    else if (value>=1e-18) { suffixp="as"; value *= 1e18; }
+    // clang-format off
+    if      (value >= 1e0)   { suffixp = "s"; value *= 1e0; }
+    else if (value >= 1e-3 ) { suffixp = "ms"; value *= 1e3; }
+    else if (value >= 1e-6 ) { suffixp = "us"; value *= 1e6; }
+    else if (value >= 1e-9 ) { suffixp = "ns"; value *= 1e9; }
+    else if (value >= 1e-12) { suffixp = "ps"; value *= 1e12; }
+    else if (value >= 1e-15) { suffixp = "fs"; value *= 1e15; }
+    else if (value >= 1e-18) { suffixp = "as"; value *= 1e18; }
+    // clang-format on
    char valuestr[100];
    sprintf(valuestr, "%3.0f%s", value, suffixp);
    return valuestr;  // Gets converted to string, so no ref to stack
 }

+//=============================================================================
+// VCD string code
+
+char* VerilatedVcd::writeCode(char* writep, vluint32_t code) {
+    *writep++ = static_cast<char>('!' + code % 94);
+    code /= 94;
+    while (code) {
+        code--;
+        *writep++ = static_cast<char>('!' + code % 94);
+        code /= 94;
+    }
+    return writep;
+}
+
 //=============================================================================
 // Definitions

 void VerilatedVcd::printIndent(int level_change) {
    if (level_change < 0) m_modDepth += level_change;
    assert(m_modDepth >= 0);
-    for (int i = 0; i < m_modDepth; i++) {
-        printStr(" ");
-    }
+    for (int i = 0; i < m_modDepth; i++) printStr(" ");
    if (level_change > 0) m_modDepth += level_change;
 }

@ -539,8 +574,8 @@ void VerilatedVcd::module(const std::string& name) {
    m_modName = name;
 }

-void VerilatedVcd::declare(vluint32_t code, const char* name, const char* wirep,
-                           bool array, int arraynum, bool tri, bool bussed, int msb, int lsb) {
+void VerilatedVcd::declare(vluint32_t code, const char* name, const char* wirep, bool array,
+                           int arraynum, bool tri, bool bussed, int msb, int lsb) {
    if (!code) {
        VL_FATAL_MT(__FILE__, __LINE__, "", "Internal: internal trace problem, code 0 is illegal");
    }
@ -554,6 +589,9 @@ void VerilatedVcd::declare(vluint32_t code, const char* name, const char* wirep,
    if (m_sigs.capacity() <= m_nextCode) {
        m_sigs.reserve(m_nextCode * 2);  // Power-of-2 allocation speeds things up
    }
+    if (m_suffixes.size() <= m_nextCode * VL_TRACE_SUFFIX_ENTRY_SIZE) {
+        m_suffixes.resize(m_nextCode * VL_TRACE_SUFFIX_ENTRY_SIZE * 2, 0);
+    }

    // Make sure write buffer is large enough (one character per bit), plus header
    bufferResize(bits + 1024);
@ -601,7 +639,21 @@ void VerilatedVcd::declare(vluint32_t code, const char* name, const char* wirep,
        sprintf(buf, "<%u", code);
        decl += buf;
    } else {
-        decl += stringCode(code);
+        // Add string code to decl
+        char* const endp = writeCode(buf, code);
+        *endp = '\0';
+        decl += buf;
+        // Build suffix array entry
+        char* const entryp = &m_suffixes[code * VL_TRACE_SUFFIX_ENTRY_SIZE];
+        const size_t length = endp - buf;
+        assert(length <= VL_TRACE_MAX_VCD_CODE_SIZE);
+        // 1 bit values don't have a ' ' separator between value and string code
+        const bool isBit = bits == 1;
+        entryp[0] = ' ';  // Separator
+        std::strcpy(entryp + !isBit, buf);  // Code (overwrite separator if isBit)
+        entryp[length + !isBit] = '\n';  // Replace '\0' with line termination '\n'
+        // Set length of suffix (used to increment write pointer)
+        entryp[VL_TRACE_SUFFIX_ENTRY_SIZE - 1] = !isBit + length + 1;
    }
    decl += " ";
    decl += basename;
@ -633,6 +685,13 @@ void VerilatedVcd::declArray(vluint32_t code, const char* name, bool array, int
                             int lsb) {
    declare(code, name, "wire", array, arraynum, false, true, msb, lsb);
 }
+void VerilatedVcd::declFloat(vluint32_t code, const char* name, bool array, int arraynum) {
+    declare(code, name, "real", array, arraynum, false, false, 31, 0);
+}
+void VerilatedVcd::declDouble(vluint32_t code, const char* name, bool array, int arraynum) {
+    declare(code, name, "real", array, arraynum, false, false, 63, 0);
+}
+#ifndef VL_TRACE_VCD_OLD_API
 void VerilatedVcd::declTriBit(vluint32_t code, const char* name, bool array, int arraynum) {
    declare(code, name, "wire", array, arraynum, true, false, 0, 0);
 }
@ -648,20 +707,234 @@ void VerilatedVcd::declTriArray(vluint32_t code, const char* name, bool array, i
                                int msb, int lsb) {
    declare(code, name, "wire", array, arraynum, true, true, msb, lsb);
 }
-void VerilatedVcd::declFloat(vluint32_t code, const char* name, bool array, int arraynum) {
-    declare(code, name, "real", array, arraynum, false, false, 31, 0);
-}
-void VerilatedVcd::declDouble(vluint32_t code, const char* name, bool array, int arraynum) {
-    declare(code, name, "real", array, arraynum, false, false, 63, 0);
-}
+#endif  //  VL_TRACE_VCD_OLD_API

 //=============================================================================
+// Trace recording routines
+
+#ifndef VL_TRACE_VCD_OLD_API
+
+//=============================================================================
+// Pointer based variants used by Verilator
+
+// Emit suffix, write back write pointer, check buffer
+void VerilatedVcd::finishLine(vluint32_t* oldp, char* writep) {
+    const vluint32_t code = oldp - m_sigs_oldvalp;
+    const char* const suffixp = m_suffixesp + code * VL_TRACE_SUFFIX_ENTRY_SIZE;
+    // Copy the whole suffix (this avoid having hard to predict branches which
+    // helps a lot). Note suffixp could be aligned, so could load it in one go,
+    // but then we would be endiannes dependent which we don't have a way to
+    // test right now and probably would make little difference...
+    // Note: The maximum length of the suffix is
+    // VL_TRACE_MAX_VCD_CODE_SIZE + 2 == 7, but we unroll this here for speed.
+    writep[0] = suffixp[0];
+    writep[1] = suffixp[1];
+    writep[2] = suffixp[2];
+    writep[3] = suffixp[3];
+    writep[4] = suffixp[4];
+    writep[5] = suffixp[5];
+    writep[6] = '\n';  // The 6th index is always '\n' if it's relevant, no need to fetch it.
+    // Now write back the write pointer incremented by the actual size of the
+    // suffix, which was stored in the last byte of the suffix buffer entry.
+    m_writep = writep + suffixp[VL_TRACE_SUFFIX_ENTRY_SIZE - 1];
+    bufferCheck();
+}
+
+void VerilatedVcd::fullBit(vluint32_t* oldp, vluint32_t newval) {
+    *oldp = newval;
+    char* wp = m_writep;
+    *wp++ = '0' | static_cast<char>(newval);
+    finishLine(oldp, wp);
+}
+
+// We do want these functions specialized for sizes to avoid hard to predict
+// branches, but we don't want them inlined, so we explicitly create one
+// specialization for each size used here here.
+
+// T_Bits is the number of used bits in the value
+template <int T_Bits> void VerilatedVcd::fullBus(vluint32_t* oldp, vluint32_t newval) {
+    *oldp = newval;
+    char* wp = m_writep;
+    *wp++ = 'b';
+    newval <<= 32 - T_Bits;
+    int bits = T_Bits;
+    do {
+        *wp++ = '0' | static_cast<char>(newval >> 31);
+        newval <<= 1;
+    } while (--bits);
+    finishLine(oldp, wp);
+}
+// Note: No specialization for width 1, covered by 'fullBit'
+template void VerilatedVcd::fullBus<2>(vluint32_t* oldp, vluint32_t newval);
+template void VerilatedVcd::fullBus<3>(vluint32_t* oldp, vluint32_t newval);
+template void VerilatedVcd::fullBus<4>(vluint32_t* oldp, vluint32_t newval);
+template void VerilatedVcd::fullBus<5>(vluint32_t* oldp, vluint32_t newval);
+template void VerilatedVcd::fullBus<6>(vluint32_t* oldp, vluint32_t newval);
+template void VerilatedVcd::fullBus<7>(vluint32_t* oldp, vluint32_t newval);
+template void VerilatedVcd::fullBus<8>(vluint32_t* oldp, vluint32_t newval);
+template void VerilatedVcd::fullBus<9>(vluint32_t* oldp, vluint32_t newval);
+template void VerilatedVcd::fullBus<10>(vluint32_t* oldp, vluint32_t newval);
+template void VerilatedVcd::fullBus<11>(vluint32_t* oldp, vluint32_t newval);
+template void VerilatedVcd::fullBus<12>(vluint32_t* oldp, vluint32_t newval);
+template void VerilatedVcd::fullBus<13>(vluint32_t* oldp, vluint32_t newval);
+template void VerilatedVcd::fullBus<14>(vluint32_t* oldp, vluint32_t newval);
+template void VerilatedVcd::fullBus<15>(vluint32_t* oldp, vluint32_t newval);
+template void VerilatedVcd::fullBus<16>(vluint32_t* oldp, vluint32_t newval);
+template void VerilatedVcd::fullBus<17>(vluint32_t* oldp, vluint32_t newval);
+template void VerilatedVcd::fullBus<18>(vluint32_t* oldp, vluint32_t newval);
+template void VerilatedVcd::fullBus<19>(vluint32_t* oldp, vluint32_t newval);
+template void VerilatedVcd::fullBus<20>(vluint32_t* oldp, vluint32_t newval);
+template void VerilatedVcd::fullBus<21>(vluint32_t* oldp, vluint32_t newval);
+template void VerilatedVcd::fullBus<22>(vluint32_t* oldp, vluint32_t newval);
+template void VerilatedVcd::fullBus<23>(vluint32_t* oldp, vluint32_t newval);
+template void VerilatedVcd::fullBus<24>(vluint32_t* oldp, vluint32_t newval);
+template void VerilatedVcd::fullBus<25>(vluint32_t* oldp, vluint32_t newval);
+template void VerilatedVcd::fullBus<26>(vluint32_t* oldp, vluint32_t newval);
+template void VerilatedVcd::fullBus<27>(vluint32_t* oldp, vluint32_t newval);
+template void VerilatedVcd::fullBus<28>(vluint32_t* oldp, vluint32_t newval);
+template void VerilatedVcd::fullBus<29>(vluint32_t* oldp, vluint32_t newval);
+template void VerilatedVcd::fullBus<30>(vluint32_t* oldp, vluint32_t newval);
+template void VerilatedVcd::fullBus<31>(vluint32_t* oldp, vluint32_t newval);
+template void VerilatedVcd::fullBus<32>(vluint32_t* oldp, vluint32_t newval);
+
+// T_Bits is the number of used bits in the value
+void VerilatedVcd::fullQuad(vluint32_t* oldp, vluint64_t newval, int bits) {
+    *reinterpret_cast<vluint64_t*>(oldp) = newval;
+    char* wp = m_writep;
+    *wp++ = 'b';
+    newval <<= 64 - bits;
+    // Handle the top 32 bits within the 64 bit input
+    const int bitsInTopHalf = bits - 32;
+    wp += bitsInTopHalf;
+    // clang-format off
+    switch (bitsInTopHalf) {
+    case 32: wp[-32] = '0' | static_cast<char>(newval >> 63); newval<<=1; //FALLTHRU
+    case 31: wp[-31] = '0' | static_cast<char>(newval >> 63); newval<<=1; //FALLTHRU
+    case 30: wp[-30] = '0' | static_cast<char>(newval >> 63); newval<<=1; //FALLTHRU
+    case 29: wp[-29] = '0' | static_cast<char>(newval >> 63); newval<<=1; //FALLTHRU
+    case 28: wp[-28] = '0' | static_cast<char>(newval >> 63); newval<<=1; //FALLTHRU
+    case 27: wp[-27] = '0' | static_cast<char>(newval >> 63); newval<<=1; //FALLTHRU
+    case 26: wp[-26] = '0' | static_cast<char>(newval >> 63); newval<<=1; //FALLTHRU
+    case 25: wp[-25] = '0' | static_cast<char>(newval >> 63); newval<<=1; //FALLTHRU
+    case 24: wp[-24] = '0' | static_cast<char>(newval >> 63); newval<<=1; //FALLTHRU
+    case 23: wp[-23] = '0' | static_cast<char>(newval >> 63); newval<<=1; //FALLTHRU
+    case 22: wp[-22] = '0' | static_cast<char>(newval >> 63); newval<<=1; //FALLTHRU
+    case 21: wp[-21] = '0' | static_cast<char>(newval >> 63); newval<<=1; //FALLTHRU
+    case 20: wp[-20] = '0' | static_cast<char>(newval >> 63); newval<<=1; //FALLTHRU
+    case 19: wp[-19] = '0' | static_cast<char>(newval >> 63); newval<<=1; //FALLTHRU
+    case 18: wp[-18] = '0' | static_cast<char>(newval >> 63); newval<<=1; //FALLTHRU
+    case 17: wp[-17] = '0' | static_cast<char>(newval >> 63); newval<<=1; //FALLTHRU
+    case 16: wp[-16] = '0' | static_cast<char>(newval >> 63); newval<<=1; //FALLTHRU
+    case 15: wp[-15] = '0' | static_cast<char>(newval >> 63); newval<<=1; //FALLTHRU
+    case 14: wp[-14] = '0' | static_cast<char>(newval >> 63); newval<<=1; //FALLTHRU
+    case 13: wp[-13] = '0' | static_cast<char>(newval >> 63); newval<<=1; //FALLTHRU
+    case 12: wp[-12] = '0' | static_cast<char>(newval >> 63); newval<<=1; //FALLTHRU
+    case 11: wp[-11] = '0' | static_cast<char>(newval >> 63); newval<<=1; //FALLTHRU
+    case 10: wp[-10] = '0' | static_cast<char>(newval >> 63); newval<<=1; //FALLTHRU
+    case 9:  wp[ -9] = '0' | static_cast<char>(newval >> 63); newval<<=1; //FALLTHRU
+    case 8:  wp[ -8] = '0' | static_cast<char>(newval >> 63); newval<<=1; //FALLTHRU
+    case 7:  wp[ -7] = '0' | static_cast<char>(newval >> 63); newval<<=1; //FALLTHRU
+    case 6:  wp[ -6] = '0' | static_cast<char>(newval >> 63); newval<<=1; //FALLTHRU
+    case 5:  wp[ -5] = '0' | static_cast<char>(newval >> 63); newval<<=1; //FALLTHRU
+    case 4:  wp[ -4] = '0' | static_cast<char>(newval >> 63); newval<<=1; //FALLTHRU
+    case 3:  wp[ -3] = '0' | static_cast<char>(newval >> 63); newval<<=1; //FALLTHRU
+    case 2:  wp[ -2] = '0' | static_cast<char>(newval >> 63); newval<<=1; //FALLTHRU
+    case 1:  wp[ -1] = '0' | static_cast<char>(newval >> 63); newval<<=1; //FALLTHRU
+    }
+    // clang-format on
+    // Handle the bottom 32 bits within the 64 bit input
+    int remaining = 32;
+    do {
+        *wp++ = '0' | static_cast<char>(newval >> 63);
+        newval <<= 1;
+    } while (--remaining);
+    finishLine(oldp, wp);
+}
+
+void VerilatedVcd::fullArray(vluint32_t* oldp, const vluint32_t* newvalp, int bits) {
+    int words = (bits + 31) / 32;
+    for (int i = 0; i < words; ++i) oldp[i] = newvalp[i];
+    char* wp = m_writep;
+    *wp++ = 'b';
+    // Handle the most significant word
+    const int bitsInMSW = bits % 32 == 0 ? 32 : bits % 32;
+    vluint32_t val = newvalp[--words] << (32 - bitsInMSW);
+    wp += bitsInMSW;
+    // clang-format off
+    switch (bitsInMSW) {
+    case 32: wp[-32] = '0' | static_cast<char>(val >> 31); val<<=1; //FALLTHRU
+    case 31: wp[-31] = '0' | static_cast<char>(val >> 31); val<<=1; //FALLTHRU
+    case 30: wp[-30] = '0' | static_cast<char>(val >> 31); val<<=1; //FALLTHRU
+    case 29: wp[-29] = '0' | static_cast<char>(val >> 31); val<<=1; //FALLTHRU
+    case 28: wp[-28] = '0' | static_cast<char>(val >> 31); val<<=1; //FALLTHRU
+    case 27: wp[-27] = '0' | static_cast<char>(val >> 31); val<<=1; //FALLTHRU
+    case 26: wp[-26] = '0' | static_cast<char>(val >> 31); val<<=1; //FALLTHRU
+    case 25: wp[-25] = '0' | static_cast<char>(val >> 31); val<<=1; //FALLTHRU
+    case 24: wp[-24] = '0' | static_cast<char>(val >> 31); val<<=1; //FALLTHRU
+    case 23: wp[-23] = '0' | static_cast<char>(val >> 31); val<<=1; //FALLTHRU
+    case 22: wp[-22] = '0' | static_cast<char>(val >> 31); val<<=1; //FALLTHRU
+    case 21: wp[-21] = '0' | static_cast<char>(val >> 31); val<<=1; //FALLTHRU
+    case 20: wp[-20] = '0' | static_cast<char>(val >> 31); val<<=1; //FALLTHRU
+    case 19: wp[-19] = '0' | static_cast<char>(val >> 31); val<<=1; //FALLTHRU
+    case 18: wp[-18] = '0' | static_cast<char>(val >> 31); val<<=1; //FALLTHRU
+    case 17: wp[-17] = '0' | static_cast<char>(val >> 31); val<<=1; //FALLTHRU
+    case 16: wp[-16] = '0' | static_cast<char>(val >> 31); val<<=1; //FALLTHRU
+    case 15: wp[-15] = '0' | static_cast<char>(val >> 31); val<<=1; //FALLTHRU
+    case 14: wp[-14] = '0' | static_cast<char>(val >> 31); val<<=1; //FALLTHRU
+    case 13: wp[-13] = '0' | static_cast<char>(val >> 31); val<<=1; //FALLTHRU
+    case 12: wp[-12] = '0' | static_cast<char>(val >> 31); val<<=1; //FALLTHRU
+    case 11: wp[-11] = '0' | static_cast<char>(val >> 31); val<<=1; //FALLTHRU
+    case 10: wp[-10] = '0' | static_cast<char>(val >> 31); val<<=1; //FALLTHRU
+    case 9:  wp[ -9] = '0' | static_cast<char>(val >> 31); val<<=1; //FALLTHRU
+    case 8:  wp[ -8] = '0' | static_cast<char>(val >> 31); val<<=1; //FALLTHRU
+    case 7:  wp[ -7] = '0' | static_cast<char>(val >> 31); val<<=1; //FALLTHRU
+    case 6:  wp[ -6] = '0' | static_cast<char>(val >> 31); val<<=1; //FALLTHRU
+    case 5:  wp[ -5] = '0' | static_cast<char>(val >> 31); val<<=1; //FALLTHRU
+    case 4:  wp[ -4] = '0' | static_cast<char>(val >> 31); val<<=1; //FALLTHRU
+    case 3:  wp[ -3] = '0' | static_cast<char>(val >> 31); val<<=1; //FALLTHRU
+    case 2:  wp[ -2] = '0' | static_cast<char>(val >> 31); val<<=1; //FALLTHRU
+    case 1:  wp[ -1] = '0' | static_cast<char>(val >> 31); val<<=1; //FALLTHRU
+    }
+    // clang-format on
+    // Handle the remaining words
+    while (words > 0) {
+        vluint32_t val = newvalp[--words];
+        int bits = 32;
+        do {
+            *wp++ = '0' | static_cast<char>(val >> 31);
+            val <<= 1;
+        } while (--bits);
+    }
+    finishLine(oldp, wp);
+}
+
+void VerilatedVcd::fullFloat(vluint32_t* oldp, float newval) {
+    // cppcheck-suppress invalidPointerCast
+    *reinterpret_cast<float*>(oldp) = newval;
+    char* wp = m_writep;
+    // Buffer can't overflow before sprintf; we sized during declaration
+    sprintf(wp, "r%.16g", static_cast<double>(newval));
+    wp += strlen(wp);
+    finishLine(oldp, wp);
+}
+
+void VerilatedVcd::fullDouble(vluint32_t* oldp, double newval) {
+    // cppcheck-suppress invalidPointerCast
+    *reinterpret_cast<double*>(oldp) = newval;
+    char* wp = m_writep;
+    // Buffer can't overflow before sprintf; we sized during declaration
+    sprintf(wp, "r%.16g", newval);
+    wp += strlen(wp);
+    finishLine(oldp, wp);
+}
+
+#else  // VL_TRACE_VCD_OLD_API

 void VerilatedVcd::fullBit(vluint32_t code, const vluint32_t newval) {
    // Note the &1, so we don't require clean input -- makes more common no change case faster
    m_sigs_oldvalp[code] = newval;
    *m_writep++ = ('0' + static_cast<char>(newval & 1));
-    printCode(code);
+    m_writep = writeCode(m_writep, code);
    *m_writep++ = '\n';
    bufferCheck();
 }
@ -672,7 +945,7 @@ void VerilatedVcd::fullBus(vluint32_t code, const vluint32_t newval, int bits) {
        *m_writep++ = ((newval & (1L << bit)) ? '1' : '0');
    }
    *m_writep++ = ' ';
-    printCode(code);
+    m_writep = writeCode(m_writep, code);
    *m_writep++ = '\n';
    bufferCheck();
 }
@ -683,7 +956,7 @@ void VerilatedVcd::fullQuad(vluint32_t code, const vluint64_t newval, int bits)
        *m_writep++ = ((newval & (VL_ULL(1) << bit)) ? '1' : '0');
    }
    *m_writep++ = ' ';
-    printCode(code);
+    m_writep = writeCode(m_writep, code);
    *m_writep++ = '\n';
    bufferCheck();
 }
@ -696,7 +969,7 @@ void VerilatedVcd::fullArray(vluint32_t code, const vluint32_t* newval, int bits
        *m_writep++ = ((newval[(bit / 32)] & (1L << (bit & 0x1f))) ? '1' : '0');
    }
    *m_writep++ = ' ';
-    printCode(code);
+    m_writep = writeCode(m_writep, code);
    *m_writep++ = '\n';
    bufferCheck();
 }
@ -709,7 +982,7 @@ void VerilatedVcd::fullArray(vluint32_t code, const vluint64_t* newval, int bits
        *m_writep++ = ((newval[(bit / 64)] & (VL_ULL(1) << (bit & 0x3f))) ? '1' : '0');
    }
    *m_writep++ = ' ';
-    printCode(code);
+    m_writep = writeCode(m_writep, code);
    *m_writep++ = '\n';
    bufferCheck();
 }
@ -717,11 +990,12 @@ void VerilatedVcd::fullTriBit(vluint32_t code, const vluint32_t newval, const vl
    m_sigs_oldvalp[code] = newval;
    m_sigs_oldvalp[code + 1] = newtri;
    *m_writep++ = "01zz"[m_sigs_oldvalp[code] | (m_sigs_oldvalp[code + 1] << 1)];
-    printCode(code);
+    m_writep = writeCode(m_writep, code);
    *m_writep++ = '\n';
    bufferCheck();
 }
-void VerilatedVcd::fullTriBus(vluint32_t code, const vluint32_t newval, const vluint32_t newtri, int bits) {
+void VerilatedVcd::fullTriBus(vluint32_t code, const vluint32_t newval, const vluint32_t newtri,
+                              int bits) {
    m_sigs_oldvalp[code] = newval;
    m_sigs_oldvalp[code + 1] = newtri;
    *m_writep++ = 'b';
@ -729,25 +1003,26 @@ void VerilatedVcd::fullTriBus(vluint32_t code, const vluint32_t newval, const vl
        *m_writep++ = "01zz"[((newval >> bit) & 1) | (((newtri >> bit) & 1) << 1)];
    }
    *m_writep++ = ' ';
-    printCode(code);
+    m_writep = writeCode(m_writep, code);
    *m_writep++ = '\n';
    bufferCheck();
 }
-void VerilatedVcd::fullTriQuad(vluint32_t code, const vluint64_t newval, const vluint32_t newtri, int bits) {
+void VerilatedVcd::fullTriQuad(vluint32_t code, const vluint64_t newval, const vluint32_t newtri,
+                               int bits) {
    (*(reinterpret_cast<vluint64_t*>(&m_sigs_oldvalp[code]))) = newval;
    (*(reinterpret_cast<vluint64_t*>(&m_sigs_oldvalp[code + 1]))) = newtri;
    *m_writep++ = 'b';
    for (int bit = bits - 1; bit >= 0; --bit) {
-        *m_writep++ = "01zz"[((newval >> bit) & VL_ULL(1))
-                             | (((newtri >> bit) & VL_ULL(1)) << VL_ULL(1))];
+        *m_writep++
+            = "01zz"[((newval >> bit) & VL_ULL(1)) | (((newtri >> bit) & VL_ULL(1)) << VL_ULL(1))];
    }
    *m_writep++ = ' ';
-    printCode(code);
+    m_writep = writeCode(m_writep, code);
    *m_writep++ = '\n';
    bufferCheck();
 }
-void VerilatedVcd::fullTriArray(vluint32_t code, const vluint32_t* newvalp, const vluint32_t* newtrip,
-                  int bits) {
+void VerilatedVcd::fullTriArray(vluint32_t code, const vluint32_t* newvalp,
+                                const vluint32_t* newtrip, int bits) {
    for (int word = 0; word < (((bits - 1) / 32) + 1); ++word) {
        m_sigs_oldvalp[code + word * 2] = newvalp[word];
        m_sigs_oldvalp[code + word * 2 + 1] = newtrip[word];
@ -759,7 +1034,7 @@ void VerilatedVcd::fullTriArray(vluint32_t code, const vluint32_t* newvalp, cons
        *m_writep++ = "01zz"[valbit | (tribit << 1)];
    }
    *m_writep++ = ' ';
-    printCode(code);
+    m_writep = writeCode(m_writep, code);
    *m_writep++ = '\n';
    bufferCheck();
 }
@ -770,7 +1045,7 @@ void VerilatedVcd::fullDouble(vluint32_t code, const double newval) {
    sprintf(m_writep, "r%.16g", newval);
    m_writep += strlen(m_writep);
    *m_writep++ = ' ';
-    printCode(code);
+    m_writep = writeCode(m_writep, code);
    *m_writep++ = '\n';
    bufferCheck();
 }
@ -781,29 +1056,29 @@ void VerilatedVcd::fullFloat(vluint32_t code, const float newval) {
    sprintf(m_writep, "r%.16g", static_cast<double>(newval));
    m_writep += strlen(m_writep);
    *m_writep++ = ' ';
-    printCode(code);
+    m_writep = writeCode(m_writep, code);
    *m_writep++ = '\n';
    bufferCheck();
 }
 void VerilatedVcd::fullBitX(vluint32_t code) {
    *m_writep++ = 'x';
-    printCode(code);
+    m_writep = writeCode(m_writep, code);
    *m_writep++ = '\n';
    bufferCheck();
 }
 void VerilatedVcd::fullBusX(vluint32_t code, int bits) {
    *m_writep++ = 'b';
-    for (int bit = bits - 1; bit >= 0; --bit) {
-        *m_writep++ = 'x';
-    }
+    for (int bit = bits - 1; bit >= 0; --bit) *m_writep++ = 'x';
    *m_writep++ = ' ';
-    printCode(code);
+    m_writep = writeCode(m_writep, code);
    *m_writep++ = '\n';
    bufferCheck();
 }
 void VerilatedVcd::fullQuadX(vluint32_t code, int bits) { fullBusX(code, bits); }
 void VerilatedVcd::fullArrayX(vluint32_t code, int bits) { fullBusX(code, bits); }

+#endif  // VL_TRACE_VCD_OLD_API
+
 //=============================================================================
 // Callbacks

@ -867,6 +1142,8 @@ void VerilatedVcd::flush_all() VL_MT_UNSAFE_ONE { VerilatedVcdSingleton::flush_a
 //======================================================================
 //======================================================================

+// clang-format off
+
 #ifdef VERILATED_VCD_TEST
 #include <iostream>

--- a/include/verilated_vcd_c.h
+++ b/include/verilated_vcd_c.h
@ -97,6 +97,9 @@ private:
    vluint64_t m_wrChunkSize;  ///< Output buffer size
    vluint64_t m_wroteBytes;  ///< Number of bytes written to this file

+    std::vector<char> m_suffixes;  ///< VCD line end string codes + metadata
+    const char* m_suffixesp;  ///< Pointer to first element of above
+
    vluint32_t* m_sigs_oldvalp;  ///< Pointer to old signal values
    typedef std::vector<VerilatedVcdSig> SigVec;
    SigVec m_sigs;  ///< Pointer to signal information
@ -131,26 +134,9 @@ private:
    void dumpFull(vluint64_t timeui);
    // cppcheck-suppress functionConst
    void dumpDone();
-    inline void printCode(vluint32_t code) {
-        *m_writep++ = static_cast<char>('!' + code % 94);
-        code /= 94;
-        while (code) {
-            code--;
-            *m_writep++ = static_cast<char>('!' + code % 94);
-            code /= 94;
-        }
-    }
-    static std::string stringCode(vluint32_t code) VL_PURE {
-        std::string out;
-        out += static_cast<char>('!' + code % 94);
-        code /= 94;
-        while (code) {
-            code--;
-            out += static_cast<char>('!' + code % 94);
-            code /= 94;
-        }
-        return out;
-    }
+    char* writeCode(char* writep, vluint32_t code);
+
+    void finishLine(vluint32_t* oldp, char* writep);

    // CONSTRUCTORS
    VL_UNCOPYABLE(VerilatedVcd);
@ -206,27 +192,116 @@ public:
    void declBus(vluint32_t code, const char* name, bool array, int arraynum, int msb, int lsb);
    void declQuad(vluint32_t code, const char* name, bool array, int arraynum, int msb, int lsb);
    void declArray(vluint32_t code, const char* name, bool array, int arraynum, int msb, int lsb);
+    void declFloat(vluint32_t code, const char* name, bool array, int arraynum);
+    void declDouble(vluint32_t code, const char* name, bool array, int arraynum);
+#ifndef VL_TRACE_VCD_OLD_API
    void declTriBit(vluint32_t code, const char* name, bool array, int arraynum);
    void declTriBus(vluint32_t code, const char* name, bool array, int arraynum, int msb, int lsb);
    void declTriQuad(vluint32_t code, const char* name, bool array, int arraynum, int msb,
                     int lsb);
    void declTriArray(vluint32_t code, const char* name, bool array, int arraynum, int msb,
                      int lsb);
-    void declDouble(vluint32_t code, const char* name, bool array, int arraynum);
-    void declFloat(vluint32_t code, const char* name, bool array, int arraynum);
+#endif  // VL_TRACE_VCD_OLD_API
    //  ... other module_start for submodules (based on cell name)

+    //=========================================================================
+    // Inside dumping routines used by Verilator
+
+    vluint32_t* oldp(vluint32_t code) { return m_sigs_oldvalp + code; }
+
+#ifndef VL_TRACE_VCD_OLD_API
+
+    //=========================================================================
+    // Write back to previous value buffer value and emit
+
+    void fullBit(vluint32_t* oldp, vluint32_t newval);
+    template <int T_Bits> void fullBus(vluint32_t* oldp, vluint32_t newval);
+    void fullQuad(vluint32_t* oldp, vluint64_t newval, int bits);
+    void fullArray(vluint32_t* oldp, const vluint32_t* newvalp, int bits);
+    void fullFloat(vluint32_t* oldp, float newval);
+    void fullDouble(vluint32_t* oldp, double newval);
+
+    //=========================================================================
+    // Check previous value and emit if changed
+
+    inline void chgBit(vluint32_t* oldp, vluint32_t newval) {
+        const vluint32_t diff = *oldp ^ newval;
+        if (VL_UNLIKELY(diff)) fullBit(oldp, newval);
+    }
+    template <int T_Bits> inline void chgBus(vluint32_t* oldp, vluint32_t newval) {
+        const vluint32_t diff = *oldp ^ newval;
+        if (VL_UNLIKELY(diff)) fullBus<T_Bits>(oldp, newval);
+    }
+    inline void chgQuad(vluint32_t* oldp, vluint64_t newval, int bits) {
+        const vluint64_t diff = *reinterpret_cast<vluint64_t*>(oldp) ^ newval;
+        if (VL_UNLIKELY(diff)) fullQuad(oldp, newval, bits);
+    }
+    inline void chgArray(vluint32_t* oldp, const vluint32_t* newvalp, int bits) {
+        for (int i = 0; i < (bits + 31) / 32; ++i) {
+            if (VL_UNLIKELY(oldp[i] ^ newvalp[i])) {
+                fullArray(oldp, newvalp, bits);
+                return;
+            }
+        }
+    }
+    inline void chgFloat(vluint32_t* oldp, float newval) {
+        // cppcheck-suppress invalidPointerCast
+        if (VL_UNLIKELY(*reinterpret_cast<float*>(oldp) != newval)) fullFloat(oldp, newval);
+    }
+    inline void chgDouble(vluint32_t* oldp, double newval) {
+        // cppcheck-suppress invalidPointerCast
+        if (VL_UNLIKELY(*reinterpret_cast<double*>(oldp) != newval)) fullDouble(oldp, newval);
+    }
+
+#else  // VL_TRACE_VCD_OLD_API
+
+    // Note: These are only for testing for backward compatibility. Verilator
+    // should use the more efficient versions above.
+
+    //=========================================================================
+    // Write back to previous value buffer value and emit
+
+    void fullBit(vluint32_t* oldp, vluint32_t newval) { fullBit(oldp - m_sigs_oldvalp, newval); }
+    template <int T_Bits> void fullBus(vluint32_t* oldp, vluint32_t newval) {
+        fullBus(oldp - m_sigs_oldvalp, newval, T_Bits);
+    }
+    void fullQuad(vluint32_t* oldp, vluint64_t newval, int bits) {
+        fullQuad(oldp - m_sigs_oldvalp, newval, bits);
+    }
+    void fullArray(vluint32_t* oldp, const vluint32_t* newvalp, int bits) {
+        fullArray(oldp - m_sigs_oldvalp, newvalp, bits);
+    }
+    void fullFloat(vluint32_t* oldp, float newval) { fullFloat(oldp - m_sigs_oldvalp, newval); }
+    void fullDouble(vluint32_t* oldp, double newval) { fullDouble(oldp - m_sigs_oldvalp, newval); }
+
+    //=========================================================================
+    // Check previous value and emit if changed
+
+    void chgBit(vluint32_t* oldp, vluint32_t newval) { chgBit(oldp - m_sigs_oldvalp, newval); }
+    template <int T_Bits> void chgBus(vluint32_t* oldp, vluint32_t newval) {
+        chgBus(oldp - m_sigs_oldvalp, newval, T_Bits);
+    }
+    void chgQuad(vluint32_t* oldp, vluint64_t newval, int bits) {
+        chgQuad(oldp - m_sigs_oldvalp, newval, bits);
+    }
+    void chgArray(vluint32_t* oldp, const vluint32_t* newvalp, int bits) {
+        chgArray(oldp - m_sigs_oldvalp, newvalp, bits);
+    }
+    void chgFloat(vluint32_t* oldp, float newval) { chgFloat(oldp - m_sigs_oldvalp, newval); }
+    void chgDouble(vluint32_t* oldp, double newval) { chgDouble(oldp - m_sigs_oldvalp, newval); }
+
    /// Inside dumping routines, dump one signal, faster when not inlined
    /// due to code size reduction.
    void fullBit(vluint32_t code, const vluint32_t newval);
    void fullBus(vluint32_t code, const vluint32_t newval, int bits);
    void fullQuad(vluint32_t code, const vluint64_t newval, int bits);
-    void fullArray(vluint32_t code, const vluint32_t* newval, int bits);
-    void fullArray(vluint32_t code, const vluint64_t* newval, int bits);
+    void fullArray(vluint32_t code, const vluint32_t* newvalp, int bits);
+    void fullArray(vluint32_t code, const vluint64_t* newvalp, int bits);
    void fullTriBit(vluint32_t code, const vluint32_t newval, const vluint32_t newtri);
    void fullTriBus(vluint32_t code, const vluint32_t newval, const vluint32_t newtri, int bits);
    void fullTriQuad(vluint32_t code, const vluint64_t newval, const vluint32_t newtri, int bits);
-    void fullTriArray(vluint32_t code, const vluint32_t* newvalp, const vluint32_t* newtrip, int bits);
+    void fullTriArray(vluint32_t code, const vluint32_t* newvalp, const vluint32_t* newtrip,
+                      int bits);
    void fullDouble(vluint32_t code, const double newval);
    void fullFloat(vluint32_t code, const float newval);

@ -243,9 +318,7 @@ public:
    /// We do want to inline these to avoid calls when the value did not change.
    inline void chgBit(vluint32_t code, const vluint32_t newval) {
        vluint32_t diff = m_sigs_oldvalp[code] ^ newval;
-        if (VL_UNLIKELY(diff)) {
-            fullBit(code, newval);
-        }
+        if (VL_UNLIKELY(diff)) fullBit(code, newval);
    }
    inline void chgBus(vluint32_t code, const vluint32_t newval, int bits) {
        vluint32_t diff = m_sigs_oldvalp[code] ^ newval;
@ -263,18 +336,18 @@ public:
            }
        }
    }
-    inline void chgArray(vluint32_t code, const vluint32_t* newval, int bits) {
+    inline void chgArray(vluint32_t code, const vluint32_t* newvalp, int bits) {
        for (int word = 0; word < (((bits - 1) / 32) + 1); ++word) {
-            if (VL_UNLIKELY(m_sigs_oldvalp[code + word] ^ newval[word])) {
-                fullArray(code, newval, bits);
+            if (VL_UNLIKELY(m_sigs_oldvalp[code + word] ^ newvalp[word])) {
+                fullArray(code, newvalp, bits);
                return;
            }
        }
    }
-    inline void chgArray(vluint32_t code, const vluint64_t* newval, int bits) {
+    inline void chgArray(vluint32_t code, const vluint64_t* newvalp, int bits) {
        for (int word = 0; word < (((bits - 1) / 64) + 1); ++word) {
-            if (VL_UNLIKELY(m_sigs_oldvalp[code + word] ^ newval[word])) {
-                fullArray(code, newval, bits);
+            if (VL_UNLIKELY(m_sigs_oldvalp[code + word] ^ newvalp[word])) {
+                fullArray(code, newvalp, bits);
                return;
            }
        }
@ -332,6 +405,8 @@ public:
        }
    }

+#endif  // VL_TRACE_VCD_OLD_API
+
 protected:
    // METHODS
    void evcd(bool flag) { m_evcd = flag; }
--- a/src/V3EmitC.cpp
+++ b/src/V3EmitC.cpp
@ -3100,6 +3100,7 @@ class EmitCTrace : EmitCStmts {
    AstCFunc*   m_funcp;        // Function we're in now
    bool        m_slow;         // Making slow file
    int         m_enumNum;      // Enumeration number (whole netlist)
+    int         m_baseCode;     // Code of first AstTraceInc in this function

    // METHODS
    void newOutCFile(int filenum) {
@ -3380,28 +3381,24 @@ class EmitCTrace : EmitCStmts {
                       ? "full":"chg");
        bool emitWidth = false;
        if (nodep->dtypep()->basicp()->isDouble()) {
-            puts("vcdp->"+full+"Double");
+            puts("vcdp->" + full + "Double");
        } else if (nodep->isWide() || emitTraceIsScBv(nodep) || emitTraceIsScBigUint(nodep)) {
-            puts("vcdp->"+full+"Array");
+            puts("vcdp->" + full + "Array");
            emitWidth = true;
        } else if (nodep->isQuad()) {
-            puts("vcdp->"+full+"Quad");
-            emitWidth = true;
-        } else if (nodep->declp()->bitRange().ranged()
-                   // 1 element smaller to use Bit dump
-                   && nodep->declp()->bitRange().elements() != 1) {
-            puts("vcdp->"+full+"Bus");
+            puts("vcdp->" + full + "Quad");
            emitWidth = true;
+        } else if (nodep->declp()->widthMin() > 1) {
+            puts("vcdp->" + full + "Bus<" + cvtToStr(nodep->declp()->widthMin()) + ">");
        } else {
-            puts("vcdp->"+full+"Bit");
+            puts("vcdp->" + full + "Bit");
        }
-        puts("(c+"+cvtToStr(nodep->declp()->code()
-                            + ((arrayindex<0) ? 0 : (arrayindex*nodep->declp()->widthWords()))));
-        puts(",");
+
+        const uint32_t offset = (arrayindex < 0) ? 0 : (arrayindex * nodep->declp()->widthWords());
+        const uint32_t code = nodep->declp()->code() + offset;
+        puts("(oldp+" + cvtToStr(code - m_baseCode) + ",");
        emitTraceValue(nodep, arrayindex);
-        if (emitWidth) {
-            puts(","+cvtToStr(nodep->declp()->widthMin()));
-        }
+        if (emitWidth) puts("," + cvtToStr(nodep->declp()->widthMin()));
        puts(");\n");
    }
    void emitTraceValue(AstTraceInc* nodep, int arrayindex) {
@ -3460,8 +3457,24 @@ class EmitCTrace : EmitCStmts {

            if (nodep->symProlog()) puts(EmitCBaseVisitor::symTopAssign()+"\n");

-            puts("int c = code;\n");
-            puts("if (false && vcdp && c) {}  // Prevent unused\n");
+            m_baseCode = -1;
+
+            if (nodep->funcType() == AstCFuncType::TRACE_FULL_SUB
+                || nodep->funcType() == AstCFuncType::TRACE_CHANGE_SUB) {
+                const AstTraceInc* const stmtp = VN_CAST_CONST(nodep->stmtsp(), TraceInc);
+                if (!stmtp) {
+                    nodep->stmtsp()->v3fatalSrc("Trace sub function should contain AstTraceInc");
+                }
+                m_baseCode = stmtp->declp()->code();
+                puts("vluint32_t* oldp = vcdp->oldp(code+" + cvtToStr(m_baseCode) + ");\n");
+                puts("if (false && vcdp && oldp) {}  // Prevent unused\n");
+            } else if (nodep->funcType() == AstCFuncType::TRACE_INIT_SUB) {
+                puts("int c = code;\n");
+                puts("if (false && vcdp && c) {}  // Prevent unused\n");
+            } else {
+                puts("if (false && vcdp) {}  // Prevent unused\n");
+            }
+
            if (nodep->funcType() == AstCFuncType::TRACE_INIT) {
                puts("vcdp->module(vlSymsp->name());  // Setup signal names\n");
            } else if (nodep->funcType() == AstCFuncType::TRACE_INIT_SUB) {
--- a/test_regress/t/t_trace_complex_fst.out
+++ b/test_regress/t/t_trace_complex_fst.out
@ -1,5 +1,5 @@
 $date
-	Tue Jan 21 18:08:49 2020
+	Sun Apr 12 20:15:55 2020

 $end
 $version
@ -101,7 +101,6 @@ b00000000000000000000000000000011 A
 #10
 b00000000000000000000000000000101 ?
 b00000000000000000000000000000101 >
-b000000 :
 b111 9
 b00000000000000000000000000000010 8
 b00000000000000000000000000000001 7
@ -144,14 +143,10 @@ b00000000000000000000000000000010 7
 b00000000000000000000000000000100 8
 b110 9
 b111111 :
-b00000000000000000000000000000101 >
-b00000000000000000000000000000101 ?
 #25
 0!
 #30
 1!
-b00000000000000000000000000000101 ?
-b00000000000000000000000000000101 >
 b110110 :
 b101 9
 b00000000000000000000000000000110 8
@ -194,14 +189,10 @@ b00000000000000000000000000000100 7
 b00000000000000000000000000001000 8
 b100 9
 b101101 :
-b00000000000000000000000000000101 >
-b00000000000000000000000000000101 ?
 #45
 0!
 #50
 1!
-b00000000000000000000000000000101 ?
-b00000000000000000000000000000101 >
 b100100 :
 b011 9
 b00000000000000000000000000001010 8
@ -244,5 +235,3 @@ b00000000000000000000000000000110 7
 b00000000000000000000000000001100 8
 b010 9
 b011011 :
-b00000000000000000000000000000101 >
-b00000000000000000000000000000101 ?
--- a/test_regress/t/t_trace_complex_old_api.pl
+++ b/test_regress/t/t_trace_complex_old_api.pl
@ -0,0 +1,38 @@
+#!/usr/bin/perl
+if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); die; }
+# DESCRIPTION: Verilator: Verilog Test driver/expect definition
+#
+# Copyright 2003-2009 by Wilson Snyder. This program is free software; you
+# can redistribute it and/or modify it under the terms of either the GNU
+# Lesser General Public License Version 3 or the Perl Artistic License
+# Version 2.0.
+# SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
+
+# Same test as t_trace_complex, but exercising the old VCD tracing API
+
+scenarios(simulator => 1);
+
+top_filename("t/t_trace_complex.v");
+
+compile(
+    verilator_flags2 => ['--cc --trace -CFLAGS -DVL_TRACE_VCD_OLD_API'],
+    );
+
+execute(
+    check_finished => 1,
+    );
+
+file_grep     ("$Self->{obj_dir}/simx.vcd", qr/ v_strp /);
+file_grep     ("$Self->{obj_dir}/simx.vcd", qr/ v_strp_strp /);
+file_grep     ("$Self->{obj_dir}/simx.vcd", qr/ v_arrp /);
+file_grep     ("$Self->{obj_dir}/simx.vcd", qr/ v_arrp_arrp /);
+file_grep     ("$Self->{obj_dir}/simx.vcd", qr/ v_arrp_strp /);
+file_grep     ("$Self->{obj_dir}/simx.vcd", qr/ v_arru\(/);
+file_grep     ("$Self->{obj_dir}/simx.vcd", qr/ v_arru_arru\(/);
+file_grep     ("$Self->{obj_dir}/simx.vcd", qr/ v_arru_arrp\(/);
+file_grep     ("$Self->{obj_dir}/simx.vcd", qr/ v_arru_strp\(/);
+
+vcd_identical ("$Self->{obj_dir}/simx.vcd", "t/t_trace_complex.out");
+
+ok(1);
+1;
--- a/test_regress/t/t_trace_complex_params_fst.out
+++ b/test_regress/t/t_trace_complex_params_fst.out
@ -1,5 +1,5 @@
 $date
-	Tue Jan 21 18:15:28 2020
+	Sun Apr 12 20:17:15 2020

 $end
 $version
@ -101,7 +101,6 @@ b00000000000000000000000000000011 A
 #10
 b00000000000000000000000000000101 ?
 b00000000000000000000000000000101 >
-b000000 :
 b111 9
 b00000000000000000000000000000010 8
 b00000000000000000000000000000001 7
@ -144,14 +143,10 @@ b00000000000000000000000000000010 7
 b00000000000000000000000000000100 8
 b110 9
 b111111 :
-b00000000000000000000000000000101 >
-b00000000000000000000000000000101 ?
 #25
 0!
 #30
 1!
-b00000000000000000000000000000101 ?
-b00000000000000000000000000000101 >
 b110110 :
 b101 9
 b00000000000000000000000000000110 8
@ -194,14 +189,10 @@ b00000000000000000000000000000100 7
 b00000000000000000000000000001000 8
 b100 9
 b101101 :
-b00000000000000000000000000000101 >
-b00000000000000000000000000000101 ?
 #45
 0!
 #50
 1!
-b00000000000000000000000000000101 ?
-b00000000000000000000000000000101 >
 b100100 :
 b011 9
 b00000000000000000000000000001010 8
@ -244,5 +235,3 @@ b00000000000000000000000000000110 7
 b00000000000000000000000000001100 8
 b010 9
 b011011 :
-b00000000000000000000000000000101 >
-b00000000000000000000000000000101 ?
--- a/test_regress/t/t_trace_complex_structs_fst.out
+++ b/test_regress/t/t_trace_complex_structs_fst.out
@ -1,5 +1,5 @@
 $date
-	Tue Jan 21 18:55:14 2020
+	Sun Apr 12 20:14:19 2020

 $end
 $version
@ -151,8 +151,6 @@ b00000000000000000000000000000000 L
 #10
 b00000000000000000000000000000101 L
 b00000000000000000000000000000101 K
-b000 G
-b000 F
 b111 E
 b00000000000000000000000000000010 D
 b00000000000000000000000000000001 C
@ -218,18 +216,14 @@ b00000000000000000000000000000010 B
 b00000000000000000000000000000010 C
 b00000000000000000000000000000100 D
 b110 E
-b111 F
 b111 G
-b00000000000000000000000000000101 K
-b00000000000000000000000000000101 L
+b111 F
 #25
 0!
 #30
 1!
-b00000000000000000000000000000101 L
-b00000000000000000000000000000101 K
-b110 G
 b110 F
+b110 G
 b101 E
 b00000000000000000000000000000110 D
 b00000000000000000000000000000011 C
@ -294,18 +288,14 @@ b00000000000000000000000000000100 B
 b00000000000000000000000000000100 C
 b00000000000000000000000000001000 D
 b100 E
-b101 F
 b101 G
-b00000000000000000000000000000101 K
-b00000000000000000000000000000101 L
+b101 F
 #45
 0!
 #50
 1!
-b00000000000000000000000000000101 L
-b00000000000000000000000000000101 K
-b100 G
 b100 F
+b100 G
 b011 E
 b00000000000000000000000000001010 D
 b00000000000000000000000000000101 C
@ -370,7 +360,5 @@ b00000000000000000000000000000110 B
 b00000000000000000000000000000110 C
 b00000000000000000000000000001100 D
 b010 E
-b011 F
 b011 G
-b00000000000000000000000000000101 K
-b00000000000000000000000000000101 L
+b011 F
--- a/test_regress/t/t_trace_fst.out
+++ b/test_regress/t/t_trace_fst.out
--- a/test_regress/t/t_trace_packed_struct_fst.out
+++ b/test_regress/t/t_trace_packed_struct_fst.out
@ -1,5 +1,5 @@
 $date
-	Sun Oct 21 21:56:54 2018
+	Sun Apr 12 20:19:35 2020

 $end
 $version
@ -42,4 +42,3 @@ b00000000000000000000000000000011 "
 0!
 #40
 1!
-b00000000000000000000000000000011 "