Various minor optimizations of VCD trace routines

- Change templated trace routines to branch table. Removed templating from trace chgBus and fullBus and replaced them with a branch table like the other there is a very small (< 1%) penalty for this on SwerRV EH1 CoreMark, but this is less than the variability of disk IO so it's worth it to keep the code simpler and smaller. - Prefetch VCD suffix buffer at the top of emit* - Increase ILP in VCD emit* routines - Use a 64-bit unaligned store to emit the VCD suffix (on x86 only) The performance difference with these is very small, but the changes hopefully make this code more performance-portable across various micro-architectures.
2025-01-01 04:07:34 +00:00 · 2020-04-25 19:37:59 +01:00 · 2020-04-25 19:37:59 +01:00 · b79ef672e1
commit b79ef672e1
parent 70549e1a64
7 changed files with 205 additions and 192 deletions
--- a/include/verilated_fst_c.cpp
+++ b/include/verilated_fst_c.cpp
@ -208,21 +208,31 @@ void VerilatedFst::declDouble(vluint32_t code, const char* name, int dtypenum, f
    declSymbol(code, name, dtypenum, vardir, vartype, array, arraynum, 2, 64);
 }

+// Note: emit* are only ever called from one place (full* in
+// verilated_trace_imp.cpp, which is included in this file at the top),
+// so always inline them.
+
+VL_ATTR_ALWINLINE
 void VerilatedFst::emitBit(vluint32_t code, vluint32_t newval) {
    fstWriterEmitValueChange(m_fst, m_symbolp[code], newval ? "1" : "0");
 }
-template <int T_Bits> void VerilatedFst::emitBus(vluint32_t code, vluint32_t newval) {
-    fstWriterEmitValueChange32(m_fst, m_symbolp[code], T_Bits, newval);
+VL_ATTR_ALWINLINE
+void VerilatedFst::emitBus(vluint32_t code, vluint32_t newval, int bits) {
+    fstWriterEmitValueChange32(m_fst, m_symbolp[code], bits, newval);
 }
+VL_ATTR_ALWINLINE
 void VerilatedFst::emitQuad(vluint32_t code, vluint64_t newval, int bits) {
    fstWriterEmitValueChange64(m_fst, m_symbolp[code], bits, newval);
 }
+VL_ATTR_ALWINLINE
 void VerilatedFst::emitArray(vluint32_t code, const vluint32_t* newvalp, int bits) {
    fstWriterEmitValueChangeVec32(m_fst, m_symbolp[code], bits, newvalp);
 }
+VL_ATTR_ALWINLINE
 void VerilatedFst::emitFloat(vluint32_t code, float newval) {
    fstWriterEmitValueChange(m_fst, m_symbolp[code], &newval);
 }
+VL_ATTR_ALWINLINE
 void VerilatedFst::emitDouble(vluint32_t code, double newval) {
    fstWriterEmitValueChange(m_fst, m_symbolp[code], &newval);
 }
--- a/include/verilated_fst_c.h
+++ b/include/verilated_fst_c.h
@ -67,13 +67,14 @@ protected:
    bool preFullDump() VL_OVERRIDE { return isOpen(); }
    bool preChangeDump() VL_OVERRIDE { return isOpen(); }

-    // Implementations of duck-typed methods for VerilatedTrace
-    void emitBit(vluint32_t code, vluint32_t newval);
-    template <int T_Bits> void emitBus(vluint32_t code, vluint32_t newval);
-    void emitQuad(vluint32_t code, vluint64_t newval, int bits);
-    void emitArray(vluint32_t code, const vluint32_t* newvalp, int bits);
-    void emitFloat(vluint32_t code, float newval);
-    void emitDouble(vluint32_t code, double newval);
+    // Implementations of duck-typed methods for VerilatedTrace. These are
+    // called from only one place (namely full*) so always inline them.
+    inline void emitBit(vluint32_t code, vluint32_t newval);
+    inline void emitBus(vluint32_t code, vluint32_t newval, int bits);
+    inline void emitQuad(vluint32_t code, vluint64_t newval, int bits);
+    inline void emitArray(vluint32_t code, const vluint32_t* newvalp, int bits);
+    inline void emitFloat(vluint32_t code, float newval);
+    inline void emitDouble(vluint32_t code, double newval);

 public:
    //=========================================================================
--- a/include/verilated_trace.h
+++ b/include/verilated_trace.h
@ -262,7 +262,7 @@ public:
    // this is very hot code during tracing.

    // duck-typed void emitBit(vluint32_t code, vluint32_t newval) = 0;
-    // duck-typed template <int T_Bits> void emitBus(vluint32_t code, vluint32_t newval) = 0;
+    // duck-typed void emitBus(vluint32_t code, vluint32_t newval, int bits) = 0;
    // duck-typed void emitQuad(vluint32_t code, vluint64_t newval, int bits) = 0;
    // duck-typed void emitArray(vluint32_t code, const vluint32_t* newvalp, int bits) = 0;
    // duck-typed void emitFloat(vluint32_t code, float newval) = 0;
@ -272,7 +272,7 @@ public:

    // Write to previous value buffer value and emit trace entry.
    void fullBit(vluint32_t* oldp, vluint32_t newval);
-    template <int T_Bits> void fullBus(vluint32_t* oldp, vluint32_t newval);
+    void fullBus(vluint32_t* oldp, vluint32_t newval, int bits);
    void fullQuad(vluint32_t* oldp, vluint64_t newval, int bits);
    void fullArray(vluint32_t* oldp, const vluint32_t* newvalp, int bits);
    void fullFloat(vluint32_t* oldp, float newval);
@ -286,8 +286,8 @@ public:
        m_traceBufferWritep += 2;
        VL_DEBUG_IF(assert(m_traceBufferWritep <= m_traceBufferEndp););
    }
-    template <int T_Bits> inline void chgBus(vluint32_t* oldp, vluint32_t newval) {
-        m_traceBufferWritep[0].cmd = VerilatedTraceCommand::CHG_BUS | T_Bits;
+    inline void chgBus(vluint32_t* oldp, vluint32_t newval, int bits) {
+        m_traceBufferWritep[0].cmd = VerilatedTraceCommand::CHG_BUS | bits;
        m_traceBufferWritep[1].oldp = oldp;
        m_traceBufferWritep[2].newBits = newval;
        m_traceBufferWritep += 3;
@ -339,9 +339,9 @@ public:
        const vluint32_t diff = *oldp ^ newval;
        if (VL_UNLIKELY(diff)) fullBit(oldp, newval);
    }
-    template <int T_Bits> inline void CHG(Bus)(vluint32_t* oldp, vluint32_t newval) {
+    inline void CHG(Bus)(vluint32_t* oldp, vluint32_t newval, int bits) {
        const vluint32_t diff = *oldp ^ newval;
-        if (VL_UNLIKELY(diff)) fullBus<T_Bits>(oldp, newval);
+        if (VL_UNLIKELY(diff)) fullBus(oldp, newval, bits);
    }
    inline void CHG(Quad)(vluint32_t* oldp, vluint64_t newval, int bits) {
        const vluint64_t diff = *reinterpret_cast<vluint64_t*>(oldp) ^ newval;
--- a/include/verilated_trace_imp.cpp
+++ b/include/verilated_trace_imp.cpp
@ -161,50 +161,15 @@ template <> void VerilatedTrace<VL_DERIVED_T>::workerThreadMain() {
                continue;
            case VerilatedTraceCommand::CHG_BUS:
                VL_TRACE_THREAD_DEBUG("Command CHG_BUS");
-
-                oldp = (readp++)->oldp;
-                newBits = (readp++)->newBits;
-
                // Bits stored in bottom byte of command
-                switch (cmd & 0xFFU) {
-                case 2: chgBusImpl<2>(oldp, newBits); continue;
-                case 3: chgBusImpl<3>(oldp, newBits); continue;
-                case 4: chgBusImpl<4>(oldp, newBits); continue;
-                case 5: chgBusImpl<5>(oldp, newBits); continue;
-                case 6: chgBusImpl<6>(oldp, newBits); continue;
-                case 7: chgBusImpl<7>(oldp, newBits); continue;
-                case 8: chgBusImpl<8>(oldp, newBits); continue;
-                case 9: chgBusImpl<9>(oldp, newBits); continue;
-                case 10: chgBusImpl<10>(oldp, newBits); continue;
-                case 11: chgBusImpl<11>(oldp, newBits); continue;
-                case 12: chgBusImpl<12>(oldp, newBits); continue;
-                case 13: chgBusImpl<13>(oldp, newBits); continue;
-                case 14: chgBusImpl<14>(oldp, newBits); continue;
-                case 15: chgBusImpl<15>(oldp, newBits); continue;
-                case 16: chgBusImpl<16>(oldp, newBits); continue;
-                case 17: chgBusImpl<17>(oldp, newBits); continue;
-                case 18: chgBusImpl<18>(oldp, newBits); continue;
-                case 19: chgBusImpl<19>(oldp, newBits); continue;
-                case 20: chgBusImpl<20>(oldp, newBits); continue;
-                case 21: chgBusImpl<21>(oldp, newBits); continue;
-                case 22: chgBusImpl<22>(oldp, newBits); continue;
-                case 23: chgBusImpl<23>(oldp, newBits); continue;
-                case 24: chgBusImpl<24>(oldp, newBits); continue;
-                case 25: chgBusImpl<25>(oldp, newBits); continue;
-                case 26: chgBusImpl<26>(oldp, newBits); continue;
-                case 27: chgBusImpl<27>(oldp, newBits); continue;
-                case 28: chgBusImpl<28>(oldp, newBits); continue;
-                case 29: chgBusImpl<29>(oldp, newBits); continue;
-                case 30: chgBusImpl<30>(oldp, newBits); continue;
-                case 31: chgBusImpl<31>(oldp, newBits); continue;
-                case 32: chgBusImpl<32>(oldp, newBits); continue;
-                }
-                VL_FATAL_MT(__FILE__, __LINE__, "", "Bad number of bits in CHG_BUS command");
-                break;
+                chgBusImpl(readp[0].oldp, readp[1].newBits, cmd & 0xFFULL);
+                readp += 2;
+                VL_TRACE_THREAD_DEBUG("Command CHG_BUS DONE");
+                continue;
            case VerilatedTraceCommand::CHG_QUAD:
                VL_TRACE_THREAD_DEBUG("Command CHG_QUAD");
                // Bits stored in bottom byte of command
-                chgQuadImpl(readp[0].oldp, readp[1].newBits, cmd & 0xFF);
+                chgQuadImpl(readp[0].oldp, readp[1].newBits, cmd & 0xFFULL);
                readp += 2;
                continue;
            case VerilatedTraceCommand::CHG_ARRAY:
@ -516,49 +481,12 @@ template <> void VerilatedTrace<VL_DERIVED_T>::fullBit(vluint32_t* oldp, vluint3
    self()->emitBit(oldp - m_sigs_oldvalp, newval);
 }

-// We want these functions specialized for sizes to avoid hard to predict
-// branches, but we don't want them inlined, so we explicitly instantiate the
-// template for each size used by Verilator.
 template <>
-template <int T_Bits>
-void VerilatedTrace<VL_DERIVED_T>::fullBus(vluint32_t* oldp, vluint32_t newval) {
+void VerilatedTrace<VL_DERIVED_T>::fullBus(vluint32_t* oldp, vluint32_t newval, int bits) {
    *oldp = newval;
-    self()->emitBus<T_Bits>(oldp - m_sigs_oldvalp, newval);
+    self()->emitBus(oldp - m_sigs_oldvalp, newval, bits);
 }

-// Note: No specialization for width 1, covered by 'fullBit'
-template void VerilatedTrace<VL_DERIVED_T>::fullBus<2>(vluint32_t* oldp, vluint32_t newval);
-template void VerilatedTrace<VL_DERIVED_T>::fullBus<3>(vluint32_t* oldp, vluint32_t newval);
-template void VerilatedTrace<VL_DERIVED_T>::fullBus<4>(vluint32_t* oldp, vluint32_t newval);
-template void VerilatedTrace<VL_DERIVED_T>::fullBus<5>(vluint32_t* oldp, vluint32_t newval);
-template void VerilatedTrace<VL_DERIVED_T>::fullBus<6>(vluint32_t* oldp, vluint32_t newval);
-template void VerilatedTrace<VL_DERIVED_T>::fullBus<7>(vluint32_t* oldp, vluint32_t newval);
-template void VerilatedTrace<VL_DERIVED_T>::fullBus<8>(vluint32_t* oldp, vluint32_t newval);
-template void VerilatedTrace<VL_DERIVED_T>::fullBus<9>(vluint32_t* oldp, vluint32_t newval);
-template void VerilatedTrace<VL_DERIVED_T>::fullBus<10>(vluint32_t* oldp, vluint32_t newval);
-template void VerilatedTrace<VL_DERIVED_T>::fullBus<11>(vluint32_t* oldp, vluint32_t newval);
-template void VerilatedTrace<VL_DERIVED_T>::fullBus<12>(vluint32_t* oldp, vluint32_t newval);
-template void VerilatedTrace<VL_DERIVED_T>::fullBus<13>(vluint32_t* oldp, vluint32_t newval);
-template void VerilatedTrace<VL_DERIVED_T>::fullBus<14>(vluint32_t* oldp, vluint32_t newval);
-template void VerilatedTrace<VL_DERIVED_T>::fullBus<15>(vluint32_t* oldp, vluint32_t newval);
-template void VerilatedTrace<VL_DERIVED_T>::fullBus<16>(vluint32_t* oldp, vluint32_t newval);
-template void VerilatedTrace<VL_DERIVED_T>::fullBus<17>(vluint32_t* oldp, vluint32_t newval);
-template void VerilatedTrace<VL_DERIVED_T>::fullBus<18>(vluint32_t* oldp, vluint32_t newval);
-template void VerilatedTrace<VL_DERIVED_T>::fullBus<19>(vluint32_t* oldp, vluint32_t newval);
-template void VerilatedTrace<VL_DERIVED_T>::fullBus<20>(vluint32_t* oldp, vluint32_t newval);
-template void VerilatedTrace<VL_DERIVED_T>::fullBus<21>(vluint32_t* oldp, vluint32_t newval);
-template void VerilatedTrace<VL_DERIVED_T>::fullBus<22>(vluint32_t* oldp, vluint32_t newval);
-template void VerilatedTrace<VL_DERIVED_T>::fullBus<23>(vluint32_t* oldp, vluint32_t newval);
-template void VerilatedTrace<VL_DERIVED_T>::fullBus<24>(vluint32_t* oldp, vluint32_t newval);
-template void VerilatedTrace<VL_DERIVED_T>::fullBus<25>(vluint32_t* oldp, vluint32_t newval);
-template void VerilatedTrace<VL_DERIVED_T>::fullBus<26>(vluint32_t* oldp, vluint32_t newval);
-template void VerilatedTrace<VL_DERIVED_T>::fullBus<27>(vluint32_t* oldp, vluint32_t newval);
-template void VerilatedTrace<VL_DERIVED_T>::fullBus<28>(vluint32_t* oldp, vluint32_t newval);
-template void VerilatedTrace<VL_DERIVED_T>::fullBus<29>(vluint32_t* oldp, vluint32_t newval);
-template void VerilatedTrace<VL_DERIVED_T>::fullBus<30>(vluint32_t* oldp, vluint32_t newval);
-template void VerilatedTrace<VL_DERIVED_T>::fullBus<31>(vluint32_t* oldp, vluint32_t newval);
-template void VerilatedTrace<VL_DERIVED_T>::fullBus<32>(vluint32_t* oldp, vluint32_t newval);
-
 template <>
 void VerilatedTrace<VL_DERIVED_T>::fullQuad(vluint32_t* oldp, vluint64_t newval, int bits) {
    *reinterpret_cast<vluint64_t*>(oldp) = newval;
--- a/include/verilated_vcd_c.cpp
+++ b/include/verilated_vcd_c.cpp
@ -611,15 +611,20 @@ void VerilatedVcd::declTriArray(vluint32_t code, const char* name, bool array, i
 //=============================================================================
 // Emit trace entries

+#define VL_VCD_SUFFIXP(code) (m_suffixesp + (code)*VL_TRACE_SUFFIX_ENTRY_SIZE)
+
 // Emit suffix, write back write pointer, check buffer
 void VerilatedVcd::finishLine(vluint32_t code, char* writep) {
-    const char* const suffixp = m_suffixesp + code * VL_TRACE_SUFFIX_ENTRY_SIZE;
+    const char* const suffixp = VL_VCD_SUFFIXP(code);
    // Copy the whole suffix (this avoid having hard to predict branches which
-    // helps a lot). Note suffixp could be aligned, so could load it in one go,
-    // but then we would be endiannes dependent which we don't have a way to
-    // test right now and probably would make little difference...
-    // Note: The maximum length of the suffix is
+    // helps a lot). Note: The maximum length of the suffix is
    // VL_TRACE_MAX_VCD_CODE_SIZE + 2 == 7, but we unroll this here for speed.
+#ifdef __x86_64__
+    // Copy the whole 8 bytes in one go, this works on little-endian machines
+    // supporting unaligned stores.
+    *reinterpret_cast<vluint64_t*>(writep) = *reinterpret_cast<const vluint64_t*>(suffixp);
+#else
+    // Portable variant
    writep[0] = suffixp[0];
    writep[1] = suffixp[1];
    writep[2] = suffixp[2];
@ -627,139 +632,202 @@ void VerilatedVcd::finishLine(vluint32_t code, char* writep) {
    writep[4] = suffixp[4];
    writep[5] = suffixp[5];
    writep[6] = '\n';  // The 6th index is always '\n' if it's relevant, no need to fetch it.
+#endif
    // Now write back the write pointer incremented by the actual size of the
    // suffix, which was stored in the last byte of the suffix buffer entry.
    m_writep = writep + suffixp[VL_TRACE_SUFFIX_ENTRY_SIZE - 1];
    bufferCheck();
 }

+// Note: emit* are only ever called from one place (full* in
+// verilated_trace_imp.cpp, which is included in this file at the top),
+// so always inline them.
+
+VL_ATTR_ALWINLINE
 void VerilatedVcd::emitBit(vluint32_t code, vluint32_t newval) {
+    // Don't prefetch suffix as it's a bit too late;
    char* wp = m_writep;
    *wp++ = '0' | static_cast<char>(newval);
    finishLine(code, wp);
 }

-template <int T_Bits> void VerilatedVcd::emitBus(vluint32_t code, vluint32_t newval) {
+VL_ATTR_ALWINLINE
+void VerilatedVcd::emitBus(vluint32_t code, vluint32_t newval, int bits) {
+    VL_PREFETCH_RD(VL_VCD_SUFFIXP(code));
    char* wp = m_writep;
    *wp++ = 'b';
-    newval <<= 32 - T_Bits;
-    int bits = T_Bits;
-    do {
-        *wp++ = '0' | static_cast<char>(newval >> 31);
-        newval <<= 1;
-    } while (--bits);
+    wp += bits;
+    // clang-format off
+    switch (bits) {
+    case 32: wp[-32] = '0' | static_cast<char>((newval >> 31)    ); //FALLTHRU
+    case 31: wp[-31] = '0' | static_cast<char>((newval >> 30) & 1); //FALLTHRU
+    case 30: wp[-30] = '0' | static_cast<char>((newval >> 29) & 1); //FALLTHRU
+    case 29: wp[-29] = '0' | static_cast<char>((newval >> 28) & 1); //FALLTHRU
+    case 28: wp[-28] = '0' | static_cast<char>((newval >> 27) & 1); //FALLTHRU
+    case 27: wp[-27] = '0' | static_cast<char>((newval >> 26) & 1); //FALLTHRU
+    case 26: wp[-26] = '0' | static_cast<char>((newval >> 25) & 1); //FALLTHRU
+    case 25: wp[-25] = '0' | static_cast<char>((newval >> 24) & 1); //FALLTHRU
+    case 24: wp[-24] = '0' | static_cast<char>((newval >> 23) & 1); //FALLTHRU
+    case 23: wp[-23] = '0' | static_cast<char>((newval >> 22) & 1); //FALLTHRU
+    case 22: wp[-22] = '0' | static_cast<char>((newval >> 21) & 1); //FALLTHRU
+    case 21: wp[-21] = '0' | static_cast<char>((newval >> 20) & 1); //FALLTHRU
+    case 20: wp[-20] = '0' | static_cast<char>((newval >> 19) & 1); //FALLTHRU
+    case 19: wp[-19] = '0' | static_cast<char>((newval >> 18) & 1); //FALLTHRU
+    case 18: wp[-18] = '0' | static_cast<char>((newval >> 17) & 1); //FALLTHRU
+    case 17: wp[-17] = '0' | static_cast<char>((newval >> 16) & 1); //FALLTHRU
+    case 16: wp[-16] = '0' | static_cast<char>((newval >> 15) & 1); //FALLTHRU
+    case 15: wp[-15] = '0' | static_cast<char>((newval >> 14) & 1); //FALLTHRU
+    case 14: wp[-14] = '0' | static_cast<char>((newval >> 13) & 1); //FALLTHRU
+    case 13: wp[-13] = '0' | static_cast<char>((newval >> 12) & 1); //FALLTHRU
+    case 12: wp[-12] = '0' | static_cast<char>((newval >> 11) & 1); //FALLTHRU
+    case 11: wp[-11] = '0' | static_cast<char>((newval >> 10) & 1); //FALLTHRU
+    case 10: wp[-10] = '0' | static_cast<char>((newval >>  9) & 1); //FALLTHRU
+    case 9:  wp[ -9] = '0' | static_cast<char>((newval >>  8) & 1); //FALLTHRU
+    case 8:  wp[ -8] = '0' | static_cast<char>((newval >>  7) & 1); //FALLTHRU
+    case 7:  wp[ -7] = '0' | static_cast<char>((newval >>  6) & 1); //FALLTHRU
+    case 6:  wp[ -6] = '0' | static_cast<char>((newval >>  5) & 1); //FALLTHRU
+    case 5:  wp[ -5] = '0' | static_cast<char>((newval >>  4) & 1); //FALLTHRU
+    case 4:  wp[ -4] = '0' | static_cast<char>((newval >>  3) & 1); //FALLTHRU
+    case 3:  wp[ -3] = '0' | static_cast<char>((newval >>  2) & 1); //FALLTHRU
+    case 2:  wp[ -2] = '0' | static_cast<char>((newval >>  1) & 1); //FALLTHRU
+    /*bit*/  wp[ -1] = '0' | static_cast<char>((newval      ) & 1); //FALLTHRU
+    }
+    // clang-format on
    finishLine(code, wp);
 }

+VL_ATTR_ALWINLINE
 void VerilatedVcd::emitQuad(vluint32_t code, vluint64_t newval, int bits) {
+    VL_PREFETCH_RD(VL_VCD_SUFFIXP(code));
    char* wp = m_writep;
    *wp++ = 'b';
-    newval <<= 64 - bits;
    // Handle the top 32 bits within the 64 bit input
    const int bitsInTopHalf = bits - 32;
    wp += bitsInTopHalf;
    // clang-format off
    switch (bitsInTopHalf) {
-    case 32: wp[-32] = '0' | static_cast<char>(newval >> 63); newval<<=1; //FALLTHRU
-    case 31: wp[-31] = '0' | static_cast<char>(newval >> 63); newval<<=1; //FALLTHRU
-    case 30: wp[-30] = '0' | static_cast<char>(newval >> 63); newval<<=1; //FALLTHRU
-    case 29: wp[-29] = '0' | static_cast<char>(newval >> 63); newval<<=1; //FALLTHRU
-    case 28: wp[-28] = '0' | static_cast<char>(newval >> 63); newval<<=1; //FALLTHRU
-    case 27: wp[-27] = '0' | static_cast<char>(newval >> 63); newval<<=1; //FALLTHRU
-    case 26: wp[-26] = '0' | static_cast<char>(newval >> 63); newval<<=1; //FALLTHRU
-    case 25: wp[-25] = '0' | static_cast<char>(newval >> 63); newval<<=1; //FALLTHRU
-    case 24: wp[-24] = '0' | static_cast<char>(newval >> 63); newval<<=1; //FALLTHRU
-    case 23: wp[-23] = '0' | static_cast<char>(newval >> 63); newval<<=1; //FALLTHRU
-    case 22: wp[-22] = '0' | static_cast<char>(newval >> 63); newval<<=1; //FALLTHRU
-    case 21: wp[-21] = '0' | static_cast<char>(newval >> 63); newval<<=1; //FALLTHRU
-    case 20: wp[-20] = '0' | static_cast<char>(newval >> 63); newval<<=1; //FALLTHRU
-    case 19: wp[-19] = '0' | static_cast<char>(newval >> 63); newval<<=1; //FALLTHRU
-    case 18: wp[-18] = '0' | static_cast<char>(newval >> 63); newval<<=1; //FALLTHRU
-    case 17: wp[-17] = '0' | static_cast<char>(newval >> 63); newval<<=1; //FALLTHRU
-    case 16: wp[-16] = '0' | static_cast<char>(newval >> 63); newval<<=1; //FALLTHRU
-    case 15: wp[-15] = '0' | static_cast<char>(newval >> 63); newval<<=1; //FALLTHRU
-    case 14: wp[-14] = '0' | static_cast<char>(newval >> 63); newval<<=1; //FALLTHRU
-    case 13: wp[-13] = '0' | static_cast<char>(newval >> 63); newval<<=1; //FALLTHRU
-    case 12: wp[-12] = '0' | static_cast<char>(newval >> 63); newval<<=1; //FALLTHRU
-    case 11: wp[-11] = '0' | static_cast<char>(newval >> 63); newval<<=1; //FALLTHRU
-    case 10: wp[-10] = '0' | static_cast<char>(newval >> 63); newval<<=1; //FALLTHRU
-    case 9:  wp[ -9] = '0' | static_cast<char>(newval >> 63); newval<<=1; //FALLTHRU
-    case 8:  wp[ -8] = '0' | static_cast<char>(newval >> 63); newval<<=1; //FALLTHRU
-    case 7:  wp[ -7] = '0' | static_cast<char>(newval >> 63); newval<<=1; //FALLTHRU
-    case 6:  wp[ -6] = '0' | static_cast<char>(newval >> 63); newval<<=1; //FALLTHRU
-    case 5:  wp[ -5] = '0' | static_cast<char>(newval >> 63); newval<<=1; //FALLTHRU
-    case 4:  wp[ -4] = '0' | static_cast<char>(newval >> 63); newval<<=1; //FALLTHRU
-    case 3:  wp[ -3] = '0' | static_cast<char>(newval >> 63); newval<<=1; //FALLTHRU
-    case 2:  wp[ -2] = '0' | static_cast<char>(newval >> 63); newval<<=1; //FALLTHRU
-    case 1:  wp[ -1] = '0' | static_cast<char>(newval >> 63); newval<<=1; //FALLTHRU
+    case 32: wp[-32] = '0' | static_cast<char>((newval >> 63)    ); //FALLTHRU
+    case 31: wp[-31] = '0' | static_cast<char>((newval >> 62) & 1); //FALLTHRU
+    case 30: wp[-30] = '0' | static_cast<char>((newval >> 61) & 1); //FALLTHRU
+    case 29: wp[-29] = '0' | static_cast<char>((newval >> 60) & 1); //FALLTHRU
+    case 28: wp[-28] = '0' | static_cast<char>((newval >> 59) & 1); //FALLTHRU
+    case 27: wp[-27] = '0' | static_cast<char>((newval >> 58) & 1); //FALLTHRU
+    case 26: wp[-26] = '0' | static_cast<char>((newval >> 57) & 1); //FALLTHRU
+    case 25: wp[-25] = '0' | static_cast<char>((newval >> 56) & 1); //FALLTHRU
+    case 24: wp[-24] = '0' | static_cast<char>((newval >> 55) & 1); //FALLTHRU
+    case 23: wp[-23] = '0' | static_cast<char>((newval >> 54) & 1); //FALLTHRU
+    case 22: wp[-22] = '0' | static_cast<char>((newval >> 53) & 1); //FALLTHRU
+    case 21: wp[-21] = '0' | static_cast<char>((newval >> 52) & 1); //FALLTHRU
+    case 20: wp[-20] = '0' | static_cast<char>((newval >> 51) & 1); //FALLTHRU
+    case 19: wp[-19] = '0' | static_cast<char>((newval >> 50) & 1); //FALLTHRU
+    case 18: wp[-18] = '0' | static_cast<char>((newval >> 49) & 1); //FALLTHRU
+    case 17: wp[-17] = '0' | static_cast<char>((newval >> 48) & 1); //FALLTHRU
+    case 16: wp[-16] = '0' | static_cast<char>((newval >> 47) & 1); //FALLTHRU
+    case 15: wp[-15] = '0' | static_cast<char>((newval >> 46) & 1); //FALLTHRU
+    case 14: wp[-14] = '0' | static_cast<char>((newval >> 45) & 1); //FALLTHRU
+    case 13: wp[-13] = '0' | static_cast<char>((newval >> 44) & 1); //FALLTHRU
+    case 12: wp[-12] = '0' | static_cast<char>((newval >> 43) & 1); //FALLTHRU
+    case 11: wp[-11] = '0' | static_cast<char>((newval >> 42) & 1); //FALLTHRU
+    case 10: wp[-10] = '0' | static_cast<char>((newval >> 41) & 1); //FALLTHRU
+    case 9:  wp[ -9] = '0' | static_cast<char>((newval >> 40) & 1); //FALLTHRU
+    case 8:  wp[ -8] = '0' | static_cast<char>((newval >> 39) & 1); //FALLTHRU
+    case 7:  wp[ -7] = '0' | static_cast<char>((newval >> 38) & 1); //FALLTHRU
+    case 6:  wp[ -6] = '0' | static_cast<char>((newval >> 37) & 1); //FALLTHRU
+    case 5:  wp[ -5] = '0' | static_cast<char>((newval >> 36) & 1); //FALLTHRU
+    case 4:  wp[ -4] = '0' | static_cast<char>((newval >> 35) & 1); //FALLTHRU
+    case 3:  wp[ -3] = '0' | static_cast<char>((newval >> 34) & 1); //FALLTHRU
+    case 2:  wp[ -2] = '0' | static_cast<char>((newval >> 33) & 1); //FALLTHRU
+    case 1:  wp[ -1] = '0' | static_cast<char>((newval >> 32) & 1); //FALLTHRU
    }
    // clang-format on
    // Handle the bottom 32 bits within the 64 bit input
-    int remaining = 32;
+    vluint32_t val = static_cast<vluint32_t>(newval);  // Truncate to bottom 32 bits
+    int loops = 4;
    do {
-        *wp++ = '0' | static_cast<char>(newval >> 63);
-        newval <<= 1;
-    } while (--remaining);
+        wp[0] = '0' | static_cast<char>((val >> 31));
+        wp[1] = '0' | static_cast<char>((val >> 30) & 1);
+        wp[2] = '0' | static_cast<char>((val >> 29) & 1);
+        wp[3] = '0' | static_cast<char>((val >> 28) & 1);
+        wp[4] = '0' | static_cast<char>((val >> 27) & 1);
+        wp[5] = '0' | static_cast<char>((val >> 26) & 1);
+        wp[6] = '0' | static_cast<char>((val >> 25) & 1);
+        wp[7] = '0' | static_cast<char>((val >> 24) & 1);
+        wp += 8;
+        val <<= 8;
+    } while (--loops);
+
    finishLine(code, wp);
 }

+VL_ATTR_ALWINLINE
 void VerilatedVcd::emitArray(vluint32_t code, const vluint32_t* newvalp, int bits) {
+    VL_PREFETCH_RD(VL_VCD_SUFFIXP(code));
    int words = (bits + 31) / 32;
    char* wp = m_writep;
    *wp++ = 'b';
    // Handle the most significant word
+    vluint32_t val = newvalp[--words];
    const int bitsInMSW = bits % 32 == 0 ? 32 : bits % 32;
-    vluint32_t val = newvalp[--words] << (32 - bitsInMSW);
    wp += bitsInMSW;
    // clang-format off
    switch (bitsInMSW) {
-    case 32: wp[-32] = '0' | static_cast<char>(val >> 31); val<<=1; //FALLTHRU
-    case 31: wp[-31] = '0' | static_cast<char>(val >> 31); val<<=1; //FALLTHRU
-    case 30: wp[-30] = '0' | static_cast<char>(val >> 31); val<<=1; //FALLTHRU
-    case 29: wp[-29] = '0' | static_cast<char>(val >> 31); val<<=1; //FALLTHRU
-    case 28: wp[-28] = '0' | static_cast<char>(val >> 31); val<<=1; //FALLTHRU
-    case 27: wp[-27] = '0' | static_cast<char>(val >> 31); val<<=1; //FALLTHRU
-    case 26: wp[-26] = '0' | static_cast<char>(val >> 31); val<<=1; //FALLTHRU
-    case 25: wp[-25] = '0' | static_cast<char>(val >> 31); val<<=1; //FALLTHRU
-    case 24: wp[-24] = '0' | static_cast<char>(val >> 31); val<<=1; //FALLTHRU
-    case 23: wp[-23] = '0' | static_cast<char>(val >> 31); val<<=1; //FALLTHRU
-    case 22: wp[-22] = '0' | static_cast<char>(val >> 31); val<<=1; //FALLTHRU
-    case 21: wp[-21] = '0' | static_cast<char>(val >> 31); val<<=1; //FALLTHRU
-    case 20: wp[-20] = '0' | static_cast<char>(val >> 31); val<<=1; //FALLTHRU
-    case 19: wp[-19] = '0' | static_cast<char>(val >> 31); val<<=1; //FALLTHRU
-    case 18: wp[-18] = '0' | static_cast<char>(val >> 31); val<<=1; //FALLTHRU
-    case 17: wp[-17] = '0' | static_cast<char>(val >> 31); val<<=1; //FALLTHRU
-    case 16: wp[-16] = '0' | static_cast<char>(val >> 31); val<<=1; //FALLTHRU
-    case 15: wp[-15] = '0' | static_cast<char>(val >> 31); val<<=1; //FALLTHRU
-    case 14: wp[-14] = '0' | static_cast<char>(val >> 31); val<<=1; //FALLTHRU
-    case 13: wp[-13] = '0' | static_cast<char>(val >> 31); val<<=1; //FALLTHRU
-    case 12: wp[-12] = '0' | static_cast<char>(val >> 31); val<<=1; //FALLTHRU
-    case 11: wp[-11] = '0' | static_cast<char>(val >> 31); val<<=1; //FALLTHRU
-    case 10: wp[-10] = '0' | static_cast<char>(val >> 31); val<<=1; //FALLTHRU
-    case 9:  wp[ -9] = '0' | static_cast<char>(val >> 31); val<<=1; //FALLTHRU
-    case 8:  wp[ -8] = '0' | static_cast<char>(val >> 31); val<<=1; //FALLTHRU
-    case 7:  wp[ -7] = '0' | static_cast<char>(val >> 31); val<<=1; //FALLTHRU
-    case 6:  wp[ -6] = '0' | static_cast<char>(val >> 31); val<<=1; //FALLTHRU
-    case 5:  wp[ -5] = '0' | static_cast<char>(val >> 31); val<<=1; //FALLTHRU
-    case 4:  wp[ -4] = '0' | static_cast<char>(val >> 31); val<<=1; //FALLTHRU
-    case 3:  wp[ -3] = '0' | static_cast<char>(val >> 31); val<<=1; //FALLTHRU
-    case 2:  wp[ -2] = '0' | static_cast<char>(val >> 31); val<<=1; //FALLTHRU
-    case 1:  wp[ -1] = '0' | static_cast<char>(val >> 31); val<<=1; //FALLTHRU
+    case 32: wp[-32] = '0' | static_cast<char>((val >> 31)    ); //FALLTHRU
+    case 31: wp[-31] = '0' | static_cast<char>((val >> 30) & 1); //FALLTHRU
+    case 30: wp[-30] = '0' | static_cast<char>((val >> 29) & 1); //FALLTHRU
+    case 29: wp[-29] = '0' | static_cast<char>((val >> 28) & 1); //FALLTHRU
+    case 28: wp[-28] = '0' | static_cast<char>((val >> 27) & 1); //FALLTHRU
+    case 27: wp[-27] = '0' | static_cast<char>((val >> 26) & 1); //FALLTHRU
+    case 26: wp[-26] = '0' | static_cast<char>((val >> 25) & 1); //FALLTHRU
+    case 25: wp[-25] = '0' | static_cast<char>((val >> 24) & 1); //FALLTHRU
+    case 24: wp[-24] = '0' | static_cast<char>((val >> 23) & 1); //FALLTHRU
+    case 23: wp[-23] = '0' | static_cast<char>((val >> 22) & 1); //FALLTHRU
+    case 22: wp[-22] = '0' | static_cast<char>((val >> 21) & 1); //FALLTHRU
+    case 21: wp[-21] = '0' | static_cast<char>((val >> 20) & 1); //FALLTHRU
+    case 20: wp[-20] = '0' | static_cast<char>((val >> 19) & 1); //FALLTHRU
+    case 19: wp[-19] = '0' | static_cast<char>((val >> 18) & 1); //FALLTHRU
+    case 18: wp[-18] = '0' | static_cast<char>((val >> 17) & 1); //FALLTHRU
+    case 17: wp[-17] = '0' | static_cast<char>((val >> 16) & 1); //FALLTHRU
+    case 16: wp[-16] = '0' | static_cast<char>((val >> 15) & 1); //FALLTHRU
+    case 15: wp[-15] = '0' | static_cast<char>((val >> 14) & 1); //FALLTHRU
+    case 14: wp[-14] = '0' | static_cast<char>((val >> 13) & 1); //FALLTHRU
+    case 13: wp[-13] = '0' | static_cast<char>((val >> 12) & 1); //FALLTHRU
+    case 12: wp[-12] = '0' | static_cast<char>((val >> 11) & 1); //FALLTHRU
+    case 11: wp[-11] = '0' | static_cast<char>((val >> 10) & 1); //FALLTHRU
+    case 10: wp[-10] = '0' | static_cast<char>((val >>  9) & 1); //FALLTHRU
+    case 9:  wp[ -9] = '0' | static_cast<char>((val >>  8) & 1); //FALLTHRU
+    case 8:  wp[ -8] = '0' | static_cast<char>((val >>  7) & 1); //FALLTHRU
+    case 7:  wp[ -7] = '0' | static_cast<char>((val >>  6) & 1); //FALLTHRU
+    case 6:  wp[ -6] = '0' | static_cast<char>((val >>  5) & 1); //FALLTHRU
+    case 5:  wp[ -5] = '0' | static_cast<char>((val >>  4) & 1); //FALLTHRU
+    case 4:  wp[ -4] = '0' | static_cast<char>((val >>  3) & 1); //FALLTHRU
+    case 3:  wp[ -3] = '0' | static_cast<char>((val >>  2) & 1); //FALLTHRU
+    case 2:  wp[ -2] = '0' | static_cast<char>((val >>  1) & 1); //FALLTHRU
+    case 1:  wp[ -1] = '0' | static_cast<char>((val      ) & 1); //FALLTHRU
    }
    // clang-format on
    // Handle the remaining words
    while (words > 0) {
        vluint32_t val = newvalp[--words];
-        int bits = 32;
+        int loops = 4;
        do {
-            *wp++ = '0' | static_cast<char>(val >> 31);
-            val <<= 1;
-        } while (--bits);
+            wp[0] = '0' | static_cast<char>((val >> 31));
+            wp[1] = '0' | static_cast<char>((val >> 30) & 1);
+            wp[2] = '0' | static_cast<char>((val >> 29) & 1);
+            wp[3] = '0' | static_cast<char>((val >> 28) & 1);
+            wp[4] = '0' | static_cast<char>((val >> 27) & 1);
+            wp[5] = '0' | static_cast<char>((val >> 26) & 1);
+            wp[6] = '0' | static_cast<char>((val >> 25) & 1);
+            wp[7] = '0' | static_cast<char>((val >> 24) & 1);
+            wp += 8;
+            val <<= 8;
+        } while (--loops);
    }
    finishLine(code, wp);
 }

+VL_ATTR_ALWINLINE
 void VerilatedVcd::emitFloat(vluint32_t code, float newval) {
+    VL_PREFETCH_RD(VL_VCD_SUFFIXP(code));
    char* wp = m_writep;
    // Buffer can't overflow before sprintf; we sized during declaration
    sprintf(wp, "r%.16g", static_cast<double>(newval));
@ -767,7 +835,9 @@ void VerilatedVcd::emitFloat(vluint32_t code, float newval) {
    finishLine(code, wp);
 }

+VL_ATTR_ALWINLINE
 void VerilatedVcd::emitDouble(vluint32_t code, double newval) {
+    VL_PREFETCH_RD(VL_VCD_SUFFIXP(code));
    char* wp = m_writep;
    // Buffer can't overflow before sprintf; we sized during declaration
    sprintf(wp, "r%.16g", newval);
@ -775,6 +845,8 @@ void VerilatedVcd::emitDouble(vluint32_t code, double newval) {
    finishLine(code, wp);
 }

+#undef VL_VCD_SUFFIXP
+
 #ifdef VL_TRACE_VCD_OLD_API

 void VerilatedVcd::fullBit(vluint32_t code, const vluint32_t newval) {
--- a/include/verilated_vcd_c.h
+++ b/include/verilated_vcd_c.h
@ -122,13 +122,14 @@ protected:
    bool preFullDump() VL_OVERRIDE { return isOpen(); }
    bool preChangeDump() VL_OVERRIDE;

-    // Implementations of duck-typed methods for VerilatedTrace
-    void emitBit(vluint32_t code, vluint32_t newval);
-    template <int T_Bits> void emitBus(vluint32_t code, vluint32_t newval);
-    void emitQuad(vluint32_t code, vluint64_t newval, int bits);
-    void emitArray(vluint32_t code, const vluint32_t* newvalp, int bits);
-    void emitFloat(vluint32_t code, float newval);
-    void emitDouble(vluint32_t code, double newval);
+    // Implementations of duck-typed methods for VerilatedTrace. These are
+    // called from only one place (namely full*) so always inline them.
+    inline void emitBit(vluint32_t code, vluint32_t newval);
+    inline void emitBus(vluint32_t code, vluint32_t newval, int bits);
+    inline void emitQuad(vluint32_t code, vluint64_t newval, int bits);
+    inline void emitArray(vluint32_t code, const vluint32_t* newvalp, int bits);
+    inline void emitFloat(vluint32_t code, float newval);
+    inline void emitDouble(vluint32_t code, double newval);

 public:
    //=========================================================================
@ -179,8 +180,8 @@ public:
    // Write back to previous value buffer value and emit

    void fullBit(vluint32_t* oldp, vluint32_t newval) { fullBit(oldp - this->oldp(0), newval); }
-    template <int T_Bits> void fullBus(vluint32_t* oldp, vluint32_t newval) {
-        fullBus(oldp - this->oldp(0), newval, T_Bits);
+    void fullBus(vluint32_t* oldp, vluint32_t newval, int bits) {
+        fullBus(oldp - this->oldp(0), newval, bits);
    }
    void fullQuad(vluint32_t* oldp, vluint64_t newval, int bits) {
        fullQuad(oldp - this->oldp(0), newval, bits);
@ -195,8 +196,8 @@ public:
    // Check previous value and emit if changed

    void chgBit(vluint32_t* oldp, vluint32_t newval) { chgBit(oldp - this->oldp(0), newval); }
-    template <int T_Bits> void chgBus(vluint32_t* oldp, vluint32_t newval) {
-        chgBus(oldp - this->oldp(0), newval, T_Bits);
+    void chgBus(vluint32_t* oldp, vluint32_t newval, int bits) {
+        chgBus(oldp - this->oldp(0), newval, bits);
    }
    void chgQuad(vluint32_t* oldp, vluint64_t newval, int bits) {
        chgQuad(oldp - this->oldp(0), newval, bits);
--- a/src/V3EmitC.cpp
+++ b/src/V3EmitC.cpp
@ -3561,7 +3561,8 @@ class EmitCTrace : EmitCStmts {
            puts("vcdp->" + full + "Quad");
            emitWidth = true;
        } else if (nodep->declp()->widthMin() > 1) {
-            puts("vcdp->" + full + "Bus<" + cvtToStr(nodep->declp()->widthMin()) + ">");
+            puts("vcdp->" + full + "Bus");
+            emitWidth = true;
        } else {
            puts("vcdp->" + full + "Bit");
        }