Use SIMD intrinsics to render VCD traces (#2289)

Use SIMD intrinsics to render VCD traces.

I have measured 10-40% single threaded performance increase with VCD
tracing on SweRV EH1 and lowRISC Ibex using SSE2 intrinsics to render
the trace. Also helps a tiny bit with FST, but now almost all of the FST
overhead is in the FST library.

I have reworked the tracing routines to use more precisely sized
arguments. The nice thing about this is that the performance without the
intrinsics is pretty much the same as it was before, as we do at most 2x
as much work as necessary, but in exchange there are no data dependent
branches at all.
This commit is contained in:
Geza Lore 2020-04-30 00:09:09 +01:00 committed by GitHub
parent faf7255e83
commit aa9cde22c8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 451 additions and 255 deletions

View File

@ -64,11 +64,13 @@
VerilatedFst::VerilatedFst(void* fst)
: m_fst(fst)
, m_symbolp(NULL) {}
, m_symbolp(NULL)
, m_strbuf(NULL) {}
VerilatedFst::~VerilatedFst() {
if (m_fst) fstWriterClose(m_fst);
if (m_symbolp) VL_DO_CLEAR(delete[] m_symbolp, m_symbolp = NULL);
if (m_strbuf) VL_DO_CLEAR(delete[] m_strbuf, m_strbuf = NULL);
}
void VerilatedFst::open(const char* filename) VL_MT_UNSAFE {
@ -100,6 +102,9 @@ void VerilatedFst::open(const char* filename) VL_MT_UNSAFE {
}
}
m_code2symbol.clear();
// Allocate string buffer for arrays
if (!m_strbuf) { m_strbuf = new char[maxBits() + 32]; }
}
void VerilatedFst::close() {
@ -213,25 +218,59 @@ void VerilatedFst::declDouble(vluint32_t code, const char* name, int dtypenum, f
// so always inline them.
VL_ATTR_ALWINLINE
void VerilatedFst::emitBit(vluint32_t code, vluint32_t newval) {
void VerilatedFst::emitBit(vluint32_t code, CData newval) {
fstWriterEmitValueChange(m_fst, m_symbolp[code], newval ? "1" : "0");
}
VL_ATTR_ALWINLINE
void VerilatedFst::emitBus(vluint32_t code, vluint32_t newval, int bits) {
fstWriterEmitValueChange32(m_fst, m_symbolp[code], bits, newval);
void VerilatedFst::emitCData(vluint32_t code, CData newval, int bits) {
char buf[VL_BYTESIZE];
cvtCDataToStr(buf, newval << (VL_BYTESIZE - bits));
fstWriterEmitValueChange(m_fst, m_symbolp[code], buf);
}
VL_ATTR_ALWINLINE
void VerilatedFst::emitQuad(vluint32_t code, vluint64_t newval, int bits) {
fstWriterEmitValueChange64(m_fst, m_symbolp[code], bits, newval);
void VerilatedFst::emitSData(vluint32_t code, SData newval, int bits) {
char buf[VL_SHORTSIZE];
cvtSDataToStr(buf, newval << (VL_SHORTSIZE - bits));
fstWriterEmitValueChange(m_fst, m_symbolp[code], buf);
}
VL_ATTR_ALWINLINE
void VerilatedFst::emitArray(vluint32_t code, const vluint32_t* newvalp, int bits) {
fstWriterEmitValueChangeVec32(m_fst, m_symbolp[code], bits, newvalp);
void VerilatedFst::emitIData(vluint32_t code, IData newval, int bits) {
char buf[VL_IDATASIZE];
cvtIDataToStr(buf, newval << (VL_IDATASIZE - bits));
fstWriterEmitValueChange(m_fst, m_symbolp[code], buf);
}
VL_ATTR_ALWINLINE
void VerilatedFst::emitQData(vluint32_t code, QData newval, int bits) {
char buf[VL_QUADSIZE];
cvtQDataToStr(buf, newval << (VL_QUADSIZE - bits));
fstWriterEmitValueChange(m_fst, m_symbolp[code], buf);
}
VL_ATTR_ALWINLINE
void VerilatedFst::emitWData(vluint32_t code, const WData* newvalp, int bits) {
int words = VL_WORDS_I(bits);
char* wp = m_strbuf;
// Convert the most significant word
const int bitsInMSW = VL_BITBIT_E(bits) ? VL_BITBIT_E(bits) : VL_EDATASIZE;
cvtEDataToStr(wp, newvalp[--words] << (VL_EDATASIZE - bitsInMSW));
wp += bitsInMSW;
// Convert the remaining words
while (words > 0) {
cvtEDataToStr(wp, newvalp[--words]);
wp += VL_EDATASIZE;
}
fstWriterEmitValueChange(m_fst, m_symbolp[code], m_strbuf);
}
VL_ATTR_ALWINLINE
void VerilatedFst::emitFloat(vluint32_t code, float newval) {
fstWriterEmitValueChange(m_fst, m_symbolp[code], &newval);
}
VL_ATTR_ALWINLINE
void VerilatedFst::emitDouble(vluint32_t code, double newval) {
fstWriterEmitValueChange(m_fst, m_symbolp[code], &newval);

View File

@ -51,6 +51,8 @@ private:
Local2FstDtype m_local2fstdtype;
std::list<std::string> m_curScope;
fstHandle* m_symbolp; ///< same as m_code2symbol, but as an array
char* m_strbuf; ///< String buffer long enough to hold maxBits() chars
// CONSTRUCTORS
VL_UNCOPYABLE(VerilatedFst);
void declSymbol(vluint32_t code, const char* name, int dtypenum, fstVarDir vardir,
@ -69,10 +71,12 @@ protected:
// Implementations of duck-typed methods for VerilatedTrace. These are
// called from only one place (namely full*) so always inline them.
inline void emitBit(vluint32_t code, vluint32_t newval);
inline void emitBus(vluint32_t code, vluint32_t newval, int bits);
inline void emitQuad(vluint32_t code, vluint64_t newval, int bits);
inline void emitArray(vluint32_t code, const vluint32_t* newvalp, int bits);
inline void emitBit(vluint32_t code, CData newval);
inline void emitCData(vluint32_t code, CData newval, int bits);
inline void emitSData(vluint32_t code, SData newval, int bits);
inline void emitIData(vluint32_t code, IData newval, int bits);
inline void emitQData(vluint32_t code, QData newval, int bits);
inline void emitWData(vluint32_t code, const WData* newvalp, int bits);
inline void emitFloat(vluint32_t code, float newval);
inline void emitDouble(vluint32_t code, double newval);

View File

@ -0,0 +1,41 @@
// -*- mode: C++; c-file-style: "cc-mode" -*-
//*************************************************************************
//
// Copyright 2003-2020 by Wilson Snyder. This program is free software; you can
// redistribute it and/or modify it under the terms of either the GNU
// Lesser General Public License Version 3 or the Perl Artistic License
// Version 2.0.
// SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
//
//*************************************************************************
///
/// \file
/// \brief Verilator: Common include for target specific intrinsics.
///
/// Code using machine specific intrinsics for optimization should
/// include this header rather than directly including he target
/// specific headers. We provide macros to check for availability
/// of instruction sets, and a common mechanism to disable them.
///
//*************************************************************************
#ifndef _VERILATED_INTRINSICS_H_
#define _VERILATED_INTRINSICS_H_ 1 ///< Header Guard
// clang-format off
// Use VL_DISABLE_INTRINSICS to disable all intrinsics based optimization
#if !defined(VL_DISABLE_INTRINSICS) && !defined(VL_PORTABLE_ONLY)
# if defined(__SSE2__) && !defined(VL_DISABLE_SSE2)
# define VL_HAVE_SSE2 1
# include <emmintrin.h>
# endif
# if defined(__AVX2__) && defined(VL_HAVE_SSE2) && !defined(VL_DISABLE_AVX2)
# define VL_HAVE_AVX2 1
# include <immintrin.h>
# endif
#endif
// clang-format on
#endif // Guard

View File

@ -90,11 +90,13 @@ public:
enum {
CHG_BIT_0 = 0x0,
CHG_BIT_1 = 0x1,
CHG_BUS = 0x2,
CHG_QUAD = 0x3,
CHG_ARRAY = 0x4,
CHG_FLOAT = 0x5,
CHG_DOUBLE = 0x6,
CHG_CDATA = 0x2,
CHG_SDATA = 0x3,
CHG_IDATA = 0x4,
CHG_QDATA = 0x5,
CHG_WDATA = 0x6,
CHG_FLOAT = 0x7,
CHG_DOUBLE = 0x8,
// TODO: full..
TIME_CHANGE = 0xd,
END = 0xe, // End of buffer
@ -122,6 +124,7 @@ private:
bool m_fullDump; ///< Whether a full dump is required on the next call to 'dump'
vluint32_t m_nextCode; ///< Next code number to assign
vluint32_t m_numSignals; ///< Number of distinct signals
vluint32_t m_maxBits; ///< Number of bits in the widest signal
std::string m_moduleName; ///< Name of module being trace initialized now
char m_scopeEscape;
double m_timeRes; ///< Time resolution (ns/ms etc)
@ -176,6 +179,7 @@ protected:
vluint32_t nextCode() const { return m_nextCode; }
vluint32_t numSignals() const { return m_numSignals; }
vluint32_t maxBits() const { return m_maxBits; }
const std::string& moduleName() const { return m_moduleName; }
void fullDump(bool value) { m_fullDump = value; }
vluint64_t timeLastDump() { return m_timeLastDump; }
@ -251,47 +255,65 @@ public:
// these here, but we cannot afford dynamic dispatch for calling these as
// this is very hot code during tracing.
// duck-typed void emitBit(vluint32_t code, vluint32_t newval) = 0;
// duck-typed void emitBus(vluint32_t code, vluint32_t newval, int bits) = 0;
// duck-typed void emitQuad(vluint32_t code, vluint64_t newval, int bits) = 0;
// duck-typed void emitArray(vluint32_t code, const vluint32_t* newvalp, int bits) = 0;
// duck-typed void emitBit(vluint32_t code, CData newval) = 0;
// duck-typed void emitCData(vluint32_t code, CData newval, int bits) = 0;
// duck-typed void emitSData(vluint32_t code, SData newval, int bits) = 0;
// duck-typed void emitIData(vluint32_t code, IData newval, int bits) = 0;
// duck-typed void emitQData(vluint32_t code, QData newval, int bits) = 0;
// duck-typed void emitWData(vluint32_t code, const WData* newvalp, int bits) = 0;
// duck-typed void emitFloat(vluint32_t code, float newval) = 0;
// duck-typed void emitDouble(vluint32_t code, double newval) = 0;
vluint32_t* oldp(vluint32_t code) { return m_sigs_oldvalp + code; }
// Write to previous value buffer value and emit trace entry.
void fullBit(vluint32_t* oldp, vluint32_t newval);
void fullBus(vluint32_t* oldp, vluint32_t newval, int bits);
void fullQuad(vluint32_t* oldp, vluint64_t newval, int bits);
void fullArray(vluint32_t* oldp, const vluint32_t* newvalp, int bits);
void fullBit(vluint32_t* oldp, CData newval);
void fullCData(vluint32_t* oldp, CData newval, int bits);
void fullSData(vluint32_t* oldp, SData newval, int bits);
void fullIData(vluint32_t* oldp, IData newval, int bits);
void fullQData(vluint32_t* oldp, QData newval, int bits);
void fullWData(vluint32_t* oldp, const WData* newvalp, int bits);
void fullFloat(vluint32_t* oldp, float newval);
void fullDouble(vluint32_t* oldp, double newval);
#ifdef VL_TRACE_THREADED
// Threaded tracing. Just dump everything in the trace buffer
inline void chgBit(vluint32_t code, vluint32_t newval) {
inline void chgBit(vluint32_t code, CData newval) {
m_traceBufferWritep[0] = VerilatedTraceCommand::CHG_BIT_0 | newval;
m_traceBufferWritep[1] = code;
m_traceBufferWritep += 2;
VL_DEBUG_IF(assert(m_traceBufferWritep <= m_traceBufferEndp););
}
inline void chgBus(vluint32_t code, vluint32_t newval, int bits) {
m_traceBufferWritep[0] = (bits << 4) | VerilatedTraceCommand::CHG_BUS;
inline void chgCData(vluint32_t code, CData newval, int bits) {
m_traceBufferWritep[0] = (bits << 4) | VerilatedTraceCommand::CHG_CDATA;
m_traceBufferWritep[1] = code;
m_traceBufferWritep[2] = newval;
m_traceBufferWritep += 3;
VL_DEBUG_IF(assert(m_traceBufferWritep <= m_traceBufferEndp););
}
inline void chgQuad(vluint32_t code, vluint64_t newval, int bits) {
m_traceBufferWritep[0] = (bits << 4) | VerilatedTraceCommand::CHG_QUAD;
inline void chgSData(vluint32_t code, SData newval, int bits) {
m_traceBufferWritep[0] = (bits << 4) | VerilatedTraceCommand::CHG_SDATA;
m_traceBufferWritep[1] = code;
*reinterpret_cast<vluint64_t*>(m_traceBufferWritep + 2) = newval;
m_traceBufferWritep[2] = newval;
m_traceBufferWritep += 3;
VL_DEBUG_IF(assert(m_traceBufferWritep <= m_traceBufferEndp););
}
inline void chgIData(vluint32_t code, IData newval, int bits) {
m_traceBufferWritep[0] = (bits << 4) | VerilatedTraceCommand::CHG_IDATA;
m_traceBufferWritep[1] = code;
m_traceBufferWritep[2] = newval;
m_traceBufferWritep += 3;
VL_DEBUG_IF(assert(m_traceBufferWritep <= m_traceBufferEndp););
}
inline void chgQData(vluint32_t code, QData newval, int bits) {
m_traceBufferWritep[0] = (bits << 4) | VerilatedTraceCommand::CHG_QDATA;
m_traceBufferWritep[1] = code;
*reinterpret_cast<QData*>(m_traceBufferWritep + 2) = newval;
m_traceBufferWritep += 4;
VL_DEBUG_IF(assert(m_traceBufferWritep <= m_traceBufferEndp););
}
inline void chgArray(vluint32_t code, const vluint32_t* newvalp, int bits) {
m_traceBufferWritep[0] = (bits << 4) | VerilatedTraceCommand::CHG_ARRAY;
inline void chgWData(vluint32_t code, const WData* newvalp, int bits) {
m_traceBufferWritep[0] = (bits << 4) | VerilatedTraceCommand::CHG_WDATA;
m_traceBufferWritep[1] = code;
m_traceBufferWritep += 2;
for (int i = 0; i < (bits + 31) / 32; ++i) { *m_traceBufferWritep++ = newvalp[i]; }
@ -324,22 +346,30 @@ public:
// thread and are called chg*Impl
// Check previous dumped value of signal. If changed, then emit trace entry
inline void CHG(Bit)(vluint32_t* oldp, vluint32_t newval) {
inline void CHG(Bit)(vluint32_t* oldp, CData newval) {
const vluint32_t diff = *oldp ^ newval;
if (VL_UNLIKELY(diff)) fullBit(oldp, newval);
}
inline void CHG(Bus)(vluint32_t* oldp, vluint32_t newval, int bits) {
inline void CHG(CData)(vluint32_t* oldp, CData newval, int bits) {
const vluint32_t diff = *oldp ^ newval;
if (VL_UNLIKELY(diff)) fullBus(oldp, newval, bits);
if (VL_UNLIKELY(diff)) fullCData(oldp, newval, bits);
}
inline void CHG(Quad)(vluint32_t* oldp, vluint64_t newval, int bits) {
const vluint64_t diff = *reinterpret_cast<vluint64_t*>(oldp) ^ newval;
if (VL_UNLIKELY(diff)) fullQuad(oldp, newval, bits);
inline void CHG(SData)(vluint32_t* oldp, SData newval, int bits) {
const vluint32_t diff = *oldp ^ newval;
if (VL_UNLIKELY(diff)) fullSData(oldp, newval, bits);
}
inline void CHG(Array)(vluint32_t* oldp, const vluint32_t* newvalp, int bits) {
inline void CHG(IData)(vluint32_t* oldp, IData newval, int bits) {
const vluint32_t diff = *oldp ^ newval;
if (VL_UNLIKELY(diff)) fullIData(oldp, newval, bits);
}
inline void CHG(QData)(vluint32_t* oldp, QData newval, int bits) {
const vluint64_t diff = *reinterpret_cast<QData*>(oldp) ^ newval;
if (VL_UNLIKELY(diff)) fullQData(oldp, newval, bits);
}
inline void CHG(WData)(vluint32_t* oldp, const WData* newvalp, int bits) {
for (int i = 0; i < (bits + 31) / 32; ++i) {
if (VL_UNLIKELY(oldp[i] ^ newvalp[i])) {
fullArray(oldp, newvalp, bits);
fullWData(oldp, newvalp, bits);
return;
}
}

View File

@ -23,6 +23,7 @@
# error "This file should be included in trace format implementations"
#endif
#include "verilated_intrinsics.h"
#include "verilated_trace.h"
#if 0
@ -166,22 +167,34 @@ template <> void VerilatedTrace<VL_DERIVED_T>::workerThreadMain() {
VL_TRACE_THREAD_DEBUG("Command CHG_BIT_1 " << top);
chgBitImpl(oldp, 1);
continue;
case VerilatedTraceCommand::CHG_BUS:
VL_TRACE_THREAD_DEBUG("Command CHG_BUS " << top);
case VerilatedTraceCommand::CHG_CDATA:
VL_TRACE_THREAD_DEBUG("Command CHG_CDATA " << top);
// Bits stored in bottom byte of command
chgBusImpl(oldp, *readp, top);
chgCDataImpl(oldp, *readp, top);
readp += 1;
continue;
case VerilatedTraceCommand::CHG_QUAD:
VL_TRACE_THREAD_DEBUG("Command CHG_QUAD " << top);
case VerilatedTraceCommand::CHG_SDATA:
VL_TRACE_THREAD_DEBUG("Command CHG_SDATA " << top);
// Bits stored in bottom byte of command
chgQuadImpl(oldp, *reinterpret_cast<const vluint64_t*>(readp), top);
chgSDataImpl(oldp, *readp, top);
readp += 1;
continue;
case VerilatedTraceCommand::CHG_IDATA:
VL_TRACE_THREAD_DEBUG("Command CHG_IDATA " << top);
// Bits stored in bottom byte of command
chgIDataImpl(oldp, *readp, top);
readp += 1;
continue;
case VerilatedTraceCommand::CHG_QDATA:
VL_TRACE_THREAD_DEBUG("Command CHG_QDATA " << top);
// Bits stored in bottom byte of command
chgQDataImpl(oldp, *reinterpret_cast<const QData*>(readp), top);
readp += 2;
continue;
case VerilatedTraceCommand::CHG_ARRAY:
VL_TRACE_THREAD_DEBUG("Command CHG_ARRAY " << top);
chgArrayImpl(oldp, readp, top);
readp += (top + 31) / 32;
case VerilatedTraceCommand::CHG_WDATA:
VL_TRACE_THREAD_DEBUG("Command CHG_WDATA " << top);
chgWDataImpl(oldp, readp, top);
readp += VL_WORDS_I(top);
continue;
case VerilatedTraceCommand::CHG_FLOAT:
VL_TRACE_THREAD_DEBUG("Command CHG_FLOAT " << top);
@ -284,6 +297,7 @@ VerilatedTrace<VL_DERIVED_T>::VerilatedTrace()
, m_fullDump(true)
, m_nextCode(0)
, m_numSignals(0)
, m_maxBits(0)
, m_scopeEscape('.')
, m_timeRes(1e-9)
, m_timeUnit(1e-9)
@ -318,6 +332,7 @@ template <> void VerilatedTrace<VL_DERIVED_T>::traceInit() VL_MT_UNSAFE {
const vluint32_t expectedCodes = nextCode();
m_nextCode = 1;
m_numSignals = 0;
m_maxBits = 0;
// Call all initialize callbacks, which will call decl* for each signal.
for (vluint32_t ent = 0; ent < m_callbacks.size(); ++ent) {
@ -355,10 +370,11 @@ void VerilatedTrace<VL_DERIVED_T>::declCode(vluint32_t code, vluint32_t bits, bo
}
// Note: The tri-state flag is not used by Verilator, but is here for
// compatibility with some foreign code.
int codesNeeded = (bits + 31) / 32;
int codesNeeded = VL_WORDS_I(bits);
if (tri) codesNeeded *= 2;
m_nextCode = std::max(m_nextCode, code + codesNeeded);
++m_numSignals;
m_maxBits = std::max(m_maxBits, bits);
}
//=========================================================================
@ -486,35 +502,139 @@ void VerilatedTrace<VL_DERIVED_T>::addCallback(callback_t initcb, callback_t ful
// that this file must be included in the format specific implementation, so
// the emit* functions can be inlined for performance.
template <> void VerilatedTrace<VL_DERIVED_T>::fullBit(vluint32_t* oldp, vluint32_t newval) {
template <> void VerilatedTrace<VL_DERIVED_T>::fullBit(vluint32_t* oldp, CData newval) {
*oldp = newval;
self()->emitBit(oldp - m_sigs_oldvalp, newval);
}
template <>
void VerilatedTrace<VL_DERIVED_T>::fullBus(vluint32_t* oldp, vluint32_t newval, int bits) {
void VerilatedTrace<VL_DERIVED_T>::fullCData(vluint32_t* oldp, CData newval, int bits) {
*oldp = newval;
self()->emitBus(oldp - m_sigs_oldvalp, newval, bits);
self()->emitCData(oldp - m_sigs_oldvalp, newval, bits);
}
template <>
void VerilatedTrace<VL_DERIVED_T>::fullQuad(vluint32_t* oldp, vluint64_t newval, int bits) {
*reinterpret_cast<vluint64_t*>(oldp) = newval;
self()->emitQuad(oldp - m_sigs_oldvalp, newval, bits);
void VerilatedTrace<VL_DERIVED_T>::fullSData(vluint32_t* oldp, SData newval, int bits) {
*oldp = newval;
self()->emitSData(oldp - m_sigs_oldvalp, newval, bits);
}
template <>
void VerilatedTrace<VL_DERIVED_T>::fullArray(vluint32_t* oldp, const vluint32_t* newvalp,
int bits) {
for (int i = 0; i < (bits + 31) / 32; ++i) oldp[i] = newvalp[i];
self()->emitArray(oldp - m_sigs_oldvalp, newvalp, bits);
void VerilatedTrace<VL_DERIVED_T>::fullIData(vluint32_t* oldp, IData newval, int bits) {
*oldp = newval;
self()->emitIData(oldp - m_sigs_oldvalp, newval, bits);
}
template <>
void VerilatedTrace<VL_DERIVED_T>::fullQData(vluint32_t* oldp, QData newval, int bits) {
*reinterpret_cast<QData*>(oldp) = newval;
self()->emitQData(oldp - m_sigs_oldvalp, newval, bits);
}
template <>
void VerilatedTrace<VL_DERIVED_T>::fullWData(vluint32_t* oldp, const WData* newvalp, int bits) {
for (int i = 0; i < VL_WORDS_I(bits); ++i) oldp[i] = newvalp[i];
self()->emitWData(oldp - m_sigs_oldvalp, newvalp, bits);
}
template <> void VerilatedTrace<VL_DERIVED_T>::fullFloat(vluint32_t* oldp, float newval) {
// cppcheck-suppress invalidPointerCast
*reinterpret_cast<float*>(oldp) = newval;
self()->emitFloat(oldp - m_sigs_oldvalp, newval);
}
template <> void VerilatedTrace<VL_DERIVED_T>::fullDouble(vluint32_t* oldp, double newval) {
// cppcheck-suppress invalidPointerCast
*reinterpret_cast<double*>(oldp) = newval;
self()->emitDouble(oldp - m_sigs_oldvalp, newval);
}
//=========================================================================
// Primitives converting binary values to strings...
// All of these take a destination pointer where the string will be emitted,
// and a value to convert. There are a couple of variants for efficiency.
inline static void cvtCDataToStr(char* dstp, CData value) {
#ifdef VL_HAVE_SSE2
// Similar to cvtSDataToStr but only the bottom 8 byte lanes are used
const __m128i a = _mm_cvtsi32_si128(value);
const __m128i b = _mm_unpacklo_epi8(a, a);
const __m128i c = _mm_shufflelo_epi16(b, 0);
const __m128i m = _mm_set1_epi64x(0x0102040810204080);
const __m128i d = _mm_cmpeq_epi8(_mm_and_si128(c, m), m);
const __m128i result = _mm_sub_epi8(_mm_set1_epi8('0'), d);
_mm_storel_epi64(reinterpret_cast<__m128i*>(dstp), result);
#else
dstp[0] = '0' | static_cast<char>((value >> 7) & 1);
dstp[1] = '0' | static_cast<char>((value >> 6) & 1);
dstp[2] = '0' | static_cast<char>((value >> 5) & 1);
dstp[3] = '0' | static_cast<char>((value >> 4) & 1);
dstp[4] = '0' | static_cast<char>((value >> 3) & 1);
dstp[5] = '0' | static_cast<char>((value >> 2) & 1);
dstp[6] = '0' | static_cast<char>((value >> 1) & 1);
dstp[7] = '0' | static_cast<char>(value & 1);
#endif
}
inline static void cvtSDataToStr(char* dstp, SData value) {
#ifdef VL_HAVE_SSE2
// We want each bit in the 16-bit input value to end up in a byte lane
// within the 128-bit XMM register. Note that x86 is little-endian and we
// want the MSB of the input at the low address, so we will bit-reverse
// at the same time.
// Put value in bottom of 128-bit register a[15:0] = value
const __m128i a = _mm_cvtsi32_si128(value);
// Interleave bytes with themselves
// b[15: 0] = {2{a[ 7:0]}} == {2{value[ 7:0]}}
// b[31:16] = {2{a[15:8]}} == {2{value[15:8]}}
const __m128i b = _mm_unpacklo_epi8(a, a);
// Shuffle bottom 64 bits, note swapping high bytes with low bytes
// c[31: 0] = {2{b[31:16]}} == {4{value[15:8}}
// c[63:32] = {2{b[15: 0]}} == {4{value[ 7:0}}
const __m128i c = _mm_shufflelo_epi16(b, 0x05);
// Shuffle whole register
// d[ 63: 0] = {2{c[31: 0]}} == {8{value[15:8}}
// d[126:54] = {2{c[63:32]}} == {8{value[ 7:0}}
const __m128i d = _mm_shuffle_epi32(c, 0x50);
// Test each bit within the bytes, this sets each byte lane to 0
// if the bit for that lane is 0 and to 0xff if the bit is 1.
const __m128i m = _mm_set1_epi64x(0x0102040810204080);
const __m128i e = _mm_cmpeq_epi8(_mm_and_si128(d, m), m);
// Convert to ASCII by subtracting the masks from ASCII '0':
// '0' - 0 is '0', '0' - -1 is '1'
const __m128i result = _mm_sub_epi8(_mm_set1_epi8('0'), e);
// Store the 16 characters to the un-aligned buffer
_mm_storeu_si128(reinterpret_cast<__m128i*>(dstp), result);
#else
cvtCDataToStr(dstp, value >> 8);
cvtCDataToStr(dstp + 8, value);
#endif
}
inline static void cvtIDataToStr(char* dstp, IData value) {
#ifdef VL_HAVE_AVX2
// Similar to cvtSDataToStr but the bottom 16-bits are processed in the
// top half of the YMM registerss
const __m256i a = _mm256_insert_epi32(_mm256_undefined_si256(), value, 0);
const __m256i b = _mm256_permute4x64_epi64(a, 0);
const __m256i s = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2,
2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3);
const __m256i c = _mm256_shuffle_epi8(b, s);
const __m256i m = _mm256_set1_epi64x(0x0102040810204080);
const __m256i d = _mm256_cmpeq_epi8(_mm256_and_si256(c, m), m);
const __m256i result = _mm256_sub_epi8(_mm256_set1_epi8('0'), d);
_mm256_storeu_si256(reinterpret_cast<__m256i*>(dstp), result);
#else
cvtSDataToStr(dstp, value >> 16);
cvtSDataToStr(dstp + 16, value);
#endif
}
inline static void cvtQDataToStr(char* dstp, QData value) {
cvtIDataToStr(dstp, value >> 32);
cvtIDataToStr(dstp + 32, value);
}
#define cvtEDataToStr cvtIDataToStr

View File

@ -35,6 +35,8 @@
# include <unistd.h>
#endif
#include "verilated_intrinsics.h"
// SPDIFF_ON
#ifndef O_LARGEFILE // For example on WIN32
@ -606,20 +608,14 @@ void VerilatedVcd::declTriArray(vluint32_t code, const char* name, bool array, i
#endif // VL_TRACE_VCD_OLD_API
//=============================================================================
// Trace recording routines
// Trace rendering prinitives
//=============================================================================
// Emit trace entries
#define VL_VCD_SUFFIXP(code) (m_suffixesp + (code)*VL_TRACE_SUFFIX_ENTRY_SIZE)
// Emit suffix, write back write pointer, check buffer
void VerilatedVcd::finishLine(vluint32_t code, char* writep) {
const char* const suffixp = VL_VCD_SUFFIXP(code);
const char* const suffixp = m_suffixesp + code * VL_TRACE_SUFFIX_ENTRY_SIZE;
// Copy the whole suffix (this avoid having hard to predict branches which
// helps a lot). Note: The maximum length of the suffix is
// VL_TRACE_MAX_VCD_CODE_SIZE + 2 == 7, but we unroll this here for speed.
#ifdef __x86_64__
#ifdef VL_X86_64
// Copy the whole 8 bytes in one go, this works on little-endian machines
// supporting unaligned stores.
*reinterpret_cast<vluint64_t*>(writep) = *reinterpret_cast<const vluint64_t*>(suffixp);
@ -639,12 +635,15 @@ void VerilatedVcd::finishLine(vluint32_t code, char* writep) {
bufferCheck();
}
//=============================================================================
// emit* trace routines
// Note: emit* are only ever called from one place (full* in
// verilated_trace_imp.cpp, which is included in this file at the top),
// so always inline them.
VL_ATTR_ALWINLINE
void VerilatedVcd::emitBit(vluint32_t code, vluint32_t newval) {
void VerilatedVcd::emitBit(vluint32_t code, CData newval) {
// Don't prefetch suffix as it's a bit too late;
char* wp = m_writep;
*wp++ = '0' | static_cast<char>(newval);
@ -652,182 +651,56 @@ void VerilatedVcd::emitBit(vluint32_t code, vluint32_t newval) {
}
VL_ATTR_ALWINLINE
void VerilatedVcd::emitBus(vluint32_t code, vluint32_t newval, int bits) {
VL_PREFETCH_RD(VL_VCD_SUFFIXP(code));
void VerilatedVcd::emitCData(vluint32_t code, CData newval, int bits) {
char* wp = m_writep;
*wp++ = 'b';
wp += bits;
// clang-format off
switch (bits) {
case 32: wp[-32] = '0' | static_cast<char>((newval >> 31) ); //FALLTHRU
case 31: wp[-31] = '0' | static_cast<char>((newval >> 30) & 1); //FALLTHRU
case 30: wp[-30] = '0' | static_cast<char>((newval >> 29) & 1); //FALLTHRU
case 29: wp[-29] = '0' | static_cast<char>((newval >> 28) & 1); //FALLTHRU
case 28: wp[-28] = '0' | static_cast<char>((newval >> 27) & 1); //FALLTHRU
case 27: wp[-27] = '0' | static_cast<char>((newval >> 26) & 1); //FALLTHRU
case 26: wp[-26] = '0' | static_cast<char>((newval >> 25) & 1); //FALLTHRU
case 25: wp[-25] = '0' | static_cast<char>((newval >> 24) & 1); //FALLTHRU
case 24: wp[-24] = '0' | static_cast<char>((newval >> 23) & 1); //FALLTHRU
case 23: wp[-23] = '0' | static_cast<char>((newval >> 22) & 1); //FALLTHRU
case 22: wp[-22] = '0' | static_cast<char>((newval >> 21) & 1); //FALLTHRU
case 21: wp[-21] = '0' | static_cast<char>((newval >> 20) & 1); //FALLTHRU
case 20: wp[-20] = '0' | static_cast<char>((newval >> 19) & 1); //FALLTHRU
case 19: wp[-19] = '0' | static_cast<char>((newval >> 18) & 1); //FALLTHRU
case 18: wp[-18] = '0' | static_cast<char>((newval >> 17) & 1); //FALLTHRU
case 17: wp[-17] = '0' | static_cast<char>((newval >> 16) & 1); //FALLTHRU
case 16: wp[-16] = '0' | static_cast<char>((newval >> 15) & 1); //FALLTHRU
case 15: wp[-15] = '0' | static_cast<char>((newval >> 14) & 1); //FALLTHRU
case 14: wp[-14] = '0' | static_cast<char>((newval >> 13) & 1); //FALLTHRU
case 13: wp[-13] = '0' | static_cast<char>((newval >> 12) & 1); //FALLTHRU
case 12: wp[-12] = '0' | static_cast<char>((newval >> 11) & 1); //FALLTHRU
case 11: wp[-11] = '0' | static_cast<char>((newval >> 10) & 1); //FALLTHRU
case 10: wp[-10] = '0' | static_cast<char>((newval >> 9) & 1); //FALLTHRU
case 9: wp[ -9] = '0' | static_cast<char>((newval >> 8) & 1); //FALLTHRU
case 8: wp[ -8] = '0' | static_cast<char>((newval >> 7) & 1); //FALLTHRU
case 7: wp[ -7] = '0' | static_cast<char>((newval >> 6) & 1); //FALLTHRU
case 6: wp[ -6] = '0' | static_cast<char>((newval >> 5) & 1); //FALLTHRU
case 5: wp[ -5] = '0' | static_cast<char>((newval >> 4) & 1); //FALLTHRU
case 4: wp[ -4] = '0' | static_cast<char>((newval >> 3) & 1); //FALLTHRU
case 3: wp[ -3] = '0' | static_cast<char>((newval >> 2) & 1); //FALLTHRU
case 2: wp[ -2] = '0' | static_cast<char>((newval >> 1) & 1); //FALLTHRU
/*bit*/ wp[ -1] = '0' | static_cast<char>((newval ) & 1); //FALLTHRU
}
// clang-format on
finishLine(code, wp);
cvtCDataToStr(wp, newval << (VL_BYTESIZE - bits));
finishLine(code, wp + bits);
}
VL_ATTR_ALWINLINE
void VerilatedVcd::emitQuad(vluint32_t code, vluint64_t newval, int bits) {
VL_PREFETCH_RD(VL_VCD_SUFFIXP(code));
void VerilatedVcd::emitSData(vluint32_t code, SData newval, int bits) {
char* wp = m_writep;
*wp++ = 'b';
// Handle the top 32 bits within the 64 bit input
const int bitsInTopHalf = bits - 32;
wp += bitsInTopHalf;
// clang-format off
switch (bitsInTopHalf) {
case 32: wp[-32] = '0' | static_cast<char>((newval >> 63) ); //FALLTHRU
case 31: wp[-31] = '0' | static_cast<char>((newval >> 62) & 1); //FALLTHRU
case 30: wp[-30] = '0' | static_cast<char>((newval >> 61) & 1); //FALLTHRU
case 29: wp[-29] = '0' | static_cast<char>((newval >> 60) & 1); //FALLTHRU
case 28: wp[-28] = '0' | static_cast<char>((newval >> 59) & 1); //FALLTHRU
case 27: wp[-27] = '0' | static_cast<char>((newval >> 58) & 1); //FALLTHRU
case 26: wp[-26] = '0' | static_cast<char>((newval >> 57) & 1); //FALLTHRU
case 25: wp[-25] = '0' | static_cast<char>((newval >> 56) & 1); //FALLTHRU
case 24: wp[-24] = '0' | static_cast<char>((newval >> 55) & 1); //FALLTHRU
case 23: wp[-23] = '0' | static_cast<char>((newval >> 54) & 1); //FALLTHRU
case 22: wp[-22] = '0' | static_cast<char>((newval >> 53) & 1); //FALLTHRU
case 21: wp[-21] = '0' | static_cast<char>((newval >> 52) & 1); //FALLTHRU
case 20: wp[-20] = '0' | static_cast<char>((newval >> 51) & 1); //FALLTHRU
case 19: wp[-19] = '0' | static_cast<char>((newval >> 50) & 1); //FALLTHRU
case 18: wp[-18] = '0' | static_cast<char>((newval >> 49) & 1); //FALLTHRU
case 17: wp[-17] = '0' | static_cast<char>((newval >> 48) & 1); //FALLTHRU
case 16: wp[-16] = '0' | static_cast<char>((newval >> 47) & 1); //FALLTHRU
case 15: wp[-15] = '0' | static_cast<char>((newval >> 46) & 1); //FALLTHRU
case 14: wp[-14] = '0' | static_cast<char>((newval >> 45) & 1); //FALLTHRU
case 13: wp[-13] = '0' | static_cast<char>((newval >> 44) & 1); //FALLTHRU
case 12: wp[-12] = '0' | static_cast<char>((newval >> 43) & 1); //FALLTHRU
case 11: wp[-11] = '0' | static_cast<char>((newval >> 42) & 1); //FALLTHRU
case 10: wp[-10] = '0' | static_cast<char>((newval >> 41) & 1); //FALLTHRU
case 9: wp[ -9] = '0' | static_cast<char>((newval >> 40) & 1); //FALLTHRU
case 8: wp[ -8] = '0' | static_cast<char>((newval >> 39) & 1); //FALLTHRU
case 7: wp[ -7] = '0' | static_cast<char>((newval >> 38) & 1); //FALLTHRU
case 6: wp[ -6] = '0' | static_cast<char>((newval >> 37) & 1); //FALLTHRU
case 5: wp[ -5] = '0' | static_cast<char>((newval >> 36) & 1); //FALLTHRU
case 4: wp[ -4] = '0' | static_cast<char>((newval >> 35) & 1); //FALLTHRU
case 3: wp[ -3] = '0' | static_cast<char>((newval >> 34) & 1); //FALLTHRU
case 2: wp[ -2] = '0' | static_cast<char>((newval >> 33) & 1); //FALLTHRU
case 1: wp[ -1] = '0' | static_cast<char>((newval >> 32) & 1); //FALLTHRU
}
// clang-format on
// Handle the bottom 32 bits within the 64 bit input
vluint32_t val = static_cast<vluint32_t>(newval); // Truncate to bottom 32 bits
int loops = 4;
do {
wp[0] = '0' | static_cast<char>((val >> 31));
wp[1] = '0' | static_cast<char>((val >> 30) & 1);
wp[2] = '0' | static_cast<char>((val >> 29) & 1);
wp[3] = '0' | static_cast<char>((val >> 28) & 1);
wp[4] = '0' | static_cast<char>((val >> 27) & 1);
wp[5] = '0' | static_cast<char>((val >> 26) & 1);
wp[6] = '0' | static_cast<char>((val >> 25) & 1);
wp[7] = '0' | static_cast<char>((val >> 24) & 1);
wp += 8;
val <<= 8;
} while (--loops);
finishLine(code, wp);
cvtSDataToStr(wp, newval << (VL_SHORTSIZE - bits));
finishLine(code, wp + bits);
}
VL_ATTR_ALWINLINE
void VerilatedVcd::emitArray(vluint32_t code, const vluint32_t* newvalp, int bits) {
VL_PREFETCH_RD(VL_VCD_SUFFIXP(code));
int words = (bits + 31) / 32;
void VerilatedVcd::emitIData(vluint32_t code, IData newval, int bits) {
char* wp = m_writep;
*wp++ = 'b';
cvtIDataToStr(wp, newval << (VL_IDATASIZE - bits));
finishLine(code, wp + bits);
}
VL_ATTR_ALWINLINE
void VerilatedVcd::emitQData(vluint32_t code, QData newval, int bits) {
char* wp = m_writep;
*wp++ = 'b';
cvtQDataToStr(wp, newval << (VL_QUADSIZE - bits));
finishLine(code, wp + bits);
}
VL_ATTR_ALWINLINE
void VerilatedVcd::emitWData(vluint32_t code, const WData* newvalp, int bits) {
int words = VL_WORDS_I(bits);
char* wp = m_writep;
*wp++ = 'b';
// Handle the most significant word
vluint32_t val = newvalp[--words];
const int bitsInMSW = bits % 32 == 0 ? 32 : bits % 32;
const int bitsInMSW = VL_BITBIT_E(bits) ? VL_BITBIT_E(bits) : VL_EDATASIZE;
cvtEDataToStr(wp, newvalp[--words] << (VL_EDATASIZE - bitsInMSW));
wp += bitsInMSW;
// clang-format off
switch (bitsInMSW) {
case 32: wp[-32] = '0' | static_cast<char>((val >> 31) ); //FALLTHRU
case 31: wp[-31] = '0' | static_cast<char>((val >> 30) & 1); //FALLTHRU
case 30: wp[-30] = '0' | static_cast<char>((val >> 29) & 1); //FALLTHRU
case 29: wp[-29] = '0' | static_cast<char>((val >> 28) & 1); //FALLTHRU
case 28: wp[-28] = '0' | static_cast<char>((val >> 27) & 1); //FALLTHRU
case 27: wp[-27] = '0' | static_cast<char>((val >> 26) & 1); //FALLTHRU
case 26: wp[-26] = '0' | static_cast<char>((val >> 25) & 1); //FALLTHRU
case 25: wp[-25] = '0' | static_cast<char>((val >> 24) & 1); //FALLTHRU
case 24: wp[-24] = '0' | static_cast<char>((val >> 23) & 1); //FALLTHRU
case 23: wp[-23] = '0' | static_cast<char>((val >> 22) & 1); //FALLTHRU
case 22: wp[-22] = '0' | static_cast<char>((val >> 21) & 1); //FALLTHRU
case 21: wp[-21] = '0' | static_cast<char>((val >> 20) & 1); //FALLTHRU
case 20: wp[-20] = '0' | static_cast<char>((val >> 19) & 1); //FALLTHRU
case 19: wp[-19] = '0' | static_cast<char>((val >> 18) & 1); //FALLTHRU
case 18: wp[-18] = '0' | static_cast<char>((val >> 17) & 1); //FALLTHRU
case 17: wp[-17] = '0' | static_cast<char>((val >> 16) & 1); //FALLTHRU
case 16: wp[-16] = '0' | static_cast<char>((val >> 15) & 1); //FALLTHRU
case 15: wp[-15] = '0' | static_cast<char>((val >> 14) & 1); //FALLTHRU
case 14: wp[-14] = '0' | static_cast<char>((val >> 13) & 1); //FALLTHRU
case 13: wp[-13] = '0' | static_cast<char>((val >> 12) & 1); //FALLTHRU
case 12: wp[-12] = '0' | static_cast<char>((val >> 11) & 1); //FALLTHRU
case 11: wp[-11] = '0' | static_cast<char>((val >> 10) & 1); //FALLTHRU
case 10: wp[-10] = '0' | static_cast<char>((val >> 9) & 1); //FALLTHRU
case 9: wp[ -9] = '0' | static_cast<char>((val >> 8) & 1); //FALLTHRU
case 8: wp[ -8] = '0' | static_cast<char>((val >> 7) & 1); //FALLTHRU
case 7: wp[ -7] = '0' | static_cast<char>((val >> 6) & 1); //FALLTHRU
case 6: wp[ -6] = '0' | static_cast<char>((val >> 5) & 1); //FALLTHRU
case 5: wp[ -5] = '0' | static_cast<char>((val >> 4) & 1); //FALLTHRU
case 4: wp[ -4] = '0' | static_cast<char>((val >> 3) & 1); //FALLTHRU
case 3: wp[ -3] = '0' | static_cast<char>((val >> 2) & 1); //FALLTHRU
case 2: wp[ -2] = '0' | static_cast<char>((val >> 1) & 1); //FALLTHRU
case 1: wp[ -1] = '0' | static_cast<char>((val ) & 1); //FALLTHRU
}
// clang-format on
// Handle the remaining words
while (words > 0) {
vluint32_t val = newvalp[--words];
int loops = 4;
do {
wp[0] = '0' | static_cast<char>((val >> 31));
wp[1] = '0' | static_cast<char>((val >> 30) & 1);
wp[2] = '0' | static_cast<char>((val >> 29) & 1);
wp[3] = '0' | static_cast<char>((val >> 28) & 1);
wp[4] = '0' | static_cast<char>((val >> 27) & 1);
wp[5] = '0' | static_cast<char>((val >> 26) & 1);
wp[6] = '0' | static_cast<char>((val >> 25) & 1);
wp[7] = '0' | static_cast<char>((val >> 24) & 1);
wp += 8;
val <<= 8;
} while (--loops);
cvtEDataToStr(wp, newvalp[--words]);
wp += VL_EDATASIZE;
}
finishLine(code, wp);
}
VL_ATTR_ALWINLINE
void VerilatedVcd::emitFloat(vluint32_t code, float newval) {
VL_PREFETCH_RD(VL_VCD_SUFFIXP(code));
char* wp = m_writep;
// Buffer can't overflow before sprintf; we sized during declaration
sprintf(wp, "r%.16g", static_cast<double>(newval));
@ -837,7 +710,6 @@ void VerilatedVcd::emitFloat(vluint32_t code, float newval) {
VL_ATTR_ALWINLINE
void VerilatedVcd::emitDouble(vluint32_t code, double newval) {
VL_PREFETCH_RD(VL_VCD_SUFFIXP(code));
char* wp = m_writep;
// Buffer can't overflow before sprintf; we sized during declaration
sprintf(wp, "r%.16g", newval);
@ -845,8 +717,6 @@ void VerilatedVcd::emitDouble(vluint32_t code, double newval) {
finishLine(code, wp);
}
#undef VL_VCD_SUFFIXP
#ifdef VL_TRACE_VCD_OLD_API
void VerilatedVcd::fullBit(vluint32_t code, const vluint32_t newval) {

View File

@ -124,10 +124,12 @@ protected:
// Implementations of duck-typed methods for VerilatedTrace. These are
// called from only one place (namely full*) so always inline them.
inline void emitBit(vluint32_t code, vluint32_t newval);
inline void emitBus(vluint32_t code, vluint32_t newval, int bits);
inline void emitQuad(vluint32_t code, vluint64_t newval, int bits);
inline void emitArray(vluint32_t code, const vluint32_t* newvalp, int bits);
inline void emitBit(vluint32_t code, CData newval);
inline void emitCData(vluint32_t code, CData newval, int bits);
inline void emitSData(vluint32_t code, SData newval, int bits);
inline void emitIData(vluint32_t code, IData newval, int bits);
inline void emitQData(vluint32_t code, QData newval, int bits);
inline void emitWData(vluint32_t code, const WData* newvalp, int bits);
inline void emitFloat(vluint32_t code, float newval);
inline void emitDouble(vluint32_t code, double newval);
@ -176,37 +178,48 @@ public:
int lsb);
void declTriArray(vluint32_t code, const char* name, bool array, int arraynum, int msb,
int lsb);
//=========================================================================
// Write back to previous value buffer value and emit
void fullBit(vluint32_t* oldp, vluint32_t newval) { fullBit(oldp - this->oldp(0), newval); }
void fullBus(vluint32_t* oldp, vluint32_t newval, int bits) {
void fullBit(vluint32_t* oldp, CData newval) { fullBit(oldp - this->oldp(0), newval); }
void fullCData(vluint32_t* oldp, CData newval, int bits) {
fullBus(oldp - this->oldp(0), newval, bits);
}
void fullQuad(vluint32_t* oldp, vluint64_t newval, int bits) {
void fullSData(vluint32_t* oldp, SData newval, int bits) {
fullBus(oldp - this->oldp(0), newval, bits);
}
void fullIData(vluint32_t* oldp, IData newval, int bits) {
fullBus(oldp - this->oldp(0), newval, bits);
}
void fullQData(vluint32_t* oldp, QData newval, int bits) {
fullQuad(oldp - this->oldp(0), newval, bits);
}
void fullArray(vluint32_t* oldp, const vluint32_t* newvalp, int bits) {
void fullWData(vluint32_t* oldp, const WData* newvalp, int bits) {
fullArray(oldp - this->oldp(0), newvalp, bits);
}
void fullFloat(vluint32_t* oldp, float newval) { fullFloat(oldp - this->oldp(0), newval); }
void fullDouble(vluint32_t* oldp, double newval) { fullDouble(oldp - this->oldp(0), newval); }
//=========================================================================
// Check previous value and emit if changed
void chgBit(vluint32_t* oldp, vluint32_t newval) { chgBit(oldp - this->oldp(0), newval); }
void chgBus(vluint32_t* oldp, vluint32_t newval, int bits) {
inline void chgBit(vluint32_t* oldp, CData newval) { chgBit(oldp - this->oldp(0), newval); }
inline void chgCData(vluint32_t* oldp, CData newval, int bits) {
chgBus(oldp - this->oldp(0), newval, bits);
}
void chgQuad(vluint32_t* oldp, vluint64_t newval, int bits) {
inline void chgSData(vluint32_t* oldp, SData newval, int bits) {
chgBus(oldp - this->oldp(0), newval, bits);
}
inline void chgIData(vluint32_t* oldp, IData newval, int bits) {
chgBus(oldp - this->oldp(0), newval, bits);
}
inline void chgQData(vluint32_t* oldp, QData newval, int bits) {
chgQuad(oldp - this->oldp(0), newval, bits);
}
void chgArray(vluint32_t* oldp, const vluint32_t* newvalp, int bits) {
inline void chgWData(vluint32_t* oldp, const WData* newvalp, int bits) {
chgArray(oldp - this->oldp(0), newvalp, bits);
}
void chgFloat(vluint32_t* oldp, float newval) { chgFloat(oldp - this->oldp(0), newval); }
void chgDouble(vluint32_t* oldp, double newval) { chgDouble(oldp - this->oldp(0), newval); }
inline void chgFloat(vluint32_t* oldp, float newval) {
chgFloat(oldp - this->oldp(0), newval);
}
inline void chgDouble(vluint32_t* oldp, double newval) {
chgDouble(oldp - this->oldp(0), newval);
}
/// Inside dumping routines, dump one signal, faster when not inlined
/// due to code size reduction.

View File

@ -475,6 +475,16 @@ typedef unsigned long long vluint64_t; ///< 64-bit unsigned type
#else
# define VL_STRCASECMP strcasecmp
#endif
//=========================================================================
// Macros controlling target specific optimizations
// Define VL_PORTABLE_ONLY to disable all target specific optimizations
#ifndef VL_PORTABLE_ONLY
# ifdef __x86_64__
# define VL_X86_64 1
# endif
#endif // VL_PORTABLE_ONLY
// clang-format on
//=========================================================================

View File

@ -3550,20 +3550,23 @@ class EmitCTrace : EmitCStmts {
const bool full = (m_funcp->funcType() == AstCFuncType::TRACE_FULL
|| m_funcp->funcType() == AstCFuncType::TRACE_FULL_SUB);
const string func = full ? "full" : "chg";
bool emitWidth = false;
bool emitWidth = true;
if (nodep->dtypep()->basicp()->isDouble()) {
puts("vcdp->" + func + "Double");
emitWidth = false;
} else if (nodep->isWide() || emitTraceIsScBv(nodep) || emitTraceIsScBigUint(nodep)) {
puts("vcdp->" + func + "Array");
emitWidth = true;
puts("vcdp->" + func + "WData");
} else if (nodep->isQuad()) {
puts("vcdp->" + func + "Quad");
emitWidth = true;
puts("vcdp->" + func + "QData");
} else if (nodep->declp()->widthMin() > 16) {
puts("vcdp->" + func + "IData");
} else if (nodep->declp()->widthMin() > 8) {
puts("vcdp->" + func + "SData");
} else if (nodep->declp()->widthMin() > 1) {
puts("vcdp->" + func + "Bus");
emitWidth = true;
puts("vcdp->" + func + "CData");
} else {
puts("vcdp->" + func + "Bit");
emitWidth = false;
}
const uint32_t offset = (arrayindex < 0) ? 0 : (arrayindex * nodep->declp()->widthWords());

View File

@ -0,0 +1,28 @@
#!/usr/bin/perl
if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); die; }
# DESCRIPTION: Verilator: Verilog Test driver/expect definition
#
# Copyright 2003-2009 by Wilson Snyder. This program is free software; you
# can redistribute it and/or modify it under the terms of either the GNU
# Lesser General Public License Version 3 or the Perl Artistic License
# Version 2.0.
# SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
scenarios(vlt => 1);
top_filename("t/t_trace_array.v");
$Self->{golden_filename} = "t/t_trace_array_fst.out";
compile(
verilator_flags2 => ['--cc --trace-fst --trace-structs',
'-CFLAGS -DVL_PORTABLE_ONLY'],
);
execute(
check_finished => 1,
);
fst_identical($Self->trace_filename, $Self->{golden_filename});
ok(1);
1;

View File

@ -0,0 +1,38 @@
#!/usr/bin/perl
if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); die; }
# DESCRIPTION: Verilator: Verilog Test driver/expect definition
#
# Copyright 2003-2009 by Wilson Snyder. This program is free software; you
# can redistribute it and/or modify it under the terms of either the GNU
# Lesser General Public License Version 3 or the Perl Artistic License
# Version 2.0.
# SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
# Same test as t_trace_complex, but exercising the old VCD tracing API
scenarios(vlt => 1);
top_filename("t/t_trace_complex.v");
compile(
verilator_flags2 => ['--cc --trace -CFLAGS -DVL_PORTABLE_ONLY'],
);
execute(
check_finished => 1,
);
file_grep ("$Self->{obj_dir}/simx.vcd", qr/ v_strp /);
file_grep ("$Self->{obj_dir}/simx.vcd", qr/ v_strp_strp /);
file_grep ("$Self->{obj_dir}/simx.vcd", qr/ v_arrp /);
file_grep ("$Self->{obj_dir}/simx.vcd", qr/ v_arrp_arrp /);
file_grep ("$Self->{obj_dir}/simx.vcd", qr/ v_arrp_strp /);
file_grep ("$Self->{obj_dir}/simx.vcd", qr/ v_arru\(/);
file_grep ("$Self->{obj_dir}/simx.vcd", qr/ v_arru_arru\(/);
file_grep ("$Self->{obj_dir}/simx.vcd", qr/ v_arru_arrp\(/);
file_grep ("$Self->{obj_dir}/simx.vcd", qr/ v_arru_strp\(/);
vcd_identical ("$Self->{obj_dir}/simx.vcd", "t/t_trace_complex.out");
ok(1);
1;