Improve trace buffer memory utilization and performance.

Convert trace buffer to 32-bit entries, rather than a union containing a
pointer type. Also tweaked trace entry layouts for a bit more
performance. This gains another 10% on SweRV EH1 CoreMark.
This commit is contained in:
Geza Lore 2020-04-25 22:38:25 +01:00
parent b79ef672e1
commit dd967f7769
5 changed files with 147 additions and 140 deletions

View File

@ -81,36 +81,26 @@ public:
}
};
// Commands used by thread tracing. Note that the bottom 8 bits of all these
// values are empty and are used to store small amounts of additional command
// parameters. Anonymous enum in class, as we want it scoped, but we also
// want the automatic conversion to integer types.
// Commands used by thread tracing. Anonymous enum in class, as we want
// it scoped, but we also want the automatic conversion to integer types.
class VerilatedTraceCommand {
public:
// These must all fit in 4 bit at the moment, as the tracing routines
// pack parameters in the top bits.
enum {
CHG_BIT = 0x0000,
CHG_BUS = 0x0100,
CHG_QUAD = 0x0200,
CHG_ARRAY = 0x0300,
CHG_FLOAT = 0x0400,
CHG_DOUBLE = 0x0500,
TIME_CHANGE = 0x8000,
END = 0xf000, // End of buffer
SHUTDOWN = 0xf200 // Shutdown worker thread, also marks end of buffer
CHG_BIT_0 = 0x0,
CHG_BIT_1 = 0x1,
CHG_BUS = 0x2,
CHG_QUAD = 0x3,
CHG_ARRAY = 0x4,
CHG_FLOAT = 0x5,
CHG_DOUBLE = 0x6,
// TODO: full..
TIME_CHANGE = 0xd,
END = 0xe, // End of buffer
SHUTDOWN = 0xf // Shutdown worker thread, also marks end of buffer
};
};
typedef union {
vluint32_t cmd; // Command code + params in bottom 8 bits
vluint32_t* oldp; // Pointer to previous value buffer to consult/update
// Note: These are 64-bit wide, as this union already has a pointer type in it.
vluint64_t params; // Command parameter
// New signal value in various forms
vluint64_t newBits;
float newFloat;
double newDouble;
vluint64_t timeui;
} VerilatedTraceEntry;
#endif
class VerilatedTraceCallInfo;
@ -149,18 +139,18 @@ private:
size_t m_traceBufferSize;
// Buffers handed to worker for processing
VerilatedThreadQueue<VerilatedTraceEntry*> m_buffersToWorker;
VerilatedThreadQueue<vluint32_t*> m_buffersToWorker;
// Buffers returned from worker after processing
VerilatedThreadQueue<VerilatedTraceEntry*> m_buffersFromWorker;
VerilatedThreadQueue<vluint32_t*> m_buffersFromWorker;
// Get a new trace buffer that can be populated. May block if none available
VerilatedTraceEntry* getTraceBuffer();
vluint32_t* getTraceBuffer();
// Write pointer into current buffer
VerilatedTraceEntry* m_traceBufferWritep;
vluint32_t* m_traceBufferWritep;
// End of trace buffer
VerilatedTraceEntry* m_traceBufferEndp;
vluint32_t* m_traceBufferEndp;
// The worker thread itself
std::unique_ptr<std::thread> m_workerThread;
@ -169,7 +159,7 @@ private:
void workerThreadMain();
// Wait until given buffer is placed in m_buffersFromWorker
void waitForBuffer(const VerilatedTraceEntry* bufferp);
void waitForBuffer(const vluint32_t* bufferp);
// Shut down and join worker, if it's running, otherwise do nothing
void shutdownWorker();
@ -280,48 +270,47 @@ public:
#ifdef VL_TRACE_THREADED
// Threaded tracing. Just dump everything in the trace buffer
inline void chgBit(vluint32_t* oldp, vluint32_t newval) {
m_traceBufferWritep[0].cmd = VerilatedTraceCommand::CHG_BIT | newval;
m_traceBufferWritep[1].oldp = oldp;
inline void chgBit(vluint32_t code, vluint32_t newval) {
m_traceBufferWritep[0] = VerilatedTraceCommand::CHG_BIT_0 | newval;
m_traceBufferWritep[1] = code;
m_traceBufferWritep += 2;
VL_DEBUG_IF(assert(m_traceBufferWritep <= m_traceBufferEndp););
}
inline void chgBus(vluint32_t* oldp, vluint32_t newval, int bits) {
m_traceBufferWritep[0].cmd = VerilatedTraceCommand::CHG_BUS | bits;
m_traceBufferWritep[1].oldp = oldp;
m_traceBufferWritep[2].newBits = newval;
inline void chgBus(vluint32_t code, vluint32_t newval, int bits) {
m_traceBufferWritep[0] = (bits << 4) | VerilatedTraceCommand::CHG_BUS;
m_traceBufferWritep[1] = code;
m_traceBufferWritep[2] = newval;
m_traceBufferWritep += 3;
VL_DEBUG_IF(assert(m_traceBufferWritep <= m_traceBufferEndp););
}
inline void chgQuad(vluint32_t* oldp, vluint64_t newval, int bits) {
m_traceBufferWritep[0].cmd = VerilatedTraceCommand::CHG_QUAD | bits;
m_traceBufferWritep[1].oldp = oldp;
m_traceBufferWritep[2].newBits = newval;
inline void chgQuad(vluint32_t code, vluint64_t newval, int bits) {
m_traceBufferWritep[0] = (bits << 4) | VerilatedTraceCommand::CHG_QUAD;
m_traceBufferWritep[1] = code;
*reinterpret_cast<vluint64_t*>(m_traceBufferWritep + 2) = newval;
m_traceBufferWritep += 4;
VL_DEBUG_IF(assert(m_traceBufferWritep <= m_traceBufferEndp););
}
inline void chgArray(vluint32_t code, const vluint32_t* newvalp, int bits) {
m_traceBufferWritep[0] = (bits << 4) | VerilatedTraceCommand::CHG_ARRAY;
m_traceBufferWritep[1] = code;
m_traceBufferWritep += 2;
for (int i = 0; i < (bits + 31) / 32; ++i) { *m_traceBufferWritep++ = newvalp[i]; }
VL_DEBUG_IF(assert(m_traceBufferWritep <= m_traceBufferEndp););
}
inline void chgFloat(vluint32_t code, float newval) {
m_traceBufferWritep[0] = VerilatedTraceCommand::CHG_FLOAT;
m_traceBufferWritep[1] = code;
// cppcheck-suppress invalidPointerCast
*reinterpret_cast<float*>(m_traceBufferWritep + 2) = newval;
m_traceBufferWritep += 3;
VL_DEBUG_IF(assert(m_traceBufferWritep <= m_traceBufferEndp););
}
inline void chgArray(vluint32_t* oldp, const vluint32_t* newvalp, int bits) {
m_traceBufferWritep[0].cmd = VerilatedTraceCommand::CHG_ARRAY;
m_traceBufferWritep[1].oldp = oldp;
m_traceBufferWritep[2].params = bits;
m_traceBufferWritep += 3;
vluint32_t* const wp = reinterpret_cast<vluint32_t*>(m_traceBufferWritep);
for (int i = 0; i < (bits + 31) / 32; ++i) { wp[i] = newvalp[i]; }
m_traceBufferWritep += (bits + 63) / 64;
VL_DEBUG_IF(assert(m_traceBufferWritep <= m_traceBufferEndp););
}
inline void chgFloat(vluint32_t* oldp, float newval) {
m_traceBufferWritep[0].cmd = VerilatedTraceCommand::CHG_FLOAT;
m_traceBufferWritep[1].oldp = oldp;
m_traceBufferWritep[2].newFloat = newval;
m_traceBufferWritep += 3;
VL_DEBUG_IF(assert(m_traceBufferWritep <= m_traceBufferEndp););
}
inline void chgDouble(vluint32_t* oldp, double newval) {
m_traceBufferWritep[0].cmd = VerilatedTraceCommand::CHG_DOUBLE;
m_traceBufferWritep[1].oldp = oldp;
m_traceBufferWritep[2].newDouble = newval;
m_traceBufferWritep += 3;
inline void chgDouble(vluint32_t code, double newval) {
m_traceBufferWritep[0] = VerilatedTraceCommand::CHG_DOUBLE;
m_traceBufferWritep[1] = code;
// cppcheck-suppress invalidPointerCast
*reinterpret_cast<double*>(m_traceBufferWritep + 2) = newval;
m_traceBufferWritep += 4;
VL_DEBUG_IF(assert(m_traceBufferWritep <= m_traceBufferEndp););
}

View File

@ -101,8 +101,8 @@ public: // This is in .cpp file so is not widely visible
//=========================================================================
// Buffer management
template <> VerilatedTraceEntry* VerilatedTrace<VL_DERIVED_T>::getTraceBuffer() {
VerilatedTraceEntry* bufferp;
template <> vluint32_t* VerilatedTrace<VL_DERIVED_T>::getTraceBuffer() {
vluint32_t* bufferp;
// Some jitter is expected, so some number of alternative trace buffers are
// required, but don't allocate more than 8 buffers.
if (m_numTraceBuffers < 8) {
@ -111,7 +111,7 @@ template <> VerilatedTraceEntry* VerilatedTrace<VL_DERIVED_T>::getTraceBuffer()
++m_numTraceBuffers;
// Note: over allocate a bit so pointer comparison is well defined
// if we overflow only by a small amount
bufferp = new VerilatedTraceEntry[m_traceBufferSize + 16];
bufferp = new vluint32_t[m_traceBufferSize + 16];
}
} else {
// Block until a buffer becomes available
@ -120,10 +120,10 @@ template <> VerilatedTraceEntry* VerilatedTrace<VL_DERIVED_T>::getTraceBuffer()
return bufferp;
}
template <> void VerilatedTrace<VL_DERIVED_T>::waitForBuffer(const VerilatedTraceEntry* buffp) {
template <> void VerilatedTrace<VL_DERIVED_T>::waitForBuffer(const vluint32_t* buffp) {
// Slow path code only called on flush/shutdown, so use a simple algorithm.
// Collect buffers from worker and stash them until we get the one we want.
std::deque<VerilatedTraceEntry*> stash;
std::deque<vluint32_t*> stash;
do { stash.push_back(m_buffersFromWorker.get()); } while (stash.back() != buffp);
// Now put them back in the queue, in the original order.
while (!stash.empty()) {
@ -136,65 +136,71 @@ template <> void VerilatedTrace<VL_DERIVED_T>::waitForBuffer(const VerilatedTrac
// Worker thread
template <> void VerilatedTrace<VL_DERIVED_T>::workerThreadMain() {
while (true) {
VerilatedTraceEntry* const bufferp = m_buffersToWorker.get();
bool shutdown = false;
do {
vluint32_t* const bufferp = m_buffersToWorker.get();
VL_TRACE_THREAD_DEBUG("");
VL_TRACE_THREAD_DEBUG("Got buffer");
VL_TRACE_THREAD_DEBUG("Got buffer: " << bufferp);
const VerilatedTraceEntry* readp = bufferp;
vluint32_t cmd;
unsigned bits;
vluint32_t* oldp;
vluint64_t newBits;
const vluint32_t* readp = bufferp;
while (true) {
cmd = (readp++)->cmd;
const vluint32_t cmd = readp[0];
const vluint32_t top = cmd >> 4;
// Always set this up, as it is almost always needed
vluint32_t* const oldp = m_sigs_oldvalp + readp[1];
// Note this increment needs to be undone on commands which do not
// actually contain a code, but those are the rare cases.
readp += 2;
switch (cmd & ~0xFFU) {
switch (cmd & 0xF) {
//===
// CHG_* commands
case VerilatedTraceCommand::CHG_BIT:
VL_TRACE_THREAD_DEBUG("Command CHG_BIT");
chgBitImpl((readp++)->oldp, cmd & 1);
case VerilatedTraceCommand::CHG_BIT_0:
VL_TRACE_THREAD_DEBUG("Command CHG_BIT_0 " << top);
chgBitImpl(oldp, 0);
continue;
case VerilatedTraceCommand::CHG_BIT_1:
VL_TRACE_THREAD_DEBUG("Command CHG_BIT_1 " << top);
chgBitImpl(oldp, 1);
continue;
case VerilatedTraceCommand::CHG_BUS:
VL_TRACE_THREAD_DEBUG("Command CHG_BUS");
VL_TRACE_THREAD_DEBUG("Command CHG_BUS " << top);
// Bits stored in bottom byte of command
chgBusImpl(readp[0].oldp, readp[1].newBits, cmd & 0xFFULL);
readp += 2;
VL_TRACE_THREAD_DEBUG("Command CHG_BUS DONE");
chgBusImpl(oldp, *readp, top);
readp += 1;
continue;
case VerilatedTraceCommand::CHG_QUAD:
VL_TRACE_THREAD_DEBUG("Command CHG_QUAD");
VL_TRACE_THREAD_DEBUG("Command CHG_QUAD " << top);
// Bits stored in bottom byte of command
chgQuadImpl(readp[0].oldp, readp[1].newBits, cmd & 0xFFULL);
chgQuadImpl(oldp, *reinterpret_cast<const vluint64_t*>(readp), top);
readp += 2;
continue;
case VerilatedTraceCommand::CHG_ARRAY:
VL_TRACE_THREAD_DEBUG("Command CHG_CHG_ARRAY");
oldp = (readp++)->oldp;
bits = (readp++)->params;
chgArrayImpl(oldp, reinterpret_cast<const vluint32_t*>(readp), bits);
readp += (bits + 63) / 64;
VL_TRACE_THREAD_DEBUG("Command CHG_ARRAY " << top);
chgArrayImpl(oldp, readp, top);
readp += (top + 31) / 32;
continue;
case VerilatedTraceCommand::CHG_FLOAT:
VL_TRACE_THREAD_DEBUG("Command CHG_FLOAT");
chgFloatImpl(readp[0].oldp, readp[1].newFloat);
readp += 2;
VL_TRACE_THREAD_DEBUG("Command CHG_FLOAT " << top);
chgFloatImpl(oldp, *reinterpret_cast<const float*>(readp));
readp += 1;
continue;
case VerilatedTraceCommand::CHG_DOUBLE:
VL_TRACE_THREAD_DEBUG("Command CHG_DOUBLE");
chgDoubleImpl(readp[0].oldp, readp[1].newDouble);
VL_TRACE_THREAD_DEBUG("Command CHG_DOUBLE " << top);
chgDoubleImpl(oldp, *reinterpret_cast<const double*>(readp));
readp += 2;
continue;
//===
// Rare commands
case VerilatedTraceCommand::TIME_CHANGE:
VL_TRACE_THREAD_DEBUG("Command TIME_CHANGE");
emitTimeChange((readp++)->timeui);
VL_TRACE_THREAD_DEBUG("Command TIME_CHANGE " << top);
readp -= 1; // No code in this command, undo increment
emitTimeChange(*reinterpret_cast<const vluint64_t*>(readp));
readp += 2;
continue;
//===
@ -202,11 +208,13 @@ template <> void VerilatedTrace<VL_DERIVED_T>::workerThreadMain() {
case VerilatedTraceCommand::END: VL_TRACE_THREAD_DEBUG("Command END"); break;
case VerilatedTraceCommand::SHUTDOWN:
VL_TRACE_THREAD_DEBUG("Command SHUTDOWN");
shutdown = true;
break;
//===
// Unknown command
default:
VL_TRACE_THREAD_DEBUG("Command UNKNOWN");
VL_PRINTF_MT("Trace command: 0x%08x\n", cmd);
VL_FATAL_MT(__FILE__, __LINE__, "", "Unknown trace command");
break;
@ -221,10 +229,7 @@ template <> void VerilatedTrace<VL_DERIVED_T>::workerThreadMain() {
// Return buffer
m_buffersFromWorker.put(bufferp);
// Shut down if required
if (VL_UNLIKELY(cmd == VerilatedTraceCommand::SHUTDOWN)) return;
}
} while (VL_LIKELY(!shutdown));
}
template <> void VerilatedTrace<VL_DERIVED_T>::shutdownWorker() {
@ -232,8 +237,8 @@ template <> void VerilatedTrace<VL_DERIVED_T>::shutdownWorker() {
if (!m_workerThread) return;
// Hand an buffer with a shutdown command to the worker thread
VerilatedTraceEntry* const bufferp = getTraceBuffer();
bufferp[0].cmd = VerilatedTraceCommand::SHUTDOWN;
vluint32_t* const bufferp = getTraceBuffer();
bufferp[0] = VerilatedTraceCommand::SHUTDOWN;
m_buffersToWorker.put(bufferp);
// Wait for it to return
waitForBuffer(bufferp);
@ -260,8 +265,8 @@ template <> void VerilatedTrace<VL_DERIVED_T>::close() {
template <> void VerilatedTrace<VL_DERIVED_T>::flush() {
#ifdef VL_TRACE_THREADED
// Hand an empty buffer to the worker thread
VerilatedTraceEntry* const bufferp = getTraceBuffer();
bufferp[0].cmd = VerilatedTraceCommand::END;
vluint32_t* const bufferp = getTraceBuffer();
*bufferp = VerilatedTraceCommand::END;
m_buffersToWorker.put(bufferp);
// Wait for it to be returned. As the processing is in-order,
// this ensures all previous buffers have been processed.
@ -333,10 +338,10 @@ template <> void VerilatedTrace<VL_DERIVED_T>::traceInit() VL_MT_UNSAFE {
#ifdef VL_TRACE_THREADED
// Compute trace buffer size. we need to be able to store a new value for
// each signal, which is 'nextCode()' entries after the init callbacks
// above have been run, plus up to 3 more words of metadata per signal,
// plus fixed overhead of 1 for a termination flag and 2 for a time stamp
// above have been run, plus up to 2 more words of metadata per signal,
// plus fixed overhead of 1 for a termination flag and 3 for a time stamp
// update.
m_traceBufferSize = nextCode() + numSignals() * 3 + 3;
m_traceBufferSize = nextCode() + numSignals() * 2 + 4;
// Start the worker thread
m_workerThread.reset(new std::thread(&VerilatedTrace<VL_DERIVED_T>::workerThreadMain, this));
@ -406,18 +411,21 @@ template <> void VerilatedTrace<VL_DERIVED_T>::dump(vluint64_t timeui) {
}
#ifdef VL_TRACE_THREADED
// Get the trace buffer we are about to fill
VerilatedTraceEntry* const bufferp = getTraceBuffer();
m_traceBufferWritep = bufferp;
m_traceBufferEndp = bufferp + m_traceBufferSize;
// Currently only incremental dumps run on the worker thread
vluint32_t* bufferp = nullptr;
if (VL_LIKELY(!m_fullDump)) {
// Get the trace buffer we are about to fill
bufferp = getTraceBuffer();
m_traceBufferWritep = bufferp;
m_traceBufferEndp = bufferp + m_traceBufferSize;
// Tell worker to update time point
(m_traceBufferWritep++)->cmd = VerilatedTraceCommand::TIME_CHANGE;
(m_traceBufferWritep++)->timeui = timeui;
m_traceBufferWritep[0] = VerilatedTraceCommand::TIME_CHANGE;
*reinterpret_cast<vluint64_t*>(m_traceBufferWritep + 1) = timeui;
m_traceBufferWritep += 3;
} else {
// Update time point
flush();
emitTimeChange(timeui);
}
#else
@ -440,14 +448,16 @@ template <> void VerilatedTrace<VL_DERIVED_T>::dump(vluint64_t timeui) {
}
#ifdef VL_TRACE_THREADED
// Mark end of the trace buffer we just filled
(m_traceBufferWritep++)->cmd = VerilatedTraceCommand::END;
if (VL_LIKELY(bufferp)) {
// Mark end of the trace buffer we just filled
*m_traceBufferWritep++ = VerilatedTraceCommand::END;
// Assert no buffer overflow
assert(m_traceBufferWritep - bufferp <= m_traceBufferSize);
// Assert no buffer overflow
assert(m_traceBufferWritep - bufferp <= m_traceBufferSize);
// Pass it to the worker thread
m_buffersToWorker.put(bufferp);
// Pass it to the worker thread
m_buffersToWorker.put(bufferp);
}
#endif
}

View File

@ -3547,29 +3547,30 @@ class EmitCTrace : EmitCStmts {
void emitTraceChangeOne(AstTraceInc* nodep, int arrayindex) {
iterateAndNextNull(nodep->precondsp());
string full = ((m_funcp->funcType() == AstCFuncType::TRACE_FULL
|| m_funcp->funcType() == AstCFuncType::TRACE_FULL_SUB)
? "full"
: "chg");
const bool full = (m_funcp->funcType() == AstCFuncType::TRACE_FULL
|| m_funcp->funcType() == AstCFuncType::TRACE_FULL_SUB);
const string func = full ? "full" : "chg";
bool emitWidth = false;
if (nodep->dtypep()->basicp()->isDouble()) {
puts("vcdp->" + full + "Double");
puts("vcdp->" + func + "Double");
} else if (nodep->isWide() || emitTraceIsScBv(nodep) || emitTraceIsScBigUint(nodep)) {
puts("vcdp->" + full + "Array");
puts("vcdp->" + func + "Array");
emitWidth = true;
} else if (nodep->isQuad()) {
puts("vcdp->" + full + "Quad");
puts("vcdp->" + func + "Quad");
emitWidth = true;
} else if (nodep->declp()->widthMin() > 1) {
puts("vcdp->" + full + "Bus");
puts("vcdp->" + func + "Bus");
emitWidth = true;
} else {
puts("vcdp->" + full + "Bit");
puts("vcdp->" + func + "Bit");
}
const uint32_t offset = (arrayindex < 0) ? 0 : (arrayindex * nodep->declp()->widthWords());
const uint32_t code = nodep->declp()->code() + offset;
puts("(oldp+" + cvtToStr(code - m_baseCode) + ",");
puts(v3Global.opt.trueTraceThreads() && !full ? "(base+" : "(oldp+");
puts(cvtToStr(code - m_baseCode));
puts(",");
emitTraceValue(nodep, arrayindex);
if (emitWidth) puts("," + cvtToStr(nodep->declp()->widthMin()));
puts(");\n");
@ -3648,8 +3649,14 @@ class EmitCTrace : EmitCStmts {
nodep->stmtsp()->v3fatalSrc("Trace sub function should contain AstTraceInc");
}
m_baseCode = stmtp->declp()->code();
puts("vluint32_t* oldp = vcdp->oldp(code+" + cvtToStr(m_baseCode) + ");\n");
puts("if (false && vcdp && oldp) {} // Prevent unused\n");
if (v3Global.opt.trueTraceThreads()
&& nodep->funcType() == AstCFuncType::TRACE_CHANGE_SUB) {
puts("vluint32_t base = code+" + cvtToStr(m_baseCode) + ";\n");
puts("if (false && vcdp && base) {} // Prevent unused\n");
} else {
puts("vluint32_t* oldp = vcdp->oldp(code+" + cvtToStr(m_baseCode) + ");\n");
puts("if (false && vcdp && oldp) {} // Prevent unused\n");
}
} else if (nodep->funcType() == AstCFuncType::TRACE_INIT_SUB) {
puts("int c = code;\n");
puts("if (false && vcdp && c) {} // Prevent unused\n");

View File

@ -69,9 +69,7 @@ public:
of.puts("\n");
of.puts("# Tracing threaded output mode? 0/1/N threads (from --trace-thread)\n");
of.puts("VM_TRACE_THREADS = ");
of.puts(!v3Global.opt.traceThreads()
? "0"
: cvtToStr(v3Global.opt.traceThreads() - v3Global.opt.traceFormat().fst()));
of.puts(cvtToStr(v3Global.opt.trueTraceThreads()));
of.puts("\n");
of.puts("# Separate FST writer thread? 0/1 (from --trace-fst with --trace-thread > 0)\n");
of.puts("VM_TRACE_FST_WRITER_THREAD = ");

View File

@ -490,6 +490,9 @@ public:
int traceMaxArray() const { return m_traceMaxArray; }
int traceMaxWidth() const { return m_traceMaxWidth; }
int traceThreads() const { return m_traceThreads; }
bool trueTraceThreads() const {
return traceThreads() == 0 ? 0 : traceThreads() - traceFormat().fst();
}
int unrollCount() const { return m_unrollCount; }
int unrollStmts() const { return m_unrollStmts; }