forked from github/verilator
c52f3349d1
The --trace-threads option can now be used to perform tracing on a thread separate from the main thread when using VCD tracing (with --trace-threads 1). For FST tracing --trace-threads can be 1 or 2, and --trace-fst --trace-threads 1 is the same a what --trace-fst-threads used to be (which is now deprecated). Performance numbers on SweRV EH1 CoreMark, clang 6.0.0, Intel i7-3770 @ 3.40GHz, IO to ramdisk, with numactl set to schedule threads on different physical cores. Relative speedup: --trace -> --trace --trace-threads 1 +22% --trace-fst -> --trace-fst --trace-threads 1 +38% (as --trace-fst-thread) --trace-fst -> --trace-fst --trace-threads 2 +93% Speed relative to --trace with no threaded tracing: --trace 1.00 x --trace --trace-threads 1 0.82 x --trace-fst 1.79 x --trace-fst --trace-threads 1 1.23 x --trace-fst --trace-threads 2 0.87 x This means FST tracing with 2 extra threads is now faster than single threaded VCD tracing, and is on par with threaded VCD tracing. You do pay for it in total compute though as --trace-fst --trace-threads 2 uses about 240% CPU vs 150% for --trace-fst --trace-threads 1, and 155% for --trace --trace threads 1. Still for interactive use it should be helpful with large designs.
370 lines
14 KiB
C++
370 lines
14 KiB
C++
// -*- mode: C++; c-file-style: "cc-mode" -*-
|
|
//=============================================================================
|
|
//
|
|
// THIS MODULE IS PUBLICLY LICENSED
|
|
//
|
|
// Copyright 2001-2020 by Wilson Snyder. This program is free software; you
|
|
// can redistribute it and/or modify it under the terms of either the GNU
|
|
// Lesser General Public License Version 3 or the Perl Artistic License
|
|
// Version 2.0.
|
|
// SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
|
|
//
|
|
//=============================================================================
|
|
///
|
|
/// \file
|
|
/// \brief Tracing functionality common to all formats
|
|
///
|
|
//=============================================================================
|
|
// SPDIFF_OFF
|
|
|
|
#ifndef _VERILATED_TRACE_H_
|
|
#define _VERILATED_TRACE_H_ 1
|
|
|
|
// clang-format off
|
|
|
|
#include "verilated.h"
|
|
|
|
#include <string>
|
|
#include <vector>
|
|
|
|
#ifdef VL_TRACE_THREADED
|
|
# include <condition_variable>
|
|
# include <deque>
|
|
# include <thread>
|
|
#endif
|
|
|
|
// clang-format on
|
|
|
|
#ifdef VL_TRACE_THREADED
|
|
//=============================================================================
|
|
// Threaded tracing
|
|
|
|
// A simple synchronized first in first out queue
|
|
template <class T> class VerilatedThreadQueue {
|
|
private:
|
|
VerilatedMutex m_mutex; // Protects m_queue
|
|
std::condition_variable_any m_cv;
|
|
std::deque<T> m_queue VL_GUARDED_BY(m_mutex);
|
|
|
|
public:
|
|
// Put an element at the back of the queue
|
|
void put(T value) {
|
|
VerilatedLockGuard lock(m_mutex);
|
|
m_queue.push_back(value);
|
|
m_cv.notify_one();
|
|
}
|
|
|
|
// Put an element at the front of the queue
|
|
void put_front(T value) {
|
|
VerilatedLockGuard lock(m_mutex);
|
|
m_queue.push_front(value);
|
|
m_cv.notify_one();
|
|
}
|
|
|
|
// Get an element from the front of the queue. Blocks if none available
|
|
T get() {
|
|
VerilatedLockGuard lock(m_mutex);
|
|
m_cv.wait(lock, [this]() VL_REQUIRES(m_mutex) { return !m_queue.empty(); });
|
|
assert(!m_queue.empty());
|
|
T value = m_queue.front();
|
|
m_queue.pop_front();
|
|
return value;
|
|
}
|
|
|
|
// Non blocking get
|
|
bool tryGet(T& result) {
|
|
VerilatedLockGuard lockGuard(m_mutex);
|
|
if (m_queue.empty()) { return false; }
|
|
result = m_queue.front();
|
|
m_queue.pop_front();
|
|
return true;
|
|
}
|
|
};
|
|
|
|
// Commands used by thread tracing. Note that the bottom 8 bits of all these
|
|
// values are empty and are used to store small amounts of additional command
|
|
// parameters. Anonymous enum in class, as we want it scoped, but we also
|
|
// want the automatic conversion to integer types.
|
|
class VerilatedTraceCommand {
|
|
public:
|
|
enum {
|
|
CHG_BIT = 0x0000,
|
|
CHG_BUS = 0x0100,
|
|
CHG_QUAD = 0x0200,
|
|
CHG_ARRAY = 0x0300,
|
|
CHG_FLOAT = 0x0400,
|
|
CHG_DOUBLE = 0x0500,
|
|
TIME_CHANGE = 0x8000,
|
|
END = 0xf000, // End of buffer
|
|
SHUTDOWN = 0xf200 // Shutdown worker thread, also marks end of buffer
|
|
};
|
|
};
|
|
|
|
typedef union {
|
|
vluint32_t cmd; // Command code + params in bottom 8 bits
|
|
vluint32_t* oldp; // Pointer to previous value buffer to consult/update
|
|
// Note: These are 64-bit wide, as this union already has a pointer type in it.
|
|
vluint64_t params; // Command parameter
|
|
// New signal value in various forms
|
|
vluint64_t newBits;
|
|
float newFloat;
|
|
double newDouble;
|
|
vluint64_t timeui;
|
|
} VerilatedTraceEntry;
|
|
#endif
|
|
|
|
class VerilatedTraceCallInfo;
|
|
|
|
//=============================================================================
|
|
// VerilatedTrace
|
|
|
|
// VerilatedTrace uses F-bounded polymorphism to access duck-typed
|
|
// implementations in the format specific derived class, which must be passed
|
|
// as the type parameter T_Derived
|
|
template <class T_Derived> class VerilatedTrace {
|
|
private:
|
|
//=========================================================================
|
|
// Generic tracing internals
|
|
|
|
vluint32_t* m_sigs_oldvalp; ///< Old value store
|
|
vluint64_t m_timeLastDump; ///< Last time we did a dump
|
|
std::vector<VerilatedTraceCallInfo*> m_callbacks; ///< Routines to perform dumping
|
|
bool m_fullDump; ///< Whether a full dump is required on the next call to 'dump'
|
|
vluint32_t m_nextCode; ///< Next code number to assign
|
|
vluint32_t m_numSignals; ///< Number of distinct signals
|
|
std::string m_moduleName; ///< Name of module being trace initialized now
|
|
char m_scopeEscape;
|
|
double m_timeRes; ///< Time resolution (ns/ms etc)
|
|
double m_timeUnit; ///< Time units (ns/ms etc)
|
|
|
|
// Equivalent to 'this' but is of the sub-type 'T_Derived*'. Use 'self()->'
|
|
// to access duck-typed functions to avoid a virtual function call.
|
|
T_Derived* self() { return static_cast<T_Derived*>(this); }
|
|
|
|
#ifdef VL_TRACE_THREADED
|
|
// Number of total trace buffers that have been allocated
|
|
vluint32_t m_numTraceBuffers;
|
|
|
|
// Size of trace buffers
|
|
size_t m_traceBufferSize;
|
|
|
|
// Buffers handed to worker for processing
|
|
VerilatedThreadQueue<VerilatedTraceEntry*> m_buffersToWorker;
|
|
// Buffers returned from worker after processing
|
|
VerilatedThreadQueue<VerilatedTraceEntry*> m_buffersFromWorker;
|
|
|
|
// Get a new trace buffer that can be populated. May block if none available
|
|
VerilatedTraceEntry* getTraceBuffer();
|
|
|
|
// Write pointer into current buffer
|
|
VerilatedTraceEntry* m_traceBufferWritep;
|
|
|
|
// End of trace buffer
|
|
VerilatedTraceEntry* m_traceBufferEndp;
|
|
|
|
// The worker thread itself
|
|
std::unique_ptr<std::thread> m_workerThread;
|
|
|
|
// The function executed by the worker thread
|
|
void workerThreadMain();
|
|
|
|
// Wait until given buffer is placed in m_buffersFromWorker
|
|
void waitForBuffer(const VerilatedTraceEntry* bufferp);
|
|
|
|
// Shut down and join worker, if it's running, otherwise do nothing
|
|
void shutdownWorker();
|
|
#endif
|
|
|
|
// CONSTRUCTORS
|
|
VL_UNCOPYABLE(VerilatedTrace);
|
|
|
|
protected:
|
|
//=========================================================================
|
|
// Internals available to format specific implementations
|
|
|
|
VerilatedAssertOneThread m_assertOne; ///< Assert only called from single thread
|
|
|
|
vluint32_t nextCode() const { return m_nextCode; }
|
|
vluint32_t numSignals() const { return m_numSignals; }
|
|
const std::string& moduleName() const { return m_moduleName; }
|
|
void fullDump(bool value) { m_fullDump = value; }
|
|
vluint64_t timeLastDump() { return m_timeLastDump; }
|
|
|
|
double timeRes() const { return m_timeRes; }
|
|
double timeUnit() const { return m_timeUnit; }
|
|
std::string timeResStr() const;
|
|
std::string timeUnitStr() const;
|
|
|
|
void traceInit() VL_MT_UNSAFE;
|
|
|
|
void declCode(vluint32_t code, vluint32_t bits, bool tri);
|
|
|
|
/// Is this an escape?
|
|
bool isScopeEscape(char c) { return isspace(c) || c == m_scopeEscape; }
|
|
/// Character that splits scopes. Note whitespace are ALWAYS escapes.
|
|
char scopeEscape() { return m_scopeEscape; }
|
|
|
|
void close();
|
|
void flush();
|
|
|
|
//=========================================================================
|
|
// Virtual functions to be provided by the format specific implementation
|
|
|
|
// Called when the trace moves forward to a new time point
|
|
virtual void emitTimeChange(vluint64_t timeui) = 0;
|
|
|
|
// These hooks are called before a full or change based dump is produced.
|
|
// The return value indicates whether to proceed with the dump.
|
|
virtual bool preFullDump() { return true; }
|
|
virtual bool preChangeDump() { return true; }
|
|
|
|
public:
|
|
//=========================================================================
|
|
// External interface to client code
|
|
|
|
explicit VerilatedTrace();
|
|
~VerilatedTrace();
|
|
|
|
// Set time units (s/ms, defaults to ns)
|
|
void set_time_unit(const char* unitp);
|
|
void set_time_unit(const std::string& unit);
|
|
// Set time resolution (s/ms, defaults to ns)
|
|
void set_time_resolution(const char* unitp);
|
|
void set_time_resolution(const std::string& unit);
|
|
|
|
// Call
|
|
void dump(vluint64_t timeui);
|
|
|
|
//=========================================================================
|
|
// Non-hot path internal interface to Verilator generated code
|
|
|
|
typedef void (*callback_t)(T_Derived* tracep, void* userthis, vluint32_t code);
|
|
|
|
void changeThread() { m_assertOne.changeThread(); }
|
|
|
|
void addCallback(callback_t initcb, callback_t fullcb, callback_t changecb,
|
|
void* userthis) VL_MT_UNSAFE_ONE;
|
|
|
|
void module(const std::string& name) VL_MT_UNSAFE_ONE {
|
|
m_assertOne.check();
|
|
m_moduleName = name;
|
|
}
|
|
|
|
void scopeEscape(char flag) { m_scopeEscape = flag; }
|
|
|
|
//=========================================================================
|
|
// Hot path internal interface to Verilator generated code
|
|
|
|
// Implementation note: We rely on the following duck-typed implementations
|
|
// in the derived class T_Derived. These emit* functions record a format
|
|
// specific trace entry. Normally one would use pure virtual functions for
|
|
// these here, but we cannot afford dynamic dispatch for calling these as
|
|
// this is very hot code during tracing.
|
|
|
|
// duck-typed void emitBit(vluint32_t code, vluint32_t newval) = 0;
|
|
// duck-typed template <int T_Bits> void emitBus(vluint32_t code, vluint32_t newval) = 0;
|
|
// duck-typed void emitQuad(vluint32_t code, vluint64_t newval, int bits) = 0;
|
|
// duck-typed void emitArray(vluint32_t code, const vluint32_t* newvalp, int bits) = 0;
|
|
// duck-typed void emitFloat(vluint32_t code, float newval) = 0;
|
|
// duck-typed void emitDouble(vluint32_t code, double newval) = 0;
|
|
|
|
vluint32_t* oldp(vluint32_t code) { return m_sigs_oldvalp + code; }
|
|
|
|
// Write to previous value buffer value and emit trace entry.
|
|
void fullBit(vluint32_t* oldp, vluint32_t newval);
|
|
template <int T_Bits> void fullBus(vluint32_t* oldp, vluint32_t newval);
|
|
void fullQuad(vluint32_t* oldp, vluint64_t newval, int bits);
|
|
void fullArray(vluint32_t* oldp, const vluint32_t* newvalp, int bits);
|
|
void fullFloat(vluint32_t* oldp, float newval);
|
|
void fullDouble(vluint32_t* oldp, double newval);
|
|
|
|
#ifdef VL_TRACE_THREADED
|
|
// Threaded tracing. Just dump everything in the trace buffer
|
|
inline void chgBit(vluint32_t* oldp, vluint32_t newval) {
|
|
m_traceBufferWritep[0].cmd = VerilatedTraceCommand::CHG_BIT | newval;
|
|
m_traceBufferWritep[1].oldp = oldp;
|
|
m_traceBufferWritep += 2;
|
|
VL_DEBUG_IF(assert(m_traceBufferWritep <= m_traceBufferEndp););
|
|
}
|
|
template <int T_Bits> inline void chgBus(vluint32_t* oldp, vluint32_t newval) {
|
|
m_traceBufferWritep[0].cmd = VerilatedTraceCommand::CHG_BUS | T_Bits;
|
|
m_traceBufferWritep[1].oldp = oldp;
|
|
m_traceBufferWritep[2].newBits = newval;
|
|
m_traceBufferWritep += 3;
|
|
VL_DEBUG_IF(assert(m_traceBufferWritep <= m_traceBufferEndp););
|
|
}
|
|
inline void chgQuad(vluint32_t* oldp, vluint64_t newval, int bits) {
|
|
m_traceBufferWritep[0].cmd = VerilatedTraceCommand::CHG_QUAD | bits;
|
|
m_traceBufferWritep[1].oldp = oldp;
|
|
m_traceBufferWritep[2].newBits = newval;
|
|
m_traceBufferWritep += 3;
|
|
VL_DEBUG_IF(assert(m_traceBufferWritep <= m_traceBufferEndp););
|
|
}
|
|
inline void chgArray(vluint32_t* oldp, const vluint32_t* newvalp, int bits) {
|
|
m_traceBufferWritep[0].cmd = VerilatedTraceCommand::CHG_ARRAY;
|
|
m_traceBufferWritep[1].oldp = oldp;
|
|
m_traceBufferWritep[2].params = bits;
|
|
m_traceBufferWritep += 3;
|
|
vluint32_t* const wp = reinterpret_cast<vluint32_t*>(m_traceBufferWritep);
|
|
for (int i = 0; i < (bits + 31) / 32; ++i) { wp[i] = newvalp[i]; }
|
|
m_traceBufferWritep += (bits + 63) / 64;
|
|
VL_DEBUG_IF(assert(m_traceBufferWritep <= m_traceBufferEndp););
|
|
}
|
|
inline void chgFloat(vluint32_t* oldp, float newval) {
|
|
m_traceBufferWritep[0].cmd = VerilatedTraceCommand::CHG_FLOAT;
|
|
m_traceBufferWritep[1].oldp = oldp;
|
|
m_traceBufferWritep[2].newFloat = newval;
|
|
m_traceBufferWritep += 3;
|
|
VL_DEBUG_IF(assert(m_traceBufferWritep <= m_traceBufferEndp););
|
|
}
|
|
inline void chgDouble(vluint32_t* oldp, double newval) {
|
|
m_traceBufferWritep[0].cmd = VerilatedTraceCommand::CHG_DOUBLE;
|
|
m_traceBufferWritep[1].oldp = oldp;
|
|
m_traceBufferWritep[2].newDouble = newval;
|
|
m_traceBufferWritep += 3;
|
|
VL_DEBUG_IF(assert(m_traceBufferWritep <= m_traceBufferEndp););
|
|
}
|
|
|
|
#define CHG(name) chg##name##Impl
|
|
#else
|
|
#define CHG(name) chg##name
|
|
#endif
|
|
|
|
// In non-threaded mode, these are called directly by the trace callbacks,
|
|
// and are called chg*. In threaded mode, they are called by the worker
|
|
// thread and are called chg*Impl
|
|
|
|
// Check previous dumped value of signal. If changed, then emit trace entry
|
|
inline void CHG(Bit)(vluint32_t* oldp, vluint32_t newval) {
|
|
const vluint32_t diff = *oldp ^ newval;
|
|
if (VL_UNLIKELY(diff)) fullBit(oldp, newval);
|
|
}
|
|
template <int T_Bits> inline void CHG(Bus)(vluint32_t* oldp, vluint32_t newval) {
|
|
const vluint32_t diff = *oldp ^ newval;
|
|
if (VL_UNLIKELY(diff)) fullBus<T_Bits>(oldp, newval);
|
|
}
|
|
inline void CHG(Quad)(vluint32_t* oldp, vluint64_t newval, int bits) {
|
|
const vluint64_t diff = *reinterpret_cast<vluint64_t*>(oldp) ^ newval;
|
|
if (VL_UNLIKELY(diff)) fullQuad(oldp, newval, bits);
|
|
}
|
|
inline void CHG(Array)(vluint32_t* oldp, const vluint32_t* newvalp, int bits) {
|
|
for (int i = 0; i < (bits + 31) / 32; ++i) {
|
|
if (VL_UNLIKELY(oldp[i] ^ newvalp[i])) {
|
|
fullArray(oldp, newvalp, bits);
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
inline void CHG(Float)(vluint32_t* oldp, float newval) {
|
|
// cppcheck-suppress invalidPointerCast
|
|
if (VL_UNLIKELY(*reinterpret_cast<float*>(oldp) != newval)) fullFloat(oldp, newval);
|
|
}
|
|
inline void CHG(Double)(vluint32_t* oldp, double newval) {
|
|
// cppcheck-suppress invalidPointerCast
|
|
if (VL_UNLIKELY(*reinterpret_cast<double*>(oldp) != newval)) fullDouble(oldp, newval);
|
|
}
|
|
|
|
#undef CHG
|
|
};
|
|
#endif // guard
|