Make parallel tracing switchable at run-time

This commit is contained in:
Geza Lore 2022-07-19 17:06:26 +01:00
parent efb5caad22
commit a4ed3c2086
4 changed files with 138 additions and 116 deletions

View File

@ -182,7 +182,7 @@ private:
const bool m_offload; // Whether to use the offload thread (ignored if !VL_THREADED)
#ifdef VL_TRACE_PARALLEL
#ifdef VL_THREADED
struct ParallelWorkerData {
const dumpCb_t m_cb; // The callback
void* const m_userp; // The use pointer to pass to the callback
@ -317,6 +317,14 @@ protected:
static constexpr bool offload() { return false; }
#endif
inline bool parallel() const {
#ifdef VL_TRACE_PARALLEL
return true;
#else
return false;
#endif
}
//=========================================================================
// Virtual functions to be provided by the format specific implementation

View File

@ -26,7 +26,7 @@
#include "verilated_intrinsics.h"
#include "verilated_trace.h"
#ifdef VL_TRACE_PARALLEL
#ifdef VL_THREADED
# include "verilated_threads.h"
# include <list>
#endif
@ -462,7 +462,7 @@ void VerilatedTrace<VL_SUB_T, VL_BUF_T>::dumpvars(int level, const std::string&
}
}
#ifdef VL_TRACE_PARALLEL
#ifdef VL_THREADED
template <> //
void VerilatedTrace<VL_SUB_T, VL_BUF_T>::parallelWorkerTask(void* datap, bool) {
ParallelWorkerData* const wdp = reinterpret_cast<ParallelWorkerData*>(datap);
@ -490,45 +490,47 @@ template <> VL_ATTR_NOINLINE void VerilatedTrace<VL_SUB_T, VL_BUF_T>::ParallelWo
template <>
void VerilatedTrace<VL_SUB_T, VL_BUF_T>::runCallbacks(const std::vector<CallbackRecord>& cbVec) {
#ifdef VL_TRACE_PARALLEL
// If tracing in parallel, dispatch to the thread pool
VlThreadPool* threadPoolp = static_cast<VlThreadPool*>(m_contextp->threadPoolp());
// List of work items for thread (std::list, as ParallelWorkerData is not movable)
std::list<ParallelWorkerData> workerData;
// We use the whole pool + the main thread
const unsigned threads = threadPoolp->numThreads() + 1;
// Main thread executes all jobs with index % threads == 0
std::vector<ParallelWorkerData*> mainThreadWorkerData;
// Enuque all the jobs
for (unsigned i = 0; i < cbVec.size(); ++i) {
const CallbackRecord& cbr = cbVec[i];
// Always get the trace buffer on the main thread
Buffer* const bufp = getTraceBuffer();
// Create new work item
workerData.emplace_back(cbr.m_dumpCb, cbr.m_userp, bufp);
// Grab the new work item
ParallelWorkerData* const itemp = &workerData.back();
// Enqueue task to thread pool, or main thread
if (unsigned rem = i % threads) {
threadPoolp->workerp(rem - 1)->addTask(parallelWorkerTask, itemp);
} else {
mainThreadWorkerData.push_back(itemp);
#ifdef VL_THREADED
if (parallel()) {
// If tracing in parallel, dispatch to the thread pool
VlThreadPool* threadPoolp = static_cast<VlThreadPool*>(m_contextp->threadPoolp());
// List of work items for thread (std::list, as ParallelWorkerData is not movable)
std::list<ParallelWorkerData> workerData;
// We use the whole pool + the main thread
const unsigned threads = threadPoolp->numThreads() + 1;
// Main thread executes all jobs with index % threads == 0
std::vector<ParallelWorkerData*> mainThreadWorkerData;
// Enuque all the jobs
for (unsigned i = 0; i < cbVec.size(); ++i) {
const CallbackRecord& cbr = cbVec[i];
// Always get the trace buffer on the main thread
Buffer* const bufp = getTraceBuffer();
// Create new work item
workerData.emplace_back(cbr.m_dumpCb, cbr.m_userp, bufp);
// Grab the new work item
ParallelWorkerData* const itemp = &workerData.back();
// Enqueue task to thread pool, or main thread
if (unsigned rem = i % threads) {
threadPoolp->workerp(rem - 1)->addTask(parallelWorkerTask, itemp);
} else {
mainThreadWorkerData.push_back(itemp);
}
}
// Execute main thead jobs
for (ParallelWorkerData* const itemp : mainThreadWorkerData) {
parallelWorkerTask(itemp, false);
}
// Commit all trace buffers in order
for (ParallelWorkerData& item : workerData) {
// Wait until ready
item.wait();
// Commit the buffer
commitTraceBuffer(item.m_bufp);
}
}
// Execute main thead jobs
for (ParallelWorkerData* const itemp : mainThreadWorkerData) {
parallelWorkerTask(itemp, false);
}
// Commit all trace buffers in order
for (ParallelWorkerData& item : workerData) {
// Wait until ready
item.wait();
// Commit the buffer
commitTraceBuffer(item.m_bufp);
}
// Done
return;
// Done
return;
}
#endif
// Fall back on sequential execution
for (const CallbackRecord& cbr : cbVec) {

View File

@ -230,9 +230,11 @@ VerilatedVcd::~VerilatedVcd() {
if (m_wrBufp) VL_DO_CLEAR(delete[] m_wrBufp, m_wrBufp = nullptr);
deleteNameMap();
if (m_filep && m_fileNewed) VL_DO_CLEAR(delete m_filep, m_filep = nullptr);
#ifdef VL_TRACE_PARALLEL
assert(m_numBuffers == m_freeBuffers.size());
for (auto& pair : m_freeBuffers) VL_DO_CLEAR(delete[] pair.first, pair.first = nullptr);
#ifdef VL_THREADED
if (parallel()) {
assert(m_numBuffers == m_freeBuffers.size());
for (auto& pair : m_freeBuffers) VL_DO_CLEAR(delete[] pair.first, pair.first = nullptr);
}
#endif
}
@ -572,49 +574,55 @@ void VerilatedVcd::declDouble(uint32_t code, const char* name, bool array, int a
VerilatedVcd::Buffer* VerilatedVcd::getTraceBuffer() {
VerilatedVcd::Buffer* const bufp = new Buffer{*this};
#ifdef VL_TRACE_PARALLEL
// Note: This is called from VeriltedVcd::dump, which already holds the lock
// If no buffer available, allocate a new one
if (m_freeBuffers.empty()) {
constexpr size_t pageSize = 4096;
// 4 * m_maxSignalBytes, so we can reserve 2 * m_maxSignalBytes at the end for safety
size_t startingSize = roundUpToMultipleOf<pageSize>(4 * m_maxSignalBytes);
m_freeBuffers.emplace_back(new char[startingSize], startingSize);
++m_numBuffers;
#ifdef VL_THREADED
if (parallel()) {
// Note: This is called from VeriltedVcd::dump, which already holds the lock
// If no buffer available, allocate a new one
if (m_freeBuffers.empty()) {
constexpr size_t pageSize = 4096;
// 4 * m_maxSignalBytes, so we can reserve 2 * m_maxSignalBytes at the end for safety
size_t startingSize = roundUpToMultipleOf<pageSize>(4 * m_maxSignalBytes);
m_freeBuffers.emplace_back(new char[startingSize], startingSize);
++m_numBuffers;
}
// Grab a buffer
const auto pair = m_freeBuffers.back();
m_freeBuffers.pop_back();
// Initialize
bufp->m_writep = bufp->m_bufp = pair.first;
bufp->m_size = pair.second;
bufp->adjustGrowp();
}
// Grab a buffer
const auto pair = m_freeBuffers.back();
m_freeBuffers.pop_back();
// Initialize
bufp->m_writep = bufp->m_bufp = pair.first;
bufp->m_size = pair.second;
bufp->adjustGrowp();
#endif
// Return the buffer
return bufp;
}
void VerilatedVcd::commitTraceBuffer(VerilatedVcd::Buffer* bufp) {
#ifdef VL_TRACE_PARALLEL
// Note: This is called from VeriltedVcd::dump, which already holds the lock
// Resize output buffer. Note, we use the full size of the trace buffer, as
// this is a lot more stable than the actual occupancy of the trace buffer.
// This helps us to avoid re-allocations due to small size changes.
bufferResize(bufp->m_size);
// Compute occupancy of buffer
const size_t usedSize = bufp->m_writep - bufp->m_bufp;
// Copy to output buffer
std::memcpy(m_writep, bufp->m_bufp, usedSize);
// Adjust write pointer
m_writep += usedSize;
// Flush if necessary
bufferCheck();
// Put buffer back on free list
m_freeBuffers.emplace_back(bufp->m_bufp, bufp->m_size);
if (parallel()) {
#if VL_THREADED
// Note: This is called from VeriltedVcd::dump, which already holds the lock
// Resize output buffer. Note, we use the full size of the trace buffer, as
// this is a lot more stable than the actual occupancy of the trace buffer.
// This helps us to avoid re-allocations due to small size changes.
bufferResize(bufp->m_size);
// Compute occupancy of buffer
const size_t usedSize = bufp->m_writep - bufp->m_bufp;
// Copy to output buffer
std::memcpy(m_writep, bufp->m_bufp, usedSize);
// Adjust write pointer
m_writep += usedSize;
// Flush if necessary
bufferCheck();
// Put buffer back on free list
m_freeBuffers.emplace_back(bufp->m_bufp, bufp->m_size);
#else
// Needs adjusting for emitTimeChange
m_writep = bufp->m_writep;
VL_FATAL_MT(__FILE__, __LINE__, "", "Unreachable");
#endif
} else {
// Needs adjusting for emitTimeChange
m_writep = bufp->m_writep;
}
delete bufp;
}
@ -656,35 +664,39 @@ void VerilatedVcdBuffer::finishLine(uint32_t code, char* writep) {
// suffix, which was stored in the last byte of the suffix buffer entry.
m_writep = writep + suffixp[VL_TRACE_SUFFIX_ENTRY_SIZE - 1];
#ifdef VL_TRACE_PARALLEL
// Double the size of the buffer if necessary
if (VL_UNLIKELY(m_writep >= m_growp)) {
// Compute occupied size of current buffer
const size_t usedSize = m_writep - m_bufp;
// We are always doubling the size
m_size *= 2;
// Allocate the new buffer
char* const newBufp = new char[m_size];
// Copy from current buffer to new buffer
std::memcpy(newBufp, m_bufp, usedSize);
// Delete current buffer
delete[] m_bufp;
// Make new buffer the current buffer
m_bufp = newBufp;
// Adjust write pointer
m_writep = m_bufp + usedSize;
// Adjust resize limit
adjustGrowp();
}
if (m_owner.parallel()) {
#ifdef VL_THREADED
// Double the size of the buffer if necessary
if (VL_UNLIKELY(m_writep >= m_growp)) {
// Compute occupied size of current buffer
const size_t usedSize = m_writep - m_bufp;
// We are always doubling the size
m_size *= 2;
// Allocate the new buffer
char* const newBufp = new char[m_size];
// Copy from current buffer to new buffer
std::memcpy(newBufp, m_bufp, usedSize);
// Delete current buffer
delete[] m_bufp;
// Make new buffer the current buffer
m_bufp = newBufp;
// Adjust write pointer
m_writep = m_bufp + usedSize;
// Adjust resize limit
adjustGrowp();
}
#else
// Flush the write buffer if there's not enough space left for new information
// We only call this once per vector, so we need enough slop for a very wide "b###" line
if (VL_UNLIKELY(m_writep > m_wrFlushp)) {
m_owner.m_writep = m_writep;
m_owner.bufferFlush();
m_writep = m_owner.m_writep;
}
VL_FATAL_MT(__FILE__, __LINE__, "", "Unreachable");
#endif
} else {
// Flush the write buffer if there's not enough space left for new information
// We only call this once per vector, so we need enough slop for a very wide "b###" line
if (VL_UNLIKELY(m_writep > m_wrFlushp)) {
m_owner.m_writep = m_writep;
m_owner.bufferFlush();
m_writep = m_owner.m_writep;
}
}
}
//=============================================================================

View File

@ -65,7 +65,7 @@ private:
using NameMap = std::map<const std::string, const std::string>;
NameMap* m_namemapp = nullptr; // List of names for the header
#ifdef VL_TRACE_PARALLEL
#ifdef VL_THREADED
// Vector of free trace buffers as (pointer, size) pairs.
std::vector<std::pair<char*, size_t>> m_freeBuffers;
size_t m_numBuffers = 0; // Number of trace buffers allocated
@ -168,30 +168,30 @@ class VerilatedVcdBuffer VL_NOT_FINAL {
VerilatedVcd& m_owner; // Trace file owning this buffer. Required by subclasses.
#ifdef VL_TRACE_PARALLEL
char* m_writep = nullptr; // Write pointer into m_bufp
char* m_bufp = nullptr; // The beginning of the trace buffer
size_t m_size = 0; // The size of the buffer at m_bufp
char* m_growp = nullptr; // Resize limit pointer
#else
char* m_writep = m_owner.m_writep; // Write pointer into output buffer
char* const m_wrFlushp = m_owner.m_wrFlushp; // Output buffer flush trigger location
#endif
// Write pointer into output buffer (in parallel mode, this is set up in 'getTraceBuffer')
char* m_writep = m_owner.parallel() ? nullptr : m_owner.m_writep;
// Output buffer flush trigger location (only used when not parallel)
char* const m_wrFlushp = m_owner.parallel() ? nullptr : m_owner.m_wrFlushp;
// VCD line end string codes + metadata
const char* const m_suffixes = m_owner.m_suffixes.data();
// The maximum number of bytes a single signal can emit
const size_t m_maxSignalBytes = m_owner.m_maxSignalBytes;
void finishLine(uint32_t code, char* writep);
#ifdef VL_THREADED
// Additional data for parallel tracing only
char* m_bufp = nullptr; // The beginning of the trace buffer
size_t m_size = 0; // The size of the buffer at m_bufp
char* m_growp = nullptr; // Resize limit pointer
#ifdef VL_TRACE_PARALLEL
void adjustGrowp() {
m_growp = (m_bufp + m_size) - (2 * m_maxSignalBytes);
assert(m_growp >= m_bufp + m_maxSignalBytes);
}
#endif
void finishLine(uint32_t code, char* writep);
// CONSTRUCTOR
explicit VerilatedVcdBuffer(VerilatedVcd& owner)
: m_owner{owner} {}