mirror of
https://github.com/verilator/verilator.git
synced 2025-01-19 12:54:02 +00:00
341 lines
12 KiB
C++
341 lines
12 KiB
C++
// -*- mode: C++; c-file-style: "cc-mode" -*-
|
|
//=============================================================================
|
|
//
|
|
// THIS MODULE IS PUBLICLY LICENSED
|
|
//
|
|
// Copyright 2012-2019 by Wilson Snyder. This program is free software;
|
|
// you can redistribute it and/or modify it under the terms of either the GNU
|
|
// Lesser General Public License Version 3 or the Perl Artistic License Version 2.0.
|
|
//
|
|
// This is distributed in the hope that it will be useful, but WITHOUT ANY
|
|
// WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
|
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
// for more details.
|
|
//
|
|
//=============================================================================
|
|
///
|
|
/// \file
|
|
/// \brief Thread pool and profiling for Verilated modules
|
|
///
|
|
//=============================================================================
|
|
|
|
#ifndef _VERILATED_THREADS_H_
|
|
#define _VERILATED_THREADS_H_
|
|
|
|
#include "verilatedos.h"
|
|
#include "verilated.h" // for VerilatedMutex and clang annotations
|
|
|
|
#include <sched.h> // For sched_getcpu()
|
|
#include <set>
|
|
#include <vector>
|
|
#if defined(__APPLE__)
|
|
# include <cpuid.h> // For __cpuid_count()
|
|
#endif
|
|
|
|
// VlMTaskVertex and VlThreadpool will work with multiple symbol table types.
|
|
// Since the type is opaque to VlMTaskVertex and VlThreadPool, represent it
|
|
// as a void* here.
|
|
typedef void* VlThrSymTab;
|
|
|
|
class VlNotification {
|
|
// MEMBERS
|
|
std::atomic<bool> m_notified; // Notification pending
|
|
static std::atomic<vluint64_t> s_yields; // Statistics
|
|
|
|
public:
|
|
// CONSTRUCTORS
|
|
VlNotification()
|
|
: m_notified(false) {
|
|
assert(atomic_is_lock_free(&m_notified));
|
|
}
|
|
~VlNotification() {}
|
|
|
|
// METHODS
|
|
static vluint64_t yields() { return s_yields; }
|
|
static void yieldThread() {
|
|
++s_yields; // Statistics
|
|
std::this_thread::yield();
|
|
}
|
|
|
|
// Block until notify() has occurred, then return.
|
|
// If notify() has already occurred, return immediately.
|
|
//
|
|
// This is logically const: the object will remain in notified state
|
|
// after WaitForNotification() returns, so you could notify more than
|
|
// one thread of the same event.
|
|
inline void waitForNotification() {
|
|
unsigned ct = 0;
|
|
while (VL_UNLIKELY(!notified())) {
|
|
VL_CPU_RELAX();
|
|
++ct;
|
|
if (VL_UNLIKELY(ct > VL_LOCK_SPINS)) {
|
|
ct = 0;
|
|
yieldThread();
|
|
}
|
|
}
|
|
}
|
|
|
|
// The 'inline' keyword here means nothing to the compiler, it's
|
|
// implicit on methods defined within the class body anyway.
|
|
//
|
|
// 'inline' is attached the this method, and others in this file,
|
|
// to remind humans that some routines in this file are called many
|
|
// times per cycle in threaded mode. Such routines should be
|
|
// inlinable; that's why they're declared in the .h and not the .cpp.
|
|
inline bool notified() {
|
|
return m_notified.load(std::memory_order_acquire);
|
|
}
|
|
// Set notified state. If state is already notified,
|
|
// it remains so.
|
|
inline void notify() {
|
|
m_notified.store(true, std::memory_order_release);
|
|
}
|
|
// Reset the state to un-notified state, which is also the
|
|
// state of a new Notification object.
|
|
inline void reset() {
|
|
m_notified.store(false, std::memory_order_relaxed);
|
|
}
|
|
};
|
|
|
|
typedef void (*VlExecFnp)(bool, VlThrSymTab);
|
|
|
|
/// Track dependencies for a single MTask.
|
|
class VlMTaskVertex {
|
|
// MEMBERS
|
|
|
|
// On even cycles, _upstreamDepsDone increases as upstream
|
|
// dependencies complete. When it reaches _upstreamDepCount,
|
|
// this MTaskVertex is ready.
|
|
//
|
|
// On odd cycles, _upstreamDepsDone decreases as upstream
|
|
// dependencies complete, and when it reaches zero this MTaskVertex
|
|
// is ready.
|
|
//
|
|
// An atomic is smaller than a mutex, and lock-free.
|
|
//
|
|
// (Why does the size of this class matter? If an mtask has many
|
|
// downstream mtasks to notify, we hope these will pack into a
|
|
// small number of cache lines to reduce the cost of pointer chasing
|
|
// during done-notification. Nobody's quantified that cost though.
|
|
// If we were really serious about shrinking this class, we could
|
|
// use 16-bit types here...)
|
|
std::atomic<vluint32_t> m_upstreamDepsDone;
|
|
const vluint32_t m_upstreamDepCount;
|
|
|
|
public:
|
|
// CONSTRUCTORS
|
|
|
|
// 'upstreamDepCount' is the number of upstream MTaskVertex's
|
|
// that must notify this MTaskVertex before it will become ready
|
|
// to run.
|
|
explicit VlMTaskVertex(vluint32_t upstreamDepCount);
|
|
~VlMTaskVertex() {}
|
|
|
|
// Upstream mtasks must call this when they complete.
|
|
// Returns true when the current MTaskVertex becomes ready to execute,
|
|
// false while it's still waiting on more dependencies.
|
|
inline bool signalUpstreamDone(bool evenCycle) {
|
|
if (evenCycle) {
|
|
vluint32_t upstreamDepsDone
|
|
= 1 + m_upstreamDepsDone.fetch_add(1, std::memory_order_release);
|
|
assert(upstreamDepsDone <= m_upstreamDepCount);
|
|
return (upstreamDepsDone == m_upstreamDepCount);
|
|
} else {
|
|
vluint32_t upstreamDepsDone_prev
|
|
= m_upstreamDepsDone.fetch_sub(1, std::memory_order_release);
|
|
assert(upstreamDepsDone_prev > 0);
|
|
return (upstreamDepsDone_prev == 1);
|
|
}
|
|
}
|
|
inline bool areUpstreamDepsDone(bool evenCycle) const {
|
|
vluint32_t target = evenCycle ? m_upstreamDepCount : 0;
|
|
return m_upstreamDepsDone.load(std::memory_order_acquire) == target;
|
|
}
|
|
inline void waitUntilUpstreamDone(bool evenCycle) const {
|
|
unsigned ct = 0;
|
|
while (VL_UNLIKELY(!areUpstreamDepsDone(evenCycle))) {
|
|
VL_CPU_RELAX();
|
|
++ct;
|
|
if (VL_UNLIKELY(ct > VL_LOCK_SPINS)) {
|
|
ct = 0;
|
|
VlNotification::yieldThread();
|
|
}
|
|
}
|
|
}
|
|
};
|
|
|
|
// Profiling support
|
|
class VlProfileRec {
|
|
protected:
|
|
friend class VlThreadPool;
|
|
enum VlProfileE {
|
|
TYPE_MTASK_RUN,
|
|
TYPE_BARRIER
|
|
};
|
|
VlProfileE m_type; // Record type
|
|
vluint32_t m_mtaskId; // Mtask we're logging
|
|
vluint32_t m_predictTime; // How long scheduler predicted would take
|
|
vluint64_t m_startTime; // Tick at start of execution
|
|
vluint64_t m_endTime; // Tick at end of execution
|
|
unsigned m_cpu; // Execution CPU number (at start anyways)
|
|
public:
|
|
class Barrier {};
|
|
VlProfileRec() {}
|
|
explicit VlProfileRec(Barrier) {
|
|
m_type = TYPE_BARRIER;
|
|
m_mtaskId = 0;
|
|
m_predictTime = 0;
|
|
m_startTime = 0;
|
|
m_endTime = 0;
|
|
m_cpu = getcpu();
|
|
}
|
|
void startRecord(vluint64_t time, uint32_t mtask, uint32_t predict) {
|
|
m_type = VlProfileRec::TYPE_MTASK_RUN;
|
|
m_mtaskId = mtask;
|
|
m_predictTime = predict;
|
|
m_startTime = time;
|
|
m_cpu = getcpu();
|
|
}
|
|
void endRecord(vluint64_t time) {
|
|
m_endTime = time;
|
|
}
|
|
static int getcpu() { // Return current executing CPU
|
|
#if defined(__linux)
|
|
return sched_getcpu();
|
|
#elif defined(__APPLE__)
|
|
vluint32_t info[4];
|
|
__cpuid_count(1, 0, info[0], info[1], info[2], info[3]);
|
|
/* info[1] is EBX, bits 24-31 are APIC ID */
|
|
if ((info[3] & (1 << 9)) == 0) {
|
|
return -1; /* no APIC on chip */
|
|
} else {
|
|
return (unsigned)info[1] >> 24;
|
|
}
|
|
#else
|
|
return 0;
|
|
#endif
|
|
}
|
|
|
|
};
|
|
|
|
class VlThreadPool;
|
|
|
|
class VlWorkerThread {
|
|
private:
|
|
// TYPES
|
|
struct ExecRec {
|
|
VlExecFnp m_fnp; // Function to execute
|
|
VlThrSymTab m_sym; // Symbol table to execute
|
|
bool m_evenCycle; // Even/odd for flag alternation
|
|
ExecRec() : m_fnp(NULL), m_sym(NULL), m_evenCycle(false) {}
|
|
ExecRec(VlExecFnp fnp, bool evenCycle, VlThrSymTab sym)
|
|
: m_fnp(fnp), m_sym(sym), m_evenCycle(evenCycle) {}
|
|
};
|
|
|
|
// MEMBERS
|
|
VerilatedMutex m_mutex;
|
|
|
|
// Why a vector? We expect the pending list to be very short, typically
|
|
// 0 or 1 or 2, so popping from the front shouldn't be
|
|
// expensive. Revisit if we ever have longer queues...
|
|
std::vector<ExecRec> m_ready VL_GUARDED_BY(m_mutex);
|
|
|
|
VlThreadPool* m_poolp; // Our associated thread pool
|
|
|
|
// If values stored are non-NULL, the thread is asleep pending new
|
|
// work. If the thread is not asleep, both parts of m_sleepAlarm must
|
|
// be NULL.
|
|
std::pair<VlNotification*, ExecRec*> m_sleepAlarm VL_GUARDED_BY(m_mutex);
|
|
|
|
bool m_profiling; // Is profiling enabled?
|
|
std::atomic<bool> m_exiting; // Worker thread should exit
|
|
std::thread m_cthread; // Underlying C++ thread record
|
|
|
|
VL_UNCOPYABLE(VlWorkerThread);
|
|
|
|
public:
|
|
// CONSTRUCTORS
|
|
explicit VlWorkerThread(VlThreadPool* poolp, bool profiling);
|
|
~VlWorkerThread();
|
|
|
|
// METHODS
|
|
inline void dequeWork(ExecRec* workp) VL_REQUIRES(m_mutex) {
|
|
// As noted above this is inefficient if our ready list is ever
|
|
// long (but it shouldn't be)
|
|
*workp = m_ready.front();
|
|
m_ready.erase(m_ready.begin());
|
|
}
|
|
inline void wakeUp() VL_REQUIRES(m_mutex) {
|
|
VlNotification* notifyp = m_sleepAlarm.first;
|
|
m_sleepAlarm.first = NULL; // NULL+NULL means wake
|
|
m_sleepAlarm.second = NULL;
|
|
notifyp->notify();
|
|
}
|
|
inline bool sleeping() VL_REQUIRES(m_mutex) {
|
|
return (m_sleepAlarm.first != NULL);
|
|
}
|
|
inline void addTask(VlExecFnp fnp, bool evenCycle, VlThrSymTab sym) {
|
|
VerilatedLockGuard lk(m_mutex);
|
|
m_ready.emplace_back(fnp, evenCycle, sym);
|
|
if (VL_LIKELY(sleeping())) { // Generally queue is waiting for work
|
|
// Awaken thread
|
|
dequeWork(m_sleepAlarm.second);
|
|
wakeUp();
|
|
}
|
|
}
|
|
void workerLoop();
|
|
static void startWorker(VlWorkerThread* workerp);
|
|
};
|
|
|
|
class VlThreadPool {
|
|
// TYPES
|
|
typedef std::vector<VlProfileRec> ProfileTrace;
|
|
typedef std::set<ProfileTrace*> ProfileSet;
|
|
|
|
// MEMBERS
|
|
std::vector<VlWorkerThread*> m_workers; // our workers
|
|
bool m_profiling; // is profiling enabled?
|
|
|
|
// Support profiling -- we can append records of profiling events
|
|
// to this vector with very low overhead, and then dump them out
|
|
// later. This prevents the overhead of printf/malloc/IO from
|
|
// corrupting the profiling data. It's super cheap to append
|
|
// a VlProfileRec struct on the end of a pre-allocated vector;
|
|
// this is the only cost we pay in real-time during a profiling cycle.
|
|
static VL_THREAD_LOCAL ProfileTrace* t_profilep;
|
|
ProfileSet m_allProfiles VL_GUARDED_BY(m_mutex);
|
|
VerilatedMutex m_mutex;
|
|
|
|
public:
|
|
// CONSTRUCTORS
|
|
// Construct a thread pool with 'nThreads' dedicated threads. The thread
|
|
// pool will create these threads and make them available to execute tasks
|
|
// via this->workerp(index)->addTask(...)
|
|
VlThreadPool(int nThreads, bool profiling);
|
|
~VlThreadPool();
|
|
|
|
// METHODS
|
|
inline int numThreads() const {
|
|
return m_workers.size();
|
|
}
|
|
inline VlWorkerThread* workerp(int index) {
|
|
assert(index >= 0);
|
|
assert(index < m_workers.size());
|
|
return m_workers[index];
|
|
}
|
|
inline VlProfileRec* profileAppend() {
|
|
t_profilep->emplace_back();
|
|
return &(t_profilep->back());
|
|
}
|
|
void profileAppendAll(const VlProfileRec& rec);
|
|
void profileDump(const char* filenamep, vluint64_t ticksElapsed);
|
|
// In profiling mode, each executing thread must call
|
|
// this once to setup profiling state:
|
|
void setupProfilingClientThread();
|
|
void tearDownProfilingClientThread();
|
|
private:
|
|
VL_UNCOPYABLE(VlThreadPool);
|
|
};
|
|
|
|
#endif
|