Fix multithreaded yield behavior when no work.

Signed-off-by: Wilson Snyder <wsnyder@wsnyder.org>
This commit is contained in:
Patrick Stewart 2019-10-07 19:27:31 -04:00 committed by Wilson Snyder
parent 8e6d68147c
commit 0ed58a4217
4 changed files with 52 additions and 113 deletions

View File

@ -12,6 +12,8 @@ The contributors that suggested a given feature are shown in []. Thanks!
**** Increase case duplicate/incomplete to 16 bit tables, bug1545. [Yossi Nivin]
**** Fix multithreaded yield behavior when no work. [Patrick Stewart]
* Verilator 4.020 2019-10-06

View File

@ -141,7 +141,7 @@ class VL_CAPABILITY("mutex") VerilatedMutex {
bool try_lock() VL_TRY_ACQUIRE(true) { return m_mutex.try_lock(); }
};
/// Lock guard for mutex (ala std::lock_guard), wrapped to allow -fthread_safety checks
/// Lock guard for mutex (ala std::unique_lock), wrapped to allow -fthread_safety checks
class VL_SCOPED_CAPABILITY VerilatedLockGuard {
VL_UNCOPYABLE(VerilatedLockGuard);
private:
@ -154,6 +154,8 @@ class VL_SCOPED_CAPABILITY VerilatedLockGuard {
~VerilatedLockGuard() VL_RELEASE() {
m_mutexr.unlock();
}
void lock() VL_ACQUIRE(mutexr) { m_mutexr.lock(); }
void unlock() VL_RELEASE() { m_mutexr.unlock(); }
};
#else // !VL_THREADED
@ -171,6 +173,8 @@ class VerilatedLockGuard {
public:
explicit VerilatedLockGuard(VerilatedMutex&) {}
~VerilatedLockGuard() {}
void lock() {}
void unlock() {}
};
#endif // VL_THREADED

View File

@ -24,7 +24,7 @@
#include <cstdio>
std::atomic<vluint64_t> VlNotification::s_yields;
std::atomic<vluint64_t> VlMTaskVertex::s_yields;
VL_THREAD_LOCAL VlThreadPool::ProfileTrace* VlThreadPool::t_profilep = NULL;
@ -41,7 +41,8 @@ VlMTaskVertex::VlMTaskVertex(vluint32_t upstreamDepCount)
// VlWorkerThread
VlWorkerThread::VlWorkerThread(VlThreadPool* poolp, bool profiling)
: m_poolp(poolp)
: m_ready_size(0)
, m_poolp(poolp)
, m_profiling(profiling)
, m_exiting(false)
// Must init this last -- after setting up fields that it might read:
@ -49,12 +50,7 @@ VlWorkerThread::VlWorkerThread(VlThreadPool* poolp, bool profiling)
VlWorkerThread::~VlWorkerThread() {
m_exiting.store(true, std::memory_order_release);
{
VerilatedLockGuard lk(m_mutex);
if (sleeping()) {
wakeUp();
}
}
wakeUp();
// The thread should exit; join it.
m_cthread.join();
}
@ -64,38 +60,18 @@ void VlWorkerThread::workerLoop() {
m_poolp->setupProfilingClientThread();
}
VlNotification alarm;
ExecRec work;
work.m_fnp = NULL;
while (1) {
bool sleep = false;
if (VL_UNLIKELY(!work.m_fnp)) {
// Look for work
VerilatedLockGuard lk(m_mutex);
if (VL_LIKELY(!m_ready.empty())) {
dequeWork(&work);
} else {
// No work available, prepare to sleep. Pass alarm/work
// into m_sleepAlarm so wakeUp will tall this function.
//
// Must modify m_sleepAlarm in the same critical section as
// the check for ready work, otherwise we could race with
// another thread enqueueing work and never be awoken.
m_sleepAlarm.first = &alarm;
m_sleepAlarm.second = &work;
sleep = true;
}
if (VL_LIKELY(!work.m_fnp)) {
dequeWork(&work);
}
// Do this here, not above, to avoid a race with the destructor.
if (VL_UNLIKELY(m_exiting.load(std::memory_order_acquire)))
break;
if (VL_UNLIKELY(sleep)) {
alarm.waitForNotification(); // ZZZzzzzz
alarm.reset();
}
if (VL_LIKELY(work.m_fnp)) {
work.m_fnp(work.m_evenCycle, work.m_sym);
work.m_fnp = NULL;
@ -194,7 +170,7 @@ void VlThreadPool::profileDump(const char* filenamep, vluint64_t ticksElapsed) {
fprintf(fp, "VLPROF arg +verilator+prof+threads+window+%u\n",
Verilated::profThreadsWindow());
fprintf(fp, "VLPROF stat yields %" VL_PRI64 "u\n",
VlNotification::yields());
VlMTaskVertex::yields());
vluint32_t thread_id = 0;
for (ProfileSet::iterator pit = m_allProfiles.begin();

View File

@ -25,6 +25,7 @@
#include "verilatedos.h"
#include "verilated.h" // for VerilatedMutex and clang annotations
#include <condition_variable>
#include <set>
#include <vector>
#if defined(__linux)
@ -39,71 +40,12 @@
// as a void* here.
typedef void* VlThrSymTab;
class VlNotification {
// MEMBERS
std::atomic<bool> m_notified; // Notification pending
static std::atomic<vluint64_t> s_yields; // Statistics
public:
// CONSTRUCTORS
VlNotification()
: m_notified(false) {
assert(atomic_is_lock_free(&m_notified));
}
~VlNotification() {}
// METHODS
static vluint64_t yields() { return s_yields; }
static void yieldThread() {
++s_yields; // Statistics
std::this_thread::yield();
}
// Block until notify() has occurred, then return.
// If notify() has already occurred, return immediately.
//
// This is logically const: the object will remain in notified state
// after WaitForNotification() returns, so you could notify more than
// one thread of the same event.
inline void waitForNotification() {
unsigned ct = 0;
while (VL_UNLIKELY(!notified())) {
VL_CPU_RELAX();
++ct;
if (VL_UNLIKELY(ct > VL_LOCK_SPINS)) {
ct = 0;
yieldThread();
}
}
}
// The 'inline' keyword here means nothing to the compiler, it's
// implicit on methods defined within the class body anyway.
//
// 'inline' is attached the this method, and others in this file,
// to remind humans that some routines in this file are called many
// times per cycle in threaded mode. Such routines should be
// inlinable; that's why they're declared in the .h and not the .cpp.
inline bool notified() {
return m_notified.load(std::memory_order_acquire);
}
// Set notified state. If state is already notified,
// it remains so.
inline void notify() {
m_notified.store(true, std::memory_order_release);
}
// Reset the state to un-notified state, which is also the
// state of a new Notification object.
inline void reset() {
m_notified.store(false, std::memory_order_relaxed);
}
};
typedef void (*VlExecFnp)(bool, VlThrSymTab);
/// Track dependencies for a single MTask.
class VlMTaskVertex {
// MEMBERS
static std::atomic<vluint64_t> s_yields; // Statistics
// On even cycles, _upstreamDepsDone increases as upstream
// dependencies complete. When it reaches _upstreamDepCount,
@ -133,6 +75,12 @@ public:
explicit VlMTaskVertex(vluint32_t upstreamDepCount);
~VlMTaskVertex() {}
static vluint64_t yields() { return s_yields; }
static void yieldThread() {
++s_yields; // Statistics
std::this_thread::yield();
}
// Upstream mtasks must call this when they complete.
// Returns true when the current MTaskVertex becomes ready to execute,
// false while it's still waiting on more dependencies.
@ -160,7 +108,7 @@ public:
++ct;
if (VL_UNLIKELY(ct > VL_LOCK_SPINS)) {
ct = 0;
VlNotification::yieldThread();
yieldThread();
}
}
}
@ -238,19 +186,19 @@ private:
// MEMBERS
VerilatedMutex m_mutex;
std::condition_variable_any m_cv;
// Only notify the condition_variable if the worker is waiting
bool m_waiting VL_GUARDED_BY(m_mutex);
// Why a vector? We expect the pending list to be very short, typically
// 0 or 1 or 2, so popping from the front shouldn't be
// expensive. Revisit if we ever have longer queues...
std::vector<ExecRec> m_ready VL_GUARDED_BY(m_mutex);
// Store the size atomically, so we can spin wait
std::atomic<size_t> m_ready_size;
VlThreadPool* m_poolp; // Our associated thread pool
// If values stored are non-NULL, the thread is asleep pending new
// work. If the thread is not asleep, both parts of m_sleepAlarm must
// be NULL.
std::pair<VlNotification*, ExecRec*> m_sleepAlarm VL_GUARDED_BY(m_mutex);
bool m_profiling; // Is profiling enabled?
std::atomic<bool> m_exiting; // Worker thread should exit
std::thread m_cthread; // Underlying C++ thread record
@ -263,29 +211,38 @@ public:
~VlWorkerThread();
// METHODS
inline void dequeWork(ExecRec* workp) VL_REQUIRES(m_mutex) {
inline void dequeWork(ExecRec* workp) {
// Spin for a while, waiting for new data
for (int i = 0; i < VL_LOCK_SPINS; ++i) {
if (VL_LIKELY(m_ready_size.load(std::memory_order_relaxed))) {
break;
}
VL_CPU_RELAX();
}
VerilatedLockGuard lk(m_mutex);
while (m_ready.empty()) {
m_waiting = true;
m_cv.wait(lk);
}
m_waiting = false;
// As noted above this is inefficient if our ready list is ever
// long (but it shouldn't be)
*workp = m_ready.front();
m_ready.erase(m_ready.begin());
m_ready_size.fetch_sub(1, std::memory_order_relaxed);
}
inline void wakeUp() VL_REQUIRES(m_mutex) {
VlNotification* notifyp = m_sleepAlarm.first;
m_sleepAlarm.first = NULL; // NULL+NULL means wake
m_sleepAlarm.second = NULL;
notifyp->notify();
}
inline bool sleeping() VL_REQUIRES(m_mutex) {
return (m_sleepAlarm.first != NULL);
inline void wakeUp() {
addTask(nullptr, false, nullptr);
}
inline void addTask(VlExecFnp fnp, bool evenCycle, VlThrSymTab sym) {
VerilatedLockGuard lk(m_mutex);
m_ready.emplace_back(fnp, evenCycle, sym);
if (VL_LIKELY(sleeping())) { // Generally queue is waiting for work
// Awaken thread
dequeWork(m_sleepAlarm.second);
wakeUp();
bool notify;
{
VerilatedLockGuard lk(m_mutex);
m_ready.emplace_back(fnp, evenCycle, sym);
m_ready_size.fetch_add(1, std::memory_order_relaxed);
notify = m_waiting;
}
if (notify) m_cv.notify_one();
}
void workerLoop();
static void startWorker(VlWorkerThread* workerp);