Fix multithreaded yield behavior when no work.

Signed-off-by: Wilson Snyder <wsnyder@wsnyder.org>
2019-10-07 19:27:31 -04:00 · 2019-10-07 19:27:31 -04:00 · 0ed58a4217
commit 0ed58a4217
parent 8e6d68147c
4 changed files with 52 additions and 113 deletions
--- a/2
+++ b/2
@ -12,6 +12,8 @@ The contributors that suggested a given feature are shown in []. Thanks!

 ****  Increase case duplicate/incomplete to 16 bit tables, bug1545. [Yossi Nivin]

+****  Fix multithreaded yield behavior when no work.  [Patrick Stewart]
+

 * Verilator 4.020 2019-10-06

--- a/include/verilated.h
+++ b/include/verilated.h
@ -141,7 +141,7 @@ class VL_CAPABILITY("mutex") VerilatedMutex {
    bool try_lock() VL_TRY_ACQUIRE(true) { return m_mutex.try_lock(); }
 };

-/// Lock guard for mutex (ala std::lock_guard), wrapped to allow -fthread_safety checks
+/// Lock guard for mutex (ala std::unique_lock), wrapped to allow -fthread_safety checks
 class VL_SCOPED_CAPABILITY VerilatedLockGuard {
    VL_UNCOPYABLE(VerilatedLockGuard);
  private:
@ -154,6 +154,8 @@ class VL_SCOPED_CAPABILITY VerilatedLockGuard {
    ~VerilatedLockGuard() VL_RELEASE() {
        m_mutexr.unlock();
    }
+    void lock() VL_ACQUIRE(mutexr) { m_mutexr.lock(); }
+    void unlock() VL_RELEASE() { m_mutexr.unlock(); }
 };

 #else  // !VL_THREADED
@ -171,6 +173,8 @@ class VerilatedLockGuard {
 public:
    explicit VerilatedLockGuard(VerilatedMutex&) {}
    ~VerilatedLockGuard() {}
+    void lock() {}
+    void unlock() {}
 };

 #endif  // VL_THREADED
--- a/include/verilated_threads.cpp
+++ b/include/verilated_threads.cpp
@ -24,7 +24,7 @@

 #include <cstdio>

-std::atomic<vluint64_t> VlNotification::s_yields;
+std::atomic<vluint64_t> VlMTaskVertex::s_yields;

 VL_THREAD_LOCAL VlThreadPool::ProfileTrace* VlThreadPool::t_profilep = NULL;

@ -41,7 +41,8 @@ VlMTaskVertex::VlMTaskVertex(vluint32_t upstreamDepCount)
 // VlWorkerThread

 VlWorkerThread::VlWorkerThread(VlThreadPool* poolp, bool profiling)
-    : m_poolp(poolp)
+    : m_ready_size(0)
+    , m_poolp(poolp)
    , m_profiling(profiling)
    , m_exiting(false)
      // Must init this last -- after setting up fields that it might read:
@ -49,12 +50,7 @@ VlWorkerThread::VlWorkerThread(VlThreadPool* poolp, bool profiling)

 VlWorkerThread::~VlWorkerThread() {
    m_exiting.store(true, std::memory_order_release);
-    {
-        VerilatedLockGuard lk(m_mutex);
-        if (sleeping()) {
-            wakeUp();
-        }
-    }
+    wakeUp();
    // The thread should exit; join it.
    m_cthread.join();
 }
@ -64,38 +60,18 @@ void VlWorkerThread::workerLoop() {
        m_poolp->setupProfilingClientThread();
    }

-    VlNotification alarm;
    ExecRec work;
    work.m_fnp = NULL;

    while (1) {
-        bool sleep = false;
-        if (VL_UNLIKELY(!work.m_fnp)) {
-            // Look for work
-            VerilatedLockGuard lk(m_mutex);
-            if (VL_LIKELY(!m_ready.empty())) {
-                dequeWork(&work);
-            } else {
-                // No work available, prepare to sleep. Pass alarm/work
-                // into m_sleepAlarm so wakeUp will tall this function.
-                //
-                // Must modify m_sleepAlarm in the same critical section as
-                // the check for ready work, otherwise we could race with
-                // another thread enqueueing work and never be awoken.
-                m_sleepAlarm.first = &alarm;
-                m_sleepAlarm.second = &work;
-                sleep = true;
-            }
+        if (VL_LIKELY(!work.m_fnp)) {
+            dequeWork(&work);
        }

        // Do this here, not above, to avoid a race with the destructor.
        if (VL_UNLIKELY(m_exiting.load(std::memory_order_acquire)))
            break;

-        if (VL_UNLIKELY(sleep)) {
-            alarm.waitForNotification();  // ZZZzzzzz
-            alarm.reset();
-        }
        if (VL_LIKELY(work.m_fnp)) {
            work.m_fnp(work.m_evenCycle, work.m_sym);
            work.m_fnp = NULL;
@ -194,7 +170,7 @@ void VlThreadPool::profileDump(const char* filenamep, vluint64_t ticksElapsed) {
    fprintf(fp, "VLPROF arg +verilator+prof+threads+window+%u\n",
            Verilated::profThreadsWindow());
    fprintf(fp, "VLPROF stat yields %" VL_PRI64 "u\n",
-            VlNotification::yields());
+            VlMTaskVertex::yields());

    vluint32_t thread_id = 0;
    for (ProfileSet::iterator pit = m_allProfiles.begin();
--- a/include/verilated_threads.h
+++ b/include/verilated_threads.h
@ -25,6 +25,7 @@
 #include "verilatedos.h"
 #include "verilated.h"  // for VerilatedMutex and clang annotations

+#include <condition_variable>
 #include <set>
 #include <vector>
 #if defined(__linux)
@ -39,71 +40,12 @@
 // as a void* here.
 typedef void* VlThrSymTab;

-class VlNotification {
-    // MEMBERS
-    std::atomic<bool> m_notified;  // Notification pending
-    static std::atomic<vluint64_t> s_yields;  // Statistics
-
-public:
-    // CONSTRUCTORS
-    VlNotification()
-        : m_notified(false) {
-        assert(atomic_is_lock_free(&m_notified));
-    }
-    ~VlNotification() {}
-
-    // METHODS
-    static vluint64_t yields() { return s_yields; }
-    static void yieldThread() {
-        ++s_yields;  // Statistics
-        std::this_thread::yield();
-    }
-
-    // Block until notify() has occurred, then return.
-    // If notify() has already occurred, return immediately.
-    //
-    // This is logically const: the object will remain in notified state
-    // after WaitForNotification() returns, so you could notify more than
-    // one thread of the same event.
-    inline void waitForNotification() {
-        unsigned ct = 0;
-        while (VL_UNLIKELY(!notified())) {
-            VL_CPU_RELAX();
-            ++ct;
-            if (VL_UNLIKELY(ct > VL_LOCK_SPINS)) {
-                ct = 0;
-                yieldThread();
-            }
-        }
-    }
-
-    // The 'inline' keyword here means nothing to the compiler, it's
-    // implicit on methods defined within the class body anyway.
-    //
-    // 'inline' is attached the this method, and others in this file,
-    // to remind humans that some routines in this file are called many
-    // times per cycle in threaded mode. Such routines should be
-    // inlinable; that's why they're declared in the .h and not the .cpp.
-    inline bool notified() {
-        return m_notified.load(std::memory_order_acquire);
-    }
-    // Set notified state. If state is already notified,
-    // it remains so.
-    inline void notify() {
-        m_notified.store(true, std::memory_order_release);
-    }
-    // Reset the state to un-notified state, which is also the
-    // state of a new Notification object.
-    inline void reset() {
-        m_notified.store(false, std::memory_order_relaxed);
-    }
-};
-
 typedef void (*VlExecFnp)(bool, VlThrSymTab);

 /// Track dependencies for a single MTask.
 class VlMTaskVertex {
    // MEMBERS
+    static std::atomic<vluint64_t> s_yields;  // Statistics

    // On even cycles, _upstreamDepsDone increases as upstream
    // dependencies complete. When it reaches _upstreamDepCount,
@ -133,6 +75,12 @@ public:
    explicit VlMTaskVertex(vluint32_t upstreamDepCount);
    ~VlMTaskVertex() {}

+    static vluint64_t yields() { return s_yields; }
+    static void yieldThread() {
+        ++s_yields;  // Statistics
+        std::this_thread::yield();
+    }
+
    // Upstream mtasks must call this when they complete.
    // Returns true when the current MTaskVertex becomes ready to execute,
    // false while it's still waiting on more dependencies.
@ -160,7 +108,7 @@ public:
            ++ct;
            if (VL_UNLIKELY(ct > VL_LOCK_SPINS)) {
                ct = 0;
-                VlNotification::yieldThread();
+                yieldThread();
            }
        }
    }
@ -238,19 +186,19 @@ private:

    // MEMBERS
    VerilatedMutex m_mutex;
+    std::condition_variable_any m_cv;
+    // Only notify the condition_variable if the worker is waiting
+    bool m_waiting VL_GUARDED_BY(m_mutex);

    // Why a vector? We expect the pending list to be very short, typically
    // 0 or 1 or 2, so popping from the front shouldn't be
    // expensive. Revisit if we ever have longer queues...
    std::vector<ExecRec> m_ready VL_GUARDED_BY(m_mutex);
+    // Store the size atomically, so we can spin wait
+    std::atomic<size_t> m_ready_size;

    VlThreadPool* m_poolp;  // Our associated thread pool

-    // If values stored are non-NULL, the thread is asleep pending new
-    // work. If the thread is not asleep, both parts of m_sleepAlarm must
-    // be NULL.
-    std::pair<VlNotification*, ExecRec*> m_sleepAlarm VL_GUARDED_BY(m_mutex);
-
    bool m_profiling;  // Is profiling enabled?
    std::atomic<bool> m_exiting;  // Worker thread should exit
    std::thread m_cthread;  // Underlying C++ thread record
@ -263,29 +211,38 @@ public:
    ~VlWorkerThread();

    // METHODS
-    inline void dequeWork(ExecRec* workp) VL_REQUIRES(m_mutex) {
+    inline void dequeWork(ExecRec* workp) {
+        // Spin for a while, waiting for new data
+        for (int i = 0; i < VL_LOCK_SPINS; ++i) {
+            if (VL_LIKELY(m_ready_size.load(std::memory_order_relaxed))) {
+                break;
+            }
+            VL_CPU_RELAX();
+        }
+        VerilatedLockGuard lk(m_mutex);
+        while (m_ready.empty()) {
+            m_waiting = true;
+            m_cv.wait(lk);
+        }
+        m_waiting = false;
        // As noted above this is inefficient if our ready list is ever
        // long (but it shouldn't be)
        *workp = m_ready.front();
        m_ready.erase(m_ready.begin());
+        m_ready_size.fetch_sub(1, std::memory_order_relaxed);
    }
-    inline void wakeUp() VL_REQUIRES(m_mutex) {
-        VlNotification* notifyp = m_sleepAlarm.first;
-        m_sleepAlarm.first = NULL;  // NULL+NULL means wake
-        m_sleepAlarm.second = NULL;
-        notifyp->notify();
-    }
-    inline bool sleeping() VL_REQUIRES(m_mutex) {
-        return (m_sleepAlarm.first != NULL);
+    inline void wakeUp() {
+        addTask(nullptr, false, nullptr);
    }
    inline void addTask(VlExecFnp fnp, bool evenCycle, VlThrSymTab sym) {
-        VerilatedLockGuard lk(m_mutex);
-        m_ready.emplace_back(fnp, evenCycle, sym);
-        if (VL_LIKELY(sleeping())) {  // Generally queue is waiting for work
-            // Awaken thread
-            dequeWork(m_sleepAlarm.second);
-            wakeUp();
+        bool notify;
+        {
+            VerilatedLockGuard lk(m_mutex);
+            m_ready.emplace_back(fnp, evenCycle, sym);
+            m_ready_size.fetch_add(1, std::memory_order_relaxed);
+            notify = m_waiting;
        }
+        if (notify) m_cv.notify_one();
    }
    void workerLoop();
    static void startWorker(VlWorkerThread* workerp);