From 0ef3c1093172dfb08cae95112d27b6f439e77044 Mon Sep 17 00:00:00 2001 From: Wilson Snyder Date: Tue, 8 May 2018 21:43:32 -0400 Subject: [PATCH] Pull some thread include changes from thread branch. --- include/verilated.cpp | 13 +++++++--- include/verilated.h | 57 +++++++++++++++++++++++++++++++---------- include/verilated_imp.h | 31 +++++++++++++--------- include/verilatedos.h | 35 +++++++++++++++++++++++++ src/V3EmitC.cpp | 12 ++++----- 5 files changed, 113 insertions(+), 35 deletions(-) diff --git a/include/verilated.cpp b/include/verilated.cpp index 172197598..23c51b7eb 100644 --- a/include/verilated.cpp +++ b/include/verilated.cpp @@ -159,7 +159,11 @@ void VL_DBG_MSGF(const char* formatp, ...) VL_MT_SAFE { std::string out = _vl_string_vprintf(formatp, ap); va_end(ap); // printf("-imm-V{t%d,%" VL_PRI64 "d}%s", VL_THREAD_ID(), _vl_dbg_sequence_number(), out.c_str()); - VL_PRINTF_MT("-V{t%d,%" VL_PRI64 "d}%s", VL_THREAD_ID(), _vl_dbg_sequence_number(), out.c_str()); + + // Using VL_PRINTF not VL_PRINTF_MT so that we can call VL_DBG_MSGF + // from within the guts of the thread execution machinery (and it goes + // to the screen and not into the queues we're debugging) + VL_PRINTF("-V{t%d,%" VL_PRI64 "d}%s", VL_THREAD_ID(), _vl_dbg_sequence_number(), out.c_str()); } #ifdef VL_THREADED @@ -1599,7 +1603,8 @@ std::string VL_CVT_PACK_STR_NW(int lwords, WDataInP lwp) VL_MT_SAFE { Verilated::ThreadLocal::ThreadLocal() : #ifdef VL_THREADED - t_trainId(0), + t_mtaskId(0), + t_endOfEvalReqd(0), #endif t_dpiScopep(NULL), t_dpiFilename(0), t_dpiLineno(0) { } @@ -1734,8 +1739,8 @@ const VerilatedScopeNameMap* Verilated::scopeNameMap() VL_MT_SAFE { } #ifdef VL_THREADED -void Verilated::endOfThreadTrainGuts(VerilatedEvalMsgQueue* evalMsgQp) VL_MT_SAFE { - VL_DEBUG_IF(VL_DBG_MSGF("End of thread train\n");); +void Verilated::endOfThreadMTaskGuts(VerilatedEvalMsgQueue* evalMsgQp) VL_MT_SAFE { + VL_DEBUG_IF(VL_DBG_MSGF("End of thread mtask\n");); VerilatedThreadMsgQueue::flush(evalMsgQp); } diff --git a/include/verilated.h b/include/verilated.h index ef3a7fd13..e47b6ee14 100644 --- a/include/verilated.h +++ b/include/verilated.h @@ -111,6 +111,8 @@ extern vluint32_t VL_THREAD_ID() VL_MT_SAFE; #if VL_THREADED +#define VL_LOCK_SPINS 50000 /// Number of times to spin for a mutex before relaxing + /// Mutex, wrapped to allow -fthread_safety checks class VL_CAPABILITY("mutex") VerilatedMutex { private: @@ -119,9 +121,19 @@ class VL_CAPABILITY("mutex") VerilatedMutex { VerilatedMutex() {} ~VerilatedMutex() {} /// Acquire/lock mutex - void lock() VL_ACQUIRE() { m_mutex.lock(); } + void lock() VL_ACQUIRE() { + // Try to acquire the lock by spinning. If the wait is short, + // avoids a trap to the OS plus OS scheduler overhead. + if (VL_LIKELY(try_lock())) return; // Short circuit loop + for (int i = 0; i < VL_LOCK_SPINS; ++i) { + if (VL_LIKELY(try_lock())) return; + VL_CPU_RELAX(); + } + // Spinning hasn't worked, pay the cost of blocking. + m_mutex.lock(); + } /// Release/unlock mutex - void unlock() VL_RELEASE() { m_mutex.unlock(); } + void unlock() VL_RELEASE() { m_mutex.unlock(); } /// Try to acquire mutex. Returns true on success, and false on failure. bool try_lock() VL_TRY_ACQUIRE(true) { return m_mutex.try_lock(); } }; @@ -143,14 +155,21 @@ class VL_SCOPED_CAPABILITY VerilatedLockGuard { #else // !VL_THREADED -// Empty classes to avoid #ifdefs everywhere -class VerilatedMutex {}; +/// Empty non-threaded mutex to avoid #ifdefs in consuming code +class VerilatedMutex { +public: + void lock() {} + void unlock() {} +}; + +/// Empty non-threaded lock guard to avoid #ifdefs in consuming code class VerilatedLockGuard { VL_UNCOPYABLE(VerilatedLockGuard); public: explicit VerilatedLockGuard(VerilatedMutex&) {} ~VerilatedLockGuard() {} }; + #endif // VL_THREADED /// Remember the calling thread at construction time, and make sure later calls use same thread @@ -336,7 +355,7 @@ class Verilated { // Not covered by mutex, as per-thread static VL_THREAD_LOCAL struct ThreadLocal { #ifdef VL_THREADED - vluint32_t t_trainId; ///< Current train# executing on this thread + vluint32_t t_mtaskId; ///< Current mtask# executing on this thread vluint32_t t_endOfEvalReqd; ///< Messages may be pending, thread needs endOf-eval calls #endif const VerilatedScope* t_dpiScopep; ///< DPI context scope @@ -455,22 +474,29 @@ public: static size_t serializedSize() VL_PURE { return sizeof(s_s); } static void* serializedPtr() VL_MT_UNSAFE { return &s_s; } // Unsafe, for Serialize only #ifdef VL_THREADED - /// Set the trainId, called when a train starts - static void trainId(vluint32_t id) VL_MT_SAFE { t_s.t_trainId = id; } - static vluint32_t trainId() VL_MT_SAFE { return t_s.t_trainId; } + /// Set the mtaskId, called when an mtask starts + static void mtaskId(vluint32_t id) VL_MT_SAFE { t_s.t_mtaskId = id; } + static vluint32_t mtaskId() VL_MT_SAFE { return t_s.t_mtaskId; } static void endOfEvalReqdInc() VL_MT_SAFE { ++t_s.t_endOfEvalReqd; } static void endOfEvalReqdDec() VL_MT_SAFE { --t_s.t_endOfEvalReqd; } - /// Called at end of each thread train, before finishing eval - static void endOfThreadTrain(VerilatedEvalMsgQueue* evalMsgQp) VL_MT_SAFE { - if (VL_UNLIKELY(t_s.t_endOfEvalReqd)) { endOfThreadTrainGuts(evalMsgQp); } } + + /// Called at end of each thread mtask, before finishing eval + static void endOfThreadMTask(VerilatedEvalMsgQueue* evalMsgQp) VL_MT_SAFE { + if (VL_UNLIKELY(t_s.t_endOfEvalReqd)) { endOfThreadMTaskGuts(evalMsgQp); } + } /// Called at end of eval loop static void endOfEval(VerilatedEvalMsgQueue* evalMsgQp) VL_MT_SAFE { - if (VL_UNLIKELY(t_s.t_endOfEvalReqd)) { endOfEvalGuts(evalMsgQp); } } + // It doesn't work to set endOfEvalReqd on the threadpool thread + // and then check it on the eval thread since it's thread local. + // It should be ok to call into endOfEvalGuts, it returns immediately + // if there are no transactions. + endOfEvalGuts(evalMsgQp); + } #endif private: #ifdef VL_THREADED - static void endOfThreadTrainGuts(VerilatedEvalMsgQueue* evalMsgQp) VL_MT_SAFE; + static void endOfThreadMTaskGuts(VerilatedEvalMsgQueue* evalMsgQp) VL_MT_SAFE; static void endOfEvalGuts(VerilatedEvalMsgQueue* evalMsgQp) VL_MT_SAFE; #endif }; @@ -527,6 +553,11 @@ extern QData VL_RAND_RESET_Q(int obits); ///< Random reset a signal extern WDataOutP VL_RAND_RESET_W(int obits, WDataOutP outwp); ///< Random reset a signal extern WDataOutP VL_ZERO_RESET_W(int obits, WDataOutP outwp); ///< Zero reset a signal (slow - else use VL_ZERO_W) +#if VL_THREADED +/// Return high-precision counter for profiling, or 0x0 if not available +inline QData VL_RDTSC_Q() { vluint64_t val; VL_RDTSC(val); return val; } +#endif + /// Math extern WDataOutP _vl_moddiv_w(int lbits, WDataOutP owp, WDataInP lwp, WDataInP rwp, bool is_modulus); diff --git a/include/verilated_imp.h b/include/verilated_imp.h index ae130f8d2..fa5eac60b 100644 --- a/include/verilated_imp.h +++ b/include/verilated_imp.h @@ -49,25 +49,25 @@ class VerilatedScope; // Threaded message passing #ifdef VL_THREADED -/// Message, enqueued on a train, and consumed on the main eval thread +/// Message, enqueued on an mtask, and consumed on the main eval thread class VerilatedMsg { public: // TYPES struct Cmp { bool operator() (const VerilatedMsg& a, const VerilatedMsg& b) const { - return a.trainId() < b.trainId(); } + return a.mtaskId() < b.mtaskId(); } }; private: // MEMBERS - vluint32_t m_trainId; ///< Train that did enqueue + vluint32_t m_mtaskId; ///< MTask that did enqueue std::function m_cb; ///< Lambda to execute when message received public: // CONSTRUCTORS VerilatedMsg(const std::function& cb) - : m_trainId(Verilated::trainId()), m_cb(cb) {} + : m_mtaskId(Verilated::mtaskId()), m_cb(cb) {} ~VerilatedMsg() {} // METHODS - vluint32_t trainId() const { return m_trainId; } + vluint32_t mtaskId() const { return m_mtaskId; } /// Execute the lambda function void run() const { m_cb(); } }; @@ -84,7 +84,9 @@ class VerilatedEvalMsgQueue { VerilatedThreadQueue m_queue VL_GUARDED_BY(m_mutex); ///< Message queue public: // CONSTRUCTORS - VerilatedEvalMsgQueue() : m_depth(0) { } + VerilatedEvalMsgQueue() : m_depth(0) { + assert(atomic_is_lock_free(&m_depth)); + } ~VerilatedEvalMsgQueue() { } private: VL_UNCOPYABLE(VerilatedEvalMsgQueue); @@ -92,7 +94,6 @@ public: // METHODS //// Add message to queue (called by producer) void post(const VerilatedMsg& msg) VL_EXCLUDES(m_mutex) { - Verilated::endOfEvalReqdInc(); // No mutex, threadsafe VerilatedLockGuard guard(m_mutex); m_queue.insert(msg); // Pass by value to copy the message into queue ++m_depth; @@ -114,10 +115,9 @@ public: m_queue.erase(it); m_mutex.unlock(); m_depth--; // Ok if outside critical section as only this code checks the value - Verilated::endOfEvalReqdDec(); // No mutex, threadsafe { - VL_DEBUG_IF(VL_DBG_MSGF("Executing callback from trainId=%d\n", msg.trainId());); - msg.run(); + VL_DEBUG_IF(VL_DBG_MSGF("Executing callback from mtaskId=%d\n", msg.mtaskId());); + msg.run(); } } } @@ -143,8 +143,15 @@ private: public: /// Add message to queue, called by producer static void post(const VerilatedMsg& msg) VL_MT_SAFE { - Verilated::endOfEvalReqdInc(); - threadton().m_queue.push(msg); // Pass by value to copy the message into queue + // Handle calls to threaded routines outside + // of any mtask -- if an initial block calls $finish, say. + if (Verilated::mtaskId() == 0) { + // No queueing, just do the action immediately + msg.run(); + } else { + Verilated::endOfEvalReqdInc(); + threadton().m_queue.push(msg); // Pass by value to copy the message into queue + } } /// Push all messages to the eval's queue static void flush(VerilatedEvalMsgQueue* evalMsgQp) VL_MT_SAFE { diff --git a/include/verilatedos.h b/include/verilatedos.h index f8bcca553..ee10a6983 100644 --- a/include/verilatedos.h +++ b/include/verilatedos.h @@ -353,6 +353,41 @@ typedef unsigned long long vluint64_t; ///< 64-bit unsigned type # define VL_ROUND(n) round(n) #endif +//========================================================================= +// Performance counters + +#if VL_THREADED +# if defined(__i386__) || defined(__x86_64__) +/// The vluint64_t argument is loaded with a high-performance counter for profiling +/// or 0x0 if not implemeted on this platform +# define VL_RDTSC(val) asm volatile("rdtsc" : "=A" (val)) +# elif defined(__aarch64__) +# define VL_RDTSC(val) asm volatile("mrs %[rt],PMCCNTR_EL0" : [rt] "=r" (val)); +# elif +// We just silently ignore unknown OSes, as only leads to missing statistics +# define VL_RDTSC(val) (val) = 0; +# endif +#endif + +//========================================================================= +// Threading related OS-specific functions + +#if VL_THREADED +# if defined(__i386__) || defined(__x86_64__) +/// For more efficient busy waiting on SMT CPUs, let the processor know +/// we're just waiting so it can let another thread run +# define VL_CPU_RELAX() asm volatile("rep; nop" ::: "memory") +# elif defined(__ia64__) +# define VL_CPU_RELAX() asm volatile("hint @pause" ::: "memory") +# elif defined(__aarch64__) +# define VL_CPU_RELAX() asm volatile("yield" ::: "memory") +# elif defined(__powerpc64__) +# define VL_CPU_RELAX() asm volatile("or 1, 1, 1; or 2, 2, 2;" ::: "memory") +# elif +# error "Missing VL_CPU_RELAX() definition. Or, don't use VL_THREADED" +# endif +#endif + //========================================================================= #endif /*guard*/ diff --git a/src/V3EmitC.cpp b/src/V3EmitC.cpp index 490b805cd..e0586e991 100644 --- a/src/V3EmitC.cpp +++ b/src/V3EmitC.cpp @@ -1780,17 +1780,17 @@ void EmitCImp::emitWrapEval(AstNodeModule* modp) { } if (v3Global.opt.threads()) { // THREADED-TODO move to per-train - uint32_t trainId = 0; - putsDecoration("// Train "+cvtToStr(trainId)+" start\n"); - puts("VL_DEBUG_IF(VL_DBG_MSGF(\"Train starting, trainId="+cvtToStr(trainId)+"\\n\"););\n"); - puts("Verilated::trainId("+cvtToStr(trainId)+");\n"); + uint32_t mtaskId = 0; + putsDecoration("// MTask "+cvtToStr(mtaskId)+" start\n"); + puts("VL_DEBUG_IF(VL_DBG_MSGF(\"MTask starting, mtaskId="+cvtToStr(mtaskId)+"\\n\"););\n"); + puts("Verilated::mtaskId("+cvtToStr(mtaskId)+");\n"); } emitSettleLoop( (string("VL_DEBUG_IF(VL_DBG_MSGF(\"+ Clock loop\\n\"););\n") + (v3Global.opt.trace() ? "vlSymsp->__Vm_activity = true;\n" : "") + "_eval(vlSymsp);"), false); - if (v3Global.opt.threads()) { // THREADED-TODO move to end of all trains on thread - puts("Verilated::endOfThreadTrain(vlSymsp->__Vm_evalMsgQp);\n"); + if (v3Global.opt.threads()) { // THREADED-TODO move to end of all mtasks on thread + puts("Verilated::endOfThreadMTask(vlSymsp->__Vm_evalMsgQp);\n"); } if (v3Global.opt.threads()) { puts("Verilated::endOfEval(vlSymsp->__Vm_evalMsgQp);\n");