Merge from master

2025-04-28 03:26:54 +00:00 · 2018-05-08 21:43:55 -04:00 · 2018-05-08 21:43:55 -04:00 · 489f58011b
commit 489f58011b
parent 1759b0826e 0ef3c10931
5 changed files with 113 additions and 35 deletions
--- a/include/verilated.cpp
+++ b/include/verilated.cpp
@ -159,7 +159,11 @@ void VL_DBG_MSGF(const char* formatp, ...) VL_MT_SAFE {
    std::string out = _vl_string_vprintf(formatp, ap);
    va_end(ap);
    // printf("-imm-V{t%d,%" VL_PRI64 "d}%s", VL_THREAD_ID(), _vl_dbg_sequence_number(), out.c_str());
-    VL_PRINTF_MT("-V{t%d,%" VL_PRI64 "d}%s", VL_THREAD_ID(), _vl_dbg_sequence_number(), out.c_str());
+
+    // Using VL_PRINTF not VL_PRINTF_MT so that we can call VL_DBG_MSGF
+    // from within the guts of the thread execution machinery (and it goes
+    // to the screen and not into the queues we're debugging)
+    VL_PRINTF("-V{t%d,%" VL_PRI64 "d}%s", VL_THREAD_ID(), _vl_dbg_sequence_number(), out.c_str());
 }

 #ifdef VL_THREADED
@ -1599,7 +1603,8 @@ std::string VL_CVT_PACK_STR_NW(int lwords, WDataInP lwp) VL_MT_SAFE {
 Verilated::ThreadLocal::ThreadLocal()
    :
 #ifdef VL_THREADED
-    t_trainId(0),
+    t_mtaskId(0),
+    t_endOfEvalReqd(0),
 #endif
    t_dpiScopep(NULL), t_dpiFilename(0), t_dpiLineno(0) {
 }
@ -1734,8 +1739,8 @@ const VerilatedScopeNameMap* Verilated::scopeNameMap() VL_MT_SAFE {
 }

 #ifdef VL_THREADED
-void Verilated::endOfThreadTrainGuts(VerilatedEvalMsgQueue* evalMsgQp) VL_MT_SAFE {
-    VL_DEBUG_IF(VL_DBG_MSGF("End of thread train\n"););
+void Verilated::endOfThreadMTaskGuts(VerilatedEvalMsgQueue* evalMsgQp) VL_MT_SAFE {
+    VL_DEBUG_IF(VL_DBG_MSGF("End of thread mtask\n"););
    VerilatedThreadMsgQueue::flush(evalMsgQp);
 }

--- a/include/verilated.h
+++ b/include/verilated.h
@ -111,6 +111,8 @@ extern vluint32_t VL_THREAD_ID() VL_MT_SAFE;

 #if VL_THREADED

+#define VL_LOCK_SPINS 50000  /// Number of times to spin for a mutex before relaxing
+
 /// Mutex, wrapped to allow -fthread_safety checks
 class VL_CAPABILITY("mutex") VerilatedMutex {
  private:
@ -119,9 +121,19 @@ class VL_CAPABILITY("mutex") VerilatedMutex {
    VerilatedMutex() {}
    ~VerilatedMutex() {}
    /// Acquire/lock mutex
-    void lock() VL_ACQUIRE() { m_mutex.lock(); }
+    void lock() VL_ACQUIRE() {
+        // Try to acquire the lock by spinning.  If the wait is short,
+        // avoids a trap to the OS plus OS scheduler overhead.
+        if (VL_LIKELY(try_lock())) return;  // Short circuit loop
+        for (int i = 0; i < VL_LOCK_SPINS; ++i) {
+            if (VL_LIKELY(try_lock())) return;
+            VL_CPU_RELAX();
+        }
+        // Spinning hasn't worked, pay the cost of blocking.
+        m_mutex.lock();
+    }
    /// Release/unlock mutex
-     void unlock() VL_RELEASE() { m_mutex.unlock(); }
+    void unlock() VL_RELEASE() { m_mutex.unlock(); }
    /// Try to acquire mutex.  Returns true on success, and false on failure.
    bool try_lock() VL_TRY_ACQUIRE(true) { return m_mutex.try_lock(); }
 };
@ -143,14 +155,21 @@ class VL_SCOPED_CAPABILITY VerilatedLockGuard {

 #else // !VL_THREADED

-// Empty classes to avoid #ifdefs everywhere
-class VerilatedMutex {};
+/// Empty non-threaded mutex to avoid #ifdefs in consuming code
+class VerilatedMutex {
+public:
+    void lock() {}
+    void unlock() {}
+};
+
+/// Empty non-threaded lock guard to avoid #ifdefs in consuming code
 class VerilatedLockGuard {
    VL_UNCOPYABLE(VerilatedLockGuard);
 public:
    explicit VerilatedLockGuard(VerilatedMutex&) {}
    ~VerilatedLockGuard() {}
 };
+
 #endif // VL_THREADED

 /// Remember the calling thread at construction time, and make sure later calls use same thread
@ -336,7 +355,7 @@ class Verilated {
    // Not covered by mutex, as per-thread
    static VL_THREAD_LOCAL struct ThreadLocal {
 #ifdef VL_THREADED
-	vluint32_t t_trainId;  ///< Current train# executing on this thread
+        vluint32_t t_mtaskId;  ///< Current mtask# executing on this thread
 	vluint32_t t_endOfEvalReqd;  ///< Messages may be pending, thread needs endOf-eval calls
 #endif
 	const VerilatedScope* t_dpiScopep;  ///< DPI context scope
@ -455,22 +474,29 @@ public:
    static size_t serializedSize() VL_PURE { return sizeof(s_s); }
    static void* serializedPtr() VL_MT_UNSAFE { return &s_s; } // Unsafe, for Serialize only
 #ifdef VL_THREADED
-    /// Set the trainId, called when a train starts
-    static void trainId(vluint32_t id) VL_MT_SAFE { t_s.t_trainId = id; }
-    static vluint32_t trainId() VL_MT_SAFE { return t_s.t_trainId; }
+    /// Set the mtaskId, called when an mtask starts
+    static void mtaskId(vluint32_t id) VL_MT_SAFE { t_s.t_mtaskId = id; }
+    static vluint32_t mtaskId() VL_MT_SAFE { return t_s.t_mtaskId; }
    static void endOfEvalReqdInc() VL_MT_SAFE { ++t_s.t_endOfEvalReqd; }
    static void endOfEvalReqdDec() VL_MT_SAFE { --t_s.t_endOfEvalReqd; }
-    /// Called at end of each thread train, before finishing eval
-    static void endOfThreadTrain(VerilatedEvalMsgQueue* evalMsgQp) VL_MT_SAFE {
-	if (VL_UNLIKELY(t_s.t_endOfEvalReqd)) { endOfThreadTrainGuts(evalMsgQp); } }
+
+    /// Called at end of each thread mtask, before finishing eval
+    static void endOfThreadMTask(VerilatedEvalMsgQueue* evalMsgQp) VL_MT_SAFE {
+        if (VL_UNLIKELY(t_s.t_endOfEvalReqd)) { endOfThreadMTaskGuts(evalMsgQp); }
+    }
    /// Called at end of eval loop
    static void endOfEval(VerilatedEvalMsgQueue* evalMsgQp) VL_MT_SAFE {
-	if (VL_UNLIKELY(t_s.t_endOfEvalReqd)) { endOfEvalGuts(evalMsgQp); } }
+        // It doesn't work to set endOfEvalReqd on the threadpool thread
+        // and then check it on the eval thread since it's thread local.
+        // It should be ok to call into endOfEvalGuts, it returns immediately
+        // if there are no transactions.
+        endOfEvalGuts(evalMsgQp);
+    }
 #endif

 private:
 #ifdef VL_THREADED
-    static void endOfThreadTrainGuts(VerilatedEvalMsgQueue* evalMsgQp) VL_MT_SAFE;
+    static void endOfThreadMTaskGuts(VerilatedEvalMsgQueue* evalMsgQp) VL_MT_SAFE;
    static void endOfEvalGuts(VerilatedEvalMsgQueue* evalMsgQp) VL_MT_SAFE;
 #endif
 };
@ -527,6 +553,11 @@ extern QData  VL_RAND_RESET_Q(int obits);	///< Random reset a signal
 extern WDataOutP VL_RAND_RESET_W(int obits, WDataOutP outwp);	///< Random reset a signal
 extern WDataOutP VL_ZERO_RESET_W(int obits, WDataOutP outwp);	///< Zero reset a signal (slow - else use VL_ZERO_W)

+#if VL_THREADED
+/// Return high-precision counter for profiling, or 0x0 if not available
+inline QData VL_RDTSC_Q() { vluint64_t val; VL_RDTSC(val); return val; }
+#endif
+
 /// Math
 extern WDataOutP _vl_moddiv_w(int lbits, WDataOutP owp, WDataInP lwp, WDataInP rwp, bool is_modulus);

--- a/include/verilated_imp.h
+++ b/include/verilated_imp.h
@ -49,25 +49,25 @@ class VerilatedScope;
 // Threaded message passing

 #ifdef VL_THREADED
-/// Message, enqueued on a train, and consumed on the main eval thread
+/// Message, enqueued on an mtask, and consumed on the main eval thread
 class VerilatedMsg {
 public:
    // TYPES
    struct Cmp {
        bool operator() (const VerilatedMsg& a, const VerilatedMsg& b) const {
-	    return a.trainId() < b.trainId(); }
+            return a.mtaskId() < b.mtaskId(); }
    };
 private:
    // MEMBERS
-    vluint32_t 		m_trainId;  	///< Train that did enqueue
+    vluint32_t          m_mtaskId;      ///< MTask that did enqueue
    std::function<void()> m_cb; 	///< Lambda to execute when message received
 public:
    // CONSTRUCTORS
    VerilatedMsg(const std::function<void()>& cb)
-	: m_trainId(Verilated::trainId()), m_cb(cb) {}
+        : m_mtaskId(Verilated::mtaskId()), m_cb(cb) {}
    ~VerilatedMsg() {}
    // METHODS
-    vluint32_t trainId() const { return m_trainId; }
+    vluint32_t mtaskId() const { return m_mtaskId; }
    /// Execute the lambda function
    void run() const { m_cb(); }
 };
@ -84,7 +84,9 @@ class VerilatedEvalMsgQueue  {
    VerilatedThreadQueue m_queue VL_GUARDED_BY(m_mutex);  ///< Message queue
 public:
    // CONSTRUCTORS
-    VerilatedEvalMsgQueue() : m_depth(0) { }
+    VerilatedEvalMsgQueue() : m_depth(0) {
+        assert(atomic_is_lock_free(&m_depth));
+    }
    ~VerilatedEvalMsgQueue() { }
 private:
    VL_UNCOPYABLE(VerilatedEvalMsgQueue);
@ -92,7 +94,6 @@ public:
    // METHODS
    //// Add message to queue (called by producer)
    void post(const VerilatedMsg& msg) VL_EXCLUDES(m_mutex) {
-	Verilated::endOfEvalReqdInc(); // No mutex, threadsafe
 	VerilatedLockGuard guard(m_mutex);
 	m_queue.insert(msg);  // Pass by value to copy the message into queue
 	++m_depth;
@ -114,10 +115,9 @@ public:
 	    m_queue.erase(it);
 	    m_mutex.unlock();
 	    m_depth--;  // Ok if outside critical section as only this code checks the value
-	    Verilated::endOfEvalReqdDec(); // No mutex, threadsafe
 	    {
-		VL_DEBUG_IF(VL_DBG_MSGF("Executing callback from trainId=%d\n", msg.trainId()););
-		msg.run();
+                VL_DEBUG_IF(VL_DBG_MSGF("Executing callback from mtaskId=%d\n", msg.mtaskId()););
+                msg.run();
 	    }
 	}
    }
@ -143,8 +143,15 @@ private:
 public:
    /// Add message to queue, called by producer
    static void post(const VerilatedMsg& msg) VL_MT_SAFE {
-	Verilated::endOfEvalReqdInc();
-	threadton().m_queue.push(msg);  // Pass by value to copy the message into queue
+        // Handle calls to threaded routines outside
+        // of any mtask -- if an initial block calls $finish, say.
+        if (Verilated::mtaskId() == 0) {
+            // No queueing, just do the action immediately
+            msg.run();
+        } else {
+            Verilated::endOfEvalReqdInc();
+            threadton().m_queue.push(msg);  // Pass by value to copy the message into queue
+        }
    }
    /// Push all messages to the eval's queue
    static void flush(VerilatedEvalMsgQueue* evalMsgQp) VL_MT_SAFE {
--- a/include/verilatedos.h
+++ b/include/verilatedos.h
@ -353,6 +353,41 @@ typedef unsigned long long	vluint64_t;	///< 64-bit unsigned type
 # define VL_ROUND(n) round(n)
 #endif

+//=========================================================================
+// Performance counters
+
+#if VL_THREADED
+# if defined(__i386__) || defined(__x86_64__)
+/// The vluint64_t argument is loaded with a high-performance counter for profiling
+/// or 0x0 if not implemeted on this platform
+#  define VL_RDTSC(val) asm volatile("rdtsc" : "=A" (val))
+# elif defined(__aarch64__)
+#  define VL_RDTSC(val) asm volatile("mrs %[rt],PMCCNTR_EL0" : [rt] "=r" (val));
+# elif
+// We just silently ignore unknown OSes, as only leads to missing statistics
+#  define VL_RDTSC(val) (val) = 0;
+# endif
+#endif
+
+//=========================================================================
+// Threading related OS-specific functions
+
+#if VL_THREADED
+# if defined(__i386__) || defined(__x86_64__)
+/// For more efficient busy waiting on SMT CPUs, let the processor know
+/// we're just waiting so it can let another thread run
+#  define VL_CPU_RELAX() asm volatile("rep; nop" ::: "memory")
+# elif defined(__ia64__)
+#  define VL_CPU_RELAX() asm volatile("hint @pause" ::: "memory")
+# elif defined(__aarch64__)
+#  define VL_CPU_RELAX() asm volatile("yield" ::: "memory")
+# elif defined(__powerpc64__)
+#  define VL_CPU_RELAX() asm volatile("or 1, 1, 1; or 2, 2, 2;" ::: "memory")
+# elif
+#  error "Missing VL_CPU_RELAX() definition. Or, don't use VL_THREADED"
+# endif
+#endif
+
 //=========================================================================

 #endif /*guard*/
--- a/src/V3EmitC.cpp
+++ b/src/V3EmitC.cpp
@ -1783,17 +1783,17 @@ void EmitCImp::emitWrapEval(AstNodeModule* modp) {
    }

    if (v3Global.opt.threads()) {  // THREADED-TODO move to per-train
-	uint32_t trainId = 0;
-	putsDecoration("// Train "+cvtToStr(trainId)+" start\n");
-	puts("VL_DEBUG_IF(VL_DBG_MSGF(\"Train starting, trainId="+cvtToStr(trainId)+"\\n\"););\n");
-	puts("Verilated::trainId("+cvtToStr(trainId)+");\n");
+	uint32_t mtaskId = 0;
+	putsDecoration("// MTask "+cvtToStr(mtaskId)+" start\n");
+	puts("VL_DEBUG_IF(VL_DBG_MSGF(\"MTask starting, mtaskId="+cvtToStr(mtaskId)+"\\n\"););\n");
+	puts("Verilated::mtaskId("+cvtToStr(mtaskId)+");\n");
    }
    emitSettleLoop(
        (string("VL_DEBUG_IF(VL_DBG_MSGF(\"+ Clock loop\\n\"););\n")
         + (v3Global.opt.trace() ? "vlSymsp->__Vm_activity = true;\n" : "")
         + "_eval(vlSymsp);"), false);
-    if (v3Global.opt.threads()) {  // THREADED-TODO move to end of all trains on thread
-	puts("Verilated::endOfThreadTrain(vlSymsp->__Vm_evalMsgQp);\n");
+    if (v3Global.opt.threads()) {  // THREADED-TODO move to end of all mtasks on thread
+	puts("Verilated::endOfThreadMTask(vlSymsp->__Vm_evalMsgQp);\n");
    }
    if (v3Global.opt.threads()) {
 	puts("Verilated::endOfEval(vlSymsp->__Vm_evalMsgQp);\n");