Merge from master

This commit is contained in:
Wilson Snyder 2018-05-08 21:43:55 -04:00
commit 489f58011b
5 changed files with 113 additions and 35 deletions

View File

@ -159,7 +159,11 @@ void VL_DBG_MSGF(const char* formatp, ...) VL_MT_SAFE {
std::string out = _vl_string_vprintf(formatp, ap); std::string out = _vl_string_vprintf(formatp, ap);
va_end(ap); va_end(ap);
// printf("-imm-V{t%d,%" VL_PRI64 "d}%s", VL_THREAD_ID(), _vl_dbg_sequence_number(), out.c_str()); // printf("-imm-V{t%d,%" VL_PRI64 "d}%s", VL_THREAD_ID(), _vl_dbg_sequence_number(), out.c_str());
VL_PRINTF_MT("-V{t%d,%" VL_PRI64 "d}%s", VL_THREAD_ID(), _vl_dbg_sequence_number(), out.c_str());
// Using VL_PRINTF not VL_PRINTF_MT so that we can call VL_DBG_MSGF
// from within the guts of the thread execution machinery (and it goes
// to the screen and not into the queues we're debugging)
VL_PRINTF("-V{t%d,%" VL_PRI64 "d}%s", VL_THREAD_ID(), _vl_dbg_sequence_number(), out.c_str());
} }
#ifdef VL_THREADED #ifdef VL_THREADED
@ -1599,7 +1603,8 @@ std::string VL_CVT_PACK_STR_NW(int lwords, WDataInP lwp) VL_MT_SAFE {
Verilated::ThreadLocal::ThreadLocal() Verilated::ThreadLocal::ThreadLocal()
: :
#ifdef VL_THREADED #ifdef VL_THREADED
t_trainId(0), t_mtaskId(0),
t_endOfEvalReqd(0),
#endif #endif
t_dpiScopep(NULL), t_dpiFilename(0), t_dpiLineno(0) { t_dpiScopep(NULL), t_dpiFilename(0), t_dpiLineno(0) {
} }
@ -1734,8 +1739,8 @@ const VerilatedScopeNameMap* Verilated::scopeNameMap() VL_MT_SAFE {
} }
#ifdef VL_THREADED #ifdef VL_THREADED
void Verilated::endOfThreadTrainGuts(VerilatedEvalMsgQueue* evalMsgQp) VL_MT_SAFE { void Verilated::endOfThreadMTaskGuts(VerilatedEvalMsgQueue* evalMsgQp) VL_MT_SAFE {
VL_DEBUG_IF(VL_DBG_MSGF("End of thread train\n");); VL_DEBUG_IF(VL_DBG_MSGF("End of thread mtask\n"););
VerilatedThreadMsgQueue::flush(evalMsgQp); VerilatedThreadMsgQueue::flush(evalMsgQp);
} }

View File

@ -111,6 +111,8 @@ extern vluint32_t VL_THREAD_ID() VL_MT_SAFE;
#if VL_THREADED #if VL_THREADED
#define VL_LOCK_SPINS 50000 /// Number of times to spin for a mutex before relaxing
/// Mutex, wrapped to allow -fthread_safety checks /// Mutex, wrapped to allow -fthread_safety checks
class VL_CAPABILITY("mutex") VerilatedMutex { class VL_CAPABILITY("mutex") VerilatedMutex {
private: private:
@ -119,9 +121,19 @@ class VL_CAPABILITY("mutex") VerilatedMutex {
VerilatedMutex() {} VerilatedMutex() {}
~VerilatedMutex() {} ~VerilatedMutex() {}
/// Acquire/lock mutex /// Acquire/lock mutex
void lock() VL_ACQUIRE() { m_mutex.lock(); } void lock() VL_ACQUIRE() {
// Try to acquire the lock by spinning. If the wait is short,
// avoids a trap to the OS plus OS scheduler overhead.
if (VL_LIKELY(try_lock())) return; // Short circuit loop
for (int i = 0; i < VL_LOCK_SPINS; ++i) {
if (VL_LIKELY(try_lock())) return;
VL_CPU_RELAX();
}
// Spinning hasn't worked, pay the cost of blocking.
m_mutex.lock();
}
/// Release/unlock mutex /// Release/unlock mutex
void unlock() VL_RELEASE() { m_mutex.unlock(); } void unlock() VL_RELEASE() { m_mutex.unlock(); }
/// Try to acquire mutex. Returns true on success, and false on failure. /// Try to acquire mutex. Returns true on success, and false on failure.
bool try_lock() VL_TRY_ACQUIRE(true) { return m_mutex.try_lock(); } bool try_lock() VL_TRY_ACQUIRE(true) { return m_mutex.try_lock(); }
}; };
@ -143,14 +155,21 @@ class VL_SCOPED_CAPABILITY VerilatedLockGuard {
#else // !VL_THREADED #else // !VL_THREADED
// Empty classes to avoid #ifdefs everywhere /// Empty non-threaded mutex to avoid #ifdefs in consuming code
class VerilatedMutex {}; class VerilatedMutex {
public:
void lock() {}
void unlock() {}
};
/// Empty non-threaded lock guard to avoid #ifdefs in consuming code
class VerilatedLockGuard { class VerilatedLockGuard {
VL_UNCOPYABLE(VerilatedLockGuard); VL_UNCOPYABLE(VerilatedLockGuard);
public: public:
explicit VerilatedLockGuard(VerilatedMutex&) {} explicit VerilatedLockGuard(VerilatedMutex&) {}
~VerilatedLockGuard() {} ~VerilatedLockGuard() {}
}; };
#endif // VL_THREADED #endif // VL_THREADED
/// Remember the calling thread at construction time, and make sure later calls use same thread /// Remember the calling thread at construction time, and make sure later calls use same thread
@ -336,7 +355,7 @@ class Verilated {
// Not covered by mutex, as per-thread // Not covered by mutex, as per-thread
static VL_THREAD_LOCAL struct ThreadLocal { static VL_THREAD_LOCAL struct ThreadLocal {
#ifdef VL_THREADED #ifdef VL_THREADED
vluint32_t t_trainId; ///< Current train# executing on this thread vluint32_t t_mtaskId; ///< Current mtask# executing on this thread
vluint32_t t_endOfEvalReqd; ///< Messages may be pending, thread needs endOf-eval calls vluint32_t t_endOfEvalReqd; ///< Messages may be pending, thread needs endOf-eval calls
#endif #endif
const VerilatedScope* t_dpiScopep; ///< DPI context scope const VerilatedScope* t_dpiScopep; ///< DPI context scope
@ -455,22 +474,29 @@ public:
static size_t serializedSize() VL_PURE { return sizeof(s_s); } static size_t serializedSize() VL_PURE { return sizeof(s_s); }
static void* serializedPtr() VL_MT_UNSAFE { return &s_s; } // Unsafe, for Serialize only static void* serializedPtr() VL_MT_UNSAFE { return &s_s; } // Unsafe, for Serialize only
#ifdef VL_THREADED #ifdef VL_THREADED
/// Set the trainId, called when a train starts /// Set the mtaskId, called when an mtask starts
static void trainId(vluint32_t id) VL_MT_SAFE { t_s.t_trainId = id; } static void mtaskId(vluint32_t id) VL_MT_SAFE { t_s.t_mtaskId = id; }
static vluint32_t trainId() VL_MT_SAFE { return t_s.t_trainId; } static vluint32_t mtaskId() VL_MT_SAFE { return t_s.t_mtaskId; }
static void endOfEvalReqdInc() VL_MT_SAFE { ++t_s.t_endOfEvalReqd; } static void endOfEvalReqdInc() VL_MT_SAFE { ++t_s.t_endOfEvalReqd; }
static void endOfEvalReqdDec() VL_MT_SAFE { --t_s.t_endOfEvalReqd; } static void endOfEvalReqdDec() VL_MT_SAFE { --t_s.t_endOfEvalReqd; }
/// Called at end of each thread train, before finishing eval
static void endOfThreadTrain(VerilatedEvalMsgQueue* evalMsgQp) VL_MT_SAFE { /// Called at end of each thread mtask, before finishing eval
if (VL_UNLIKELY(t_s.t_endOfEvalReqd)) { endOfThreadTrainGuts(evalMsgQp); } } static void endOfThreadMTask(VerilatedEvalMsgQueue* evalMsgQp) VL_MT_SAFE {
if (VL_UNLIKELY(t_s.t_endOfEvalReqd)) { endOfThreadMTaskGuts(evalMsgQp); }
}
/// Called at end of eval loop /// Called at end of eval loop
static void endOfEval(VerilatedEvalMsgQueue* evalMsgQp) VL_MT_SAFE { static void endOfEval(VerilatedEvalMsgQueue* evalMsgQp) VL_MT_SAFE {
if (VL_UNLIKELY(t_s.t_endOfEvalReqd)) { endOfEvalGuts(evalMsgQp); } } // It doesn't work to set endOfEvalReqd on the threadpool thread
// and then check it on the eval thread since it's thread local.
// It should be ok to call into endOfEvalGuts, it returns immediately
// if there are no transactions.
endOfEvalGuts(evalMsgQp);
}
#endif #endif
private: private:
#ifdef VL_THREADED #ifdef VL_THREADED
static void endOfThreadTrainGuts(VerilatedEvalMsgQueue* evalMsgQp) VL_MT_SAFE; static void endOfThreadMTaskGuts(VerilatedEvalMsgQueue* evalMsgQp) VL_MT_SAFE;
static void endOfEvalGuts(VerilatedEvalMsgQueue* evalMsgQp) VL_MT_SAFE; static void endOfEvalGuts(VerilatedEvalMsgQueue* evalMsgQp) VL_MT_SAFE;
#endif #endif
}; };
@ -527,6 +553,11 @@ extern QData VL_RAND_RESET_Q(int obits); ///< Random reset a signal
extern WDataOutP VL_RAND_RESET_W(int obits, WDataOutP outwp); ///< Random reset a signal extern WDataOutP VL_RAND_RESET_W(int obits, WDataOutP outwp); ///< Random reset a signal
extern WDataOutP VL_ZERO_RESET_W(int obits, WDataOutP outwp); ///< Zero reset a signal (slow - else use VL_ZERO_W) extern WDataOutP VL_ZERO_RESET_W(int obits, WDataOutP outwp); ///< Zero reset a signal (slow - else use VL_ZERO_W)
#if VL_THREADED
/// Return high-precision counter for profiling, or 0x0 if not available
inline QData VL_RDTSC_Q() { vluint64_t val; VL_RDTSC(val); return val; }
#endif
/// Math /// Math
extern WDataOutP _vl_moddiv_w(int lbits, WDataOutP owp, WDataInP lwp, WDataInP rwp, bool is_modulus); extern WDataOutP _vl_moddiv_w(int lbits, WDataOutP owp, WDataInP lwp, WDataInP rwp, bool is_modulus);

View File

@ -49,25 +49,25 @@ class VerilatedScope;
// Threaded message passing // Threaded message passing
#ifdef VL_THREADED #ifdef VL_THREADED
/// Message, enqueued on a train, and consumed on the main eval thread /// Message, enqueued on an mtask, and consumed on the main eval thread
class VerilatedMsg { class VerilatedMsg {
public: public:
// TYPES // TYPES
struct Cmp { struct Cmp {
bool operator() (const VerilatedMsg& a, const VerilatedMsg& b) const { bool operator() (const VerilatedMsg& a, const VerilatedMsg& b) const {
return a.trainId() < b.trainId(); } return a.mtaskId() < b.mtaskId(); }
}; };
private: private:
// MEMBERS // MEMBERS
vluint32_t m_trainId; ///< Train that did enqueue vluint32_t m_mtaskId; ///< MTask that did enqueue
std::function<void()> m_cb; ///< Lambda to execute when message received std::function<void()> m_cb; ///< Lambda to execute when message received
public: public:
// CONSTRUCTORS // CONSTRUCTORS
VerilatedMsg(const std::function<void()>& cb) VerilatedMsg(const std::function<void()>& cb)
: m_trainId(Verilated::trainId()), m_cb(cb) {} : m_mtaskId(Verilated::mtaskId()), m_cb(cb) {}
~VerilatedMsg() {} ~VerilatedMsg() {}
// METHODS // METHODS
vluint32_t trainId() const { return m_trainId; } vluint32_t mtaskId() const { return m_mtaskId; }
/// Execute the lambda function /// Execute the lambda function
void run() const { m_cb(); } void run() const { m_cb(); }
}; };
@ -84,7 +84,9 @@ class VerilatedEvalMsgQueue {
VerilatedThreadQueue m_queue VL_GUARDED_BY(m_mutex); ///< Message queue VerilatedThreadQueue m_queue VL_GUARDED_BY(m_mutex); ///< Message queue
public: public:
// CONSTRUCTORS // CONSTRUCTORS
VerilatedEvalMsgQueue() : m_depth(0) { } VerilatedEvalMsgQueue() : m_depth(0) {
assert(atomic_is_lock_free(&m_depth));
}
~VerilatedEvalMsgQueue() { } ~VerilatedEvalMsgQueue() { }
private: private:
VL_UNCOPYABLE(VerilatedEvalMsgQueue); VL_UNCOPYABLE(VerilatedEvalMsgQueue);
@ -92,7 +94,6 @@ public:
// METHODS // METHODS
//// Add message to queue (called by producer) //// Add message to queue (called by producer)
void post(const VerilatedMsg& msg) VL_EXCLUDES(m_mutex) { void post(const VerilatedMsg& msg) VL_EXCLUDES(m_mutex) {
Verilated::endOfEvalReqdInc(); // No mutex, threadsafe
VerilatedLockGuard guard(m_mutex); VerilatedLockGuard guard(m_mutex);
m_queue.insert(msg); // Pass by value to copy the message into queue m_queue.insert(msg); // Pass by value to copy the message into queue
++m_depth; ++m_depth;
@ -114,10 +115,9 @@ public:
m_queue.erase(it); m_queue.erase(it);
m_mutex.unlock(); m_mutex.unlock();
m_depth--; // Ok if outside critical section as only this code checks the value m_depth--; // Ok if outside critical section as only this code checks the value
Verilated::endOfEvalReqdDec(); // No mutex, threadsafe
{ {
VL_DEBUG_IF(VL_DBG_MSGF("Executing callback from trainId=%d\n", msg.trainId());); VL_DEBUG_IF(VL_DBG_MSGF("Executing callback from mtaskId=%d\n", msg.mtaskId()););
msg.run(); msg.run();
} }
} }
} }
@ -143,8 +143,15 @@ private:
public: public:
/// Add message to queue, called by producer /// Add message to queue, called by producer
static void post(const VerilatedMsg& msg) VL_MT_SAFE { static void post(const VerilatedMsg& msg) VL_MT_SAFE {
Verilated::endOfEvalReqdInc(); // Handle calls to threaded routines outside
threadton().m_queue.push(msg); // Pass by value to copy the message into queue // of any mtask -- if an initial block calls $finish, say.
if (Verilated::mtaskId() == 0) {
// No queueing, just do the action immediately
msg.run();
} else {
Verilated::endOfEvalReqdInc();
threadton().m_queue.push(msg); // Pass by value to copy the message into queue
}
} }
/// Push all messages to the eval's queue /// Push all messages to the eval's queue
static void flush(VerilatedEvalMsgQueue* evalMsgQp) VL_MT_SAFE { static void flush(VerilatedEvalMsgQueue* evalMsgQp) VL_MT_SAFE {

View File

@ -353,6 +353,41 @@ typedef unsigned long long vluint64_t; ///< 64-bit unsigned type
# define VL_ROUND(n) round(n) # define VL_ROUND(n) round(n)
#endif #endif
//=========================================================================
// Performance counters
#if VL_THREADED
# if defined(__i386__) || defined(__x86_64__)
/// The vluint64_t argument is loaded with a high-performance counter for profiling
/// or 0x0 if not implemeted on this platform
# define VL_RDTSC(val) asm volatile("rdtsc" : "=A" (val))
# elif defined(__aarch64__)
# define VL_RDTSC(val) asm volatile("mrs %[rt],PMCCNTR_EL0" : [rt] "=r" (val));
# elif
// We just silently ignore unknown OSes, as only leads to missing statistics
# define VL_RDTSC(val) (val) = 0;
# endif
#endif
//=========================================================================
// Threading related OS-specific functions
#if VL_THREADED
# if defined(__i386__) || defined(__x86_64__)
/// For more efficient busy waiting on SMT CPUs, let the processor know
/// we're just waiting so it can let another thread run
# define VL_CPU_RELAX() asm volatile("rep; nop" ::: "memory")
# elif defined(__ia64__)
# define VL_CPU_RELAX() asm volatile("hint @pause" ::: "memory")
# elif defined(__aarch64__)
# define VL_CPU_RELAX() asm volatile("yield" ::: "memory")
# elif defined(__powerpc64__)
# define VL_CPU_RELAX() asm volatile("or 1, 1, 1; or 2, 2, 2;" ::: "memory")
# elif
# error "Missing VL_CPU_RELAX() definition. Or, don't use VL_THREADED"
# endif
#endif
//========================================================================= //=========================================================================
#endif /*guard*/ #endif /*guard*/

View File

@ -1783,17 +1783,17 @@ void EmitCImp::emitWrapEval(AstNodeModule* modp) {
} }
if (v3Global.opt.threads()) { // THREADED-TODO move to per-train if (v3Global.opt.threads()) { // THREADED-TODO move to per-train
uint32_t trainId = 0; uint32_t mtaskId = 0;
putsDecoration("// Train "+cvtToStr(trainId)+" start\n"); putsDecoration("// MTask "+cvtToStr(mtaskId)+" start\n");
puts("VL_DEBUG_IF(VL_DBG_MSGF(\"Train starting, trainId="+cvtToStr(trainId)+"\\n\"););\n"); puts("VL_DEBUG_IF(VL_DBG_MSGF(\"MTask starting, mtaskId="+cvtToStr(mtaskId)+"\\n\"););\n");
puts("Verilated::trainId("+cvtToStr(trainId)+");\n"); puts("Verilated::mtaskId("+cvtToStr(mtaskId)+");\n");
} }
emitSettleLoop( emitSettleLoop(
(string("VL_DEBUG_IF(VL_DBG_MSGF(\"+ Clock loop\\n\"););\n") (string("VL_DEBUG_IF(VL_DBG_MSGF(\"+ Clock loop\\n\"););\n")
+ (v3Global.opt.trace() ? "vlSymsp->__Vm_activity = true;\n" : "") + (v3Global.opt.trace() ? "vlSymsp->__Vm_activity = true;\n" : "")
+ "_eval(vlSymsp);"), false); + "_eval(vlSymsp);"), false);
if (v3Global.opt.threads()) { // THREADED-TODO move to end of all trains on thread if (v3Global.opt.threads()) { // THREADED-TODO move to end of all mtasks on thread
puts("Verilated::endOfThreadTrain(vlSymsp->__Vm_evalMsgQp);\n"); puts("Verilated::endOfThreadMTask(vlSymsp->__Vm_evalMsgQp);\n");
} }
if (v3Global.opt.threads()) { if (v3Global.opt.threads()) {
puts("Verilated::endOfEval(vlSymsp->__Vm_evalMsgQp);\n"); puts("Verilated::endOfEval(vlSymsp->__Vm_evalMsgQp);\n");