verilator/include/verilated_threads.h
Geza Lore 60d5f0e86b
Emit model implementation as loose methods. (#3006)
This patch introduces the concept of 'loose' methods, which semantically
are methods, but are declared as global functions, and are passed an
explicit 'self' pointer. This enables these methods to be declared
outside the class, only when they are needed, therefore removing the
header dependency. The bulk of the emitted model implementation now uses
loose methods.
2021-06-13 14:33:11 +01:00

306 lines
11 KiB
C++

// -*- mode: C++; c-file-style: "cc-mode" -*-
//=============================================================================
//
// Code available from: https://verilator.org
//
// Copyright 2012-2021 by Wilson Snyder. This program is free software; you
// can redistribute it and/or modify it under the terms of either the GNU
// Lesser General Public License Version 3 or the Perl Artistic License
// Version 2.0.
// SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
//
//=============================================================================
///
/// \file
/// \brief Verilated thread pool and profiling header
///
/// This file is not part of the Verilated public-facing API.
/// It is only for internal use by Verilated library multithreaded
/// routines.
///
//=============================================================================
#ifndef VERILATOR_VERILATED_THREADS_H_
#define VERILATOR_VERILATED_THREADS_H_
#include "verilatedos.h"
#include "verilated.h" // for VerilatedMutex and clang annotations
#ifndef VL_THREADED
// Hitting this likely means verilated_threads.cpp is being compiled when
// 'verilator --threads' was not used. 'verilator --threads' sets
// VL_THREADED.
// Alternatively it is always safe but may harm performance to always
// define VL_THREADED for all compiles.
#error "verilated_threads.h/cpp expected VL_THREADED (from verilator --threads)"
#endif
#include <condition_variable>
#include <set>
#include <vector>
// clang-format off
#if defined(__linux)
# include <sched.h> // For sched_getcpu()
#endif
#if defined(__APPLE__)
# include <cpuid.h> // For __cpuid_count()
#endif
// clang-format on
// VlMTaskVertex and VlThreadpool will work with multiple model class types.
// Since the type is opaque to VlMTaskVertex and VlThreadPool, represent it
// as a void* here.
using VlSelfP = void*;
using VlExecFnp = void (*)(VlSelfP, bool);
// Track dependencies for a single MTask.
class VlMTaskVertex final {
// MEMBERS
static std::atomic<vluint64_t> s_yields; // Statistics
// On even cycles, _upstreamDepsDone increases as upstream
// dependencies complete. When it reaches _upstreamDepCount,
// this MTaskVertex is ready.
//
// On odd cycles, _upstreamDepsDone decreases as upstream
// dependencies complete, and when it reaches zero this MTaskVertex
// is ready.
//
// An atomic is smaller than a mutex, and lock-free.
//
// (Why does the size of this class matter? If an mtask has many
// downstream mtasks to notify, we hope these will pack into a
// small number of cache lines to reduce the cost of pointer chasing
// during done-notification. Nobody's quantified that cost though.
// If we were really serious about shrinking this class, we could
// use 16-bit types here...)
std::atomic<vluint32_t> m_upstreamDepsDone;
const vluint32_t m_upstreamDepCount;
public:
// CONSTRUCTORS
// 'upstreamDepCount' is the number of upstream MTaskVertex's
// that must notify this MTaskVertex before it will become ready
// to run.
explicit VlMTaskVertex(vluint32_t upstreamDepCount);
~VlMTaskVertex() = default;
static vluint64_t yields() { return s_yields; }
static void yieldThread() {
++s_yields; // Statistics
std::this_thread::yield();
}
// Upstream mtasks must call this when they complete.
// Returns true when the current MTaskVertex becomes ready to execute,
// false while it's still waiting on more dependencies.
inline bool signalUpstreamDone(bool evenCycle) {
if (evenCycle) {
vluint32_t upstreamDepsDone
= 1 + m_upstreamDepsDone.fetch_add(1, std::memory_order_release);
assert(upstreamDepsDone <= m_upstreamDepCount);
return (upstreamDepsDone == m_upstreamDepCount);
} else {
vluint32_t upstreamDepsDone_prev
= m_upstreamDepsDone.fetch_sub(1, std::memory_order_release);
assert(upstreamDepsDone_prev > 0);
return (upstreamDepsDone_prev == 1);
}
}
inline bool areUpstreamDepsDone(bool evenCycle) const {
vluint32_t target = evenCycle ? m_upstreamDepCount : 0;
return m_upstreamDepsDone.load(std::memory_order_acquire) == target;
}
inline void waitUntilUpstreamDone(bool evenCycle) const {
unsigned ct = 0;
while (VL_UNLIKELY(!areUpstreamDepsDone(evenCycle))) {
VL_CPU_RELAX();
++ct;
if (VL_UNLIKELY(ct > VL_LOCK_SPINS)) {
ct = 0;
yieldThread();
}
}
}
};
// Profiling support
class VlProfileRec final {
protected:
friend class VlThreadPool;
enum VlProfileE { TYPE_MTASK_RUN, TYPE_BARRIER };
VlProfileE m_type = TYPE_BARRIER; // Record type
vluint32_t m_mtaskId = 0; // Mtask we're logging
vluint32_t m_predictTime = 0; // How long scheduler predicted would take
vluint64_t m_startTime = 0; // Tick at start of execution
vluint64_t m_endTime = 0; // Tick at end of execution
unsigned m_cpu; // Execution CPU number (at start anyways)
public:
class Barrier {};
VlProfileRec() = default;
explicit VlProfileRec(Barrier) { m_cpu = getcpu(); }
void startRecord(vluint64_t time, uint32_t mtask, uint32_t predict) {
m_type = VlProfileRec::TYPE_MTASK_RUN;
m_mtaskId = mtask;
m_predictTime = predict;
m_startTime = time;
m_cpu = getcpu();
}
void endRecord(vluint64_t time) { m_endTime = time; }
static int getcpu() { // Return current executing CPU
#if defined(__linux)
return sched_getcpu();
#elif defined(__APPLE__)
vluint32_t info[4];
__cpuid_count(1, 0, info[0], info[1], info[2], info[3]);
// info[1] is EBX, bits 24-31 are APIC ID
if ((info[3] & (1 << 9)) == 0) {
return -1; // no APIC on chip
} else {
return (unsigned)info[1] >> 24;
}
#elif defined(_WIN32)
return GetCurrentProcessorNumber();
#else
return 0;
#endif
}
};
class VlThreadPool;
class VlWorkerThread final {
private:
// TYPES
struct ExecRec {
VlExecFnp m_fnp; // Function to execute
VlSelfP m_selfp; // Symbol table to execute
bool m_evenCycle; // Even/odd for flag alternation
ExecRec()
: m_fnp{nullptr}
, m_selfp{nullptr}
, m_evenCycle{false} {}
ExecRec(VlExecFnp fnp, VlSelfP selfp, bool evenCycle)
: m_fnp{fnp}
, m_selfp{selfp}
, m_evenCycle{evenCycle} {}
};
// MEMBERS
VerilatedMutex m_mutex;
std::condition_variable_any m_cv;
// Only notify the condition_variable if the worker is waiting
bool m_waiting VL_GUARDED_BY(m_mutex) = false;
// Why a vector? We expect the pending list to be very short, typically
// 0 or 1 or 2, so popping from the front shouldn't be
// expensive. Revisit if we ever have longer queues...
std::vector<ExecRec> m_ready VL_GUARDED_BY(m_mutex);
// Store the size atomically, so we can spin wait
std::atomic<size_t> m_ready_size;
VlThreadPool* m_poolp; // Our associated thread pool
bool m_profiling; // Is profiling enabled?
std::atomic<bool> m_exiting; // Worker thread should exit
std::thread m_cthread; // Underlying C++ thread record
VerilatedContext* m_contextp; // Context for spawned thread
VL_UNCOPYABLE(VlWorkerThread);
public:
// CONSTRUCTORS
explicit VlWorkerThread(VlThreadPool* poolp, VerilatedContext* contextp, bool profiling);
~VlWorkerThread();
// METHODS
inline void dequeWork(ExecRec* workp) VL_MT_SAFE_EXCLUDES(m_mutex) {
// Spin for a while, waiting for new data
for (int i = 0; i < VL_LOCK_SPINS; ++i) {
if (VL_LIKELY(m_ready_size.load(std::memory_order_relaxed))) { //
break;
}
VL_CPU_RELAX();
}
VerilatedLockGuard lk(m_mutex);
while (m_ready.empty()) {
m_waiting = true;
m_cv.wait(lk);
}
m_waiting = false;
// As noted above this is inefficient if our ready list is ever
// long (but it shouldn't be)
*workp = m_ready.front();
m_ready.erase(m_ready.begin());
m_ready_size.fetch_sub(1, std::memory_order_relaxed);
}
inline void wakeUp() { addTask(nullptr, nullptr, false); }
inline void addTask(VlExecFnp fnp, VlSelfP selfp, bool evenCycle)
VL_MT_SAFE_EXCLUDES(m_mutex) {
bool notify;
{
const VerilatedLockGuard lk(m_mutex);
m_ready.emplace_back(fnp, selfp, evenCycle);
m_ready_size.fetch_add(1, std::memory_order_relaxed);
notify = m_waiting;
}
if (notify) m_cv.notify_one();
}
void workerLoop();
static void startWorker(VlWorkerThread* workerp);
};
class VlThreadPool final {
// TYPES
using ProfileTrace = std::vector<VlProfileRec>;
// MEMBERS
std::vector<VlWorkerThread*> m_workers; // our workers
bool m_profiling; // is profiling enabled?
// Support profiling -- we can append records of profiling events
// to this vector with very low overhead, and then dump them out
// later. This prevents the overhead of printf/malloc/IO from
// corrupting the profiling data. It's super cheap to append
// a VlProfileRec struct on the end of a pre-allocated vector;
// this is the only cost we pay in real-time during a profiling cycle.
// Internal note: Globals may multi-construct, see verilated.cpp top.
static VL_THREAD_LOCAL ProfileTrace* t_profilep;
std::set<ProfileTrace*> m_allProfiles VL_GUARDED_BY(m_mutex);
VerilatedMutex m_mutex;
public:
// CONSTRUCTORS
// Construct a thread pool with 'nThreads' dedicated threads. The thread
// pool will create these threads and make them available to execute tasks
// via this->workerp(index)->addTask(...)
VlThreadPool(VerilatedContext* contextp, int nThreads, bool profiling);
~VlThreadPool();
// METHODS
inline int numThreads() const { return m_workers.size(); }
inline VlWorkerThread* workerp(int index) {
assert(index >= 0);
assert(index < m_workers.size());
return m_workers[index];
}
inline VlProfileRec* profileAppend() {
t_profilep->emplace_back();
return &(t_profilep->back());
}
void profileAppendAll(const VlProfileRec& rec) VL_MT_SAFE_EXCLUDES(m_mutex);
void profileDump(const char* filenamep, vluint64_t ticksElapsed) VL_MT_SAFE_EXCLUDES(m_mutex);
// In profiling mode, each executing thread must call
// this once to setup profiling state:
void setupProfilingClientThread() VL_MT_SAFE_EXCLUDES(m_mutex);
void tearDownProfilingClientThread();
private:
VL_UNCOPYABLE(VlThreadPool);
};
#endif