From d6923c8571184b8edb1c55fdca63e79650fa7b74 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Chmiel?= Date: Fri, 6 Sep 2024 14:04:26 +0200 Subject: [PATCH] Improve performance of V3VariableOrder with parallelization (#5406) --- src/Makefile_obj.in | 2 +- src/V3AstNodeOther.h | 8 +-- src/V3EmitCBase.h | 2 +- src/V3Error.h | 2 +- src/V3ExecGraph.cpp | 2 +- src/V3ExecGraph.h | 6 +- src/V3Options.cpp | 2 +- src/V3Options.h | 12 ++-- src/V3TSP.cpp | 8 +-- src/V3TSP.h | 4 +- src/V3VariableOrder.cpp | 127 +++++++++++++++++++++++----------------- src/V3VariableOrder.h | 2 +- 12 files changed, 99 insertions(+), 78 deletions(-) diff --git a/src/Makefile_obj.in b/src/Makefile_obj.in index d89a952cd..3603d5a6e 100644 --- a/src/Makefile_obj.in +++ b/src/Makefile_obj.in @@ -210,6 +210,7 @@ RAW_OBJS_PCH_ASTMT = \ V3Options.o \ V3Stats.o \ V3StatsReport.o \ + V3VariableOrder.o \ RAW_OBJS_PCH_ASTNOMT = \ V3Active.o \ @@ -309,7 +310,6 @@ RAW_OBJS_PCH_ASTNOMT = \ V3Undriven.o \ V3Unknown.o \ V3Unroll.o \ - V3VariableOrder.o \ V3Width.o \ V3WidthCommit.o \ V3WidthSel.o \ diff --git a/src/V3AstNodeOther.h b/src/V3AstNodeOther.h index 03d08bcb5..0707ef238 100644 --- a/src/V3AstNodeOther.h +++ b/src/V3AstNodeOther.h @@ -2027,7 +2027,7 @@ public: bool isRef() const VL_MT_SAFE { return m_direction.isRef(); } bool isWritable() const VL_MT_SAFE { return m_direction.isWritable(); } bool isTristate() const { return m_tristate; } - bool isPrimaryIO() const { return m_primaryIO; } + bool isPrimaryIO() const VL_MT_SAFE { return m_primaryIO; } bool isPrimaryInish() const { return isPrimaryIO() && isNonOutput(); } bool isIfaceRef() const { return (varType() == VVarType::IFACEREF); } bool isIfaceParent() const { return m_isIfaceParent; } @@ -2051,15 +2051,15 @@ public: AstBasicDType* bdtypep = basicp(); return bdtypep && bdtypep->isBitLogic(); } - bool isUsedClock() const { return m_usedClock; } + bool isUsedClock() const VL_MT_SAFE { return m_usedClock; } bool isUsedParam() const { return m_usedParam; } bool isUsedLoopIdx() const { return m_usedLoopIdx; } bool isSc() const VL_MT_SAFE { return m_sc; } bool isScQuad() const; - bool isScBv() const; + bool isScBv() const VL_MT_STABLE; bool isScUint() const; bool isScUintBool() const; - bool isScBigUint() const; + bool isScBigUint() const VL_MT_STABLE; bool isScSensitive() const { return m_scSensitive; } bool isSigPublic() const; bool isSigModPublic() const { return m_sigModPublic; } diff --git a/src/V3EmitCBase.h b/src/V3EmitCBase.h index 01c06796c..722c628e7 100644 --- a/src/V3EmitCBase.h +++ b/src/V3EmitCBase.h @@ -67,7 +67,7 @@ public: } // Return C++ class name for a module/class object static string prefixNameProtect(const AstNode* nodep) VL_MT_STABLE; - static bool isAnonOk(const AstVar* varp) { + static bool isAnonOk(const AstVar* varp) VL_MT_STABLE { AstNodeDType* const dtp = varp->dtypep()->skipRefp(); return v3Global.opt.compLimitMembers() != 0 // Enabled && !varp->isStatic() // Not a static variable diff --git a/src/V3Error.h b/src/V3Error.h index a5dcf0b03..e2b669be3 100644 --- a/src/V3Error.h +++ b/src/V3Error.h @@ -657,7 +657,7 @@ void v3errorEndFatal(std::ostringstream& sstr) // Takes an optional "name" (as __VA_ARGS__) #define VL_DEFINE_DUMP(func, tag) \ - VL_ATTR_UNUSED static int dump##func() { \ + VL_ATTR_UNUSED static int dump##func() VL_MT_SAFE { \ static int level = -1; \ if (VL_UNLIKELY(level < 0)) { \ const unsigned dumpTag = v3Global.opt.dumpLevel(tag); \ diff --git a/src/V3ExecGraph.cpp b/src/V3ExecGraph.cpp index f42a4e87d..63649f9c8 100644 --- a/src/V3ExecGraph.cpp +++ b/src/V3ExecGraph.cpp @@ -48,7 +48,7 @@ void ExecMTask::dump(std::ostream& str) const { if (priority() || cost()) str << " [pr=" << priority() << " c=" << cvtToStr(cost()) << "]"; } -uint32_t ExecMTask::s_nextId = 0; +std::atomic ExecMTask::s_nextId{0}; namespace V3ExecGraph { diff --git a/src/V3ExecGraph.h b/src/V3ExecGraph.h index 48600a5c8..9b672a780 100644 --- a/src/V3ExecGraph.h +++ b/src/V3ExecGraph.h @@ -22,6 +22,8 @@ #include "V3Graph.h" +#include + class AstNetlist; class AstMTaskBody; @@ -33,7 +35,7 @@ class ExecMTask final : public V3GraphVertex { private: AstMTaskBody* const m_bodyp; // Task body const uint32_t m_id; // Unique ID of this ExecMTask. - static uint32_t s_nextId; // Next ID to use + static std::atomic s_nextId; // Next ID to use const std::string m_hashName; // Hashed name based on body for profile-driven optimization // Predicted critical path from the start of this mtask to the ends of the graph that are // reachable from this mtask. In abstract time units. @@ -57,7 +59,7 @@ public: string hashName() const { return m_hashName; } void dump(std::ostream& str) const; - static uint32_t numUsedIds() { return s_nextId; } + static uint32_t numUsedIds() VL_MT_SAFE { return s_nextId; } }; inline std::ostream& operator<<(std::ostream& os, const ExecMTask& rhs) { diff --git a/src/V3Options.cpp b/src/V3Options.cpp index 4326a4171..0c033487b 100644 --- a/src/V3Options.cpp +++ b/src/V3Options.cpp @@ -2032,7 +2032,7 @@ unsigned V3Options::dumpLevel(const string& tag) const VL_MT_SAFE { return iter != m_dumpLevel.end() ? iter->second : 0; } -unsigned V3Options::dumpSrcLevel(const string& srcfile_path) const { +unsigned V3Options::dumpSrcLevel(const string& srcfile_path) const VL_MT_SAFE { // For simplicity, calling functions can just use __FILE__ for srcfile. // That means we need to strip the filenames: ../Foo.cpp -> Foo return dumpLevel(V3Os::filenameNonDirExt(srcfile_path)); diff --git a/src/V3Options.h b/src/V3Options.h index 5c95e4d51..c15440de4 100644 --- a/src/V3Options.h +++ b/src/V3Options.h @@ -432,7 +432,7 @@ public: unsigned debugLevel(const string& tag) const VL_MT_SAFE; unsigned debugSrcLevel(const string& srcfile_path) const VL_MT_SAFE; unsigned dumpLevel(const string& tag) const VL_MT_SAFE; - unsigned dumpSrcLevel(const string& srcfile_path) const; + unsigned dumpSrcLevel(const string& srcfile_path) const VL_MT_SAFE; // METHODS void addCppFile(const string& filename); @@ -520,7 +520,7 @@ public: bool pinsInoutEnables() const { return m_pinsInoutEnables; } bool pinsScUint() const { return m_pinsScUint; } bool pinsScUintBool() const { return m_pinsScUintBool; } - bool pinsScBigUint() const { return m_pinsScBigUint; } + bool pinsScBigUint() const VL_MT_SAFE { return m_pinsScBigUint; } bool pinsUint8() const { return m_pinsUint8; } bool ppComments() const { return m_ppComments; } bool profC() const { return m_profC; } @@ -563,14 +563,14 @@ public: int outputSplit() const { return m_outputSplit; } int outputSplitCFuncs() const { return m_outputSplitCFuncs; } int outputSplitCTrace() const { return m_outputSplitCTrace; } - int pinsBv() const { return m_pinsBv; } + int pinsBv() const VL_MT_SAFE { return m_pinsBv; } int publicDepth() const { return m_publicDepth; } int reloopLimit() const { return m_reloopLimit; } VOptionBool skipIdentical() const { return m_skipIdentical; } bool stopFail() const { return m_stopFail; } int threads() const VL_MT_SAFE { return m_threads; } int threadsMaxMTasks() const { return m_threadsMaxMTasks; } - bool mtasks() const { return (m_threads > 1); } + bool mtasks() const VL_MT_SAFE { return (m_threads > 1); } VTimescale timeDefaultPrec() const { return m_timeDefaultPrec; } VTimescale timeDefaultUnit() const { return m_timeDefaultUnit; } VTimescale timeOverridePrec() const { return m_timeOverridePrec; } @@ -596,7 +596,7 @@ public: int verilateJobs() const { return m_verilateJobs; } int compLimitBlocks() const { return m_compLimitBlocks; } - int compLimitMembers() const { return m_compLimitMembers; } + int compLimitMembers() const VL_MT_SAFE { return m_compLimitMembers; } int compLimitParens() const { return m_compLimitParens; } string exeName() const { return m_exeName != "" ? m_exeName : prefix(); } @@ -693,7 +693,7 @@ public: } bool hierarchical() const { return m_hierarchical; } - int hierChild() const { return m_hierChild; } + int hierChild() const VL_MT_SAFE { return m_hierChild; } bool hierTop() const VL_MT_SAFE { return !m_hierChild && !m_hierBlocks.empty(); } const V3HierBlockOptSet& hierBlocks() const { return m_hierBlocks; } // Directory to save .tree, .dot, .dat, .vpp for hierarchical block top diff --git a/src/V3TSP.cpp b/src/V3TSP.cpp index f47ae2e28..e027db84d 100644 --- a/src/V3TSP.cpp +++ b/src/V3TSP.cpp @@ -433,7 +433,7 @@ private: //###################################################################### // Main algorithm -void V3TSP::tspSort(const V3TSP::StateVec& states, V3TSP::StateVec* resultp) { +void V3TSP::tspSort(const V3TSP::StateVec& states, V3TSP::StateVec* resultp) VL_MT_SAFE { UASSERT(resultp->empty(), "Output graph must start empty"); // Make this TSP implementation work for graphs of size 0 or 1 @@ -536,14 +536,14 @@ public: , m_ypos{ypos} , m_serial{++s_serialNext} {} ~TspTestState() override = default; - int cost(const TspStateBase* otherp) const override { + int cost(const TspStateBase* otherp) const override VL_MT_SAFE { return cost(dynamic_cast(otherp)); } - static unsigned diff(unsigned a, unsigned b) { + static unsigned diff(unsigned a, unsigned b) VL_PURE { if (a > b) return a - b; return b - a; } - virtual int cost(const TspTestState* otherp) const { + int cost(const TspTestState* otherp) const VL_PURE { // For test purposes, each TspTestState is merely a point // on the Cartesian plane; cost is the linear distance // between two points. diff --git a/src/V3TSP.h b/src/V3TSP.h index f0c775ddf..37bfb8891 100644 --- a/src/V3TSP.h +++ b/src/V3TSP.h @@ -34,7 +34,7 @@ public: // This is the cost function that the TSP sort will minimize. // All costs in V3TSP are int, chosen to match the type of // V3GraphEdge::weight() which will reflect each edge's cost. - virtual int cost(const TspStateBase* otherp) const = 0; + virtual int cost(const TspStateBase* otherp) const VL_MT_SAFE = 0; // This operator< must place a meaningless, arbitrary, but // stable order on all TspStateBase's. It's used only to @@ -49,7 +49,7 @@ using StateVec = std::vector; // Given an unsorted set of TspState's, sort them to minimize // the transition cost for walking the sorted list. -void tspSort(const StateVec& states, StateVec* resultp) VL_MT_DISABLED; +void tspSort(const StateVec& states, StateVec* resultp) VL_MT_SAFE; void selfTest() VL_MT_DISABLED; } // namespace V3TSP diff --git a/src/V3VariableOrder.cpp b/src/V3VariableOrder.cpp index 292ce3dc2..0aad4a3d6 100644 --- a/src/V3VariableOrder.cpp +++ b/src/V3VariableOrder.cpp @@ -20,7 +20,7 @@ // //************************************************************************* -#include "V3PchAstNoMT.h" // VL_MT_DISABLED_CODE_UNIT +#include "V3PchAstMT.h" #include "V3VariableOrder.h" @@ -28,6 +28,7 @@ #include "V3EmitCBase.h" #include "V3ExecGraph.h" #include "V3TSP.h" +#include "V3ThreadPool.h" #include @@ -113,10 +114,10 @@ public: } bool operator<(const VarTspSorter& other) const { return m_serial < other.m_serial; } const MTaskIdVec& mTaskIds() const { return m_mTaskIds; } - int cost(const TspStateBase* otherp) const override { + int cost(const TspStateBase* otherp) const override VL_MT_SAFE { return cost(static_cast(otherp)); } - int cost(const VarTspSorter* otherp) const { + int cost(const VarTspSorter* otherp) const VL_MT_SAFE { // Compute the number of MTasks not shared (Hamming distance) int cost = 0; const size_t size = ExecMTask::numUsedIds(); @@ -127,22 +128,20 @@ public: uint32_t VarTspSorter::s_serialNext = 0; +struct VarAttributes final { + uint8_t stratum; // Roughly equivalent to alignment requirement, to avoid padding + bool anonOk; // Can be emitted as part of anonymous structure +}; class VariableOrder final { - // NODE STATE - // AstVar::user1() -> attributes, via m_attributes - const VNUser1InUse m_user1InUse; // AstVar - - struct VarAttributes final { - uint32_t stratum; // Roughly equivalent to alignment requirement, to avoid padding - bool anonOk; // Can be emitted as part of anonymous structure - }; - - AstUser1Allocator m_attributes; // Attributes used for sorting + std::unordered_map m_attributes; const MTaskAffinityMap& m_mTaskAffinity; + std::vector& m_varps; - VariableOrder(AstNodeModule* modp, const MTaskAffinityMap& mTaskAffinity) - : m_mTaskAffinity{mTaskAffinity} { + VariableOrder(AstNodeModule* modp, const MTaskAffinityMap& mTaskAffinity, + std::vector& varps) + : m_mTaskAffinity{mTaskAffinity} + , m_varps{varps} { orderModuleVars(modp); } ~VariableOrder() = default; @@ -157,8 +156,11 @@ class VariableOrder final { if (ap->isStatic() != bp->isStatic()) { // Non-statics before statics return bp->isStatic(); } - const auto& attrA = m_attributes(ap); - const auto& attrB = m_attributes(bp); + UASSERT(m_attributes.find(ap) != m_attributes.end() + && m_attributes.find(bp) != m_attributes.end(), + "m_attributes should be populated for each AstVar"); + const auto& attrA = m_attributes.at(ap); + const auto& attrB = m_attributes.at(bp); if (attrA.anonOk != attrB.anonOk) { // Anons before non-anons return attrA.anonOk; } @@ -209,58 +211,42 @@ class VariableOrder final { } void orderModuleVars(AstNodeModule* modp) { - std::vector varps; - // Unlink all module variables from the module, compute attributes for (AstNode *nodep = modp->stmtsp(), *nextp; nodep; nodep = nextp) { nextp = nodep->nextp(); if (AstVar* const varp = VN_CAST(nodep, Var)) { - // Unlink, add to vector - varp->unlinkFrBack(); - varps.push_back(varp); + m_varps.push_back(varp); + // Compute attributes up front - auto& attributes = m_attributes(varp); // Stratum const int sigbytes = varp->dtypeSkipRefp()->widthAlignBytes(); - attributes.stratum = (v3Global.opt.hierChild() && varp->isPrimaryIO()) ? 0 - : (varp->isUsedClock() && varp->widthMin() == 1) ? 1 - : VN_IS(varp->dtypeSkipRefp(), UnpackArrayDType) ? 9 - : (varp->basicp() && varp->basicp()->isOpaque()) ? 8 - : (varp->isScBv() || varp->isScBigUint()) ? 7 - : (sigbytes == 8) ? 6 - : (sigbytes == 4) ? 5 - : (sigbytes == 2) ? 3 - : (sigbytes == 1) ? 2 - : 10; - // Anonymous structure ok - attributes.anonOk = EmitCBase::isAnonOk(varp); + const uint8_t stratum = (v3Global.opt.hierChild() && varp->isPrimaryIO()) ? 0 + : (varp->isUsedClock() && varp->widthMin() == 1) ? 1 + : VN_IS(varp->dtypeSkipRefp(), UnpackArrayDType) ? 9 + : (varp->basicp() && varp->basicp()->isOpaque()) ? 8 + : (varp->isScBv() || varp->isScBigUint()) ? 7 + : (sigbytes == 8) ? 6 + : (sigbytes == 4) ? 5 + : (sigbytes == 2) ? 3 + : (sigbytes == 1) ? 2 + : 10; + m_attributes.emplace(varp, VarAttributes{stratum, EmitCBase::isAnonOk(varp)}); } } - if (!varps.empty()) { - // Sort variables + if (!m_varps.empty()) { if (!v3Global.opt.mtasks()) { - simpleSortVars(varps); + simpleSortVars(m_varps); } else { - tspSortVars(varps); + tspSortVars(m_varps); } - - // Insert them back under the module, in the new order, but at - // the front of the list so they come out first in dumps/XML. - auto it = varps.cbegin(); - AstVar* const firstp = *it++; - for (; it != varps.cend(); ++it) firstp->addNext(*it); - if (AstNode* const stmtsp = modp->stmtsp()) { - stmtsp->unlinkFrBackWithNext(); - AstNode::addNext(firstp, stmtsp); - } - modp->addStmtsp(firstp); } } public: - static void processModule(AstNodeModule* modp, const MTaskAffinityMap& mTaskAffinity) { - VariableOrder{modp, mTaskAffinity}; + static void processModule(AstNodeModule* modp, const MTaskAffinityMap& mTaskAffinity, + std::vector& varps) VL_MT_STABLE { + VariableOrder{modp, mTaskAffinity, varps}; } }; @@ -280,11 +266,44 @@ void V3VariableOrder::orderAll(AstNetlist* netlistp) { } }); } + if (v3Global.opt.stats()) V3Stats::statsStage("variableorder-gather"); - // Order variables in each module + // Sort variables for each module + std::unordered_map> sortedVars; + { + V3ThreadScope threadScope; + + for (AstNodeModule* modp = v3Global.rootp()->modulesp(); modp; + modp = VN_AS(modp->nextp(), NodeModule)) { + std::vector& varps = sortedVars[modp]; + threadScope.enqueue([modp, mTaskAffinity, &varps]() { + VariableOrder::processModule(modp, mTaskAffinity, varps); + }); + } + } + if (v3Global.opt.stats()) V3Stats::statsStage("variableorder-sort"); + + // Insert them back under the module, in the new order, but at + // the front of the list so they come out first in dumps/XML. for (AstNodeModule* modp = v3Global.rootp()->modulesp(); modp; modp = VN_AS(modp->nextp(), NodeModule)) { - VariableOrder::processModule(modp, mTaskAffinity); + const std::vector& varps = sortedVars[modp]; + + if (!varps.empty()) { + auto it = varps.cbegin(); + AstVar* const firstp = *it++; + firstp->unlinkFrBack(); + for (; it != varps.cend(); ++it) { + AstVar* const varp = *it; + varp->unlinkFrBack(); + firstp->addNext(varp); + } + if (AstNode* const stmtsp = modp->stmtsp()) { + stmtsp->unlinkFrBackWithNext(); + AstNode::addNext(firstp, stmtsp); + } + modp->addStmtsp(firstp); + } } // Done diff --git a/src/V3VariableOrder.h b/src/V3VariableOrder.h index 9f27ea414..3816ca3a0 100644 --- a/src/V3VariableOrder.h +++ b/src/V3VariableOrder.h @@ -26,7 +26,7 @@ class AstNetlist; class V3VariableOrder final { public: - static void orderAll(AstNetlist*) VL_MT_DISABLED; + static void orderAll(AstNetlist*); }; #endif // Guard