diff --git a/Changes b/Changes index 1fa981e3c..65ee0fd39 100644 --- a/Changes +++ b/Changes @@ -20,7 +20,6 @@ Verilator 4.225 devel * Fix incorrect tristate logic (#3399) [shareefj, Vighnesh Iyer] * Fix segfault exporting non-existant package (#3535). * Fix case statement comparing string literal (#3544). [Gustav Svensk] -* Improve Verilation speed with --threads on large designs. [Geza Lore] Verilator 4.224 2022-06-19 diff --git a/src/V3Graph.h b/src/V3Graph.h index a18fb5dfc..da096ab2f 100644 --- a/src/V3Graph.h +++ b/src/V3Graph.h @@ -67,7 +67,7 @@ public: return names[m_e]; } // METHODS unique to this class - constexpr GraphWay invert() const { return GraphWay{m_e ^ 1}; } + constexpr GraphWay invert() const { return m_e == FORWARD ? REVERSE : FORWARD; } constexpr bool forward() const { return m_e == FORWARD; } constexpr bool reverse() const { return m_e != FORWARD; } }; diff --git a/src/V3PairingHeap.h b/src/V3PairingHeap.h deleted file mode 100644 index c1f5f5342..000000000 --- a/src/V3PairingHeap.h +++ /dev/null @@ -1,293 +0,0 @@ -// -*- mode: C++; c-file-style: "cc-mode" -*- -//************************************************************************* -// DESCRIPTION: Verilator: Pairing Heap data structure -// -// Code available from: https://verilator.org -// -//************************************************************************* -// -// Copyright 2003-2022 by Wilson Snyder. This program is free software; you -// can redistribute it and/or modify it under the terms of either the GNU -// Lesser General Public License Version 3 or the Perl Artistic License -// Version 2.0. -// SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0 -// -//************************************************************************* - -#ifndef VERILATOR_V3PAIRINGHEAP_H_ -#define VERILATOR_V3PAIRINGHEAP_H_ - -#include "config_build.h" -#include "verilatedos.h" - -#include "V3Error.h" - -//============================================================================= -// Pairing heap (max-heap) with increase key and delete. -// -// While this is written as a generic data structure, it's interface and -// implementation is finely tuned for it's use by V3Parm_tition, and is critical -// to verilaton performance, so be very careful changing anything or adding any -// new operations that would impact either memory usage, or performance of the -// existing operations. This data structure is fully deterministic, meaning -// the order in which elements with equal keys are retrieved only depends on -// the order of operations performed on the heap. -//============================================================================= - -template -class PairingHeap final { -public: - struct Node; - - // Just a pointer to a heap Node, but with special accessors to help keep back pointers - // consistent. - struct Link { - Node* m_ptr = nullptr; // The managed pointer - - Link() = default; - VL_UNCOPYABLE(Link); - - // Make the pointer point to the target, and the target's owner pointer to this pointer - VL_ATTR_ALWINLINE void link(Node* targetp) { - m_ptr = targetp; - if (!targetp) return; -#if VL_DEBUG - UASSERT(!targetp->m_ownerpp, "Already linked"); -#endif - targetp->m_ownerpp = &m_ptr; - } - - // Make the pointer point to the target, and the target's owner pointer to this pointer - VL_ATTR_ALWINLINE void linkNonNull(Node* targetp) { - m_ptr = targetp; -#if VL_DEBUG - UASSERT(!targetp->m_ownerpp, "Already linked"); -#endif - targetp->m_ownerpp = &m_ptr; - } - - // Clear the pointer and return it's previous value - VL_ATTR_ALWINLINE Node* unlink() { - Node* const result = m_ptr; -#if VL_DEBUG - if (result) { - UASSERT(m_ptr->m_ownerpp == &m_ptr, "Bad back link"); - // Not strictly necessary to clear this, but helps debugging - m_ptr->m_ownerpp = nullptr; - } -#endif - m_ptr = nullptr; - return result; - } - - // Minimal convenience acessors and operators - VL_ATTR_ALWINLINE Node* ptr() const { return m_ptr; } - VL_ATTR_ALWINLINE operator bool() const { return m_ptr; } - VL_ATTR_ALWINLINE bool operator!() const { return !m_ptr; } - VL_ATTR_ALWINLINE Node* operator->() const { return m_ptr; } - VL_ATTR_ALWINLINE Node& operator*() const { return *m_ptr; } - }; - - // A single node in the pairing heap tree - struct Node { - Link m_next; // Next in list of sibling heaps - Link m_kids; // Head of list of child heaps - Node** m_ownerpp = nullptr; // Pointer to the Link pointer pointing to this heap - T_Key m_key; // The key in the heap - - // CONSTRUCTOR - explicit Node() = default; - VL_UNCOPYABLE(Node); - - // METHODS - VL_ATTR_ALWINLINE const T_Key& key() const { return m_key; } - VL_ATTR_ALWINLINE bool operator<(const Node& that) const { return m_key < that.m_key; } - VL_ATTR_ALWINLINE bool operator>(const Node& that) const { return that.m_key < m_key; } - - // Make newp take the place of this in the tree - VL_ATTR_ALWINLINE void replaceWith(Node* newp) { - *m_ownerpp = newp; // The owner pointer needs to point to the new node - if (newp) newp->m_ownerpp = m_ownerpp; // The new node needs to point to its owner - m_ownerpp = nullptr; // This node has no owner anymore - } - - // Make newp take the place of this in the tree - VL_ATTR_ALWINLINE void replaceWithNonNull(Node* newp) { - *m_ownerpp = newp; // The owner pointer needs to point to the new node - newp->m_ownerpp = m_ownerpp; // The new node needs to point to its owner - m_ownerpp = nullptr; // This node has no owner anymore - } - }; - -private: - // MEMBERS - - // The root of the heap. Note: We do not reduce lists during insertion/removal etc, unless we - // absolutely have to. This means the root can become a list. This is ok, we will reduce - // lazily when requesting the minimum element. - mutable Link m_root; - - // CONSTRUCTORS - VL_UNCOPYABLE(PairingHeap); - -public: - explicit PairingHeap() = default; - - // METHODS - bool empty() const { return !m_root; } - - // Insert given node into this heap with given key. - void insert(Node* nodep, T_Key key) { - // Update key of node - nodep->m_key = key; - insert(nodep); - } - - // Insert given node into this heap with key already set in the node - void insert(Node* nodep) { -#if VL_DEBUG - UASSERT(!nodep->m_ownerpp && !nodep->m_next && !nodep->m_kids, "Already linked"); -#endif - // Just stick it at the front of the root list - nodep->m_next.link(m_root.unlink()); - m_root.linkNonNull(nodep); - } - - // Remove given node only from the heap it is contained in - void remove(Node* nodep) { - if (!nodep->m_next) { - // If the node does not have siblings, replace it with its children (might be empty). - nodep->replaceWith(nodep->m_kids.unlink()); - } else if (!nodep->m_kids) { - // If it has siblings but no children, replace it with the siblings. - nodep->replaceWithNonNull(nodep->m_next.unlink()); - } else { - // If it has both siblings and children, reduce the children and splice that - // reduced heap in place of this node - Node* const reducedKidsp = reduce(nodep->m_kids.unlink()); - reducedKidsp->m_next.linkNonNull(nodep->m_next.unlink()); - nodep->replaceWithNonNull(reducedKidsp); - } - } - - // Returns the largest element in the heap - Node* max() const { - // Heap might be empty - if (!m_root) return nullptr; - // If the root have siblings reduce them - if (m_root->m_next) m_root.linkNonNull(reduce(m_root.unlink())); - // The root element is the largest - return m_root.ptr(); - } - - // Returns the second-largest element in the heap. - // This is only valid to call if 'max' returned a valid element. - Node* secondMax() const { -#if VL_DEBUG - UASSERT(m_root, "'max' would have returned nullptr"); - UASSERT(!m_root->m_next, "'max' would have reduced"); -#endif - // If there are no children, there is no second element - if (!m_root->m_kids) return nullptr; - // If there are multiple children, reduce them - if (m_root->m_kids->m_next) m_root->m_kids.linkNonNull(reduce(m_root->m_kids.unlink())); - // Return the now singular child, which is the second-largest element - return m_root->m_kids.ptr(); - } - - // Increase the key of the given node to the given new value - template - void increaseKey(Node* nodep, T_Update value) { - // Update the key - nodep->m_key.increase(value); - // Increasing the key of the root is easy - if (nodep == m_root.ptr()) return; - // Otherwise we do have a little work to do - if (!nodep->m_kids) { - // If the node has no children, replace it with its siblings (migtht be null) - nodep->replaceWith(nodep->m_next.unlink()); - } else if (!nodep->m_next) { - // If the node has no siblings, replace it with its children - nodep->replaceWithNonNull(nodep->m_kids.unlink()); - } else { - // The node has both children and siblings. Splice the first child in the place of the - // node, and extract the rest of the children with the node - Node* const kidsp = nodep->m_kids.unlink(); - nodep->m_kids.link(kidsp->m_next.unlink()); - kidsp->m_next.linkNonNull(nodep->m_next.unlink()); - nodep->replaceWithNonNull(kidsp); - } - // Just stick the increased node a the front of the root list - nodep->m_next.linkNonNull(m_root.unlink()); - m_root.linkNonNull(nodep); - } - -private: - // Meld (merge) two heaps rooted at the given nodes, return the root of the new heap - VL_ATTR_ALWINLINE static Node* merge(Node* ap, Node* bp) { -#if VL_DEBUG - UASSERT(!ap->m_ownerpp && !ap->m_next, "Not root a"); - UASSERT(!bp->m_ownerpp && !bp->m_next, "Not root b"); -#endif - if (*ap > *bp) { // bp goes under ap - bp->m_next.link(ap->m_kids.unlink()); - ap->m_kids.linkNonNull(bp); - return ap; - } else { // ap goes under bp - ap->m_next.link(bp->m_kids.unlink()); - bp->m_kids.linkNonNull(ap); - return bp; - } - } - - // Reduces the list of nodes starting at the given node into a single node that is returned - VL_ATTR_NOINLINE static Node* reduce(Node* nodep) { -#if VL_DEBUG - UASSERT(!nodep->m_ownerpp, "Node is linked"); -#endif - // If there is only one node in the list, then there is nothing to do - if (!nodep->m_next) return nodep; - // The result node - Node* resultp = nullptr; - // Pairwise merge the child nodes - while (nodep) { - // Pop off the first nodes - Node* const ap = nodep; - // If we have an odd number of nodes, prepend the unpaired one onto the result list - if (!nodep->m_next) { - ap->m_next.link(resultp); - resultp = ap; - break; - } - // Pop off the second nodes - Node* const bp = nodep->m_next.unlink(); - // Keep hold of the rest of the list - nodep = bp->m_next.unlink(); - // Merge the current pair - Node* const mergedp = merge(ap, bp); - // Prepend the merged pair to the result list - mergedp->m_next.link(resultp); - resultp = mergedp; - } - // Now merge-reduce the merged pairs - while (resultp->m_next) { - // Pop first two results - Node* const ap = resultp; - Node* const bp = resultp->m_next.unlink(); - // Keep hold of the rest of the list - resultp = bp->m_next.unlink(); - // Merge the current pair - Node* const mergedp = merge(ap, bp); - // Prepend the merged pair to the result list - mergedp->m_next.link(resultp); - resultp = mergedp; - } - // Done - return resultp; - } -}; - -// The PairingHeap itself should be a simple pointer and nothing more -static_assert(sizeof(PairingHeap) == sizeof(PairingHeap::Node*), "Should be a pointer"); - -#endif // Guard diff --git a/src/V3Partition.cpp b/src/V3Partition.cpp index bf537a65c..5b1474e91 100644 --- a/src/V3Partition.cpp +++ b/src/V3Partition.cpp @@ -22,29 +22,23 @@ #include "V3Config.h" #include "V3EmitCBase.h" #include "V3File.h" +#include "V3GraphAlg.h" #include "V3GraphStream.h" #include "V3InstrCount.h" #include "V3Os.h" -#include "V3PairingHeap.h" #include "V3PartitionGraph.h" #include "V3Scoreboard.h" #include "V3Stats.h" #include "V3UniqueNames.h" #include -#include #include #include -#include #include -#include -class LogicMTask; -class MTaskEdge; class MergeCandidate; -class SiblingMC; -// ###################################################################### +//###################################################################### // Partitioner tunable settings: // // Before describing these settings, a bit of background: @@ -76,14 +70,14 @@ class SiblingMC; // skipping the enumeration of some siblings on a few vertices does not // have a large impact on the result of the partitioner. // -// If your vertices are small, the limit (at 26) approaches a no-op. Hence +// If your vertices are small, the limit (at 25) approaches a no-op. Hence // there's basically no cost to applying this limit even when we don't // expect huge vertices. // // If you don't care about partitioner runtime and you want the most // aggressive partition, set the limit very high. If you have huge // vertices, leave this as is. -constexpr unsigned PART_SIBLING_EDGE_LIMIT = 26; +constexpr unsigned PART_SIBLING_EDGE_LIMIT = 25; // PART_STEPPED_COST (defined/undef) // @@ -149,34 +143,10 @@ static void partCheckCachedScoreVsActual(uint32_t cached, uint32_t actual) { #endif } -//============================================================================= -// We keep MTaskEdge graph edges in a PairingHeap, sorted by score and id - -struct EdgeKey { - // Node: Structure layout chosen to minimize padding in PairingHeao<*>::Node - uint64_t m_id; // Unique ID part of edge score - uint32_t m_score; // Score part of ID - void increase(uint32_t score) { -#if VL_DEBUG - UASSERT(score >= m_score, "Must increase"); -#endif - m_score = score; - } - bool operator<(const EdgeKey& other) const { - // First by Score then by ID - return m_score < other.m_score || (m_score == other.m_score && m_id < other.m_id); - } -}; - -using EdgeHeap = PairingHeap; - -//============================================================================= +//###################################################################### // LogicMTask class LogicMTask final : public AbstractLogicMTask { - template - friend class PartPropagateCp; - public: // TYPES using VxList = std::list; @@ -187,6 +157,55 @@ public: } }; + // This adaptor class allows the PartPropagateCp class to be somewhat + // independent of the LogicMTask class + // - PartPropagateCp can thus be declared before LogicMTask + // - PartPropagateCp could be reused with graphs of other node types + // in the future, using another Accessor adaptor. + class CpCostAccessor final { + public: + CpCostAccessor() = default; + ~CpCostAccessor() = default; + // Return cost of this node + uint32_t cost(const V3GraphVertex* vxp) const { + const LogicMTask* const mtaskp = static_cast(vxp); + return mtaskp->stepCost(); + } + // Return stored CP to this node + uint32_t critPathCost(const V3GraphVertex* vxp, GraphWay way) const { + const LogicMTask* const mtaskp = static_cast(vxp); + return mtaskp->critPathCost(way); + } + // Store a new CP to this node + void setCritPathCost(V3GraphVertex* vxp, GraphWay way, uint32_t cost) const { + LogicMTask* const mtaskp = static_cast(vxp); + mtaskp->setCritPathCost(way, cost); + } + // Notify vxp that the wayward CP at the throughp-->vxp edge + // has increased to 'cp'. (vxp is wayward from throughp.) + // This is our cue to update vxp's m_edges[!way][throughp]. + void notifyEdgeCp(V3GraphVertex* vxp, GraphWay way, V3GraphVertex* throuvhVxp, + uint32_t cp) const { + LogicMTask* const updateVxp = static_cast(vxp); + LogicMTask* const lthrouvhVxp = static_cast(throuvhVxp); + EdgeSet& edges = updateVxp->m_edges[way.invert()]; + const auto it = edges.find(lthrouvhVxp); + if (cp > it->second) edges.update(it, cp); + } + // Check that CP matches that of the longest edge wayward of vxp. + void checkNewCpVersusEdges(V3GraphVertex* vxp, GraphWay way, uint32_t cp) const { + LogicMTask* const mtaskp = static_cast(vxp); + const EdgeSet& edges = mtaskp->m_edges[way.invert()]; + // This is mtaskp's relative with longest !wayward inclusive CP: + const auto edgeIt = edges.rbegin(); + const uint32_t edgeCp = edgeIt->second; + UASSERT_OBJ(edgeCp == cp, vxp, "CP doesn't match longest wayward edge"); + } + + private: + VL_UNCOPYABLE(CpCostAccessor); + }; + private: // MEMBERS @@ -212,21 +231,21 @@ private: // while searching for a path. uint64_t m_generation = 0; - // Store a set of forward relatives so we can quickly check if we have a given child - std::unordered_set m_edgeSet; - // Store the outgoing and incoming edges in a heap sorted by the critical path length - std::array m_edgeHeap; - - // SiblingMC for which storage is owned by this MTask - std::set m_ownSibs; - // SiblingMC for which storage is owned by the opposite MTask - std::set m_farSibps; + // Redundant with the V3GraphEdge's, store a map of relatives so we can + // quickly check if we have a given parent or child. + // + // 'm_edges[way]' maps a wayward relative to the !way critical path at + // our edge with them. The SortByValueMap supports iterating over + // relatives in longest-to-shortest CP order. We rely on this ordering + // in more than one place. + using EdgeSet = SortByValueMap; + std::array m_edges; public: // CONSTRUCTORS LogicMTask(V3Graph* graphp, MTaskMoveVertex* mtmvVxp) : AbstractLogicMTask{graphp} { - for (uint32_t& item : m_critPathCost) item = 0; + for (unsigned int& i : m_critPathCost) i = 0; if (mtmvVxp) { // Else null for test m_vertices.push_back(mtmvVxp); if (const OrderLogicVertex* const olvp = mtmvVxp->logicp()) { @@ -240,9 +259,6 @@ public: } // METHODS - std::set& ownSibs() { return m_ownSibs; }; - std::set& farSibs() { return m_farSibps; }; - void moveAllVerticesFrom(LogicMTask* otherp) { // splice() is constant time m_vertices.splice(m_vertices.end(), otherp->m_vertices); @@ -280,37 +296,32 @@ public: logcost = logcost / 20.0; const uint32_t stepCost = static_cast(exp(logcost)); -#if VL_DEBUG UASSERT_STATIC(stepCost >= cost, "stepped cost error exceeded"); UASSERT_STATIC(stepCost <= ((cost * 11 / 10)), "stepped cost error exceeded"); -#endif return stepCost; #else return cost; #endif } - template - void addRelativeEdge(MTaskEdge* edgep); - template - void removeRelativeEdge(MTaskEdge* edgep); - - void addRelativeMTask(LogicMTask* relativep) { - // Add the relative to connecting edge map - VL_ATTR_UNUSED const bool exits = !m_edgeSet.emplace(relativep).second; + void addRelative(GraphWay way, LogicMTask* relativep) { + // value is !way cp to this edge + const uint32_t cp = relativep->stepCost() + relativep->critPathCost(way.invert()); + VL_ATTR_UNUSED const bool exits = !m_edges[way].emplace(relativep, cp).second; #if VL_DEBUG - UASSERT(!exits, "Adding existing relative"); + UASSERT(!exits, "Adding existing edge"); #endif } - void removeRelativeMTask(LogicMTask* relativep) { - VL_ATTR_UNUSED const size_t removed = m_edgeSet.erase(relativep); -#if VL_DEBUG - UASSERT(removed, "Relative should have been in set"); -#endif + void removeRelative(GraphWay way, LogicMTask* relativep) { m_edges[way].erase(relativep); } + bool hasRelative(GraphWay way, LogicMTask* relativep) { return m_edges[way].has(relativep); } + void checkRelativesCp(GraphWay way) const { + for (const auto& edge : vlstd::reverse_view(m_edges[way])) { + const LogicMTask* const relativep = edge.first; + const uint32_t cachedCp = edge.second; + const uint32_t cp = relativep->critPathCost(way.invert()) + relativep->stepCost(); + partCheckCachedScoreVsActual(cachedCp, cp); + } } - bool hasRelativeMTask(LogicMTask* relativep) const { return m_edgeSet.count(relativep); } - - void checkRelativesCp(GraphWay way) const; virtual string name() const override { // Display forward and reverse critical path costs. This gives a quick @@ -323,7 +334,27 @@ public: void setCritPathCost(GraphWay way, uint32_t cost) { m_critPathCost[way] = cost; } uint32_t critPathCost(GraphWay way) const { return m_critPathCost[way]; } - uint32_t critPathCostWithout(GraphWay way, const V3GraphEdge* withoutp) const; + uint32_t critPathCostWithout(GraphWay way, const V3GraphEdge* withoutp) const { + // Compute the critical path cost wayward to this node, without + // considering edge 'withoutp' + UASSERT(this == withoutp->furtherp(way), "In critPathCostWithout(), edge 'withoutp' must " + "further to 'this'"); + + // Iterate through edges until we get a relative other than + // wayEdgeEndp(way, withoutp). This should take 2 iterations max. + const EdgeSet& edges = m_edges[way.invert()]; + uint32_t result = 0; + for (const auto& edge : vlstd::reverse_view(edges)) { + if (edge.first != withoutp->furtherp(way.invert())) { + // Use the cached cost. It could be a small overestimate + // due to stepping. This is consistent with critPathCost() + // which also returns the cached cost. + result = edge.second; + break; + } + } + return result; + } private: static bool pathExistsFromInternal(LogicMTask* fromp, LogicMTask* top, @@ -380,7 +411,65 @@ public: return pathExistsFromInternal(fromp, top, excludedEdgep, incGeneration()); } - static void dumpCpFilePrefixed(const V3Graph* graphp, const string& nameComment); + static void dumpCpFilePrefixed(const V3Graph* graphp, const string& nameComment) { + const string filename = v3Global.debugFilename(nameComment) + ".txt"; + UINFO(1, "Writing " << filename << endl); + const std::unique_ptr ofp{V3File::new_ofstream(filename)}; + std::ostream* const osp = &(*ofp); // &* needed to deref unique_ptr + if (osp->fail()) v3fatalStatic("Can't write " << filename); + + // Find start vertex with longest CP + const LogicMTask* startp = nullptr; + for (const V3GraphVertex* vxp = graphp->verticesBeginp(); vxp; + vxp = vxp->verticesNextp()) { + const LogicMTask* const mtaskp = static_cast(vxp); + if (!startp) { + startp = mtaskp; + continue; + } + if (mtaskp->cost() + mtaskp->critPathCost(GraphWay::REVERSE) + > startp->cost() + startp->critPathCost(GraphWay::REVERSE)) { + startp = mtaskp; + } + } + + // Follow the entire critical path + std::vector path; + uint32_t totalCost = 0; + for (const LogicMTask* nextp = startp; nextp;) { + path.push_back(nextp); + totalCost += nextp->cost(); + + const EdgeSet& children = nextp->m_edges[GraphWay::FORWARD]; + const EdgeSet::const_reverse_iterator it = children.rbegin(); + if (it == children.rend()) { + nextp = nullptr; + } else { + nextp = it->first; + } + } + + *osp << "totalCost = " << totalCost + << " (should match the computed critical path cost (CP) for the graph)\n"; + + // Dump + for (const LogicMTask* mtaskp : path) { + *osp << "begin mtask with cost " << mtaskp->cost() << '\n'; + for (VxList::const_iterator lit = mtaskp->vertexListp()->begin(); + lit != mtaskp->vertexListp()->end(); ++lit) { + const OrderLogicVertex* const logicp = (*lit)->logicp(); + if (!logicp) continue; + if (false) { + // Show nodes only + *osp << "> "; + logicp->nodep()->dumpTree(*osp); + } else { + // Show nodes with hierarchical costs + V3InstrCount::count(logicp->nodep(), false, osp); + } + } + } + } private: VL_DEBUG_FUNC; // Declare debug() @@ -401,20 +490,11 @@ public: } }; -struct MergeCandidateKey { - // Note: Structure layout chosen to minimize padding in PairingHeao<*>::Node - uint64_t m_id; // Unique ID part of edge score - uint32_t m_score; // Score part of ID - bool operator<(const MergeCandidateKey& other) const { - // First by Score then by ID, but notice that we want minimums using a max-heap, so reverse - return m_score > other.m_score || (m_score == other.m_score && m_id > other.m_id); - } -}; +class SiblingMC; +class MTaskEdge; -using MergeCandidateScoreboard = V3Scoreboard; - -// Information associated with scoreboarding a merge candidate -class MergeCandidate VL_NOT_FINAL : public MergeCandidateScoreboard::Node { +// Information associated with scoreboarding an MTask +class MergeCandidate VL_NOT_FINAL { private: // Only the known subclasses can create or delete one of these friend class SiblingMC; @@ -427,17 +507,18 @@ private: // using another bit of the id to denote the actual subtype. // By using the bottom bits for flags, we can still use < to compare IDs without masking. - // <63:1> Serial number for ordering, <0> subtype (SiblingMC) - static constexpr uint64_t IS_SIBLING_MASK = 1ULL << 0; - static constexpr uint64_t ID_INCREMENT = 1ULL << 1; + uint64_t m_id; // <63:2> Serial number for ordering, <1> subtype (SiblingMC), <0> removed + static constexpr uint64_t REMOVED_MASK = 1ULL << 0; + static constexpr uint64_t IS_SIBLING_MASK = 1ULL << 1; + static constexpr uint64_t ID_INCREMENT = 1ULL << 2; - bool isSiblingMC() const { return m_key.m_id & IS_SIBLING_MASK; } + bool isSiblingMC() const { return m_id & IS_SIBLING_MASK; } // CONSTRUCTORS explicit MergeCandidate(bool isSiblingMC) { static uint64_t serial = 0; serial += ID_INCREMENT; // +ID_INCREMENT so doesn't set the special bottom bits - m_key.m_id = serial | (isSiblingMC * IS_SIBLING_MASK); + m_id = serial | (isSiblingMC * IS_SIBLING_MASK); } ~MergeCandidate() = default; @@ -449,33 +530,35 @@ public: const MTaskEdge* toMTaskEdge() const; // Instead of dynamic_cast bool mergeWouldCreateCycle() const; // Instead of virtual method - inline void rescore(); - uint32_t score() const { return m_key.m_score; } - - static MergeCandidate* heapNodeToElem(MergeCandidateScoreboard::Node* nodep) { - return static_cast(nodep); - } + bool removedFromSb() const { return (m_id & REMOVED_MASK) != 0; } + void removedFromSb(bool /*removed*/) { m_id |= REMOVED_MASK; } + void clearRemovedFromSb() { m_id &= ~REMOVED_MASK; } + bool operator<(const MergeCandidate& other) const { return m_id < other.m_id; } }; -static_assert(sizeof(MergeCandidate) == sizeof(MergeCandidateScoreboard::Node), - "Should not have a vtable"); +static_assert(sizeof(MergeCandidate) == sizeof(uint64_t), "Should not have a vtable"); // A pair of associated LogicMTask's that are merge candidates for sibling // contraction class SiblingMC final : public MergeCandidate { private: - LogicMTask* const m_ap; - LogicMTask* const m_bp; + LogicMTask* m_ap; + LogicMTask* m_bp; public: // CONSTRUCTORS SiblingMC() = delete; SiblingMC(LogicMTask* ap, LogicMTask* bp) - : MergeCandidate{/* isSiblingMC: */ true} - , m_ap{ap} - , m_bp{bp} { - // operator< and storage management depends on this - UASSERT(ap->id() > bp->id(), "Should be ordered"); + : MergeCandidate{/* isSiblingMC: */ true} { + // Assign 'ap' and 'bp' in a canonical order, so we can more easily + // compare pairs of SiblingMCs + if (ap->id() > bp->id()) { + m_ap = ap; + m_bp = bp; + } else { + m_ap = bp; + m_bp = ap; + } } ~SiblingMC() = default; // METHODS @@ -497,23 +580,17 @@ static_assert(sizeof(SiblingMC) == sizeof(MergeCandidate) + 2 * sizeof(LogicMTas // GraphEdge for the MTask graph class MTaskEdge final : public V3GraphEdge, public MergeCandidate { - friend class LogicMTask; - template - friend class PartPropagateCp; - - // MEMBERS - // This edge can be in 2 EdgeHeaps, one forward and one reverse. We allocate the heap nodes - // directly within the edge as they are always required and this makes association cheap. - EdgeHeap::Node m_edgeHeapNode[GraphWay::NUM_WAYS]; - public: // CONSTRUCTORS MTaskEdge(V3Graph* graphp, LogicMTask* fromp, LogicMTask* top, int weight) : V3GraphEdge{graphp, fromp, top, weight} , MergeCandidate{/* isSiblingMC: */ false} { - fromp->addRelativeMTask(top); - fromp->addRelativeEdge(this); - top->addRelativeEdge(this); + fromp->addRelative(GraphWay::FORWARD, top); + top->addRelative(GraphWay::REVERSE, fromp); + } + virtual ~MTaskEdge() override { + fromMTaskp()->removeRelative(GraphWay::FORWARD, toMTaskp()); + toMTaskp()->removeRelative(GraphWay::REVERSE, fromMTaskp()); } // METHODS LogicMTask* furtherMTaskp(GraphWay way) const { @@ -524,135 +601,28 @@ public: bool mergeWouldCreateCycle() const { return LogicMTask::pathExistsFrom(fromMTaskp(), toMTaskp(), this); } + static MTaskEdge* cast(V3GraphEdge* edgep) { + if (!edgep) return nullptr; + MTaskEdge* const resultp = dynamic_cast(edgep); + UASSERT(resultp, "Failed to cast in MTaskEdge::cast"); + return resultp; + } // Following initial assignment of critical paths, clear this MTaskEdge // out of the edge-map for each node and reinsert at a new location // with updated critical path. void resetCriticalPaths() { LogicMTask* const fromp = fromMTaskp(); LogicMTask* const top = toMTaskp(); - fromp->removeRelativeEdge(this); - top->removeRelativeEdge(this); - fromp->addRelativeEdge(this); - top->addRelativeEdge(this); - } - - uint32_t cachedCp(GraphWay way) const { return m_edgeHeapNode[way].key().m_score; } - - // Convert from the address of the m_edgeHeapNode[way] in an MTaskEdge back to the MTaskEdge - static const MTaskEdge* toEdge(GraphWay way, const EdgeHeap::Node* nodep) { - // Offset of the node within the MTaskEdge - const size_t offset - = reinterpret_cast(&(reinterpret_cast(0)->m_edgeHeapNode[way])); - return reinterpret_cast(reinterpret_cast(nodep) - offset); + fromp->removeRelative(GraphWay::FORWARD, top); + top->removeRelative(GraphWay::REVERSE, fromp); + fromp->addRelative(GraphWay::FORWARD, top); + top->addRelative(GraphWay::REVERSE, fromp); } private: VL_UNCOPYABLE(MTaskEdge); }; -template -void LogicMTask::addRelativeEdge(MTaskEdge* edgep) { - constexpr GraphWay way{T_Way}; - constexpr GraphWay inv = way.invert(); - // Add to the edge heap - LogicMTask* const relativep = edgep->furtherMTaskp(way); - // Value is !way cp to this edge - const uint32_t cp = relativep->stepCost() + relativep->critPathCost(inv); - // - m_edgeHeap[way].insert(&edgep->m_edgeHeapNode[way], {relativep->id(), cp}); -} - -template -void LogicMTask::removeRelativeEdge(MTaskEdge* edgep) { - constexpr GraphWay way{T_Way}; - // Remove from the edge heap - m_edgeHeap[way].remove(&edgep->m_edgeHeapNode[way]); -} - -void LogicMTask::checkRelativesCp(GraphWay way) const { - for (V3GraphEdge* edgep = beginp(way); edgep; edgep = edgep->nextp(way)) { - const LogicMTask* const relativep = static_cast(edgep->furtherp(way)); - const uint32_t cachedCp = static_cast(edgep)->cachedCp(way); - const uint32_t cp = relativep->critPathCost(way.invert()) + relativep->stepCost(); - partCheckCachedScoreVsActual(cachedCp, cp); - } -} - -uint32_t LogicMTask::critPathCostWithout(GraphWay way, const V3GraphEdge* withoutp) const { - // Compute the critical path cost wayward to this node, without considering edge 'withoutp'. - // We need to look at two edges at most, the critical path if that is not via 'withoutp', - // or the second-worst path, if the critical path is via 'withoutp'. -#if VL_DEBUG - UASSERT(withoutp->furtherp(way) == this, - "In critPathCostWithout(), edge 'withoutp' must further to 'this'"); -#endif - const GraphWay inv = way.invert(); - const EdgeHeap& edgeHeap = m_edgeHeap[inv]; - const EdgeHeap::Node* const maxp = edgeHeap.max(); - if (!maxp) return 0; - if (MTaskEdge::toEdge(inv, maxp) != withoutp) return maxp->key().m_score; - const EdgeHeap::Node* const secp = edgeHeap.secondMax(); - if (!secp) return 0; - return secp->key().m_score; -} - -void LogicMTask::dumpCpFilePrefixed(const V3Graph* graphp, const string& nameComment) { - const string filename = v3Global.debugFilename(nameComment) + ".txt"; - UINFO(1, "Writing " << filename << endl); - const std::unique_ptr ofp{V3File::new_ofstream(filename)}; - std::ostream* const osp = &(*ofp); // &* needed to deref unique_ptr - if (osp->fail()) v3fatalStatic("Can't write " << filename); - - // Find start vertex with longest CP - LogicMTask* startp = nullptr; - for (V3GraphVertex* vxp = graphp->verticesBeginp(); vxp; vxp = vxp->verticesNextp()) { - LogicMTask* const mtaskp = static_cast(vxp); - if (!startp) { - startp = mtaskp; - continue; - } - if (mtaskp->cost() + mtaskp->critPathCost(GraphWay::REVERSE) - > startp->cost() + startp->critPathCost(GraphWay::REVERSE)) { - startp = mtaskp; - } - } - - // Follow the entire critical path - std::vector path; - uint32_t totalCost = 0; - for (LogicMTask* nextp = startp; nextp;) { - path.push_back(nextp); - totalCost += nextp->cost(); - - if (EdgeHeap::Node* const maxp = nextp->m_edgeHeap[GraphWay::FORWARD].max()) { - nextp = MTaskEdge::toEdge(GraphWay::FORWARD, maxp)->toMTaskp(); - } else { - nextp = nullptr; - } - } - - *osp << "totalCost = " << totalCost - << " (should match the computed critical path cost (CP) for the graph)\n"; - - // Dump - for (const LogicMTask* mtaskp : path) { - *osp << "begin mtask with cost " << mtaskp->cost() << '\n'; - for (VxList::const_iterator lit = mtaskp->vertexListp()->begin(); - lit != mtaskp->vertexListp()->end(); ++lit) { - const OrderLogicVertex* const logicp = (*lit)->logicp(); - if (!logicp) continue; - if (false) { - // Show nodes only - *osp << "> "; - logicp->nodep()->dumpTree(*osp); - } else { - // Show nodes with hierarchical costs - V3InstrCount::count(logicp->nodep(), false, osp); - } - } - } -} - // Instead of dynamic cast SiblingMC* MergeCandidate::toSiblingMC() { return isSiblingMC() ? static_cast(this) : nullptr; @@ -677,40 +647,6 @@ bool MergeCandidate::mergeWouldCreateCycle() const { : static_cast(this)->mergeWouldCreateCycle(); } -static uint32_t siblingScore(const SiblingMC* sibsp) { - const LogicMTask* const ap = sibsp->ap(); - const LogicMTask* const bp = sibsp->bp(); - const uint32_t mergedCpCostFwd - = std::max(ap->critPathCost(GraphWay::FORWARD), bp->critPathCost(GraphWay::FORWARD)); - const uint32_t mergedCpCostRev - = std::max(ap->critPathCost(GraphWay::REVERSE), bp->critPathCost(GraphWay::REVERSE)); - return mergedCpCostRev + mergedCpCostFwd + LogicMTask::stepCost(ap->cost() + bp->cost()); -} - -static uint32_t edgeScore(const MTaskEdge* edgep) { - // Score this edge. Lower is better. The score is the new local CP - // length if we merge these mtasks. ("Local" means the longest - // critical path running through the merged node.) - const LogicMTask* const top = static_cast(edgep->top()); - const LogicMTask* const fromp = static_cast(edgep->fromp()); - const uint32_t mergedCpCostFwd = std::max(fromp->critPathCost(GraphWay::FORWARD), - top->critPathCostWithout(GraphWay::FORWARD, edgep)); - const uint32_t mergedCpCostRev = std::max(fromp->critPathCostWithout(GraphWay::REVERSE, edgep), - top->critPathCost(GraphWay::REVERSE)); - return mergedCpCostRev + mergedCpCostFwd + LogicMTask::stepCost(fromp->cost() + top->cost()); -} - -void MergeCandidate::rescore() { - if (const SiblingMC* const sibp = toSiblingMC()) { - m_key.m_score = siblingScore(sibp); - } else { - // The '1 +' favors merging a SiblingMC over an otherwise- - // equal-scoring MTaskEdge. The comment on selfTest() talks - // about why. - m_key.m_score = 1 + edgeScore(static_cast(this)); - } -} - // ###################################################################### // Vertex utility classes @@ -877,6 +813,7 @@ static void partCheckCriticalPaths(V3Graph* mtasksp) { // Usage: // * Client increases the cost and/or CP at a node or small set of nodes // (often a pair in practice, eg. edge contraction.) +// * Client instances a PartPropagateCp object // * Client calls PartPropagateCp::cpHasIncreased() one or more times. // Each call indicates that the inclusive CP of some "seed" vertex // has increased to a given value. @@ -886,120 +823,53 @@ static void partCheckCriticalPaths(V3Graph* mtasksp) { // * Client calls PartPropagateCp::go(). Internally, this iteratively // propagates the new CPs wayward through the graph. // -template -class PartPropagateCp final { - // TYPES - - // We keep pending vertices in a heap during critical path propagation - struct PendingKey { - LogicMTask* m_mtaskp; // The vertex in the heap - uint32_t m_score; // The score of this entry - void increase(uint32_t score) { -#if VL_DEBUG - UASSERT(score >= m_score, "Must increase"); -#endif - m_score = score; - } - bool operator<(const PendingKey& other) const { - if (m_score != other.m_score) return m_score < other.m_score; - return LogicMTask::CmpLogicMTask{}(m_mtaskp, other.m_mtaskp); - } - }; - - using PendingHeap = PairingHeap; - using PendingHeapNode = typename PendingHeap::Node; +class PartPropagateCp final : GraphAlg<> { +private: // MEMBERS - PendingHeap m_pendingHeap; // Heap of pending rescores - - // We allocate this many heap nodes at once - static constexpr size_t ALLOC_CHUNK_SIZE = 128; - PendingHeapNode* m_freep = nullptr; // List of free heap nodes - std::vector> m_allocated; // Allocated heap nodes - + const GraphWay m_way; // CPs oriented in this direction: either FORWARD + // // from graph-start to current node, or REVERSE + // // from graph-end to current node. + LogicMTask::CpCostAccessor m_access; // Access cost and CPs on V3GraphVertex's. + // // confirm we only process each vertex once. const bool m_slowAsserts; // Enable nontrivial asserts + // Pending rescores + SortByValueMap m_pending; + std::set m_seen; // Used only with slow asserts to check mtasks visited only once public: // CONSTRUCTORS - PartPropagateCp(bool slowAsserts) - : m_slowAsserts{slowAsserts} {} + PartPropagateCp(V3Graph* graphp, GraphWay way, bool slowAsserts, + V3EdgeFuncP edgeFuncp = &V3GraphEdge::followAlwaysTrue) + : GraphAlg<>{graphp, edgeFuncp} + , m_way{way} + , m_slowAsserts{slowAsserts} {} // METHODS -private: - // Allocate a HeapNode for the given element - PendingHeapNode* allocNode() { - // If no free nodes available, then make some - if (!m_freep) { - // Allocate in chunks for efficiency - m_allocated.emplace_back(new PendingHeapNode[ALLOC_CHUNK_SIZE]); - // Set up free list pointer - m_freep = m_allocated.back().get(); - // Set up free list chain - for (size_t i = 1; i < ALLOC_CHUNK_SIZE; ++i) { - m_freep[i - 1].m_next.m_ptr = &m_freep[i]; - } - // Clear the next pointer of the last entry - m_freep[ALLOC_CHUNK_SIZE - 1].m_next.m_ptr = nullptr; - } - // Free nodes are available, pick up the first one - PendingHeapNode* const resultp = m_freep; - m_freep = resultp->m_next.m_ptr; - resultp->m_next.m_ptr = nullptr; - return resultp; - } - - // Release a heap node (make it available for future allocation) - void freeNode(PendingHeapNode* nodep) { - // Re-use the existing link pointers and simply prepend it to the free list - nodep->m_next.m_ptr = m_freep; - m_freep = nodep; - } - -public: void cpHasIncreased(V3GraphVertex* vxp, uint32_t newInclusiveCp) { - constexpr GraphWay way{T_Way}; - constexpr GraphWay inv{way.invert()}; - // For *vxp, whose CP-inclusive has just increased to // newInclusiveCp, iterate to all wayward nodes, update the edges // of each, and add each to m_pending if its overall CP has grown. - for (MTaskEdge *edgep = static_cast(vxp->beginp(way)), *nextp; edgep; - edgep = nextp) { - // Fetch early as likely cache miss - nextp = static_cast(edgep->nextp(way)); + for (V3GraphEdge* edgep = vxp->beginp(m_way); edgep; edgep = edgep->nextp(m_way)) { + if (!m_edgeFuncp(edgep)) continue; + LogicMTask* const relativep = static_cast(edgep->furtherp(m_way)); + m_access.notifyEdgeCp(relativep, m_way, vxp, newInclusiveCp); - LogicMTask* const relativep = edgep->furtherMTaskp(way); - EdgeHeap::Node& edgeHeapNode = edgep->m_edgeHeapNode[inv]; - if (newInclusiveCp > edgeHeapNode.key().m_score) { - relativep->m_edgeHeap[inv].increaseKey(&edgeHeapNode, newInclusiveCp); + if (m_access.critPathCost(relativep, m_way) < newInclusiveCp) { + // relativep's critPathCost() is out of step with its + // longest !wayward edge. Schedule that to be resolved. + const uint32_t newPendingVal + = newInclusiveCp - m_access.critPathCost(relativep, m_way); + const auto pair = m_pending.emplace(relativep, newPendingVal); + if (!pair.second && (newPendingVal > pair.first->second)) { + m_pending.update(pair.first, newPendingVal); + } } - - const uint32_t critPathCost = relativep->critPathCost(way); - - if (critPathCost >= newInclusiveCp) continue; - - // relativep's critPathCost() is out of step with its longest !wayward edge. - // Schedule that to be resolved. - const uint32_t newVal = newInclusiveCp - critPathCost; - - if (PendingHeapNode* const nodep = static_cast(relativep->userp())) { - // Already in heap. Increase score if needed. - if (newVal > nodep->key().m_score) m_pendingHeap.increaseKey(nodep, newVal); - continue; - } - - // Add to heap - PendingHeapNode* const nodep = allocNode(); - relativep->userp(nodep); - m_pendingHeap.insert(nodep, {relativep, newVal}); } } void go() { - constexpr GraphWay way{T_Way}; - constexpr GraphWay inv{way.invert()}; - // m_pending maps each pending vertex to the amount that it wayward // CP will grow. // @@ -1016,34 +886,27 @@ public: // once. And so on. // // This generalizes to multiple seed nodes also. - while (!m_pendingHeap.empty()) { - // Pop max element from heap - PendingHeapNode* const maxp = m_pendingHeap.max(); - m_pendingHeap.remove(maxp); - // Pick up values - LogicMTask* const mtaskp = maxp->key().m_mtaskp; - const uint32_t cpGrowBy = maxp->key().m_score; - // Free the heap node, we are done with it - freeNode(maxp); - mtaskp->userp(nullptr); - // Update the critPathCost of mtaskp, that was out-of-date with respect to its edges - const uint32_t startCp = mtaskp->critPathCost(way); + while (!m_pending.empty()) { + const auto it = m_pending.rbegin(); + LogicMTask* const updateMep = it->first; + const uint32_t cpGrowBy = it->second; + m_pending.erase(it); + + // For *updateMep, whose critPathCost was out-of-date with respect + // to its edges, update the critPathCost. + const uint32_t startCp = m_access.critPathCost(updateMep, m_way); const uint32_t newCp = startCp + cpGrowBy; if (VL_UNLIKELY(m_slowAsserts)) { - // Check that CP matches that of the longest edge wayward of vxp. - const uint32_t edgeCp = mtaskp->m_edgeHeap[inv].max()->key().m_score; - UASSERT_OBJ(edgeCp == newCp, mtaskp, "CP doesn't match longest wayward edge"); + m_access.checkNewCpVersusEdges(updateMep, m_way, newCp); // Confirm that we only set each node's CP once. That's an // important property of PartPropagateCp which allows it to be far // faster than a recursive algorithm on some graphs. - const bool first = m_seen.insert(mtaskp).second; - UASSERT_OBJ(first, mtaskp, "Set CP on node twice"); + const bool first = m_seen.insert(updateMep).second; + UASSERT_OBJ(first, updateMep, "Set CP on node twice"); } - mtaskp->setCritPathCost(way, newCp); - cpHasIncreased(mtaskp, newCp + mtaskp->stepCost()); + m_access.setCritPathCost(updateMep, m_way, newCp); + cpHasIncreased(updateMep, newCp + m_access.cost(updateMep)); } - - if (VL_UNLIKELY(m_slowAsserts)) m_seen.clear(); } private: @@ -1076,11 +939,11 @@ private: const unsigned idx1 = V3Os::rand64(rngState) % 50; const unsigned idx2 = V3Os::rand64(rngState) % 50; if (idx1 > idx2) { - if (!m_vx[idx2]->hasRelativeMTask(m_vx[idx1])) { + if (!m_vx[idx2]->hasRelative(GraphWay::FORWARD, m_vx[idx1])) { new MTaskEdge{&m_graph, m_vx[idx2], m_vx[idx1], 1}; } } else if (idx2 > idx1) { - if (!m_vx[idx1]->hasRelativeMTask(m_vx[idx2])) { + if (!m_vx[idx1]->hasRelative(GraphWay::FORWARD, m_vx[idx2])) { new MTaskEdge{&m_graph, m_vx[idx1], m_vx[idx2], 1}; } } @@ -1089,7 +952,7 @@ private: partInitCriticalPaths(&m_graph); // This SelfTest class is also the T_CostAccessor - PartPropagateCp prop(true); + PartPropagateCp prop(&m_graph, GraphWay::FORWARD, true); // Seed the propagator with every input node; // This should result in the complete graph getting all CP's assigned. @@ -1098,6 +961,9 @@ private: } // Run the propagator. + // * The setCritPathCost() routine checks that each node's CP changes + // at most once. + // * The notifyEdgeCp routine is also self checking. prop.go(); // Finally, confirm that the entire graph appears to have correct CPs. @@ -1110,7 +976,7 @@ public: // Merge edges from a LogicMtask. // -// This code removes adjacent edges. When this occurs, mark it in need +// This code removes 'hasRelative' edges. When this occurs, mark it in need // of a rescore, in case its score has fallen and we need to move it up // toward the front of the scoreboard. // @@ -1141,93 +1007,51 @@ public: // // Another way of stating this: this code ensures that scores of // non-transitive edges only ever increase. -static void partRedirectEdgesFrom(V3Graph* graphp, LogicMTask* recipientp, LogicMTask* donorp, - MergeCandidateScoreboard* sbp) { - - // Process outgoing edges - MTaskEdge* outNextp = static_cast(donorp->outBeginp()); - while (outNextp) { - MTaskEdge* const edgep = outNextp; - LogicMTask* const relativep = outNextp->toMTaskp(); - outNextp = static_cast(outNextp->outNextp()); - - relativep->removeRelativeEdge(edgep); - - if (recipientp->hasRelativeMTask(relativep)) { - // An edge already exists between recipient and relative of donor. - // Mark it in need of a rescore - if (sbp) { - if (sbp->contains(edgep)) sbp->remove(edgep); - MTaskEdge* const existMTaskEdgep = static_cast( - recipientp->findConnectingEdgep(GraphWay::FORWARD, relativep)); -#if VL_DEBUG - UASSERT(existMTaskEdgep, "findConnectingEdge didn't find edge"); -#endif - if (sbp->contains(existMTaskEdgep)) sbp->hintScoreChanged(existMTaskEdgep); - } - // Can nuke the edge now - VL_DO_DANGLING(edgep->unlinkDelete(), edgep); - } else { - // No existing edge between recipient and relative of donor. - // Redirect the edge from donor<->relative to recipient<->relative. - donorp->removeRelativeEdge(edgep); - edgep->relinkFromp(recipientp); - recipientp->addRelativeMTask(relativep); - recipientp->addRelativeEdge(edgep); - relativep->addRelativeEdge(edgep); - if (sbp) { - if (!sbp->contains(edgep)) { - sbp->add(edgep); +static void partRedirectEdgesFrom(LogicMTask* recipientp, LogicMTask* donorp, + V3Scoreboard* sbp) { + for (const auto& way : {GraphWay::FORWARD, GraphWay::REVERSE}) { + for (V3GraphEdge *edgep = donorp->beginp(way), *nextp; edgep; edgep = nextp) { + nextp = edgep->nextp(way); + MTaskEdge* const tedgep = MTaskEdge::cast(edgep); + LogicMTask* const relativep = tedgep->furtherMTaskp(way); + if (recipientp->hasRelative(way, relativep)) { + // An edge already exists between recipient and relative of donor. + // Mark it in need of a rescore + if (sbp) { + if (!tedgep->removedFromSb()) sbp->removeElem(tedgep); + const MTaskEdge* const existMTaskEdgep + = MTaskEdge::cast(recipientp->findConnectingEdgep(way, relativep)); + UASSERT(existMTaskEdgep, "findConnectingEdge didn't find edge"); + if (!existMTaskEdgep->removedFromSb()) { + sbp->hintScoreChanged(existMTaskEdgep); + } + } + VL_DO_DANGLING(edgep->unlinkDelete(), edgep); + } else { + // No existing edge between recipient and relative of donor. + // Redirect the edge from donor<->relative to recipient<->relative. + if (way == GraphWay::REVERSE) { + tedgep->relinkTop(recipientp); + relativep->removeRelative(GraphWay::FORWARD, donorp); + relativep->addRelative(GraphWay::FORWARD, recipientp); + recipientp->addRelative(GraphWay::REVERSE, relativep); } else { - sbp->hintScoreChanged(edgep); + tedgep->relinkFromp(recipientp); + relativep->removeRelative(GraphWay::REVERSE, donorp); + relativep->addRelative(GraphWay::REVERSE, recipientp); + recipientp->addRelative(GraphWay::FORWARD, relativep); + } + if (sbp) { + if (tedgep->removedFromSb()) { + tedgep->clearRemovedFromSb(); + sbp->addElem(tedgep); + } else { + sbp->hintScoreChanged(tedgep); + } } } } } - - // Process incoming edges - MTaskEdge* inNextp = static_cast(donorp->inBeginp()); - while (inNextp) { - MTaskEdge* const edgep = inNextp; - LogicMTask* const relativep = inNextp->fromMTaskp(); - inNextp = static_cast(inNextp->inNextp()); - - relativep->removeRelativeMTask(donorp); - relativep->removeRelativeEdge(edgep); - - if (relativep->hasRelativeMTask(recipientp)) { - // An edge already exists between recipient and relative of donor. - // Mark it in need of a rescore - if (sbp) { - if (sbp->contains(edgep)) sbp->remove(edgep); - MTaskEdge* const existMTaskEdgep = static_cast( - recipientp->findConnectingEdgep(GraphWay::REVERSE, relativep)); -#if VL_DEBUG - UASSERT(existMTaskEdgep, "findConnectingEdge didn't find edge"); -#endif - if (sbp->contains(existMTaskEdgep)) sbp->hintScoreChanged(existMTaskEdgep); - } - VL_DO_DANGLING(edgep->unlinkDelete(), edgep); - } else { - // No existing edge between recipient and relative of donor. - // Redirect the edge from donor<->relative to recipient<->relative. - donorp->removeRelativeEdge(edgep); - edgep->relinkTop(recipientp); - relativep->addRelativeMTask(recipientp); - relativep->addRelativeEdge(edgep); - recipientp->addRelativeEdge(edgep); - if (sbp) { - if (!sbp->contains(edgep)) { - sbp->add(edgep); - } else { - sbp->hintScoreChanged(edgep); - } - } - } - } - - // Remove donorp from the graph - VL_DO_DANGLING(donorp->unlinkDelete(graphp), donorp); } //###################################################################### @@ -1237,6 +1061,14 @@ static void partRedirectEdgesFrom(V3Graph* graphp, LogicMTask* recipientp, Logic class PartContraction final { private: // TYPES + + // TODO: might get a little more speed by making this a + // std::unordered_set and defining hash and equal_to functors for the + // SiblingMC: + using SibSet = std::set; + using SibpSet = std::unordered_set; + using MTask2Sibs = std::unordered_map; + // New CP information for mtaskp reflecting an upcoming merge struct NewCp { uint32_t cp; @@ -1250,17 +1082,17 @@ private: uint32_t m_scoreLimitBeforeRescore = 0xffffffff; // Next score rescore at unsigned m_mergesSinceRescore = 0; // Merges since last rescore const bool m_slowAsserts; // Take extra time to validate algorithm - MergeCandidateScoreboard m_sb; // Scoreboard - - PartPropagateCp m_forwardPropagator{m_slowAsserts}; // Forward propagator - PartPropagateCp m_reversePropagator{m_slowAsserts}; // Reverse propagator + V3Scoreboard m_sb; // Scoreboard + SibSet m_pairs; // Storage for each SiblingMC + MTask2Sibs m_mtask2sibs; // SiblingMC set for each mtask public: // CONSTRUCTORS PartContraction(V3Graph* mtasksp, uint32_t scoreLimit, bool slowAsserts) : m_mtasksp{mtasksp} , m_scoreLimit{scoreLimit} - , m_slowAsserts{slowAsserts} {} + , m_slowAsserts{slowAsserts} + , m_sb{&mergeCandidateScore, slowAsserts} {} // METHODS void go() { @@ -1284,18 +1116,17 @@ public: // - Incrementally recompute critical paths near the merged mtask. for (V3GraphVertex* itp = m_mtasksp->verticesBeginp(); itp; itp = itp->verticesNextp()) { - itp->userp(nullptr); // Reset user value. Used by PartPropagateCp. std::unordered_set neighbors; for (V3GraphEdge* edgep = itp->outBeginp(); edgep; edgep = edgep->outNextp()) { - m_sb.add(static_cast(edgep)); + m_sb.addElem(MTaskEdge::cast(edgep)); if (m_slowAsserts) { UASSERT_OBJ(neighbors.find(edgep->top()) == neighbors.end(), itp, "Redundant edge found in input to PartContraction()"); } neighbors.insert(edgep->top()); } - siblingPairFromRelatives(itp); - siblingPairFromRelatives(itp); + siblingPairFromRelatives(GraphWay::REVERSE, itp, true); + siblingPairFromRelatives(GraphWay::FORWARD, itp, true); } doRescore(); // Set initial scores in scoreboard @@ -1303,7 +1134,7 @@ public: while (true) { // This is the best edge to merge, with the lowest // score (shortest local critical path) - MergeCandidate* const mergeCanp = m_sb.best(); + MergeCandidate* const mergeCanp = const_cast(m_sb.bestp()); if (!mergeCanp) { // Scoreboard found no eligible merges. Maybe a rescore // will produce some merge-able pairs? @@ -1318,9 +1149,8 @@ public: UASSERT(!m_sb.needsRescore(mergeCanp), "Need-rescore items should not be returned by bestp"); } - const uint32_t cachedScore = mergeCanp->score(); - mergeCanp->rescore(); - const uint32_t actualScore = mergeCanp->score(); + const uint32_t cachedScore = m_sb.cachedScore(mergeCanp); + const uint32_t actualScore = mergeCandidateScore(mergeCanp); if (actualScore > cachedScore) { // Cached score is out-of-date. @@ -1381,11 +1211,8 @@ public: if (mergeCanp->mergeWouldCreateCycle()) { // Remove this edge from scoreboard so we don't keep // reconsidering it on every loop. - m_sb.remove(mergeCanp); - if (SiblingMC* const smcp = mergeCanp->toSiblingMC()) { - smcp->bp()->farSibs().erase(smcp); - smcp->ap()->ownSibs().erase(*smcp); // Kills *smcp, so do last - } + m_sb.removeElem(mergeCanp); + mergeCanp->removedFromSb(true); continue; } @@ -1447,29 +1274,31 @@ private: } void removeSiblingMCsWith(LogicMTask* mtaskp) { - for (const SiblingMC& pair : mtaskp->ownSibs()) { - m_sb.remove(const_cast(&pair)); - // Owner is always ap(), remove from the opposite side - pair.bp()->farSibs().erase(&pair); + for (SibpSet::iterator it = m_mtask2sibs[mtaskp].begin(); it != m_mtask2sibs[mtaskp].end(); + ++it) { + const SiblingMC* const pairp = *it; + if (!pairp->removedFromSb()) m_sb.removeElem(pairp); + const LogicMTask* const otherp = (pairp->bp() == mtaskp) ? pairp->ap() : pairp->bp(); + size_t erased = m_mtask2sibs[otherp].erase(pairp); + UASSERT_OBJ(erased > 0, otherp, "Expected existing mtask"); + erased = m_pairs.erase(*pairp); + UASSERT_OBJ(erased > 0, mtaskp, "Expected existing mtask"); } - for (const SiblingMC* const pairp : mtaskp->farSibs()) { - m_sb.remove(const_cast(pairp)); - // Owner is always ap(), remove from the opposite side - pairp->ap()->ownSibs().erase(*pairp); - } - mtaskp->ownSibs().clear(); - mtaskp->farSibs().clear(); + const size_t erased = m_mtask2sibs.erase(mtaskp); + UASSERT_OBJ(erased > 0, mtaskp, "Expected existing mtask"); } void contract(MergeCandidate* mergeCanp) { LogicMTask* top = nullptr; LogicMTask* fromp = nullptr; MTaskEdge* mergeEdgep = mergeCanp->toMTaskEdge(); + const SiblingMC* mergeSibsp = nullptr; if (mergeEdgep) { top = static_cast(mergeEdgep->top()); fromp = static_cast(mergeEdgep->fromp()); } else { - const SiblingMC* mergeSibsp = static_cast(mergeCanp); + mergeSibsp = mergeCanp->toSiblingMC(); + UASSERT(mergeSibsp, "Failed to cast mergeCanp to either MTaskEdge or SiblingMC"); top = mergeSibsp->ap(); fromp = mergeSibsp->bp(); } @@ -1508,10 +1337,7 @@ private: if (mergeEdgep) { // Remove and free the connecting edge. Must do this before // propagating CP's below. - m_sb.remove(mergeCanp); - mergeEdgep->fromMTaskp()->removeRelativeMTask(mergeEdgep->toMTaskp()); - mergeEdgep->fromMTaskp()->removeRelativeEdge(mergeEdgep); - mergeEdgep->toMTaskp()->removeRelativeEdge(mergeEdgep); + m_sb.removeElem(mergeCanp); VL_DO_CLEAR(mergeEdgep->unlinkDelete(), mergeEdgep = nullptr); } @@ -1527,22 +1353,25 @@ private: << (donorNewCpFwd.propagate ? " true " : " false ") << donorNewCpFwd.propagateCp << endl); + PartPropagateCp forwardPropagator(m_mtasksp, GraphWay::FORWARD, m_slowAsserts); + PartPropagateCp reversePropagator(m_mtasksp, GraphWay::REVERSE, m_slowAsserts); + recipientp->setCritPathCost(GraphWay::FORWARD, recipientNewCpFwd.cp); if (recipientNewCpFwd.propagate) { - m_forwardPropagator.cpHasIncreased(recipientp, recipientNewCpFwd.propagateCp); + forwardPropagator.cpHasIncreased(recipientp, recipientNewCpFwd.propagateCp); } recipientp->setCritPathCost(GraphWay::REVERSE, recipientNewCpRev.cp); if (recipientNewCpRev.propagate) { - m_reversePropagator.cpHasIncreased(recipientp, recipientNewCpRev.propagateCp); + reversePropagator.cpHasIncreased(recipientp, recipientNewCpRev.propagateCp); } if (donorNewCpFwd.propagate) { - m_forwardPropagator.cpHasIncreased(donorp, donorNewCpFwd.propagateCp); + forwardPropagator.cpHasIncreased(donorp, donorNewCpFwd.propagateCp); } if (donorNewCpRev.propagate) { - m_reversePropagator.cpHasIncreased(donorp, donorNewCpRev.propagateCp); + reversePropagator.cpHasIncreased(donorp, donorNewCpRev.propagateCp); } - m_forwardPropagator.go(); - m_reversePropagator.go(); + forwardPropagator.go(); + reversePropagator.go(); // Remove all SiblingMCs that include donorp. This Includes the one // we're merging, if we're merging a SiblingMC. @@ -1552,8 +1381,11 @@ private: // to a bounded number. removeSiblingMCsWith(recipientp); - // Redirect all edges, delete donorp - partRedirectEdgesFrom(m_mtasksp, recipientp, donorp, &m_sb); + // Redirect all edges + partRedirectEdgesFrom(recipientp, donorp, &m_sb); + + // Delete the donorp mtask from the graph + VL_DO_CLEAR(donorp->unlinkDelete(m_mtasksp), donorp = nullptr); ++m_mergesSinceRescore; @@ -1566,21 +1398,21 @@ private: // - prereqs of recipientp's postreqs // - postreqs of recipientp's prereqs // Note that this depends on the updated critical paths (above). - siblingPairFromRelatives(recipientp); - siblingPairFromRelatives(recipientp); + siblingPairFromRelatives(GraphWay::REVERSE, recipientp, true); + siblingPairFromRelatives(GraphWay::FORWARD, recipientp, true); unsigned edges = 0; for (V3GraphEdge* edgep = recipientp->outBeginp(); edgep; edgep = edgep->outNextp()) { LogicMTask* const postreqp = static_cast(edgep->top()); - siblingPairFromRelatives(postreqp); + siblingPairFromRelatives(GraphWay::REVERSE, postreqp, false); ++edges; - if (edges >= PART_SIBLING_EDGE_LIMIT) break; + if (edges > PART_SIBLING_EDGE_LIMIT) break; } edges = 0; for (V3GraphEdge* edgep = recipientp->inBeginp(); edgep; edgep = edgep->inNextp()) { LogicMTask* const prereqp = static_cast(edgep->fromp()); - siblingPairFromRelatives(prereqp); + siblingPairFromRelatives(GraphWay::FORWARD, prereqp, false); ++edges; - if (edges >= PART_SIBLING_EDGE_LIMIT) break; + if (edges > PART_SIBLING_EDGE_LIMIT) break; } } @@ -1597,86 +1429,111 @@ private: m_scoreLimitBeforeRescore = 0xffffffff; } + static uint32_t mergeCandidateScore(const MergeCandidate* pairp) { + if (const MTaskEdge* const edgep = pairp->toMTaskEdge()) { + // The '1 +' favors merging a SiblingMC over an otherwise- + // equal-scoring MTaskEdge. The comment on selfTest() talks + // about why. + return 1 + edgeScore(edgep); + } else { + return siblingScore(pairp->toSiblingMC()); + } + v3fatalSrc("Failed to cast pairp to either MTaskEdge or SiblingMC in mergeCandidateScore"); + return 0; + } + + VL_ATTR_NOINLINE + static uint32_t siblingScore(const SiblingMC* sibsp) { + const LogicMTask* const ap = sibsp->ap(); + const LogicMTask* const bp = sibsp->bp(); + const uint32_t mergedCpCostFwd + = std::max(ap->critPathCost(GraphWay::FORWARD), bp->critPathCost(GraphWay::FORWARD)); + const uint32_t mergedCpCostRev + = std::max(ap->critPathCost(GraphWay::REVERSE), bp->critPathCost(GraphWay::REVERSE)); + return mergedCpCostRev + mergedCpCostFwd + LogicMTask::stepCost(ap->cost() + bp->cost()); + } + + VL_ATTR_NOINLINE + static uint32_t edgeScore(const V3GraphEdge* edgep) { + // Score this edge. Lower is better. The score is the new local CP + // length if we merge these mtasks. ("Local" means the longest + // critical path running through the merged node.) + const LogicMTask* const top = static_cast(edgep->top()); + const LogicMTask* const fromp = static_cast(edgep->fromp()); + const uint32_t mergedCpCostFwd + = std::max(fromp->critPathCost(GraphWay::FORWARD), + top->critPathCostWithout(GraphWay::FORWARD, edgep)); + const uint32_t mergedCpCostRev + = std::max(fromp->critPathCostWithout(GraphWay::REVERSE, edgep), + top->critPathCost(GraphWay::REVERSE)); + return mergedCpCostRev + mergedCpCostFwd + + LogicMTask::stepCost(fromp->cost() + top->cost()); + } + void makeSiblingMC(LogicMTask* ap, LogicMTask* bp) { - if (ap->id() < bp->id()) std::swap(ap, bp); - // The higher id vertex owns the storage - const auto emplaceResult = ap->ownSibs().emplace(ap, bp); - if (emplaceResult.second) { - SiblingMC* const newSibsp = const_cast(&(*emplaceResult.first)); - bp->farSibs().insert(newSibsp); - m_sb.add(newSibsp); + const SiblingMC newSibs(ap, bp); + const std::pair insertResult = m_pairs.insert(newSibs); + if (insertResult.second) { + const SiblingMC* const newSibsp = &(*insertResult.first); + m_mtask2sibs[ap].insert(newSibsp); + m_mtask2sibs[bp].insert(newSibsp); + m_sb.addElem(newSibsp); } else if (m_slowAsserts) { // It's fine if we already have this SiblingMC, we may have // created it earlier. Just confirm that we have associated data. + UASSERT_OBJ(m_mtask2sibs.find(ap) != m_mtask2sibs.end(), ap, "Sibling not found"); + UASSERT_OBJ(m_mtask2sibs.find(bp) != m_mtask2sibs.end(), bp, "Sibling not found"); bool found = false; - for (const SiblingMC& sibs : ap->ownSibs()) { - UASSERT_OBJ(sibs.ap() == ap, ap, "Inconsistent SiblingMC"); - UASSERT_OBJ(m_sb.contains(&sibs), ap, "Must be on the scoreboard"); - if (sibs.bp() == bp) found = true; + for (SibpSet::iterator it = m_mtask2sibs[ap].begin(); it != m_mtask2sibs[ap].end(); + ++it) { + const SiblingMC* const sibsp = *it; + UASSERT_OBJ(!(!sibsp->removedFromSb() && !m_sb.contains(sibsp)), ap, + "One sibling must be the one we collided with"); + if ((sibsp->ap() == ap && sibsp->bp() == bp) + || (sibsp->bp() == ap && sibsp->ap() == bp)) + found = true; } UASSERT_OBJ(found, ap, "Sibling not found"); } } - template - VL_ATTR_NOINLINE void siblingPairFromRelatives(V3GraphVertex* mtaskp) { - constexpr GraphWay way{Way}; - // Need at least 2 edges - if (!mtaskp->beginp(way) || !mtaskp->beginp(way)->nextp(way)) return; + void siblingPairFromRelatives(GraphWay way, V3GraphVertex* mtaskp, bool exhaustive) { + std::vector shortestPrereqs; - std::array neighbours; - - // This is a hot method, so we want so sort as efficiently as possible. We pre-load - // all data (critical path cost and id) required for determining ordering into an aligned - // structure. There is not enough space next to these to keep a whole pointer within 16 - // bytes, so we store an index into the neighbours buffer instead. We can then compare - // and swap these sorting records very efficiently. With this the standard library sorting - // functions are efficient enough and using more optimized methods (e.g.: sorting networks) - // has no measurable benefit. - struct alignas(16) SortingRecord { - uint64_t m_id; - uint32_t m_cp; - uint8_t m_idx; - static_assert(PART_SIBLING_EDGE_LIMIT <= std::numeric_limits::max(), - "m_idx must fit all indices into 'neighbours'"); - bool operator<(const SortingRecord& that) const { - return m_cp < that.m_cp || (m_cp == that.m_cp && m_id < that.m_id); - } - }; - static_assert(sizeof(SortingRecord) <= 16, "How could this be padded to more than 16?"); - - std::array sortRecs; - size_t n = 0; - - // Populate the buffers - for (V3GraphEdge *edgep = mtaskp->beginp(way), *nextp; edgep; edgep = nextp) { - nextp = edgep->nextp(way); // Fetch next first as likely cache miss - LogicMTask* const otherp = static_cast(edgep->furtherp(way)); - neighbours[n] = otherp; - sortRecs[n].m_id = otherp->id(); - sortRecs[n].m_cp = otherp->critPathCost(way) + otherp->cost(); - sortRecs[n].m_idx = n; - ++n; - // Prevent nodes with huge numbers of edges from massively slowing down us down - if (n >= PART_SIBLING_EDGE_LIMIT) break; + for (V3GraphEdge* edgep = mtaskp->beginp(way); edgep; edgep = edgep->nextp(way)) { + LogicMTask* const prereqp = static_cast(edgep->furtherp(way)); + shortestPrereqs.push_back(prereqp); + // Prevent nodes with huge numbers of edges from massively + // slowing down the partitioner: + if (shortestPrereqs.size() > PART_SIBLING_EDGE_LIMIT) break; } - // Don't make all possible pairs of siblings when not requested (non-exhaustive). + if (shortestPrereqs.size() <= 1) return; + + const auto cmp = [way](const LogicMTask* ap, const LogicMTask* bp) { + const uint32_t aCp = ap->critPathCost(way) + ap->cost(); + const uint32_t bCp = bp->critPathCost(way) + bp->cost(); + if (aCp != bCp) return aCp < bCp; + return ap->id() < bp->id(); + }; + + // Don't make all possible pairs of prereqs when not requested (non-exhaustive). // Just make a few pairs. constexpr size_t MAX_NONEXHAUSTIVE_PAIRS = 3; - if (Exhaustive || n <= 2 * MAX_NONEXHAUSTIVE_PAIRS) { - const size_t end = n & ~static_cast(1); // Round down to even, (we want pairs) - std::sort(sortRecs.begin(), sortRecs.begin() + n); - for (size_t i = 0; i < end; i += 2) { - makeSiblingMC(neighbours[sortRecs[i].m_idx], neighbours[sortRecs[i + 1].m_idx]); - } + size_t end; // End index of pairs to add to candidates (exclusive) + + if (exhaustive || (shortestPrereqs.size() <= 2 * MAX_NONEXHAUSTIVE_PAIRS)) { + end = shortestPrereqs.size() & ~static_cast(1); // Round down to even + std::sort(shortestPrereqs.begin(), shortestPrereqs.end(), cmp); } else { - constexpr size_t end = 2 * MAX_NONEXHAUSTIVE_PAIRS; - std::partial_sort(sortRecs.begin(), sortRecs.begin() + end, sortRecs.begin() + n); - for (size_t i = 0; i < end; i += 2) { - makeSiblingMC(neighbours[sortRecs[i].m_idx], neighbours[sortRecs[i + 1].m_idx]); - } + end = 2 * MAX_NONEXHAUSTIVE_PAIRS; + std::partial_sort(shortestPrereqs.begin(), shortestPrereqs.begin() + end, + shortestPrereqs.end(), cmp); + } + + for (size_t i = 0; i < end; i += 2) { + makeSiblingMC(shortestPrereqs[i], shortestPrereqs[i + 1]); } } @@ -1993,15 +1850,17 @@ private: } // Move all vertices from donorp to mergedp mergedp->moveAllVerticesFrom(donorp); - // Redirect edges from donorp to recipientp, delete donorp - partRedirectEdgesFrom(m_mtasksp, mergedp, donorp, nullptr); + // Redirect edges from donorp to recipientp + partRedirectEdgesFrom(mergedp, donorp, nullptr); + // Remove donorp from the graph + VL_DO_DANGLING(donorp->unlinkDelete(m_mtasksp), donorp); ++m_mergesDone; } if (lastMergedp) { UASSERT_OBJ(lastMergedp->rank() < mergedp->rank(), mergedp, "Merging must be on lower rank"); - if (!lastMergedp->hasRelativeMTask(mergedp)) { + if (!lastMergedp->hasRelative(GraphWay::FORWARD, mergedp)) { new MTaskEdge(m_mtasksp, lastMergedp, mergedp, 1); } } @@ -2647,8 +2506,9 @@ void V3Partition::setupMTaskDeps(V3Graph* mtasksp, const Vx2MTaskMap* vx2mtaskp) UASSERT_OBJ(otherMTaskp != mtaskp, mtaskp, "Would create a cycle edge"); // Don't create redundant edges. - if (mtaskp->hasRelativeMTask(otherMTaskp)) continue; - + if (mtaskp->hasRelative(GraphWay::FORWARD, otherMTaskp)) { // + continue; + } new MTaskEdge(mtasksp, mtaskp, otherMTaskp, 1); } } diff --git a/src/V3Scoreboard.cpp b/src/V3Scoreboard.cpp index d21422a81..78d466596 100644 --- a/src/V3Scoreboard.cpp +++ b/src/V3Scoreboard.cpp @@ -19,42 +19,26 @@ #include "V3Scoreboard.h" -class ScoreboardTestElem; - -struct Key { - // Node: Structure layout chosen to minimize padding in PairingHeao<*>::Node - uint64_t m_id; // Unique ID part of edge score - uint32_t m_score; // Score part of ID - bool operator<(const Key& other) const { - // First by Score then by ID, but notice that we want minimums using a max-heap, so reverse - return m_score > other.m_score || (m_score == other.m_score && m_id > other.m_id); - } -}; - -using Scoreboard = V3Scoreboard; - -class ScoreboardTestElem final : public Scoreboard::Node { +class ScoreboardTestElem final { public: - uint32_t m_newScore; + // MEMBERS + uint32_t m_score; + uint32_t m_id; // CONSTRUCTORS explicit ScoreboardTestElem(uint32_t score) - : m_newScore{score} { - m_key.m_score = m_newScore; + : m_score{score} { static uint32_t s_serial = 0; - m_key.m_id = ++s_serial; + m_id = ++s_serial; } ScoreboardTestElem() = default; + // METHODS + static uint32_t scoreFn(const ScoreboardTestElem* elp) { return elp->m_score; } - uint64_t id() const { return m_key.m_id; } - void rescore() { m_key.m_score = m_newScore; } - uint32_t score() const { return m_key.m_score; } - static ScoreboardTestElem* heapNodeToElem(Scoreboard::Node* nodep) { - return static_cast(nodep); - } + bool operator<(const ScoreboardTestElem& other) const { return m_id < other.m_id; } }; void V3ScoreboardBase::selfTest() { - Scoreboard sb; + V3Scoreboard sb(ScoreboardTestElem::scoreFn, true); UASSERT(!sb.needsRescore(), "SelfTest: Empty sb should not need rescore."); @@ -62,13 +46,13 @@ void V3ScoreboardBase::selfTest() { ScoreboardTestElem e2(20); ScoreboardTestElem e3(30); - sb.add(&e1); - sb.add(&e2); - sb.add(&e3); + sb.addElem(&e1); + sb.addElem(&e2); + sb.addElem(&e3); UASSERT(sb.needsRescore(), "SelfTest: Newly filled sb should need a rescore."); UASSERT(sb.needsRescore(&e1), "SelfTest: Individual newly-added element should need rescore"); - UASSERT(nullptr == sb.best(), + UASSERT(nullptr == sb.bestp(), "SelfTest: Newly filled sb should have nothing eligible for Bestp()"); sb.rescore(); @@ -76,22 +60,24 @@ void V3ScoreboardBase::selfTest() { UASSERT(!sb.needsRescore(), "SelfTest: Newly rescored sb should not need rescore"); UASSERT(!sb.needsRescore(&e1), "SelfTest: Newly rescored sb should not need an element rescored"); - UASSERT(&e1 == sb.best(), "SelfTest: Should return element with lowest (best) score"); + UASSERT(e2.m_score == sb.cachedScore(&e2), + "SelfTest: Cached score should match current score"); + UASSERT(&e1 == sb.bestp(), "SelfTest: Should return element with lowest (best) score"); // Change one element's score sb.hintScoreChanged(&e2); - e2.m_newScore = 21; + e2.m_score = 21; UASSERT(sb.needsRescore(&e2), "SelfTest: Should need rescore on elem after hintScoreChanged"); // Remove an element UASSERT(sb.contains(&e1), "SelfTest: e1 should be there"); - sb.remove(&e1); + sb.removeElem(&e1); UASSERT(!sb.contains(&e1), "SelfTest: e1 should be gone"); UASSERT(sb.contains(&e2), "SelfTest: e2 should be there, despite needing rescore"); // Now e3 should be our best-scoring element, even though // e2 has a better score, since e2 is pending rescore. - UASSERT(&e3 == sb.best(), "SelfTest: Expect e3 as best element with known score."); + UASSERT(&e3 == sb.bestp(), "SelfTest: Expect e3 as best element with known score."); sb.rescore(); - UASSERT(&e2 == sb.best(), "SelfTest: Expect e2 as best element again after Rescore"); + UASSERT(&e2 == sb.bestp(), "SelfTest: Expect e2 as best element again after Rescore"); } diff --git a/src/V3Scoreboard.h b/src/V3Scoreboard.h index 4bf915431..dc5fce0b0 100644 --- a/src/V3Scoreboard.h +++ b/src/V3Scoreboard.h @@ -1,6 +1,13 @@ // -*- mode: C++; c-file-style: "cc-mode" -*- //************************************************************************* -// DESCRIPTION: Verilator: Scoreboard for mtask coarsening +// DESCRIPTION: Verilator: Scoreboards for thread partitioner +// +// Provides scoreboard classes: +// +// * SortByValueMap +// * V3Scoreboard +// +// See details below // // Code available from: https://verilator.org // @@ -21,122 +28,248 @@ #include "verilatedos.h" #include "V3Error.h" -#include "V3PairingHeap.h" -//=============================================================================================== -// V3Scoreboard is essentially a heap that can be hinted that some elements have changed keys, at -// which points those elements will be deferred as 'unknown' until the next 'rescore' call. We -// largely reuse the implementation of the slightly more generic PairingHeap, but we do rely on the -// internal structure of the PairingHeap so changing that class requires changing this. -// -// For efficiency, the elements themselves must be the heap nodes, by deriving them from -// V3Scoreboard::Node. This also means a single element can only be associated with -// a single scoreboard. +#include +#include +#include +#include + +// ###################################################################### +// SortByValueMap + +// A generic key-value map, except iteration is in *value* sorted order. Values need not be unique. +// Uses T_KeyCompare to break ties in the sort when values collide. Note: Only const iteration is +// possible, as updating mapped values via iterators is not safe. + +template > +class SortByValueMap final { + // Current implementation is a std::set of key/value pairs, plus a std_unordered_map from keys + // to iterators into the set. This keeps most operations fairly cheap and also has the benefit + // of being able to re-use the std::set iterators. -template -class V3Scoreboard final { // TYPES - using Heap = PairingHeap; + + using Pair = std::pair; + + struct PairCmp final { + bool operator()(const Pair& a, const Pair& b) const { + // First compare values + if (a.second != b.second) return a.second < b.second; + // Then compare keys + return T_KeyCompare{}(a.first, b.first); + } + }; + + using PairSet = std::set; public: - using Node = typename Heap::Node; + using const_iterator = typename PairSet::const_iterator; + using const_reverse_iterator = typename PairSet::const_reverse_iterator; private: - using Link = typename Heap::Link; - - // Note: T_Elem is incomplete here, so we cannot assert 'std::is_base_of::value' - // MEMBERS - Heap m_known; // The heap of entries with known scores - Link m_unknown; // List of entries with unknown scores + PairSet m_pairs; // The contents of the map, stored directly as key-value pairs + std::unordered_map m_kiMap; // Key to iterator map + + VL_UNCOPYABLE(SortByValueMap); public: // CONSTRUCTORS - explicit V3Scoreboard() = default; - ~V3Scoreboard() = default; + SortByValueMap() = default; -private: - VL_UNCOPYABLE(V3Scoreboard); + // Only const iteration is possible + const_iterator begin() const { return m_pairs.begin(); } + const_iterator end() const { return m_pairs.end(); } + const_iterator cbegin() const { m_pairs.cbegin(); } + const_iterator cend() const { return m_pairs.cend(); } + const_reverse_iterator rbegin() const { return m_pairs.rbegin(); } + const_reverse_iterator rend() const { return m_pairs.rend(); } + const_reverse_iterator crbegin() const { return m_pairs.crbegin(); } + const_reverse_iterator crend() const { return m_pairs.crend(); } - // METHODSs - void addUnknown(T_Elem* nodep) { - // Just prepend it to the list of unknown entries - nodep->m_next.link(m_unknown.unlink()); - m_unknown.linkNonNull(nodep); - // We mark nodes on the unknown list by making their child pointer point to themselves - nodep->m_kids.m_ptr = nodep; + const_iterator find(const T_Key& key) const { + const auto kiIt = m_kiMap.find(key); + if (kiIt == m_kiMap.end()) return cend(); + return kiIt->second; } - -public: - // Returns true if the element is present in the scoreboard, false otherwise. Every other - // method that takes a T_Elem* (except for 'add') has undefined behavior if the element is not - // in this scoreboard. Furthermore, this method is only valid if the element can only possibly - // be in this scoreboard. That is: if the element might be in another scoreboard, the behaviour - // of this method is undefined. - static bool contains(const T_Elem* nodep) { return nodep->m_ownerpp; } - - // Add an element to the scoreboard. This will not be returned before the next 'rescore' call. - void add(T_Elem* nodep) { + size_t erase(const T_Key& key) { + const auto kiIt = m_kiMap.find(key); + if (kiIt == m_kiMap.end()) return 0; + m_pairs.erase(kiIt->second); + m_kiMap.erase(kiIt); + return 1; + } + void erase(const_iterator it) { + m_kiMap.erase(it->first); + m_pairs.erase(it); + } + void erase(const_reverse_iterator rit) { + m_kiMap.erase(rit->first); + m_pairs.erase(std::next(rit).base()); + } + bool has(const T_Key& key) const { return m_kiMap.count(key); } + bool empty() const { return m_pairs.empty(); } + // Returns const reference. + const T_Value& at(const T_Key& key) const { return m_kiMap.at(key)->second; } + // Note this returns const_iterator + template + std::pair emplace(const T_Key& key, Args&&... args) { + const auto kiEmp = m_kiMap.emplace(key, end()); + if (kiEmp.second) { + const auto result = m_pairs.emplace(key, std::forward(args)...); #if VL_DEBUG - UASSERT(!contains(nodep), "Adding element to scoreboard that was already in a scoreboard"); + UASSERT(result.second, "Should not be in set yet"); #endif - addUnknown(nodep); - } - - // Remove element from scoreboard. - void remove(T_Elem* nodep) { - if (nodep->m_kids.m_ptr == nodep) { - // Node is on the unknown list, replace with next - nodep->replaceWith(nodep->m_next.unlink()); - return; + kiEmp.first->second = result.first; + return result; } - // Node is in the known heap, remove it - m_known.remove(nodep); + return {kiEmp.first->second, false}; } - - // Get the known element with the highest score (as we are using a max-heap), or nullptr if - // there are no elements with known entries. This does not automatically 'rescore'. The client - // must call 'rescore' appropriately to ensure all elements in the scoreboard are reflected in - // the result of this method. - T_Elem* best() const { return T_Elem::heapNodeToElem(m_known.max()); } - - // Tell the scoreboard that this element's score may have changed. At the time of this call, - // the element's score becomes 'unknown' to the scoreboard. Unknown elements will not be - // returned by 'best until the next call to 'rescore'. - void hintScoreChanged(T_Elem* nodep) { - // If it's already in the unknown list, then nothing to do - if (nodep->m_kids.m_ptr == nodep) return; - // Otherwise it was in the heap, remove it - m_known.remove(nodep); - // Prepend it to the unknown list - addUnknown(nodep); - } - - // True if we have elements with unknown score - bool needsRescore() const { return m_unknown; } - - // True if the element's score is unknown, false otherwise. - static bool needsRescore(const T_Elem* nodep) { return nodep->m_kids.m_ptr == nodep; } - - // For each element whose score is unknown, recompute the score and add to the known heap - void rescore() { - // Rescore and insert all unknown elements - for (Node *nodep = m_unknown.unlink(), *nextp; nodep; nodep = nextp) { - // Pick up next - nextp = nodep->m_next.ptr(); - // Reset pointers - nodep->m_next.m_ptr = nullptr; - nodep->m_kids.m_ptr = nullptr; - nodep->m_ownerpp = nullptr; - // Re-compute the score of the element - T_Elem::heapNodeToElem(nodep)->rescore(); - // re-insert into the heap - m_known.insert(nodep); - } + // Invalidates iterators + void update(const_iterator it, T_Value value) { + const auto kiIt = m_kiMap.find(it->first); + m_pairs.erase(it); + kiIt->second = m_pairs.emplace(kiIt->first, value).first; } }; -// ###################################################################### +//###################################################################### + +/// V3Scoreboard takes a set of Elem*'s, each having some score. +/// Scores are assigned by a user-supplied scoring function. +/// +/// At any time, the V3Scoreboard can return th515e elem with the "best" score +/// among those elements whose scores are known. +/// +/// The best score is the _lowest_ score. This makes sense in contexts +/// where scores represent costs. +/// +/// The Scoreboard supports mutating element scores efficiently. The client +/// must hint to the V3Scoreboard when an element's score may have +/// changed. When it receives this hint, the V3Scoreboard will move the +/// element into the set of elements whose scores are unknown. Later the +/// client can tell V3Scoreboard to re-sort the list, which it does +/// incrementally, by re-scoring all elements whose scores are unknown, and +/// then moving these back into the score-sorted map. This is efficient +/// when the subset of elements whose scores change is much smaller than +/// the full set size. + +template > +class V3Scoreboard final { +private: + // TYPES + class CmpElems final { + public: + bool operator()(const T_Elem* const& ap, const T_Elem* const& bp) const { + const T_ElemCompare cmp; + return cmp.operator()(*ap, *bp); + } + }; + using SortedMap = SortByValueMap; + using UserScoreFnp = T_Score (*)(const T_Elem*); + + // MEMBERS + // Below uses set<> not an unordered_set<>. unordered_set::clear() and + // construction results in a 491KB clear operation to zero all the + // buckets. Since the set size is generally small, and we iterate the + // set members, set is better performant. + std::set m_unknown; // Elements with unknown scores + SortedMap m_sorted; // Set of elements with known scores + const UserScoreFnp m_scoreFnp; // Scoring function + const bool m_slowAsserts; // Do some asserts that require extra lookups + +public: + // CONSTRUCTORS + explicit V3Scoreboard(UserScoreFnp scoreFnp, bool slowAsserts) + : m_scoreFnp{scoreFnp} + , m_slowAsserts{slowAsserts} {} + ~V3Scoreboard() = default; + + // METHODS + + // Add an element to the scoreboard. + // Element begins in needs-rescore state; it won't be returned by + // bestp() until after the next rescore(). + void addElem(const T_Elem* elp) { + if (m_slowAsserts) { + UASSERT(!contains(elp), "Adding element to scoreboard that was already in scoreboard"); + } + m_unknown.insert(elp); + } + + // Remove elp from scoreboard. + void removeElem(const T_Elem* elp) { + if (0 == m_sorted.erase(elp)) { + UASSERT(m_unknown.erase(elp), + "Could not find requested elem to remove from scoreboard"); + } + } + + // Returns true if elp is present in the scoreboard, false otherwise. + // + // Note: every other V3Scoreboard routine that takes an T_Elem* has + // undefined behavior if the element is not in the scoreboard. + bool contains(const T_Elem* elp) const { + if (m_unknown.find(elp) != m_unknown.end()) return true; + return (m_sorted.find(elp) != m_sorted.end()); + } + + // Get the best element, with the lowest score (lower is better), among + // elements whose scores are known. Returns nullptr if no elements with + // known scores exist. + // + // Note: This does not automatically rescore. Client must call + // rescore() periodically to ensure all elems in the scoreboard are + // reflected in the result of bestp(). Otherwise, bestp() only + // considers elements that aren't pending rescore. + const T_Elem* bestp() { + const auto it = m_sorted.begin(); + if (VL_UNLIKELY(it == m_sorted.end())) return nullptr; + return it->first; + } + + // Tell the scoreboard that this element's score may have changed. + // + // At the time of this call, the element's score becomes "unknown" + // to the V3Scoreboard. Unknown elements won't be returned by bestp(). + // The element's score will remain unknown until the next rescore(). + // + // The client MUST call this for each element whose score has changed. + // + // The client MAY call this for elements whose score has not changed. + // Doing so incurs some compute cost (to re-sort the element back to + // its original location) and still makes it ineligible to be returned + // by bestp() until the next rescore(). + void hintScoreChanged(const T_Elem* elp) { + m_unknown.insert(elp); + m_sorted.erase(elp); + } + + // True if any element's score is unknown to V3Scoreboard. + bool needsRescore() { return !m_unknown.empty(); } + // False if elp's score is known to V3Scoreboard, + // else true if elp's score is unknown until the next rescore(). + bool needsRescore(const T_Elem* elp) { return m_unknown.count(elp); } + // Retrieve the last known score for an element. + T_Score cachedScore(const T_Elem* elp) { return m_sorted.at(elp); } + // For each element whose score is unknown to V3Scoreboard, + // call the client's scoring function to get a new score, + // and sort all elements by their current score. + void rescore() { + for (const T_Elem* elp : m_unknown) { + VL_ATTR_UNUSED const bool exists = !m_sorted.emplace(elp, m_scoreFnp(elp)).second; +#if VL_DEBUG + UASSERT(!exists, "Should not be in both m_unknown and m_sorted"); +#endif + } + m_unknown.clear(); + } + +private: + VL_UNCOPYABLE(V3Scoreboard); +}; + +//###################################################################### namespace V3ScoreboardBase { void selfTest();