diff --git a/Changes b/Changes index 752f4c15c..ff4509bcc 100644 --- a/Changes +++ b/Changes @@ -21,6 +21,7 @@ Verilator 4.225 devel * Fix incorrect tristate logic (#3399) [shareefj, Vighnesh Iyer] * Fix segfault exporting non-existant package (#3535). * Fix case statement comparing string literal (#3544). [Gustav Svensk] +* Improve Verilation speed with --threads on large designs. [Geza Lore] Verilator 4.224 2022-06-19 diff --git a/include/verilatedos.h b/include/verilatedos.h index c89b4c6dc..12763f815 100644 --- a/include/verilatedos.h +++ b/include/verilatedos.h @@ -530,6 +530,13 @@ using ssize_t = uint32_t; ///< signed size_t; returned from read() #define VL_STRINGIFY(x) VL_STRINGIFY2(x) #define VL_STRINGIFY2(x) #x +//========================================================================= +// Offset of field in type + +// Address zero can cause compiler problems +#define VL_OFFSETOF(type, field) \ + (reinterpret_cast(&(reinterpret_cast(0x10000000)->field)) - 0x10000000) + //========================================================================= // Conversions diff --git a/src/V3Graph.h b/src/V3Graph.h index da096ab2f..a18fb5dfc 100644 --- a/src/V3Graph.h +++ b/src/V3Graph.h @@ -67,7 +67,7 @@ public: return names[m_e]; } // METHODS unique to this class - constexpr GraphWay invert() const { return m_e == FORWARD ? REVERSE : FORWARD; } + constexpr GraphWay invert() const { return GraphWay{m_e ^ 1}; } constexpr bool forward() const { return m_e == FORWARD; } constexpr bool reverse() const { return m_e != FORWARD; } }; diff --git a/src/V3PairingHeap.h b/src/V3PairingHeap.h new file mode 100644 index 000000000..9904225f3 --- /dev/null +++ b/src/V3PairingHeap.h @@ -0,0 +1,303 @@ +// -*- mode: C++; c-file-style: "cc-mode" -*- +//************************************************************************* +// DESCRIPTION: Verilator: Pairing Heap data structure +// +// Code available from: https://verilator.org +// +//************************************************************************* +// +// Copyright 2003-2022 by Wilson Snyder. This program is free software; you +// can redistribute it and/or modify it under the terms of either the GNU +// Lesser General Public License Version 3 or the Perl Artistic License +// Version 2.0. +// SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0 +// +//************************************************************************* + +#ifndef VERILATOR_V3PAIRINGHEAP_H_ +#define VERILATOR_V3PAIRINGHEAP_H_ + +#include "config_build.h" +#include "verilatedos.h" + +#include "V3Error.h" + +//============================================================================= +// Pairing heap (max-heap) with increase key and delete. +// +// While this is written as a generic data structure, it's interface and +// implementation is finely tuned for it's use by V3Parm_tition, and is critical +// to verilaton performance, so be very careful changing anything or adding any +// new operations that would impact either memory usage, or performance of the +// existing operations. This data structure is fully deterministic, meaning +// the order in which elements with equal keys are retrieved only depends on +// the order of operations performed on the heap. +//============================================================================= + +template +class PairingHeap final { +public: + struct Node; + + // Just a pointer to a heap Node, but with special accessors to help keep back pointers + // consistent. + struct Link { + Node* m_ptr = nullptr; // The managed pointer + + Link() = default; + VL_UNCOPYABLE(Link); + + // Make the pointer point to the target, and the target's owner pointer to this pointer + VL_ATTR_ALWINLINE void link(Node* targetp) { + m_ptr = targetp; + if (!targetp) return; +#if VL_DEBUG + UASSERT(!targetp->m_ownerpp, "Already linked"); +#endif + targetp->m_ownerpp = &m_ptr; + } + + // Make the pointer point to the target, and the target's owner pointer to this pointer + VL_ATTR_ALWINLINE void linkNonNull(Node* targetp) { + m_ptr = targetp; +#if VL_DEBUG + UASSERT(!targetp->m_ownerpp, "Already linked"); +#endif + targetp->m_ownerpp = &m_ptr; + } + + // Clear the pointer and return it's previous value + VL_ATTR_ALWINLINE Node* unlink() { + Node* const result = m_ptr; +#if VL_DEBUG + if (result) { + UASSERT(m_ptr->m_ownerpp == &m_ptr, "Bad back link"); + // Not strictly necessary to clear this, but helps debugging + m_ptr->m_ownerpp = nullptr; + } +#endif + m_ptr = nullptr; + return result; + } + + // Minimal convenience acessors and operators + VL_ATTR_ALWINLINE Node* ptr() const { return m_ptr; } + VL_ATTR_ALWINLINE operator bool() const { return m_ptr; } + VL_ATTR_ALWINLINE bool operator!() const { return !m_ptr; } + VL_ATTR_ALWINLINE Node* operator->() const { return m_ptr; } + VL_ATTR_ALWINLINE Node& operator*() const { return *m_ptr; } + }; + + // A single node in the pairing heap tree + struct Node { + Link m_next; // Next in list of sibling heaps + Link m_kids; // Head of list of child heaps + Node** m_ownerpp = nullptr; // Pointer to the Link pointer pointing to this heap + T_Key m_key; // The key in the heap + + // CONSTRUCTOR + explicit Node() = default; + VL_UNCOPYABLE(Node); + + // METHODS + VL_ATTR_ALWINLINE const T_Key& key() const { return m_key; } + VL_ATTR_ALWINLINE bool operator<(const Node& that) const { return m_key < that.m_key; } + VL_ATTR_ALWINLINE bool operator>(const Node& that) const { return that.m_key < m_key; } + + // Make newp take the place of this in the tree + VL_ATTR_ALWINLINE void replaceWith(Node* newp) { + *m_ownerpp = newp; // The owner pointer needs to point to the new node + if (newp) newp->m_ownerpp = m_ownerpp; // The new node needs to point to its owner + m_ownerpp = nullptr; // This node has no owner anymore + } + + // Make newp take the place of this in the tree + VL_ATTR_ALWINLINE void replaceWithNonNull(Node* newp) { + *m_ownerpp = newp; // The owner pointer needs to point to the new node + newp->m_ownerpp = m_ownerpp; // The new node needs to point to its owner + m_ownerpp = nullptr; // This node has no owner anymore + } + + // Yank this node out of the heap it currently is in. This node can then be safely inserted + // into another heap. Note that this leaves the heap the node is currently under in an + // inconsistent state, so you cannot access it anymore. Still this can save a remove if we + // don't care about the state of the source heap. + VL_ATTR_ALWINLINE void yank() { + m_next.link(nullptr); + m_kids.link(nullptr); + m_ownerpp = nullptr; + } + }; + +private: + // MEMBERS + + // The root of the heap. Note: We do not reduce lists during insertion/removal etc, unless we + // absolutely have to. This means the root can become a list. This is ok, we will reduce + // lazily when requesting the minimum element. + mutable Link m_root; + + // CONSTRUCTORS + VL_UNCOPYABLE(PairingHeap); + +public: + explicit PairingHeap() = default; + + // METHODS + bool empty() const { return !m_root; } + + // Insert given node into this heap with given key. + void insert(Node* nodep, T_Key key) { + // Update key of node + nodep->m_key = key; + insert(nodep); + } + + // Insert given node into this heap with key already set in the node + void insert(Node* nodep) { +#if VL_DEBUG + UASSERT(!nodep->m_ownerpp && !nodep->m_next && !nodep->m_kids, "Already linked"); +#endif + // Just stick it at the front of the root list + nodep->m_next.link(m_root.unlink()); + m_root.linkNonNull(nodep); + } + + // Remove given node only from the heap it is contained in + void remove(Node* nodep) { + if (!nodep->m_next) { + // If the node does not have siblings, replace it with its children (might be empty). + nodep->replaceWith(nodep->m_kids.unlink()); + } else if (!nodep->m_kids) { + // If it has siblings but no children, replace it with the siblings. + nodep->replaceWithNonNull(nodep->m_next.unlink()); + } else { + // If it has both siblings and children, reduce the children and splice that + // reduced heap in place of this node + Node* const reducedKidsp = reduce(nodep->m_kids.unlink()); + reducedKidsp->m_next.linkNonNull(nodep->m_next.unlink()); + nodep->replaceWithNonNull(reducedKidsp); + } + } + + // Returns the largest element in the heap + Node* max() const { + // Heap might be empty + if (!m_root) return nullptr; + // If the root have siblings reduce them + if (m_root->m_next) m_root.linkNonNull(reduce(m_root.unlink())); + // The root element is the largest + return m_root.ptr(); + } + + // Returns the second-largest element in the heap. + // This is only valid to call if 'max' returned a valid element. + Node* secondMax() const { +#if VL_DEBUG + UASSERT(m_root, "'max' would have returned nullptr"); + UASSERT(!m_root->m_next, "'max' would have reduced"); +#endif + // If there are no children, there is no second element + if (!m_root->m_kids) return nullptr; + // If there are multiple children, reduce them + if (m_root->m_kids->m_next) m_root->m_kids.linkNonNull(reduce(m_root->m_kids.unlink())); + // Return the now singular child, which is the second-largest element + return m_root->m_kids.ptr(); + } + + // Increase the key of the given node to the given new value + template + void increaseKey(Node* nodep, T_Update value) { + // Update the key + nodep->m_key.increase(value); + // Increasing the key of the root is easy + if (nodep == m_root.ptr()) return; + // Otherwise we do have a little work to do + if (!nodep->m_kids) { + // If the node has no children, replace it with its siblings (migtht be null) + nodep->replaceWith(nodep->m_next.unlink()); + } else if (!nodep->m_next) { + // If the node has no siblings, replace it with its children + nodep->replaceWithNonNull(nodep->m_kids.unlink()); + } else { + // The node has both children and siblings. Splice the first child in the place of the + // node, and extract the rest of the children with the node + Node* const kidsp = nodep->m_kids.unlink(); + nodep->m_kids.link(kidsp->m_next.unlink()); + kidsp->m_next.linkNonNull(nodep->m_next.unlink()); + nodep->replaceWithNonNull(kidsp); + } + // Just stick the increased node at the front of the root list + nodep->m_next.linkNonNull(m_root.unlink()); + m_root.linkNonNull(nodep); + } + +private: + // Meld (merge) two heaps rooted at the given nodes, return the root of the new heap + VL_ATTR_ALWINLINE static Node* merge(Node* ap, Node* bp) { +#if VL_DEBUG + UASSERT(!ap->m_ownerpp && !ap->m_next, "Not root a"); + UASSERT(!bp->m_ownerpp && !bp->m_next, "Not root b"); +#endif + if (*ap > *bp) { // bp goes under ap + bp->m_next.link(ap->m_kids.unlink()); + ap->m_kids.linkNonNull(bp); + return ap; + } else { // ap goes under bp + ap->m_next.link(bp->m_kids.unlink()); + bp->m_kids.linkNonNull(ap); + return bp; + } + } + + // Reduces the list of nodes starting at the given node into a single node that is returned + VL_ATTR_NOINLINE static Node* reduce(Node* nodep) { +#if VL_DEBUG + UASSERT(!nodep->m_ownerpp, "Node is linked"); +#endif + // If there is only one node in the list, then there is nothing to do + if (!nodep->m_next) return nodep; + // The result node + Node* resultp = nullptr; + // Pairwise merge the child nodes + while (nodep) { + // Pop off the first nodes + Node* const ap = nodep; + // If we have an odd number of nodes, prepend the unpaired one onto the result list + if (!nodep->m_next) { + ap->m_next.link(resultp); + resultp = ap; + break; + } + // Pop off the second nodes + Node* const bp = nodep->m_next.unlink(); + // Keep hold of the rest of the list + nodep = bp->m_next.unlink(); + // Merge the current pair + Node* const mergedp = merge(ap, bp); + // Prepend the merged pair to the result list + mergedp->m_next.link(resultp); + resultp = mergedp; + } + // Now merge-reduce the merged pairs + while (resultp->m_next) { + // Pop first two results + Node* const ap = resultp; + Node* const bp = resultp->m_next.unlink(); + // Keep hold of the rest of the list + resultp = bp->m_next.unlink(); + // Merge the current pair + Node* const mergedp = merge(ap, bp); + // Prepend the merged pair to the result list + mergedp->m_next.link(resultp); + resultp = mergedp; + } + // Done + return resultp; + } +}; + +// The PairingHeap itself should be a simple pointer and nothing more +static_assert(sizeof(PairingHeap) == sizeof(PairingHeap::Node*), "Should be a pointer"); + +#endif // Guard diff --git a/src/V3Partition.cpp b/src/V3Partition.cpp index 5b1474e91..1b11a00b5 100644 --- a/src/V3Partition.cpp +++ b/src/V3Partition.cpp @@ -22,23 +22,29 @@ #include "V3Config.h" #include "V3EmitCBase.h" #include "V3File.h" -#include "V3GraphAlg.h" #include "V3GraphStream.h" #include "V3InstrCount.h" #include "V3Os.h" +#include "V3PairingHeap.h" #include "V3PartitionGraph.h" #include "V3Scoreboard.h" #include "V3Stats.h" #include "V3UniqueNames.h" #include +#include #include #include +#include #include +#include +class LogicMTask; +class MTaskEdge; class MergeCandidate; +class SiblingMC; -//###################################################################### +// ###################################################################### // Partitioner tunable settings: // // Before describing these settings, a bit of background: @@ -70,14 +76,14 @@ class MergeCandidate; // skipping the enumeration of some siblings on a few vertices does not // have a large impact on the result of the partitioner. // -// If your vertices are small, the limit (at 25) approaches a no-op. Hence +// If your vertices are small, the limit (at 26) approaches a no-op. Hence // there's basically no cost to applying this limit even when we don't // expect huge vertices. // // If you don't care about partitioner runtime and you want the most // aggressive partition, set the limit very high. If you have huge // vertices, leave this as is. -constexpr unsigned PART_SIBLING_EDGE_LIMIT = 25; +constexpr unsigned PART_SIBLING_EDGE_LIMIT = 26; // PART_STEPPED_COST (defined/undef) // @@ -143,10 +149,34 @@ static void partCheckCachedScoreVsActual(uint32_t cached, uint32_t actual) { #endif } -//###################################################################### +//============================================================================= +// We keep MTaskEdge graph edges in a PairingHeap, sorted by score and id + +struct EdgeKey { + // Node: Structure layout chosen to minimize padding in PairingHeao<*>::Node + uint64_t m_id; // Unique ID part of edge score + uint32_t m_score; // Score part of ID + void increase(uint32_t score) { +#if VL_DEBUG + UASSERT(score >= m_score, "Must increase"); +#endif + m_score = score; + } + bool operator<(const EdgeKey& other) const { + // First by Score then by ID + return m_score < other.m_score || (m_score == other.m_score && m_id < other.m_id); + } +}; + +using EdgeHeap = PairingHeap; + +//============================================================================= // LogicMTask class LogicMTask final : public AbstractLogicMTask { + template + friend class PartPropagateCp; + public: // TYPES using VxList = std::list; @@ -157,55 +187,6 @@ public: } }; - // This adaptor class allows the PartPropagateCp class to be somewhat - // independent of the LogicMTask class - // - PartPropagateCp can thus be declared before LogicMTask - // - PartPropagateCp could be reused with graphs of other node types - // in the future, using another Accessor adaptor. - class CpCostAccessor final { - public: - CpCostAccessor() = default; - ~CpCostAccessor() = default; - // Return cost of this node - uint32_t cost(const V3GraphVertex* vxp) const { - const LogicMTask* const mtaskp = static_cast(vxp); - return mtaskp->stepCost(); - } - // Return stored CP to this node - uint32_t critPathCost(const V3GraphVertex* vxp, GraphWay way) const { - const LogicMTask* const mtaskp = static_cast(vxp); - return mtaskp->critPathCost(way); - } - // Store a new CP to this node - void setCritPathCost(V3GraphVertex* vxp, GraphWay way, uint32_t cost) const { - LogicMTask* const mtaskp = static_cast(vxp); - mtaskp->setCritPathCost(way, cost); - } - // Notify vxp that the wayward CP at the throughp-->vxp edge - // has increased to 'cp'. (vxp is wayward from throughp.) - // This is our cue to update vxp's m_edges[!way][throughp]. - void notifyEdgeCp(V3GraphVertex* vxp, GraphWay way, V3GraphVertex* throuvhVxp, - uint32_t cp) const { - LogicMTask* const updateVxp = static_cast(vxp); - LogicMTask* const lthrouvhVxp = static_cast(throuvhVxp); - EdgeSet& edges = updateVxp->m_edges[way.invert()]; - const auto it = edges.find(lthrouvhVxp); - if (cp > it->second) edges.update(it, cp); - } - // Check that CP matches that of the longest edge wayward of vxp. - void checkNewCpVersusEdges(V3GraphVertex* vxp, GraphWay way, uint32_t cp) const { - LogicMTask* const mtaskp = static_cast(vxp); - const EdgeSet& edges = mtaskp->m_edges[way.invert()]; - // This is mtaskp's relative with longest !wayward inclusive CP: - const auto edgeIt = edges.rbegin(); - const uint32_t edgeCp = edgeIt->second; - UASSERT_OBJ(edgeCp == cp, vxp, "CP doesn't match longest wayward edge"); - } - - private: - VL_UNCOPYABLE(CpCostAccessor); - }; - private: // MEMBERS @@ -231,21 +212,21 @@ private: // while searching for a path. uint64_t m_generation = 0; - // Redundant with the V3GraphEdge's, store a map of relatives so we can - // quickly check if we have a given parent or child. - // - // 'm_edges[way]' maps a wayward relative to the !way critical path at - // our edge with them. The SortByValueMap supports iterating over - // relatives in longest-to-shortest CP order. We rely on this ordering - // in more than one place. - using EdgeSet = SortByValueMap; - std::array m_edges; + // Store a set of forward relatives so we can quickly check if we have a given child + std::unordered_set m_edgeSet; + // Store the outgoing and incoming edges in a heap sorted by the critical path length + std::array m_edgeHeap; + + // SiblingMC for which storage is owned by this MTask + std::set m_ownSibs; + // SiblingMC for which storage is owned by the opposite MTask + std::set m_farSibps; public: // CONSTRUCTORS LogicMTask(V3Graph* graphp, MTaskMoveVertex* mtmvVxp) : AbstractLogicMTask{graphp} { - for (unsigned int& i : m_critPathCost) i = 0; + for (uint32_t& item : m_critPathCost) item = 0; if (mtmvVxp) { // Else null for test m_vertices.push_back(mtmvVxp); if (const OrderLogicVertex* const olvp = mtmvVxp->logicp()) { @@ -259,6 +240,9 @@ public: } // METHODS + std::set& ownSibs() { return m_ownSibs; }; + std::set& farSibs() { return m_farSibps; }; + void moveAllVerticesFrom(LogicMTask* otherp) { // splice() is constant time m_vertices.splice(m_vertices.end(), otherp->m_vertices); @@ -296,32 +280,39 @@ public: logcost = logcost / 20.0; const uint32_t stepCost = static_cast(exp(logcost)); +#if VL_DEBUG UASSERT_STATIC(stepCost >= cost, "stepped cost error exceeded"); UASSERT_STATIC(stepCost <= ((cost * 11 / 10)), "stepped cost error exceeded"); +#endif return stepCost; #else return cost; #endif } - void addRelative(GraphWay way, LogicMTask* relativep) { - // value is !way cp to this edge - const uint32_t cp = relativep->stepCost() + relativep->critPathCost(way.invert()); - VL_ATTR_UNUSED const bool exits = !m_edges[way].emplace(relativep, cp).second; + template + void addRelativeEdge(MTaskEdge* edgep); + template + void stealRelativeEdge(MTaskEdge* edgep); + template + void removeRelativeEdge(MTaskEdge* edgep); + + void addRelativeMTask(LogicMTask* relativep) { + // Add the relative to connecting edge map + VL_ATTR_UNUSED const bool exits = !m_edgeSet.emplace(relativep).second; #if VL_DEBUG - UASSERT(!exits, "Adding existing edge"); + UASSERT(!exits, "Adding existing relative"); #endif } - void removeRelative(GraphWay way, LogicMTask* relativep) { m_edges[way].erase(relativep); } - bool hasRelative(GraphWay way, LogicMTask* relativep) { return m_edges[way].has(relativep); } - void checkRelativesCp(GraphWay way) const { - for (const auto& edge : vlstd::reverse_view(m_edges[way])) { - const LogicMTask* const relativep = edge.first; - const uint32_t cachedCp = edge.second; - const uint32_t cp = relativep->critPathCost(way.invert()) + relativep->stepCost(); - partCheckCachedScoreVsActual(cachedCp, cp); - } + void removeRelativeMTask(LogicMTask* relativep) { + VL_ATTR_UNUSED const size_t removed = m_edgeSet.erase(relativep); +#if VL_DEBUG + UASSERT(removed, "Relative should have been in set"); +#endif } + bool hasRelativeMTask(LogicMTask* relativep) const { return m_edgeSet.count(relativep); } + + void checkRelativesCp(GraphWay way) const; virtual string name() const override { // Display forward and reverse critical path costs. This gives a quick @@ -334,27 +325,7 @@ public: void setCritPathCost(GraphWay way, uint32_t cost) { m_critPathCost[way] = cost; } uint32_t critPathCost(GraphWay way) const { return m_critPathCost[way]; } - uint32_t critPathCostWithout(GraphWay way, const V3GraphEdge* withoutp) const { - // Compute the critical path cost wayward to this node, without - // considering edge 'withoutp' - UASSERT(this == withoutp->furtherp(way), "In critPathCostWithout(), edge 'withoutp' must " - "further to 'this'"); - - // Iterate through edges until we get a relative other than - // wayEdgeEndp(way, withoutp). This should take 2 iterations max. - const EdgeSet& edges = m_edges[way.invert()]; - uint32_t result = 0; - for (const auto& edge : vlstd::reverse_view(edges)) { - if (edge.first != withoutp->furtherp(way.invert())) { - // Use the cached cost. It could be a small overestimate - // due to stepping. This is consistent with critPathCost() - // which also returns the cached cost. - result = edge.second; - break; - } - } - return result; - } + uint32_t critPathCostWithout(GraphWay way, const V3GraphEdge* withoutp) const; private: static bool pathExistsFromInternal(LogicMTask* fromp, LogicMTask* top, @@ -411,65 +382,7 @@ public: return pathExistsFromInternal(fromp, top, excludedEdgep, incGeneration()); } - static void dumpCpFilePrefixed(const V3Graph* graphp, const string& nameComment) { - const string filename = v3Global.debugFilename(nameComment) + ".txt"; - UINFO(1, "Writing " << filename << endl); - const std::unique_ptr ofp{V3File::new_ofstream(filename)}; - std::ostream* const osp = &(*ofp); // &* needed to deref unique_ptr - if (osp->fail()) v3fatalStatic("Can't write " << filename); - - // Find start vertex with longest CP - const LogicMTask* startp = nullptr; - for (const V3GraphVertex* vxp = graphp->verticesBeginp(); vxp; - vxp = vxp->verticesNextp()) { - const LogicMTask* const mtaskp = static_cast(vxp); - if (!startp) { - startp = mtaskp; - continue; - } - if (mtaskp->cost() + mtaskp->critPathCost(GraphWay::REVERSE) - > startp->cost() + startp->critPathCost(GraphWay::REVERSE)) { - startp = mtaskp; - } - } - - // Follow the entire critical path - std::vector path; - uint32_t totalCost = 0; - for (const LogicMTask* nextp = startp; nextp;) { - path.push_back(nextp); - totalCost += nextp->cost(); - - const EdgeSet& children = nextp->m_edges[GraphWay::FORWARD]; - const EdgeSet::const_reverse_iterator it = children.rbegin(); - if (it == children.rend()) { - nextp = nullptr; - } else { - nextp = it->first; - } - } - - *osp << "totalCost = " << totalCost - << " (should match the computed critical path cost (CP) for the graph)\n"; - - // Dump - for (const LogicMTask* mtaskp : path) { - *osp << "begin mtask with cost " << mtaskp->cost() << '\n'; - for (VxList::const_iterator lit = mtaskp->vertexListp()->begin(); - lit != mtaskp->vertexListp()->end(); ++lit) { - const OrderLogicVertex* const logicp = (*lit)->logicp(); - if (!logicp) continue; - if (false) { - // Show nodes only - *osp << "> "; - logicp->nodep()->dumpTree(*osp); - } else { - // Show nodes with hierarchical costs - V3InstrCount::count(logicp->nodep(), false, osp); - } - } - } - } + static void dumpCpFilePrefixed(const V3Graph* graphp, const string& nameComment); private: VL_DEBUG_FUNC; // Declare debug() @@ -490,11 +403,20 @@ public: } }; -class SiblingMC; -class MTaskEdge; +struct MergeCandidateKey { + // Note: Structure layout chosen to minimize padding in PairingHeao<*>::Node + uint64_t m_id; // Unique ID part of edge score + uint32_t m_score; // Score part of ID + bool operator<(const MergeCandidateKey& other) const { + // First by Score then by ID, but notice that we want minimums using a max-heap, so reverse + return m_score > other.m_score || (m_score == other.m_score && m_id > other.m_id); + } +}; -// Information associated with scoreboarding an MTask -class MergeCandidate VL_NOT_FINAL { +using MergeCandidateScoreboard = V3Scoreboard; + +// Information associated with scoreboarding a merge candidate +class MergeCandidate VL_NOT_FINAL : public MergeCandidateScoreboard::Node { private: // Only the known subclasses can create or delete one of these friend class SiblingMC; @@ -507,18 +429,17 @@ private: // using another bit of the id to denote the actual subtype. // By using the bottom bits for flags, we can still use < to compare IDs without masking. - uint64_t m_id; // <63:2> Serial number for ordering, <1> subtype (SiblingMC), <0> removed - static constexpr uint64_t REMOVED_MASK = 1ULL << 0; - static constexpr uint64_t IS_SIBLING_MASK = 1ULL << 1; - static constexpr uint64_t ID_INCREMENT = 1ULL << 2; + // <63:1> Serial number for ordering, <0> subtype (SiblingMC) + static constexpr uint64_t IS_SIBLING_MASK = 1ULL << 0; + static constexpr uint64_t ID_INCREMENT = 1ULL << 1; - bool isSiblingMC() const { return m_id & IS_SIBLING_MASK; } + bool isSiblingMC() const { return m_key.m_id & IS_SIBLING_MASK; } // CONSTRUCTORS explicit MergeCandidate(bool isSiblingMC) { static uint64_t serial = 0; serial += ID_INCREMENT; // +ID_INCREMENT so doesn't set the special bottom bits - m_id = serial | (isSiblingMC * IS_SIBLING_MASK); + m_key.m_id = serial | (isSiblingMC * IS_SIBLING_MASK); } ~MergeCandidate() = default; @@ -530,35 +451,33 @@ public: const MTaskEdge* toMTaskEdge() const; // Instead of dynamic_cast bool mergeWouldCreateCycle() const; // Instead of virtual method - bool removedFromSb() const { return (m_id & REMOVED_MASK) != 0; } - void removedFromSb(bool /*removed*/) { m_id |= REMOVED_MASK; } - void clearRemovedFromSb() { m_id &= ~REMOVED_MASK; } - bool operator<(const MergeCandidate& other) const { return m_id < other.m_id; } + inline void rescore(); + uint32_t score() const { return m_key.m_score; } + + static MergeCandidate* heapNodeToElem(MergeCandidateScoreboard::Node* nodep) { + return static_cast(nodep); + } }; -static_assert(sizeof(MergeCandidate) == sizeof(uint64_t), "Should not have a vtable"); +static_assert(sizeof(MergeCandidate) == sizeof(MergeCandidateScoreboard::Node), + "Should not have a vtable"); // A pair of associated LogicMTask's that are merge candidates for sibling // contraction class SiblingMC final : public MergeCandidate { private: - LogicMTask* m_ap; - LogicMTask* m_bp; + LogicMTask* const m_ap; + LogicMTask* const m_bp; public: // CONSTRUCTORS SiblingMC() = delete; SiblingMC(LogicMTask* ap, LogicMTask* bp) - : MergeCandidate{/* isSiblingMC: */ true} { - // Assign 'ap' and 'bp' in a canonical order, so we can more easily - // compare pairs of SiblingMCs - if (ap->id() > bp->id()) { - m_ap = ap; - m_bp = bp; - } else { - m_ap = bp; - m_bp = ap; - } + : MergeCandidate{/* isSiblingMC: */ true} + , m_ap{ap} + , m_bp{bp} { + // operator< and storage management depends on this + UASSERT(ap->id() > bp->id(), "Should be ordered"); } ~SiblingMC() = default; // METHODS @@ -580,17 +499,23 @@ static_assert(sizeof(SiblingMC) == sizeof(MergeCandidate) + 2 * sizeof(LogicMTas // GraphEdge for the MTask graph class MTaskEdge final : public V3GraphEdge, public MergeCandidate { + friend class LogicMTask; + template + friend class PartPropagateCp; + + // MEMBERS + // This edge can be in 2 EdgeHeaps, one forward and one reverse. We allocate the heap nodes + // directly within the edge as they are always required and this makes association cheap. + EdgeHeap::Node m_edgeHeapNode[GraphWay::NUM_WAYS]; + public: // CONSTRUCTORS MTaskEdge(V3Graph* graphp, LogicMTask* fromp, LogicMTask* top, int weight) : V3GraphEdge{graphp, fromp, top, weight} , MergeCandidate{/* isSiblingMC: */ false} { - fromp->addRelative(GraphWay::FORWARD, top); - top->addRelative(GraphWay::REVERSE, fromp); - } - virtual ~MTaskEdge() override { - fromMTaskp()->removeRelative(GraphWay::FORWARD, toMTaskp()); - toMTaskp()->removeRelative(GraphWay::REVERSE, fromMTaskp()); + fromp->addRelativeMTask(top); + fromp->addRelativeEdge(this); + top->addRelativeEdge(this); } // METHODS LogicMTask* furtherMTaskp(GraphWay way) const { @@ -601,28 +526,142 @@ public: bool mergeWouldCreateCycle() const { return LogicMTask::pathExistsFrom(fromMTaskp(), toMTaskp(), this); } - static MTaskEdge* cast(V3GraphEdge* edgep) { - if (!edgep) return nullptr; - MTaskEdge* const resultp = dynamic_cast(edgep); - UASSERT(resultp, "Failed to cast in MTaskEdge::cast"); - return resultp; - } // Following initial assignment of critical paths, clear this MTaskEdge // out of the edge-map for each node and reinsert at a new location // with updated critical path. void resetCriticalPaths() { LogicMTask* const fromp = fromMTaskp(); LogicMTask* const top = toMTaskp(); - fromp->removeRelative(GraphWay::FORWARD, top); - top->removeRelative(GraphWay::REVERSE, fromp); - fromp->addRelative(GraphWay::FORWARD, top); - top->addRelative(GraphWay::REVERSE, fromp); + fromp->removeRelativeEdge(this); + top->removeRelativeEdge(this); + fromp->addRelativeEdge(this); + top->addRelativeEdge(this); + } + + uint32_t cachedCp(GraphWay way) const { return m_edgeHeapNode[way].key().m_score; } + + // Convert from the address of the m_edgeHeapNode[way] in an MTaskEdge back to the MTaskEdge + static const MTaskEdge* toMTaskEdge(GraphWay way, const EdgeHeap::Node* nodep) { + const size_t offset = VL_OFFSETOF(MTaskEdge, m_edgeHeapNode[way]); + return reinterpret_cast(reinterpret_cast(nodep) - offset); } private: VL_UNCOPYABLE(MTaskEdge); }; +template +void LogicMTask::addRelativeEdge(MTaskEdge* edgep) { + constexpr GraphWay way{T_Way}; + constexpr GraphWay inv = way.invert(); + // Add to the edge heap + LogicMTask* const relativep = edgep->furtherMTaskp(way); + // Value is !way cp to this edge + const uint32_t cp = relativep->stepCost() + relativep->critPathCost(inv); + // + m_edgeHeap[way].insert(&edgep->m_edgeHeapNode[way], {relativep->id(), cp}); +} + +template +void LogicMTask::stealRelativeEdge(MTaskEdge* edgep) { + constexpr GraphWay way{T_Way}; + // Make heap node insertable, ruining the heap it is currently in. + edgep->m_edgeHeapNode[way].yank(); + // Add the edge as new + addRelativeEdge(edgep); +} + +template +void LogicMTask::removeRelativeEdge(MTaskEdge* edgep) { + constexpr GraphWay way{T_Way}; + // Remove from the edge heap + m_edgeHeap[way].remove(&edgep->m_edgeHeapNode[way]); +} + +void LogicMTask::checkRelativesCp(GraphWay way) const { + for (V3GraphEdge* edgep = beginp(way); edgep; edgep = edgep->nextp(way)) { + const LogicMTask* const relativep = static_cast(edgep->furtherp(way)); + const uint32_t cachedCp = static_cast(edgep)->cachedCp(way); + const uint32_t cp = relativep->critPathCost(way.invert()) + relativep->stepCost(); + partCheckCachedScoreVsActual(cachedCp, cp); + } +} + +uint32_t LogicMTask::critPathCostWithout(GraphWay way, const V3GraphEdge* withoutp) const { + // Compute the critical path cost wayward to this node, without considering edge 'withoutp'. + // We need to look at two edges at most, the critical path if that is not via 'withoutp', + // or the second-worst path, if the critical path is via 'withoutp'. +#if VL_DEBUG + UASSERT(withoutp->furtherp(way) == this, + "In critPathCostWithout(), edge 'withoutp' must further to 'this'"); +#endif + const GraphWay inv = way.invert(); + const EdgeHeap& edgeHeap = m_edgeHeap[inv]; + const EdgeHeap::Node* const maxp = edgeHeap.max(); + if (!maxp) return 0; + if (MTaskEdge::toMTaskEdge(inv, maxp) != withoutp) return maxp->key().m_score; + const EdgeHeap::Node* const secp = edgeHeap.secondMax(); + if (!secp) return 0; + return secp->key().m_score; +} + +void LogicMTask::dumpCpFilePrefixed(const V3Graph* graphp, const string& nameComment) { + const string filename = v3Global.debugFilename(nameComment) + ".txt"; + UINFO(1, "Writing " << filename << endl); + const std::unique_ptr ofp{V3File::new_ofstream(filename)}; + std::ostream* const osp = &(*ofp); // &* needed to deref unique_ptr + if (osp->fail()) v3fatalStatic("Can't write " << filename); + + // Find start vertex with longest CP + LogicMTask* startp = nullptr; + for (V3GraphVertex* vxp = graphp->verticesBeginp(); vxp; vxp = vxp->verticesNextp()) { + LogicMTask* const mtaskp = static_cast(vxp); + if (!startp) { + startp = mtaskp; + continue; + } + if (mtaskp->cost() + mtaskp->critPathCost(GraphWay::REVERSE) + > startp->cost() + startp->critPathCost(GraphWay::REVERSE)) { + startp = mtaskp; + } + } + + // Follow the entire critical path + std::vector path; + uint32_t totalCost = 0; + for (LogicMTask* nextp = startp; nextp;) { + path.push_back(nextp); + totalCost += nextp->cost(); + + if (EdgeHeap::Node* const maxp = nextp->m_edgeHeap[GraphWay::FORWARD].max()) { + nextp = MTaskEdge::toMTaskEdge(GraphWay::FORWARD, maxp)->toMTaskp(); + } else { + nextp = nullptr; + } + } + + *osp << "totalCost = " << totalCost + << " (should match the computed critical path cost (CP) for the graph)\n"; + + // Dump + for (const LogicMTask* mtaskp : path) { + *osp << "begin mtask with cost " << mtaskp->cost() << '\n'; + for (VxList::const_iterator lit = mtaskp->vertexListp()->begin(); + lit != mtaskp->vertexListp()->end(); ++lit) { + const OrderLogicVertex* const logicp = (*lit)->logicp(); + if (!logicp) continue; + if (false) { + // Show nodes only + *osp << "> "; + logicp->nodep()->dumpTree(*osp); + } else { + // Show nodes with hierarchical costs + V3InstrCount::count(logicp->nodep(), false, osp); + } + } + } +} + // Instead of dynamic cast SiblingMC* MergeCandidate::toSiblingMC() { return isSiblingMC() ? static_cast(this) : nullptr; @@ -647,6 +686,40 @@ bool MergeCandidate::mergeWouldCreateCycle() const { : static_cast(this)->mergeWouldCreateCycle(); } +static uint32_t siblingScore(const SiblingMC* sibsp) { + const LogicMTask* const ap = sibsp->ap(); + const LogicMTask* const bp = sibsp->bp(); + const uint32_t mergedCpCostFwd + = std::max(ap->critPathCost(GraphWay::FORWARD), bp->critPathCost(GraphWay::FORWARD)); + const uint32_t mergedCpCostRev + = std::max(ap->critPathCost(GraphWay::REVERSE), bp->critPathCost(GraphWay::REVERSE)); + return mergedCpCostRev + mergedCpCostFwd + LogicMTask::stepCost(ap->cost() + bp->cost()); +} + +static uint32_t edgeScore(const MTaskEdge* edgep) { + // Score this edge. Lower is better. The score is the new local CP + // length if we merge these mtasks. ("Local" means the longest + // critical path running through the merged node.) + const LogicMTask* const top = static_cast(edgep->top()); + const LogicMTask* const fromp = static_cast(edgep->fromp()); + const uint32_t mergedCpCostFwd = std::max(fromp->critPathCost(GraphWay::FORWARD), + top->critPathCostWithout(GraphWay::FORWARD, edgep)); + const uint32_t mergedCpCostRev = std::max(fromp->critPathCostWithout(GraphWay::REVERSE, edgep), + top->critPathCost(GraphWay::REVERSE)); + return mergedCpCostRev + mergedCpCostFwd + LogicMTask::stepCost(fromp->cost() + top->cost()); +} + +void MergeCandidate::rescore() { + if (const SiblingMC* const sibp = toSiblingMC()) { + m_key.m_score = siblingScore(sibp); + } else { + // The '1 +' favors merging a SiblingMC over an otherwise- + // equal-scoring MTaskEdge. The comment on selfTest() talks + // about why. + m_key.m_score = 1 + edgeScore(static_cast(this)); + } +} + // ###################################################################### // Vertex utility classes @@ -813,7 +886,6 @@ static void partCheckCriticalPaths(V3Graph* mtasksp) { // Usage: // * Client increases the cost and/or CP at a node or small set of nodes // (often a pair in practice, eg. edge contraction.) -// * Client instances a PartPropagateCp object // * Client calls PartPropagateCp::cpHasIncreased() one or more times. // Each call indicates that the inclusive CP of some "seed" vertex // has increased to a given value. @@ -823,53 +895,120 @@ static void partCheckCriticalPaths(V3Graph* mtasksp) { // * Client calls PartPropagateCp::go(). Internally, this iteratively // propagates the new CPs wayward through the graph. // +template +class PartPropagateCp final { + // TYPES + + // We keep pending vertices in a heap during critical path propagation + struct PendingKey { + LogicMTask* m_mtaskp; // The vertex in the heap + uint32_t m_score; // The score of this entry + void increase(uint32_t score) { +#if VL_DEBUG + UASSERT(score >= m_score, "Must increase"); +#endif + m_score = score; + } + bool operator<(const PendingKey& other) const { + if (m_score != other.m_score) return m_score < other.m_score; + return LogicMTask::CmpLogicMTask{}(m_mtaskp, other.m_mtaskp); + } + }; + + using PendingHeap = PairingHeap; + using PendingHeapNode = typename PendingHeap::Node; -class PartPropagateCp final : GraphAlg<> { -private: // MEMBERS - const GraphWay m_way; // CPs oriented in this direction: either FORWARD - // // from graph-start to current node, or REVERSE - // // from graph-end to current node. - LogicMTask::CpCostAccessor m_access; // Access cost and CPs on V3GraphVertex's. - // // confirm we only process each vertex once. - const bool m_slowAsserts; // Enable nontrivial asserts - // Pending rescores - SortByValueMap m_pending; + PendingHeap m_pendingHeap; // Heap of pending rescores + // We allocate this many heap nodes at once + static constexpr size_t ALLOC_CHUNK_SIZE = 128; + PendingHeapNode* m_freep = nullptr; // List of free heap nodes + std::vector> m_allocated; // Allocated heap nodes + + const bool m_slowAsserts; // Enable nontrivial asserts std::set m_seen; // Used only with slow asserts to check mtasks visited only once public: // CONSTRUCTORS - PartPropagateCp(V3Graph* graphp, GraphWay way, bool slowAsserts, - V3EdgeFuncP edgeFuncp = &V3GraphEdge::followAlwaysTrue) - : GraphAlg<>{graphp, edgeFuncp} - , m_way{way} - , m_slowAsserts{slowAsserts} {} + PartPropagateCp(bool slowAsserts) + : m_slowAsserts{slowAsserts} {} // METHODS +private: + // Allocate a HeapNode for the given element + PendingHeapNode* allocNode() { + // If no free nodes available, then make some + if (!m_freep) { + // Allocate in chunks for efficiency + m_allocated.emplace_back(new PendingHeapNode[ALLOC_CHUNK_SIZE]); + // Set up free list pointer + m_freep = m_allocated.back().get(); + // Set up free list chain + for (size_t i = 1; i < ALLOC_CHUNK_SIZE; ++i) { + m_freep[i - 1].m_next.m_ptr = &m_freep[i]; + } + // Clear the next pointer of the last entry + m_freep[ALLOC_CHUNK_SIZE - 1].m_next.m_ptr = nullptr; + } + // Free nodes are available, pick up the first one + PendingHeapNode* const resultp = m_freep; + m_freep = resultp->m_next.m_ptr; + resultp->m_next.m_ptr = nullptr; + return resultp; + } + + // Release a heap node (make it available for future allocation) + void freeNode(PendingHeapNode* nodep) { + // Re-use the existing link pointers and simply prepend it to the free list + nodep->m_next.m_ptr = m_freep; + m_freep = nodep; + } + +public: void cpHasIncreased(V3GraphVertex* vxp, uint32_t newInclusiveCp) { + constexpr GraphWay way{T_Way}; + constexpr GraphWay inv{way.invert()}; + // For *vxp, whose CP-inclusive has just increased to // newInclusiveCp, iterate to all wayward nodes, update the edges // of each, and add each to m_pending if its overall CP has grown. - for (V3GraphEdge* edgep = vxp->beginp(m_way); edgep; edgep = edgep->nextp(m_way)) { - if (!m_edgeFuncp(edgep)) continue; - LogicMTask* const relativep = static_cast(edgep->furtherp(m_way)); - m_access.notifyEdgeCp(relativep, m_way, vxp, newInclusiveCp); + for (MTaskEdge *edgep = static_cast(vxp->beginp(way)), *nextp; edgep; + edgep = nextp) { + // Fetch early as likely cache miss + nextp = static_cast(edgep->nextp(way)); - if (m_access.critPathCost(relativep, m_way) < newInclusiveCp) { - // relativep's critPathCost() is out of step with its - // longest !wayward edge. Schedule that to be resolved. - const uint32_t newPendingVal - = newInclusiveCp - m_access.critPathCost(relativep, m_way); - const auto pair = m_pending.emplace(relativep, newPendingVal); - if (!pair.second && (newPendingVal > pair.first->second)) { - m_pending.update(pair.first, newPendingVal); - } + LogicMTask* const relativep = edgep->furtherMTaskp(way); + EdgeHeap::Node& edgeHeapNode = edgep->m_edgeHeapNode[inv]; + if (newInclusiveCp > edgeHeapNode.key().m_score) { + relativep->m_edgeHeap[inv].increaseKey(&edgeHeapNode, newInclusiveCp); } + + const uint32_t critPathCost = relativep->critPathCost(way); + + if (critPathCost >= newInclusiveCp) continue; + + // relativep's critPathCost() is out of step with its longest !wayward edge. + // Schedule that to be resolved. + const uint32_t newVal = newInclusiveCp - critPathCost; + + if (PendingHeapNode* const nodep = static_cast(relativep->userp())) { + // Already in heap. Increase score if needed. + if (newVal > nodep->key().m_score) m_pendingHeap.increaseKey(nodep, newVal); + continue; + } + + // Add to heap + PendingHeapNode* const nodep = allocNode(); + relativep->userp(nodep); + m_pendingHeap.insert(nodep, {relativep, newVal}); } } void go() { + constexpr GraphWay way{T_Way}; + constexpr GraphWay inv{way.invert()}; + // m_pending maps each pending vertex to the amount that it wayward // CP will grow. // @@ -886,27 +1025,34 @@ public: // once. And so on. // // This generalizes to multiple seed nodes also. - while (!m_pending.empty()) { - const auto it = m_pending.rbegin(); - LogicMTask* const updateMep = it->first; - const uint32_t cpGrowBy = it->second; - m_pending.erase(it); - - // For *updateMep, whose critPathCost was out-of-date with respect - // to its edges, update the critPathCost. - const uint32_t startCp = m_access.critPathCost(updateMep, m_way); + while (!m_pendingHeap.empty()) { + // Pop max element from heap + PendingHeapNode* const maxp = m_pendingHeap.max(); + m_pendingHeap.remove(maxp); + // Pick up values + LogicMTask* const mtaskp = maxp->key().m_mtaskp; + const uint32_t cpGrowBy = maxp->key().m_score; + // Free the heap node, we are done with it + freeNode(maxp); + mtaskp->userp(nullptr); + // Update the critPathCost of mtaskp, that was out-of-date with respect to its edges + const uint32_t startCp = mtaskp->critPathCost(way); const uint32_t newCp = startCp + cpGrowBy; if (VL_UNLIKELY(m_slowAsserts)) { - m_access.checkNewCpVersusEdges(updateMep, m_way, newCp); + // Check that CP matches that of the longest edge wayward of vxp. + const uint32_t edgeCp = mtaskp->m_edgeHeap[inv].max()->key().m_score; + UASSERT_OBJ(edgeCp == newCp, mtaskp, "CP doesn't match longest wayward edge"); // Confirm that we only set each node's CP once. That's an // important property of PartPropagateCp which allows it to be far // faster than a recursive algorithm on some graphs. - const bool first = m_seen.insert(updateMep).second; - UASSERT_OBJ(first, updateMep, "Set CP on node twice"); + const bool first = m_seen.insert(mtaskp).second; + UASSERT_OBJ(first, mtaskp, "Set CP on node twice"); } - m_access.setCritPathCost(updateMep, m_way, newCp); - cpHasIncreased(updateMep, newCp + m_access.cost(updateMep)); + mtaskp->setCritPathCost(way, newCp); + cpHasIncreased(mtaskp, newCp + mtaskp->stepCost()); } + + if (VL_UNLIKELY(m_slowAsserts)) m_seen.clear(); } private: @@ -939,11 +1085,11 @@ private: const unsigned idx1 = V3Os::rand64(rngState) % 50; const unsigned idx2 = V3Os::rand64(rngState) % 50; if (idx1 > idx2) { - if (!m_vx[idx2]->hasRelative(GraphWay::FORWARD, m_vx[idx1])) { + if (!m_vx[idx2]->hasRelativeMTask(m_vx[idx1])) { new MTaskEdge{&m_graph, m_vx[idx2], m_vx[idx1], 1}; } } else if (idx2 > idx1) { - if (!m_vx[idx1]->hasRelative(GraphWay::FORWARD, m_vx[idx2])) { + if (!m_vx[idx1]->hasRelativeMTask(m_vx[idx2])) { new MTaskEdge{&m_graph, m_vx[idx1], m_vx[idx2], 1}; } } @@ -952,7 +1098,7 @@ private: partInitCriticalPaths(&m_graph); // This SelfTest class is also the T_CostAccessor - PartPropagateCp prop(&m_graph, GraphWay::FORWARD, true); + PartPropagateCp prop(true); // Seed the propagator with every input node; // This should result in the complete graph getting all CP's assigned. @@ -961,9 +1107,6 @@ private: } // Run the propagator. - // * The setCritPathCost() routine checks that each node's CP changes - // at most once. - // * The notifyEdgeCp routine is also self checking. prop.go(); // Finally, confirm that the entire graph appears to have correct CPs. @@ -976,7 +1119,7 @@ public: // Merge edges from a LogicMtask. // -// This code removes 'hasRelative' edges. When this occurs, mark it in need +// This code removes adjacent edges. When this occurs, mark it in need // of a rescore, in case its score has fallen and we need to move it up // toward the front of the scoreboard. // @@ -1007,51 +1150,90 @@ public: // // Another way of stating this: this code ensures that scores of // non-transitive edges only ever increase. -static void partRedirectEdgesFrom(LogicMTask* recipientp, LogicMTask* donorp, - V3Scoreboard* sbp) { - for (const auto& way : {GraphWay::FORWARD, GraphWay::REVERSE}) { - for (V3GraphEdge *edgep = donorp->beginp(way), *nextp; edgep; edgep = nextp) { - nextp = edgep->nextp(way); - MTaskEdge* const tedgep = MTaskEdge::cast(edgep); - LogicMTask* const relativep = tedgep->furtherMTaskp(way); - if (recipientp->hasRelative(way, relativep)) { - // An edge already exists between recipient and relative of donor. - // Mark it in need of a rescore - if (sbp) { - if (!tedgep->removedFromSb()) sbp->removeElem(tedgep); - const MTaskEdge* const existMTaskEdgep - = MTaskEdge::cast(recipientp->findConnectingEdgep(way, relativep)); - UASSERT(existMTaskEdgep, "findConnectingEdge didn't find edge"); - if (!existMTaskEdgep->removedFromSb()) { - sbp->hintScoreChanged(existMTaskEdgep); - } - } - VL_DO_DANGLING(edgep->unlinkDelete(), edgep); - } else { - // No existing edge between recipient and relative of donor. - // Redirect the edge from donor<->relative to recipient<->relative. - if (way == GraphWay::REVERSE) { - tedgep->relinkTop(recipientp); - relativep->removeRelative(GraphWay::FORWARD, donorp); - relativep->addRelative(GraphWay::FORWARD, recipientp); - recipientp->addRelative(GraphWay::REVERSE, relativep); +static void partRedirectEdgesFrom(V3Graph* graphp, LogicMTask* recipientp, LogicMTask* donorp, + MergeCandidateScoreboard* sbp) { + + // Process outgoing edges + MTaskEdge* outNextp = static_cast(donorp->outBeginp()); + while (outNextp) { + MTaskEdge* const edgep = outNextp; + LogicMTask* const relativep = outNextp->toMTaskp(); + outNextp = static_cast(outNextp->outNextp()); + + relativep->removeRelativeEdge(edgep); + + if (recipientp->hasRelativeMTask(relativep)) { + // An edge already exists between recipient and relative of donor. + // Mark it in need of a rescore + if (sbp) { + if (sbp->contains(edgep)) sbp->remove(edgep); + MTaskEdge* const existMTaskEdgep = static_cast( + recipientp->findConnectingEdgep(GraphWay::FORWARD, relativep)); +#if VL_DEBUG + UASSERT(existMTaskEdgep, "findConnectingEdge didn't find edge"); +#endif + if (sbp->contains(existMTaskEdgep)) sbp->hintScoreChanged(existMTaskEdgep); + } + VL_DO_DANGLING(edgep->unlinkDelete(), edgep); + } else { + // No existing edge between recipient and relative of donor. + // Redirect the edge from donor<->relative to recipient<->relative. + edgep->relinkFromp(recipientp); + recipientp->addRelativeMTask(relativep); + recipientp->stealRelativeEdge(edgep); + relativep->addRelativeEdge(edgep); + if (sbp) { + if (!sbp->contains(edgep)) { + sbp->add(edgep); } else { - tedgep->relinkFromp(recipientp); - relativep->removeRelative(GraphWay::REVERSE, donorp); - relativep->addRelative(GraphWay::REVERSE, recipientp); - recipientp->addRelative(GraphWay::FORWARD, relativep); - } - if (sbp) { - if (tedgep->removedFromSb()) { - tedgep->clearRemovedFromSb(); - sbp->addElem(tedgep); - } else { - sbp->hintScoreChanged(tedgep); - } + sbp->hintScoreChanged(edgep); } } } } + + // Process incoming edges + MTaskEdge* inNextp = static_cast(donorp->inBeginp()); + while (inNextp) { + MTaskEdge* const edgep = inNextp; + LogicMTask* const relativep = inNextp->fromMTaskp(); + inNextp = static_cast(inNextp->inNextp()); + + relativep->removeRelativeMTask(donorp); + relativep->removeRelativeEdge(edgep); + + if (relativep->hasRelativeMTask(recipientp)) { + // An edge already exists between recipient and relative of donor. + // Mark it in need of a rescore + if (sbp) { + if (sbp->contains(edgep)) sbp->remove(edgep); + MTaskEdge* const existMTaskEdgep = static_cast( + recipientp->findConnectingEdgep(GraphWay::REVERSE, relativep)); +#if VL_DEBUG + UASSERT(existMTaskEdgep, "findConnectingEdge didn't find edge"); +#endif + if (sbp->contains(existMTaskEdgep)) sbp->hintScoreChanged(existMTaskEdgep); + } + VL_DO_DANGLING(edgep->unlinkDelete(), edgep); + } else { + // No existing edge between recipient and relative of donor. + // Redirect the edge from donor<->relative to recipient<->relative. + edgep->relinkTop(recipientp); + relativep->addRelativeMTask(recipientp); + relativep->addRelativeEdge(edgep); + recipientp->stealRelativeEdge(edgep); + if (sbp) { + if (!sbp->contains(edgep)) { + sbp->add(edgep); + } else { + sbp->hintScoreChanged(edgep); + } + } + } + } + + // Remove donorp from the graph + VL_DO_DANGLING(donorp->unlinkDelete(graphp), donorp); } //###################################################################### @@ -1061,14 +1243,6 @@ static void partRedirectEdgesFrom(LogicMTask* recipientp, LogicMTask* donorp, class PartContraction final { private: // TYPES - - // TODO: might get a little more speed by making this a - // std::unordered_set and defining hash and equal_to functors for the - // SiblingMC: - using SibSet = std::set; - using SibpSet = std::unordered_set; - using MTask2Sibs = std::unordered_map; - // New CP information for mtaskp reflecting an upcoming merge struct NewCp { uint32_t cp; @@ -1082,17 +1256,17 @@ private: uint32_t m_scoreLimitBeforeRescore = 0xffffffff; // Next score rescore at unsigned m_mergesSinceRescore = 0; // Merges since last rescore const bool m_slowAsserts; // Take extra time to validate algorithm - V3Scoreboard m_sb; // Scoreboard - SibSet m_pairs; // Storage for each SiblingMC - MTask2Sibs m_mtask2sibs; // SiblingMC set for each mtask + MergeCandidateScoreboard m_sb; // Scoreboard + + PartPropagateCp m_forwardPropagator{m_slowAsserts}; // Forward propagator + PartPropagateCp m_reversePropagator{m_slowAsserts}; // Reverse propagator public: // CONSTRUCTORS PartContraction(V3Graph* mtasksp, uint32_t scoreLimit, bool slowAsserts) : m_mtasksp{mtasksp} , m_scoreLimit{scoreLimit} - , m_slowAsserts{slowAsserts} - , m_sb{&mergeCandidateScore, slowAsserts} {} + , m_slowAsserts{slowAsserts} {} // METHODS void go() { @@ -1116,17 +1290,18 @@ public: // - Incrementally recompute critical paths near the merged mtask. for (V3GraphVertex* itp = m_mtasksp->verticesBeginp(); itp; itp = itp->verticesNextp()) { + itp->userp(nullptr); // Reset user value. Used by PartPropagateCp. std::unordered_set neighbors; for (V3GraphEdge* edgep = itp->outBeginp(); edgep; edgep = edgep->outNextp()) { - m_sb.addElem(MTaskEdge::cast(edgep)); + m_sb.add(static_cast(edgep)); if (m_slowAsserts) { UASSERT_OBJ(neighbors.find(edgep->top()) == neighbors.end(), itp, "Redundant edge found in input to PartContraction()"); } neighbors.insert(edgep->top()); } - siblingPairFromRelatives(GraphWay::REVERSE, itp, true); - siblingPairFromRelatives(GraphWay::FORWARD, itp, true); + siblingPairFromRelatives(itp); + siblingPairFromRelatives(itp); } doRescore(); // Set initial scores in scoreboard @@ -1134,7 +1309,7 @@ public: while (true) { // This is the best edge to merge, with the lowest // score (shortest local critical path) - MergeCandidate* const mergeCanp = const_cast(m_sb.bestp()); + MergeCandidate* const mergeCanp = m_sb.best(); if (!mergeCanp) { // Scoreboard found no eligible merges. Maybe a rescore // will produce some merge-able pairs? @@ -1149,8 +1324,9 @@ public: UASSERT(!m_sb.needsRescore(mergeCanp), "Need-rescore items should not be returned by bestp"); } - const uint32_t cachedScore = m_sb.cachedScore(mergeCanp); - const uint32_t actualScore = mergeCandidateScore(mergeCanp); + const uint32_t cachedScore = mergeCanp->score(); + mergeCanp->rescore(); + const uint32_t actualScore = mergeCanp->score(); if (actualScore > cachedScore) { // Cached score is out-of-date. @@ -1211,8 +1387,11 @@ public: if (mergeCanp->mergeWouldCreateCycle()) { // Remove this edge from scoreboard so we don't keep // reconsidering it on every loop. - m_sb.removeElem(mergeCanp); - mergeCanp->removedFromSb(true); + m_sb.remove(mergeCanp); + if (SiblingMC* const smcp = mergeCanp->toSiblingMC()) { + smcp->bp()->farSibs().erase(smcp); + smcp->ap()->ownSibs().erase(*smcp); // Kills *smcp, so do last + } continue; } @@ -1245,7 +1424,9 @@ public: } private: - NewCp newCp(GraphWay way, LogicMTask* mtaskp, LogicMTask* otherp, MTaskEdge* mergeEdgep) { + template + NewCp newCp(LogicMTask* mtaskp, LogicMTask* otherp, MTaskEdge* mergeEdgep) { + constexpr GraphWay way{T_Way}; // Return new wayward-CP for mtaskp reflecting its upcoming merge // with otherp. Set 'result.propagate' if mtaskp's wayward // relatives will see a new wayward CP from this merge. @@ -1274,31 +1455,29 @@ private: } void removeSiblingMCsWith(LogicMTask* mtaskp) { - for (SibpSet::iterator it = m_mtask2sibs[mtaskp].begin(); it != m_mtask2sibs[mtaskp].end(); - ++it) { - const SiblingMC* const pairp = *it; - if (!pairp->removedFromSb()) m_sb.removeElem(pairp); - const LogicMTask* const otherp = (pairp->bp() == mtaskp) ? pairp->ap() : pairp->bp(); - size_t erased = m_mtask2sibs[otherp].erase(pairp); - UASSERT_OBJ(erased > 0, otherp, "Expected existing mtask"); - erased = m_pairs.erase(*pairp); - UASSERT_OBJ(erased > 0, mtaskp, "Expected existing mtask"); + for (const SiblingMC& pair : mtaskp->ownSibs()) { + m_sb.remove(const_cast(&pair)); + // Owner is always ap(), remove from the opposite side + pair.bp()->farSibs().erase(&pair); } - const size_t erased = m_mtask2sibs.erase(mtaskp); - UASSERT_OBJ(erased > 0, mtaskp, "Expected existing mtask"); + for (const SiblingMC* const pairp : mtaskp->farSibs()) { + m_sb.remove(const_cast(pairp)); + // Owner is always ap(), remove from the opposite side + pairp->ap()->ownSibs().erase(*pairp); + } + mtaskp->ownSibs().clear(); + mtaskp->farSibs().clear(); } void contract(MergeCandidate* mergeCanp) { LogicMTask* top = nullptr; LogicMTask* fromp = nullptr; MTaskEdge* mergeEdgep = mergeCanp->toMTaskEdge(); - const SiblingMC* mergeSibsp = nullptr; if (mergeEdgep) { top = static_cast(mergeEdgep->top()); fromp = static_cast(mergeEdgep->fromp()); } else { - mergeSibsp = mergeCanp->toSiblingMC(); - UASSERT(mergeSibsp, "Failed to cast mergeCanp to either MTaskEdge or SiblingMC"); + const SiblingMC* mergeSibsp = static_cast(mergeCanp); top = mergeSibsp->ap(); fromp = mergeSibsp->bp(); } @@ -1329,15 +1508,18 @@ private: // // These 'NewCp' objects carry a bit indicating whether we must // propagate CP for each of the four cases: - const NewCp recipientNewCpFwd = newCp(GraphWay::FORWARD, recipientp, donorp, mergeEdgep); - const NewCp donorNewCpFwd = newCp(GraphWay::FORWARD, donorp, recipientp, mergeEdgep); - const NewCp recipientNewCpRev = newCp(GraphWay::REVERSE, recipientp, donorp, mergeEdgep); - const NewCp donorNewCpRev = newCp(GraphWay::REVERSE, donorp, recipientp, mergeEdgep); + const NewCp recipientNewCpFwd = newCp(recipientp, donorp, mergeEdgep); + const NewCp donorNewCpFwd = newCp(donorp, recipientp, mergeEdgep); + const NewCp recipientNewCpRev = newCp(recipientp, donorp, mergeEdgep); + const NewCp donorNewCpRev = newCp(donorp, recipientp, mergeEdgep); if (mergeEdgep) { // Remove and free the connecting edge. Must do this before // propagating CP's below. - m_sb.removeElem(mergeCanp); + m_sb.remove(mergeCanp); + mergeEdgep->fromMTaskp()->removeRelativeMTask(mergeEdgep->toMTaskp()); + mergeEdgep->fromMTaskp()->removeRelativeEdge(mergeEdgep); + mergeEdgep->toMTaskp()->removeRelativeEdge(mergeEdgep); VL_DO_CLEAR(mergeEdgep->unlinkDelete(), mergeEdgep = nullptr); } @@ -1353,25 +1535,22 @@ private: << (donorNewCpFwd.propagate ? " true " : " false ") << donorNewCpFwd.propagateCp << endl); - PartPropagateCp forwardPropagator(m_mtasksp, GraphWay::FORWARD, m_slowAsserts); - PartPropagateCp reversePropagator(m_mtasksp, GraphWay::REVERSE, m_slowAsserts); - recipientp->setCritPathCost(GraphWay::FORWARD, recipientNewCpFwd.cp); if (recipientNewCpFwd.propagate) { - forwardPropagator.cpHasIncreased(recipientp, recipientNewCpFwd.propagateCp); + m_forwardPropagator.cpHasIncreased(recipientp, recipientNewCpFwd.propagateCp); } recipientp->setCritPathCost(GraphWay::REVERSE, recipientNewCpRev.cp); if (recipientNewCpRev.propagate) { - reversePropagator.cpHasIncreased(recipientp, recipientNewCpRev.propagateCp); + m_reversePropagator.cpHasIncreased(recipientp, recipientNewCpRev.propagateCp); } if (donorNewCpFwd.propagate) { - forwardPropagator.cpHasIncreased(donorp, donorNewCpFwd.propagateCp); + m_forwardPropagator.cpHasIncreased(donorp, donorNewCpFwd.propagateCp); } if (donorNewCpRev.propagate) { - reversePropagator.cpHasIncreased(donorp, donorNewCpRev.propagateCp); + m_reversePropagator.cpHasIncreased(donorp, donorNewCpRev.propagateCp); } - forwardPropagator.go(); - reversePropagator.go(); + m_forwardPropagator.go(); + m_reversePropagator.go(); // Remove all SiblingMCs that include donorp. This Includes the one // we're merging, if we're merging a SiblingMC. @@ -1381,11 +1560,8 @@ private: // to a bounded number. removeSiblingMCsWith(recipientp); - // Redirect all edges - partRedirectEdgesFrom(recipientp, donorp, &m_sb); - - // Delete the donorp mtask from the graph - VL_DO_CLEAR(donorp->unlinkDelete(m_mtasksp), donorp = nullptr); + // Redirect all edges, delete donorp + partRedirectEdgesFrom(m_mtasksp, recipientp, donorp, &m_sb); ++m_mergesSinceRescore; @@ -1398,21 +1574,21 @@ private: // - prereqs of recipientp's postreqs // - postreqs of recipientp's prereqs // Note that this depends on the updated critical paths (above). - siblingPairFromRelatives(GraphWay::REVERSE, recipientp, true); - siblingPairFromRelatives(GraphWay::FORWARD, recipientp, true); + siblingPairFromRelatives(recipientp); + siblingPairFromRelatives(recipientp); unsigned edges = 0; for (V3GraphEdge* edgep = recipientp->outBeginp(); edgep; edgep = edgep->outNextp()) { LogicMTask* const postreqp = static_cast(edgep->top()); - siblingPairFromRelatives(GraphWay::REVERSE, postreqp, false); + siblingPairFromRelatives(postreqp); ++edges; - if (edges > PART_SIBLING_EDGE_LIMIT) break; + if (edges >= PART_SIBLING_EDGE_LIMIT) break; } edges = 0; for (V3GraphEdge* edgep = recipientp->inBeginp(); edgep; edgep = edgep->inNextp()) { LogicMTask* const prereqp = static_cast(edgep->fromp()); - siblingPairFromRelatives(GraphWay::FORWARD, prereqp, false); + siblingPairFromRelatives(prereqp); ++edges; - if (edges > PART_SIBLING_EDGE_LIMIT) break; + if (edges >= PART_SIBLING_EDGE_LIMIT) break; } } @@ -1429,111 +1605,86 @@ private: m_scoreLimitBeforeRescore = 0xffffffff; } - static uint32_t mergeCandidateScore(const MergeCandidate* pairp) { - if (const MTaskEdge* const edgep = pairp->toMTaskEdge()) { - // The '1 +' favors merging a SiblingMC over an otherwise- - // equal-scoring MTaskEdge. The comment on selfTest() talks - // about why. - return 1 + edgeScore(edgep); - } else { - return siblingScore(pairp->toSiblingMC()); - } - v3fatalSrc("Failed to cast pairp to either MTaskEdge or SiblingMC in mergeCandidateScore"); - return 0; - } - - VL_ATTR_NOINLINE - static uint32_t siblingScore(const SiblingMC* sibsp) { - const LogicMTask* const ap = sibsp->ap(); - const LogicMTask* const bp = sibsp->bp(); - const uint32_t mergedCpCostFwd - = std::max(ap->critPathCost(GraphWay::FORWARD), bp->critPathCost(GraphWay::FORWARD)); - const uint32_t mergedCpCostRev - = std::max(ap->critPathCost(GraphWay::REVERSE), bp->critPathCost(GraphWay::REVERSE)); - return mergedCpCostRev + mergedCpCostFwd + LogicMTask::stepCost(ap->cost() + bp->cost()); - } - - VL_ATTR_NOINLINE - static uint32_t edgeScore(const V3GraphEdge* edgep) { - // Score this edge. Lower is better. The score is the new local CP - // length if we merge these mtasks. ("Local" means the longest - // critical path running through the merged node.) - const LogicMTask* const top = static_cast(edgep->top()); - const LogicMTask* const fromp = static_cast(edgep->fromp()); - const uint32_t mergedCpCostFwd - = std::max(fromp->critPathCost(GraphWay::FORWARD), - top->critPathCostWithout(GraphWay::FORWARD, edgep)); - const uint32_t mergedCpCostRev - = std::max(fromp->critPathCostWithout(GraphWay::REVERSE, edgep), - top->critPathCost(GraphWay::REVERSE)); - return mergedCpCostRev + mergedCpCostFwd - + LogicMTask::stepCost(fromp->cost() + top->cost()); - } - void makeSiblingMC(LogicMTask* ap, LogicMTask* bp) { - const SiblingMC newSibs(ap, bp); - const std::pair insertResult = m_pairs.insert(newSibs); - if (insertResult.second) { - const SiblingMC* const newSibsp = &(*insertResult.first); - m_mtask2sibs[ap].insert(newSibsp); - m_mtask2sibs[bp].insert(newSibsp); - m_sb.addElem(newSibsp); + if (ap->id() < bp->id()) std::swap(ap, bp); + // The higher id vertex owns the storage + const auto emplaceResult = ap->ownSibs().emplace(ap, bp); + if (emplaceResult.second) { + SiblingMC* const newSibsp = const_cast(&(*emplaceResult.first)); + bp->farSibs().insert(newSibsp); + m_sb.add(newSibsp); } else if (m_slowAsserts) { // It's fine if we already have this SiblingMC, we may have // created it earlier. Just confirm that we have associated data. - UASSERT_OBJ(m_mtask2sibs.find(ap) != m_mtask2sibs.end(), ap, "Sibling not found"); - UASSERT_OBJ(m_mtask2sibs.find(bp) != m_mtask2sibs.end(), bp, "Sibling not found"); bool found = false; - for (SibpSet::iterator it = m_mtask2sibs[ap].begin(); it != m_mtask2sibs[ap].end(); - ++it) { - const SiblingMC* const sibsp = *it; - UASSERT_OBJ(!(!sibsp->removedFromSb() && !m_sb.contains(sibsp)), ap, - "One sibling must be the one we collided with"); - if ((sibsp->ap() == ap && sibsp->bp() == bp) - || (sibsp->bp() == ap && sibsp->ap() == bp)) - found = true; + for (const SiblingMC& sibs : ap->ownSibs()) { + UASSERT_OBJ(sibs.ap() == ap, ap, "Inconsistent SiblingMC"); + UASSERT_OBJ(m_sb.contains(&sibs), ap, "Must be on the scoreboard"); + if (sibs.bp() == bp) found = true; } UASSERT_OBJ(found, ap, "Sibling not found"); } } - void siblingPairFromRelatives(GraphWay way, V3GraphVertex* mtaskp, bool exhaustive) { - std::vector shortestPrereqs; + template + void siblingPairFromRelatives(V3GraphVertex* mtaskp) { + constexpr GraphWay way{T_Way}; + // Need at least 2 edges + if (!mtaskp->beginp(way) || !mtaskp->beginp(way)->nextp(way)) return; - for (V3GraphEdge* edgep = mtaskp->beginp(way); edgep; edgep = edgep->nextp(way)) { - LogicMTask* const prereqp = static_cast(edgep->furtherp(way)); - shortestPrereqs.push_back(prereqp); - // Prevent nodes with huge numbers of edges from massively - // slowing down the partitioner: - if (shortestPrereqs.size() > PART_SIBLING_EDGE_LIMIT) break; + std::array neighbours; + + // This is a hot method, so we want so sort as efficiently as possible. We pre-load + // all data (critical path cost and id) required for determining ordering into an aligned + // structure. There is not enough space next to these to keep a whole pointer within 16 + // bytes, so we store an index into the neighbours buffer instead. We can then compare + // and swap these sorting records very efficiently. With this the standard library sorting + // functions are efficient enough and using more optimized methods (e.g.: sorting networks) + // has no measurable benefit. + struct alignas(16) SortingRecord { + uint64_t m_id; + uint32_t m_cp; + uint8_t m_idx; + static_assert(PART_SIBLING_EDGE_LIMIT <= std::numeric_limits::max(), + "m_idx must fit all indices into 'neighbours'"); + bool operator<(const SortingRecord& that) const { + return m_cp < that.m_cp || (m_cp == that.m_cp && m_id < that.m_id); + } + }; + static_assert(sizeof(SortingRecord) <= 16, "How could this be padded to more than 16?"); + + std::array sortRecs; + size_t n = 0; + + // Populate the buffers + for (V3GraphEdge *edgep = mtaskp->beginp(way), *nextp; edgep; edgep = nextp) { + nextp = edgep->nextp(way); // Fetch next first as likely cache miss + LogicMTask* const otherp = static_cast(edgep->furtherp(way)); + neighbours[n] = otherp; + sortRecs[n].m_id = otherp->id(); + sortRecs[n].m_cp = otherp->critPathCost(way) + otherp->cost(); + sortRecs[n].m_idx = n; + ++n; + // Prevent nodes with huge numbers of edges from massively slowing down us down + if (n >= PART_SIBLING_EDGE_LIMIT) break; } - if (shortestPrereqs.size() <= 1) return; - - const auto cmp = [way](const LogicMTask* ap, const LogicMTask* bp) { - const uint32_t aCp = ap->critPathCost(way) + ap->cost(); - const uint32_t bCp = bp->critPathCost(way) + bp->cost(); - if (aCp != bCp) return aCp < bCp; - return ap->id() < bp->id(); - }; - - // Don't make all possible pairs of prereqs when not requested (non-exhaustive). + // Don't make all possible pairs of siblings when not requested (non-exhaustive). // Just make a few pairs. constexpr size_t MAX_NONEXHAUSTIVE_PAIRS = 3; - size_t end; // End index of pairs to add to candidates (exclusive) - - if (exhaustive || (shortestPrereqs.size() <= 2 * MAX_NONEXHAUSTIVE_PAIRS)) { - end = shortestPrereqs.size() & ~static_cast(1); // Round down to even - std::sort(shortestPrereqs.begin(), shortestPrereqs.end(), cmp); + if (Exhaustive || n <= 2 * MAX_NONEXHAUSTIVE_PAIRS) { + const size_t end = n & ~static_cast(1); // Round down to even, (we want pairs) + std::sort(sortRecs.begin(), sortRecs.begin() + n); + for (size_t i = 0; i < end; i += 2) { + makeSiblingMC(neighbours[sortRecs[i].m_idx], neighbours[sortRecs[i + 1].m_idx]); + } } else { - end = 2 * MAX_NONEXHAUSTIVE_PAIRS; - std::partial_sort(shortestPrereqs.begin(), shortestPrereqs.begin() + end, - shortestPrereqs.end(), cmp); - } - - for (size_t i = 0; i < end; i += 2) { - makeSiblingMC(shortestPrereqs[i], shortestPrereqs[i + 1]); + constexpr size_t end = 2 * MAX_NONEXHAUSTIVE_PAIRS; + std::partial_sort(sortRecs.begin(), sortRecs.begin() + end, sortRecs.begin() + n); + for (size_t i = 0; i < end; i += 2) { + makeSiblingMC(neighbours[sortRecs[i].m_idx], neighbours[sortRecs[i + 1].m_idx]); + } } } @@ -1850,17 +2001,15 @@ private: } // Move all vertices from donorp to mergedp mergedp->moveAllVerticesFrom(donorp); - // Redirect edges from donorp to recipientp - partRedirectEdgesFrom(mergedp, donorp, nullptr); - // Remove donorp from the graph - VL_DO_DANGLING(donorp->unlinkDelete(m_mtasksp), donorp); + // Redirect edges from donorp to recipientp, delete donorp + partRedirectEdgesFrom(m_mtasksp, mergedp, donorp, nullptr); ++m_mergesDone; } if (lastMergedp) { UASSERT_OBJ(lastMergedp->rank() < mergedp->rank(), mergedp, "Merging must be on lower rank"); - if (!lastMergedp->hasRelative(GraphWay::FORWARD, mergedp)) { + if (!lastMergedp->hasRelativeMTask(mergedp)) { new MTaskEdge(m_mtasksp, lastMergedp, mergedp, 1); } } @@ -2506,9 +2655,8 @@ void V3Partition::setupMTaskDeps(V3Graph* mtasksp, const Vx2MTaskMap* vx2mtaskp) UASSERT_OBJ(otherMTaskp != mtaskp, mtaskp, "Would create a cycle edge"); // Don't create redundant edges. - if (mtaskp->hasRelative(GraphWay::FORWARD, otherMTaskp)) { // - continue; - } + if (mtaskp->hasRelativeMTask(otherMTaskp)) continue; + new MTaskEdge(mtasksp, mtaskp, otherMTaskp, 1); } } diff --git a/src/V3Scoreboard.cpp b/src/V3Scoreboard.cpp index 78d466596..d21422a81 100644 --- a/src/V3Scoreboard.cpp +++ b/src/V3Scoreboard.cpp @@ -19,26 +19,42 @@ #include "V3Scoreboard.h" -class ScoreboardTestElem final { +class ScoreboardTestElem; + +struct Key { + // Node: Structure layout chosen to minimize padding in PairingHeao<*>::Node + uint64_t m_id; // Unique ID part of edge score + uint32_t m_score; // Score part of ID + bool operator<(const Key& other) const { + // First by Score then by ID, but notice that we want minimums using a max-heap, so reverse + return m_score > other.m_score || (m_score == other.m_score && m_id > other.m_id); + } +}; + +using Scoreboard = V3Scoreboard; + +class ScoreboardTestElem final : public Scoreboard::Node { public: - // MEMBERS - uint32_t m_score; - uint32_t m_id; + uint32_t m_newScore; // CONSTRUCTORS explicit ScoreboardTestElem(uint32_t score) - : m_score{score} { + : m_newScore{score} { + m_key.m_score = m_newScore; static uint32_t s_serial = 0; - m_id = ++s_serial; + m_key.m_id = ++s_serial; } ScoreboardTestElem() = default; - // METHODS - static uint32_t scoreFn(const ScoreboardTestElem* elp) { return elp->m_score; } - bool operator<(const ScoreboardTestElem& other) const { return m_id < other.m_id; } + uint64_t id() const { return m_key.m_id; } + void rescore() { m_key.m_score = m_newScore; } + uint32_t score() const { return m_key.m_score; } + static ScoreboardTestElem* heapNodeToElem(Scoreboard::Node* nodep) { + return static_cast(nodep); + } }; void V3ScoreboardBase::selfTest() { - V3Scoreboard sb(ScoreboardTestElem::scoreFn, true); + Scoreboard sb; UASSERT(!sb.needsRescore(), "SelfTest: Empty sb should not need rescore."); @@ -46,13 +62,13 @@ void V3ScoreboardBase::selfTest() { ScoreboardTestElem e2(20); ScoreboardTestElem e3(30); - sb.addElem(&e1); - sb.addElem(&e2); - sb.addElem(&e3); + sb.add(&e1); + sb.add(&e2); + sb.add(&e3); UASSERT(sb.needsRescore(), "SelfTest: Newly filled sb should need a rescore."); UASSERT(sb.needsRescore(&e1), "SelfTest: Individual newly-added element should need rescore"); - UASSERT(nullptr == sb.bestp(), + UASSERT(nullptr == sb.best(), "SelfTest: Newly filled sb should have nothing eligible for Bestp()"); sb.rescore(); @@ -60,24 +76,22 @@ void V3ScoreboardBase::selfTest() { UASSERT(!sb.needsRescore(), "SelfTest: Newly rescored sb should not need rescore"); UASSERT(!sb.needsRescore(&e1), "SelfTest: Newly rescored sb should not need an element rescored"); - UASSERT(e2.m_score == sb.cachedScore(&e2), - "SelfTest: Cached score should match current score"); - UASSERT(&e1 == sb.bestp(), "SelfTest: Should return element with lowest (best) score"); + UASSERT(&e1 == sb.best(), "SelfTest: Should return element with lowest (best) score"); // Change one element's score sb.hintScoreChanged(&e2); - e2.m_score = 21; + e2.m_newScore = 21; UASSERT(sb.needsRescore(&e2), "SelfTest: Should need rescore on elem after hintScoreChanged"); // Remove an element UASSERT(sb.contains(&e1), "SelfTest: e1 should be there"); - sb.removeElem(&e1); + sb.remove(&e1); UASSERT(!sb.contains(&e1), "SelfTest: e1 should be gone"); UASSERT(sb.contains(&e2), "SelfTest: e2 should be there, despite needing rescore"); // Now e3 should be our best-scoring element, even though // e2 has a better score, since e2 is pending rescore. - UASSERT(&e3 == sb.bestp(), "SelfTest: Expect e3 as best element with known score."); + UASSERT(&e3 == sb.best(), "SelfTest: Expect e3 as best element with known score."); sb.rescore(); - UASSERT(&e2 == sb.bestp(), "SelfTest: Expect e2 as best element again after Rescore"); + UASSERT(&e2 == sb.best(), "SelfTest: Expect e2 as best element again after Rescore"); } diff --git a/src/V3Scoreboard.h b/src/V3Scoreboard.h index dc5fce0b0..4bf915431 100644 --- a/src/V3Scoreboard.h +++ b/src/V3Scoreboard.h @@ -1,13 +1,6 @@ // -*- mode: C++; c-file-style: "cc-mode" -*- //************************************************************************* -// DESCRIPTION: Verilator: Scoreboards for thread partitioner -// -// Provides scoreboard classes: -// -// * SortByValueMap -// * V3Scoreboard -// -// See details below +// DESCRIPTION: Verilator: Scoreboard for mtask coarsening // // Code available from: https://verilator.org // @@ -28,248 +21,122 @@ #include "verilatedos.h" #include "V3Error.h" +#include "V3PairingHeap.h" -#include -#include -#include -#include +//=============================================================================================== +// V3Scoreboard is essentially a heap that can be hinted that some elements have changed keys, at +// which points those elements will be deferred as 'unknown' until the next 'rescore' call. We +// largely reuse the implementation of the slightly more generic PairingHeap, but we do rely on the +// internal structure of the PairingHeap so changing that class requires changing this. +// +// For efficiency, the elements themselves must be the heap nodes, by deriving them from +// V3Scoreboard::Node. This also means a single element can only be associated with +// a single scoreboard. -// ###################################################################### -// SortByValueMap - -// A generic key-value map, except iteration is in *value* sorted order. Values need not be unique. -// Uses T_KeyCompare to break ties in the sort when values collide. Note: Only const iteration is -// possible, as updating mapped values via iterators is not safe. - -template > -class SortByValueMap final { - // Current implementation is a std::set of key/value pairs, plus a std_unordered_map from keys - // to iterators into the set. This keeps most operations fairly cheap and also has the benefit - // of being able to re-use the std::set iterators. - - // TYPES - - using Pair = std::pair; - - struct PairCmp final { - bool operator()(const Pair& a, const Pair& b) const { - // First compare values - if (a.second != b.second) return a.second < b.second; - // Then compare keys - return T_KeyCompare{}(a.first, b.first); - } - }; - - using PairSet = std::set; - -public: - using const_iterator = typename PairSet::const_iterator; - using const_reverse_iterator = typename PairSet::const_reverse_iterator; - -private: - // MEMBERS - PairSet m_pairs; // The contents of the map, stored directly as key-value pairs - std::unordered_map m_kiMap; // Key to iterator map - - VL_UNCOPYABLE(SortByValueMap); - -public: - // CONSTRUCTORS - SortByValueMap() = default; - - // Only const iteration is possible - const_iterator begin() const { return m_pairs.begin(); } - const_iterator end() const { return m_pairs.end(); } - const_iterator cbegin() const { m_pairs.cbegin(); } - const_iterator cend() const { return m_pairs.cend(); } - const_reverse_iterator rbegin() const { return m_pairs.rbegin(); } - const_reverse_iterator rend() const { return m_pairs.rend(); } - const_reverse_iterator crbegin() const { return m_pairs.crbegin(); } - const_reverse_iterator crend() const { return m_pairs.crend(); } - - const_iterator find(const T_Key& key) const { - const auto kiIt = m_kiMap.find(key); - if (kiIt == m_kiMap.end()) return cend(); - return kiIt->second; - } - size_t erase(const T_Key& key) { - const auto kiIt = m_kiMap.find(key); - if (kiIt == m_kiMap.end()) return 0; - m_pairs.erase(kiIt->second); - m_kiMap.erase(kiIt); - return 1; - } - void erase(const_iterator it) { - m_kiMap.erase(it->first); - m_pairs.erase(it); - } - void erase(const_reverse_iterator rit) { - m_kiMap.erase(rit->first); - m_pairs.erase(std::next(rit).base()); - } - bool has(const T_Key& key) const { return m_kiMap.count(key); } - bool empty() const { return m_pairs.empty(); } - // Returns const reference. - const T_Value& at(const T_Key& key) const { return m_kiMap.at(key)->second; } - // Note this returns const_iterator - template - std::pair emplace(const T_Key& key, Args&&... args) { - const auto kiEmp = m_kiMap.emplace(key, end()); - if (kiEmp.second) { - const auto result = m_pairs.emplace(key, std::forward(args)...); -#if VL_DEBUG - UASSERT(result.second, "Should not be in set yet"); -#endif - kiEmp.first->second = result.first; - return result; - } - return {kiEmp.first->second, false}; - } - // Invalidates iterators - void update(const_iterator it, T_Value value) { - const auto kiIt = m_kiMap.find(it->first); - m_pairs.erase(it); - kiIt->second = m_pairs.emplace(kiIt->first, value).first; - } -}; - -//###################################################################### - -/// V3Scoreboard takes a set of Elem*'s, each having some score. -/// Scores are assigned by a user-supplied scoring function. -/// -/// At any time, the V3Scoreboard can return th515e elem with the "best" score -/// among those elements whose scores are known. -/// -/// The best score is the _lowest_ score. This makes sense in contexts -/// where scores represent costs. -/// -/// The Scoreboard supports mutating element scores efficiently. The client -/// must hint to the V3Scoreboard when an element's score may have -/// changed. When it receives this hint, the V3Scoreboard will move the -/// element into the set of elements whose scores are unknown. Later the -/// client can tell V3Scoreboard to re-sort the list, which it does -/// incrementally, by re-scoring all elements whose scores are unknown, and -/// then moving these back into the score-sorted map. This is efficient -/// when the subset of elements whose scores change is much smaller than -/// the full set size. - -template > +template class V3Scoreboard final { -private: // TYPES - class CmpElems final { - public: - bool operator()(const T_Elem* const& ap, const T_Elem* const& bp) const { - const T_ElemCompare cmp; - return cmp.operator()(*ap, *bp); - } - }; - using SortedMap = SortByValueMap; - using UserScoreFnp = T_Score (*)(const T_Elem*); + using Heap = PairingHeap; + +public: + using Node = typename Heap::Node; + +private: + using Link = typename Heap::Link; + + // Note: T_Elem is incomplete here, so we cannot assert 'std::is_base_of::value' // MEMBERS - // Below uses set<> not an unordered_set<>. unordered_set::clear() and - // construction results in a 491KB clear operation to zero all the - // buckets. Since the set size is generally small, and we iterate the - // set members, set is better performant. - std::set m_unknown; // Elements with unknown scores - SortedMap m_sorted; // Set of elements with known scores - const UserScoreFnp m_scoreFnp; // Scoring function - const bool m_slowAsserts; // Do some asserts that require extra lookups + Heap m_known; // The heap of entries with known scores + Link m_unknown; // List of entries with unknown scores public: // CONSTRUCTORS - explicit V3Scoreboard(UserScoreFnp scoreFnp, bool slowAsserts) - : m_scoreFnp{scoreFnp} - , m_slowAsserts{slowAsserts} {} + explicit V3Scoreboard() = default; ~V3Scoreboard() = default; - // METHODS - - // Add an element to the scoreboard. - // Element begins in needs-rescore state; it won't be returned by - // bestp() until after the next rescore(). - void addElem(const T_Elem* elp) { - if (m_slowAsserts) { - UASSERT(!contains(elp), "Adding element to scoreboard that was already in scoreboard"); - } - m_unknown.insert(elp); - } - - // Remove elp from scoreboard. - void removeElem(const T_Elem* elp) { - if (0 == m_sorted.erase(elp)) { - UASSERT(m_unknown.erase(elp), - "Could not find requested elem to remove from scoreboard"); - } - } - - // Returns true if elp is present in the scoreboard, false otherwise. - // - // Note: every other V3Scoreboard routine that takes an T_Elem* has - // undefined behavior if the element is not in the scoreboard. - bool contains(const T_Elem* elp) const { - if (m_unknown.find(elp) != m_unknown.end()) return true; - return (m_sorted.find(elp) != m_sorted.end()); - } - - // Get the best element, with the lowest score (lower is better), among - // elements whose scores are known. Returns nullptr if no elements with - // known scores exist. - // - // Note: This does not automatically rescore. Client must call - // rescore() periodically to ensure all elems in the scoreboard are - // reflected in the result of bestp(). Otherwise, bestp() only - // considers elements that aren't pending rescore. - const T_Elem* bestp() { - const auto it = m_sorted.begin(); - if (VL_UNLIKELY(it == m_sorted.end())) return nullptr; - return it->first; - } - - // Tell the scoreboard that this element's score may have changed. - // - // At the time of this call, the element's score becomes "unknown" - // to the V3Scoreboard. Unknown elements won't be returned by bestp(). - // The element's score will remain unknown until the next rescore(). - // - // The client MUST call this for each element whose score has changed. - // - // The client MAY call this for elements whose score has not changed. - // Doing so incurs some compute cost (to re-sort the element back to - // its original location) and still makes it ineligible to be returned - // by bestp() until the next rescore(). - void hintScoreChanged(const T_Elem* elp) { - m_unknown.insert(elp); - m_sorted.erase(elp); - } - - // True if any element's score is unknown to V3Scoreboard. - bool needsRescore() { return !m_unknown.empty(); } - // False if elp's score is known to V3Scoreboard, - // else true if elp's score is unknown until the next rescore(). - bool needsRescore(const T_Elem* elp) { return m_unknown.count(elp); } - // Retrieve the last known score for an element. - T_Score cachedScore(const T_Elem* elp) { return m_sorted.at(elp); } - // For each element whose score is unknown to V3Scoreboard, - // call the client's scoring function to get a new score, - // and sort all elements by their current score. - void rescore() { - for (const T_Elem* elp : m_unknown) { - VL_ATTR_UNUSED const bool exists = !m_sorted.emplace(elp, m_scoreFnp(elp)).second; -#if VL_DEBUG - UASSERT(!exists, "Should not be in both m_unknown and m_sorted"); -#endif - } - m_unknown.clear(); - } - private: VL_UNCOPYABLE(V3Scoreboard); + + // METHODSs + void addUnknown(T_Elem* nodep) { + // Just prepend it to the list of unknown entries + nodep->m_next.link(m_unknown.unlink()); + m_unknown.linkNonNull(nodep); + // We mark nodes on the unknown list by making their child pointer point to themselves + nodep->m_kids.m_ptr = nodep; + } + +public: + // Returns true if the element is present in the scoreboard, false otherwise. Every other + // method that takes a T_Elem* (except for 'add') has undefined behavior if the element is not + // in this scoreboard. Furthermore, this method is only valid if the element can only possibly + // be in this scoreboard. That is: if the element might be in another scoreboard, the behaviour + // of this method is undefined. + static bool contains(const T_Elem* nodep) { return nodep->m_ownerpp; } + + // Add an element to the scoreboard. This will not be returned before the next 'rescore' call. + void add(T_Elem* nodep) { +#if VL_DEBUG + UASSERT(!contains(nodep), "Adding element to scoreboard that was already in a scoreboard"); +#endif + addUnknown(nodep); + } + + // Remove element from scoreboard. + void remove(T_Elem* nodep) { + if (nodep->m_kids.m_ptr == nodep) { + // Node is on the unknown list, replace with next + nodep->replaceWith(nodep->m_next.unlink()); + return; + } + // Node is in the known heap, remove it + m_known.remove(nodep); + } + + // Get the known element with the highest score (as we are using a max-heap), or nullptr if + // there are no elements with known entries. This does not automatically 'rescore'. The client + // must call 'rescore' appropriately to ensure all elements in the scoreboard are reflected in + // the result of this method. + T_Elem* best() const { return T_Elem::heapNodeToElem(m_known.max()); } + + // Tell the scoreboard that this element's score may have changed. At the time of this call, + // the element's score becomes 'unknown' to the scoreboard. Unknown elements will not be + // returned by 'best until the next call to 'rescore'. + void hintScoreChanged(T_Elem* nodep) { + // If it's already in the unknown list, then nothing to do + if (nodep->m_kids.m_ptr == nodep) return; + // Otherwise it was in the heap, remove it + m_known.remove(nodep); + // Prepend it to the unknown list + addUnknown(nodep); + } + + // True if we have elements with unknown score + bool needsRescore() const { return m_unknown; } + + // True if the element's score is unknown, false otherwise. + static bool needsRescore(const T_Elem* nodep) { return nodep->m_kids.m_ptr == nodep; } + + // For each element whose score is unknown, recompute the score and add to the known heap + void rescore() { + // Rescore and insert all unknown elements + for (Node *nodep = m_unknown.unlink(), *nextp; nodep; nodep = nextp) { + // Pick up next + nextp = nodep->m_next.ptr(); + // Reset pointers + nodep->m_next.m_ptr = nullptr; + nodep->m_kids.m_ptr = nullptr; + nodep->m_ownerpp = nullptr; + // Re-compute the score of the element + T_Elem::heapNodeToElem(nodep)->rescore(); + // re-insert into the heap + m_known.insert(nodep); + } + } }; -//###################################################################### +// ###################################################################### namespace V3ScoreboardBase { void selfTest();