From 9ac64d0b929cff7231c736ded0be906191b6a777 Mon Sep 17 00:00:00 2001
From: Geza Lore <gezalore@gmail.com>
Date: Sun, 7 Aug 2022 14:11:58 +0100
Subject: [PATCH] Improve performance of MTask coarsening

Various optimizations to speed up MTasks coarsening (which is the long
pole in the multi-threaded scheduling of very large designs).

The biggest impact ones:
- Use efficient hand written Pairing Heaps for implementing priority
  queues and the scoreboard, instead of the old SortByValueMap. This
  helps us avoid having to sort a lot of merge candidates that we will
  never actually consider and helps a lot in performance.
- Remove unnecessary associative containers and store data structures
  (the heap nodes in particular) directly in the object they relate to.
  This eliminates a huge amount of lookups and helps a lot in
  performance.
- Distribute storage for SiblingMC instances into the LogicMTask
  instances, and combine with the sibling maps. This again eliminates
  hash table lookups and makes storage structures smaller.
- Remove some now bidirectional edge maps, keep only the forward map.

There are also some other smaller optimizations:
- Replaced more unnecessary dynamic_casts with static_casts
- Templated some functions/classes to reduce the number of static
  branches in loops.
- Improves sorting of edges for sibling candidate creation
- Various micro-optimizations here and there

This speeds up MTask coarsening by 3.8x on a large design, which
translates to a 2.5x speedup of the ordering pass in multi-threaded
mode. (Combined with the earlier optimizations, ordering is now 3x
faster.)

Due to the elimination of a lot of the auxiliary data structures, and
ensuring a minimal size for the necessary ones, memory consumption of
the MTask coarsening is also reduced (measured up to 4.4x reduction
though the accuracy of this is low).

The algorithm is identical except for minor alterations of the order
some candidates are added or removed, this can cause perturbation in the
output due to tied scores being broken based on IDs.
---
 Changes               |    1 +
 include/verilatedos.h |    7 +
 src/V3Graph.h         |    2 +-
 src/V3PairingHeap.h   |  303 ++++++++++++
 src/V3Partition.cpp   | 1044 +++++++++++++++++++++++------------------
 src/V3Scoreboard.cpp  |   56 ++-
 src/V3Scoreboard.h    |  337 ++++---------
 7 files changed, 1045 insertions(+), 705 deletions(-)
 create mode 100644 src/V3PairingHeap.h

diff --git a/Changes b/Changes
index 752f4c15c..ff4509bcc 100644
--- a/Changes
+++ b/Changes
@@ -21,6 +21,7 @@ Verilator 4.225 devel
 * Fix incorrect tristate logic (#3399) [shareefj, Vighnesh Iyer]
 * Fix segfault exporting non-existant package (#3535).
 * Fix case statement comparing string literal (#3544). [Gustav Svensk]
+* Improve Verilation speed with --threads on large designs. [Geza Lore]
 
 
 Verilator 4.224 2022-06-19
diff --git a/include/verilatedos.h b/include/verilatedos.h
index c89b4c6dc..12763f815 100644
--- a/include/verilatedos.h
+++ b/include/verilatedos.h
@@ -530,6 +530,13 @@ using ssize_t = uint32_t;  ///< signed size_t; returned from read()
 #define VL_STRINGIFY(x) VL_STRINGIFY2(x)
 #define VL_STRINGIFY2(x) #x
 
+//=========================================================================
+// Offset of field in type
+
+// Address zero can cause compiler problems
+#define VL_OFFSETOF(type, field) \
+    (reinterpret_cast<size_t>(&(reinterpret_cast<type*>(0x10000000)->field)) - 0x10000000)
+
 //=========================================================================
 // Conversions
 
diff --git a/src/V3Graph.h b/src/V3Graph.h
index da096ab2f..a18fb5dfc 100644
--- a/src/V3Graph.h
+++ b/src/V3Graph.h
@@ -67,7 +67,7 @@ public:
         return names[m_e];
     }
     // METHODS unique to this class
-    constexpr GraphWay invert() const { return m_e == FORWARD ? REVERSE : FORWARD; }
+    constexpr GraphWay invert() const { return GraphWay{m_e ^ 1}; }
     constexpr bool forward() const { return m_e == FORWARD; }
     constexpr bool reverse() const { return m_e != FORWARD; }
 };
diff --git a/src/V3PairingHeap.h b/src/V3PairingHeap.h
new file mode 100644
index 000000000..9904225f3
--- /dev/null
+++ b/src/V3PairingHeap.h
@@ -0,0 +1,303 @@
+// -*- mode: C++; c-file-style: "cc-mode" -*-
+//*************************************************************************
+// DESCRIPTION: Verilator: Pairing Heap data structure
+//
+// Code available from: https://verilator.org
+//
+//*************************************************************************
+//
+// Copyright 2003-2022 by Wilson Snyder. This program is free software; you
+// can redistribute it and/or modify it under the terms of either the GNU
+// Lesser General Public License Version 3 or the Perl Artistic License
+// Version 2.0.
+// SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
+//
+//*************************************************************************
+
+#ifndef VERILATOR_V3PAIRINGHEAP_H_
+#define VERILATOR_V3PAIRINGHEAP_H_
+
+#include "config_build.h"
+#include "verilatedos.h"
+
+#include "V3Error.h"
+
+//=============================================================================
+// Pairing heap (max-heap) with increase key and delete.
+//
+// While this is written as a generic data structure, it's interface and
+// implementation is finely tuned for it's use by V3Parm_tition, and is critical
+// to verilaton performance, so be very careful changing anything or adding any
+// new operations that would impact either memory usage, or performance of the
+// existing operations. This data structure is fully deterministic, meaning
+// the order in which elements with equal keys are retrieved only depends on
+// the order of operations performed on the heap.
+//=============================================================================
+
+template <typename T_Key>
+class PairingHeap final {
+public:
+    struct Node;
+
+    // Just a pointer to a heap Node, but with special accessors to help keep back pointers
+    // consistent.
+    struct Link {
+        Node* m_ptr = nullptr;  // The managed pointer
+
+        Link() = default;
+        VL_UNCOPYABLE(Link);
+
+        // Make the pointer point to the target, and the target's owner pointer to this pointer
+        VL_ATTR_ALWINLINE void link(Node* targetp) {
+            m_ptr = targetp;
+            if (!targetp) return;
+#if VL_DEBUG
+            UASSERT(!targetp->m_ownerpp, "Already linked");
+#endif
+            targetp->m_ownerpp = &m_ptr;
+        }
+
+        // Make the pointer point to the target, and the target's owner pointer to this pointer
+        VL_ATTR_ALWINLINE void linkNonNull(Node* targetp) {
+            m_ptr = targetp;
+#if VL_DEBUG
+            UASSERT(!targetp->m_ownerpp, "Already linked");
+#endif
+            targetp->m_ownerpp = &m_ptr;
+        }
+
+        // Clear the pointer and return it's previous value
+        VL_ATTR_ALWINLINE Node* unlink() {
+            Node* const result = m_ptr;
+#if VL_DEBUG
+            if (result) {
+                UASSERT(m_ptr->m_ownerpp == &m_ptr, "Bad back link");
+                // Not strictly necessary to clear this, but helps debugging
+                m_ptr->m_ownerpp = nullptr;
+            }
+#endif
+            m_ptr = nullptr;
+            return result;
+        }
+
+        // Minimal convenience acessors and operators
+        VL_ATTR_ALWINLINE Node* ptr() const { return m_ptr; }
+        VL_ATTR_ALWINLINE operator bool() const { return m_ptr; }
+        VL_ATTR_ALWINLINE bool operator!() const { return !m_ptr; }
+        VL_ATTR_ALWINLINE Node* operator->() const { return m_ptr; }
+        VL_ATTR_ALWINLINE Node& operator*() const { return *m_ptr; }
+    };
+
+    // A single node in the pairing heap tree
+    struct Node {
+        Link m_next;  // Next in list of sibling heaps
+        Link m_kids;  // Head of list of child heaps
+        Node** m_ownerpp = nullptr;  // Pointer to the Link pointer pointing to this heap
+        T_Key m_key;  // The key in the heap
+
+        // CONSTRUCTOR
+        explicit Node() = default;
+        VL_UNCOPYABLE(Node);
+
+        // METHODS
+        VL_ATTR_ALWINLINE const T_Key& key() const { return m_key; }
+        VL_ATTR_ALWINLINE bool operator<(const Node& that) const { return m_key < that.m_key; }
+        VL_ATTR_ALWINLINE bool operator>(const Node& that) const { return that.m_key < m_key; }
+
+        // Make newp take the place of this in the tree
+        VL_ATTR_ALWINLINE void replaceWith(Node* newp) {
+            *m_ownerpp = newp;  // The owner pointer needs to point to the new node
+            if (newp) newp->m_ownerpp = m_ownerpp;  // The new node needs to point to its owner
+            m_ownerpp = nullptr;  // This node has no owner anymore
+        }
+
+        // Make newp take the place of this in the tree
+        VL_ATTR_ALWINLINE void replaceWithNonNull(Node* newp) {
+            *m_ownerpp = newp;  // The owner pointer needs to point to the new node
+            newp->m_ownerpp = m_ownerpp;  // The new node needs to point to its owner
+            m_ownerpp = nullptr;  // This node has no owner anymore
+        }
+
+        // Yank this node out of the heap it currently is in. This node can then be safely inserted
+        // into another heap. Note that this leaves the heap the node is currently under in an
+        // inconsistent state, so you cannot access it anymore. Still this can save a remove if we
+        // don't care about the state of the source heap.
+        VL_ATTR_ALWINLINE void yank() {
+            m_next.link(nullptr);
+            m_kids.link(nullptr);
+            m_ownerpp = nullptr;
+        }
+    };
+
+private:
+    // MEMBERS
+
+    // The root of the heap. Note: We do not reduce lists during insertion/removal etc, unless we
+    // absolutely have to. This means the root can become a list. This is ok, we will reduce
+    // lazily when requesting the minimum element.
+    mutable Link m_root;
+
+    // CONSTRUCTORS
+    VL_UNCOPYABLE(PairingHeap);
+
+public:
+    explicit PairingHeap() = default;
+
+    // METHODS
+    bool empty() const { return !m_root; }
+
+    // Insert given node into this heap with given key.
+    void insert(Node* nodep, T_Key key) {
+        // Update key of node
+        nodep->m_key = key;
+        insert(nodep);
+    }
+
+    // Insert given node into this heap with key already set in the node
+    void insert(Node* nodep) {
+#if VL_DEBUG
+        UASSERT(!nodep->m_ownerpp && !nodep->m_next && !nodep->m_kids, "Already linked");
+#endif
+        // Just stick it at the front of the root list
+        nodep->m_next.link(m_root.unlink());
+        m_root.linkNonNull(nodep);
+    }
+
+    // Remove given node only from the heap it is contained in
+    void remove(Node* nodep) {
+        if (!nodep->m_next) {
+            // If the node does not have siblings, replace it with its children (might be empty).
+            nodep->replaceWith(nodep->m_kids.unlink());
+        } else if (!nodep->m_kids) {
+            // If it has siblings but no children, replace it with the siblings.
+            nodep->replaceWithNonNull(nodep->m_next.unlink());
+        } else {
+            // If it has both siblings and children, reduce the children and splice that
+            // reduced heap in place of this node
+            Node* const reducedKidsp = reduce(nodep->m_kids.unlink());
+            reducedKidsp->m_next.linkNonNull(nodep->m_next.unlink());
+            nodep->replaceWithNonNull(reducedKidsp);
+        }
+    }
+
+    // Returns the largest element in the heap
+    Node* max() const {
+        // Heap might be empty
+        if (!m_root) return nullptr;
+        // If the root have siblings reduce them
+        if (m_root->m_next) m_root.linkNonNull(reduce(m_root.unlink()));
+        // The root element is the largest
+        return m_root.ptr();
+    }
+
+    // Returns the second-largest element in the heap.
+    // This is only valid to call if 'max' returned a valid element.
+    Node* secondMax() const {
+#if VL_DEBUG
+        UASSERT(m_root, "'max' would have returned nullptr");
+        UASSERT(!m_root->m_next, "'max' would have reduced");
+#endif
+        // If there are no children, there is no second element
+        if (!m_root->m_kids) return nullptr;
+        // If there are multiple children, reduce them
+        if (m_root->m_kids->m_next) m_root->m_kids.linkNonNull(reduce(m_root->m_kids.unlink()));
+        // Return the now singular child, which is the second-largest element
+        return m_root->m_kids.ptr();
+    }
+
+    // Increase the key of the given node to the given new value
+    template <typename T_Update>
+    void increaseKey(Node* nodep, T_Update value) {
+        // Update the key
+        nodep->m_key.increase(value);
+        // Increasing the key of the root is easy
+        if (nodep == m_root.ptr()) return;
+        // Otherwise we do have a little work to do
+        if (!nodep->m_kids) {
+            // If the node has no children, replace it with its siblings (migtht be null)
+            nodep->replaceWith(nodep->m_next.unlink());
+        } else if (!nodep->m_next) {
+            // If the node has no siblings, replace it with its children
+            nodep->replaceWithNonNull(nodep->m_kids.unlink());
+        } else {
+            // The node has both children and siblings. Splice the first child in the place of the
+            // node, and extract the rest of the children with the node
+            Node* const kidsp = nodep->m_kids.unlink();
+            nodep->m_kids.link(kidsp->m_next.unlink());
+            kidsp->m_next.linkNonNull(nodep->m_next.unlink());
+            nodep->replaceWithNonNull(kidsp);
+        }
+        // Just stick the increased node at the front of the root list
+        nodep->m_next.linkNonNull(m_root.unlink());
+        m_root.linkNonNull(nodep);
+    }
+
+private:
+    // Meld (merge) two heaps rooted at the given nodes, return the root of the new heap
+    VL_ATTR_ALWINLINE static Node* merge(Node* ap, Node* bp) {
+#if VL_DEBUG
+        UASSERT(!ap->m_ownerpp && !ap->m_next, "Not root a");
+        UASSERT(!bp->m_ownerpp && !bp->m_next, "Not root b");
+#endif
+        if (*ap > *bp) {  // bp goes under ap
+            bp->m_next.link(ap->m_kids.unlink());
+            ap->m_kids.linkNonNull(bp);
+            return ap;
+        } else {  // ap goes under bp
+            ap->m_next.link(bp->m_kids.unlink());
+            bp->m_kids.linkNonNull(ap);
+            return bp;
+        }
+    }
+
+    // Reduces the list of nodes starting at the given node into a single node that is returned
+    VL_ATTR_NOINLINE static Node* reduce(Node* nodep) {
+#if VL_DEBUG
+        UASSERT(!nodep->m_ownerpp, "Node is linked");
+#endif
+        // If there is only one node in the list, then there is nothing to do
+        if (!nodep->m_next) return nodep;
+        // The result node
+        Node* resultp = nullptr;
+        // Pairwise merge the child nodes
+        while (nodep) {
+            // Pop off the first nodes
+            Node* const ap = nodep;
+            // If we have an odd number of nodes, prepend the unpaired one onto the result list
+            if (!nodep->m_next) {
+                ap->m_next.link(resultp);
+                resultp = ap;
+                break;
+            }
+            // Pop off the second nodes
+            Node* const bp = nodep->m_next.unlink();
+            // Keep hold of the rest of the list
+            nodep = bp->m_next.unlink();
+            // Merge the current pair
+            Node* const mergedp = merge(ap, bp);
+            // Prepend the merged pair to the result list
+            mergedp->m_next.link(resultp);
+            resultp = mergedp;
+        }
+        // Now merge-reduce the merged pairs
+        while (resultp->m_next) {
+            // Pop first two results
+            Node* const ap = resultp;
+            Node* const bp = resultp->m_next.unlink();
+            // Keep hold of the rest of the list
+            resultp = bp->m_next.unlink();
+            // Merge the current pair
+            Node* const mergedp = merge(ap, bp);
+            // Prepend the merged pair to the result list
+            mergedp->m_next.link(resultp);
+            resultp = mergedp;
+        }
+        // Done
+        return resultp;
+    }
+};
+
+// The PairingHeap itself should be a simple pointer and nothing more
+static_assert(sizeof(PairingHeap<int>) == sizeof(PairingHeap<int>::Node*), "Should be a pointer");
+
+#endif  // Guard
diff --git a/src/V3Partition.cpp b/src/V3Partition.cpp
index 5b1474e91..1b11a00b5 100644
--- a/src/V3Partition.cpp
+++ b/src/V3Partition.cpp
@@ -22,23 +22,29 @@
 #include "V3Config.h"
 #include "V3EmitCBase.h"
 #include "V3File.h"
-#include "V3GraphAlg.h"
 #include "V3GraphStream.h"
 #include "V3InstrCount.h"
 #include "V3Os.h"
+#include "V3PairingHeap.h"
 #include "V3PartitionGraph.h"
 #include "V3Scoreboard.h"
 #include "V3Stats.h"
 #include "V3UniqueNames.h"
 
 #include <algorithm>
+#include <array>
 #include <list>
 #include <memory>
+#include <unordered_map>
 #include <unordered_set>
+#include <vector>
 
+class LogicMTask;
+class MTaskEdge;
 class MergeCandidate;
+class SiblingMC;
 
-//######################################################################
+// ######################################################################
 // Partitioner tunable settings:
 //
 // Before describing these settings, a bit of background:
@@ -70,14 +76,14 @@ class MergeCandidate;
 // skipping the enumeration of some siblings on a few vertices does not
 // have a large impact on the result of the partitioner.
 //
-// If your vertices are small, the limit (at 25) approaches a no-op.  Hence
+// If your vertices are small, the limit (at 26) approaches a no-op.  Hence
 // there's basically no cost to applying this limit even when we don't
 // expect huge vertices.
 //
 // If you don't care about partitioner runtime and you want the most
 // aggressive partition, set the limit very high.  If you have huge
 // vertices, leave this as is.
-constexpr unsigned PART_SIBLING_EDGE_LIMIT = 25;
+constexpr unsigned PART_SIBLING_EDGE_LIMIT = 26;
 
 //   PART_STEPPED_COST (defined/undef)
 //
@@ -143,10 +149,34 @@ static void partCheckCachedScoreVsActual(uint32_t cached, uint32_t actual) {
 #endif
 }
 
-//######################################################################
+//=============================================================================
+// We keep MTaskEdge graph edges in a PairingHeap, sorted by score and id
+
+struct EdgeKey {
+    // Node: Structure layout chosen to minimize padding in PairingHeao<*>::Node
+    uint64_t m_id;  // Unique ID part of edge score
+    uint32_t m_score;  // Score part of ID
+    void increase(uint32_t score) {
+#if VL_DEBUG
+        UASSERT(score >= m_score, "Must increase");
+#endif
+        m_score = score;
+    }
+    bool operator<(const EdgeKey& other) const {
+        // First by Score then by ID
+        return m_score < other.m_score || (m_score == other.m_score && m_id < other.m_id);
+    }
+};
+
+using EdgeHeap = PairingHeap<EdgeKey>;
+
+//=============================================================================
 // LogicMTask
 
 class LogicMTask final : public AbstractLogicMTask {
+    template <GraphWay::en T_Way>
+    friend class PartPropagateCp;
+
 public:
     // TYPES
     using VxList = std::list<MTaskMoveVertex*>;
@@ -157,55 +187,6 @@ public:
         }
     };
 
-    // This adaptor class allows the PartPropagateCp class to be somewhat
-    // independent of the LogicMTask class
-    //  - PartPropagateCp can thus be declared before LogicMTask
-    //  - PartPropagateCp could be reused with graphs of other node types
-    //    in the future, using another Accessor adaptor.
-    class CpCostAccessor final {
-    public:
-        CpCostAccessor() = default;
-        ~CpCostAccessor() = default;
-        // Return cost of this node
-        uint32_t cost(const V3GraphVertex* vxp) const {
-            const LogicMTask* const mtaskp = static_cast<const LogicMTask*>(vxp);
-            return mtaskp->stepCost();
-        }
-        // Return stored CP to this node
-        uint32_t critPathCost(const V3GraphVertex* vxp, GraphWay way) const {
-            const LogicMTask* const mtaskp = static_cast<const LogicMTask*>(vxp);
-            return mtaskp->critPathCost(way);
-        }
-        // Store a new CP to this node
-        void setCritPathCost(V3GraphVertex* vxp, GraphWay way, uint32_t cost) const {
-            LogicMTask* const mtaskp = static_cast<LogicMTask*>(vxp);
-            mtaskp->setCritPathCost(way, cost);
-        }
-        // Notify vxp that the wayward CP at the throughp-->vxp edge
-        // has increased to 'cp'. (vxp is wayward from throughp.)
-        // This is our cue to update vxp's m_edges[!way][throughp].
-        void notifyEdgeCp(V3GraphVertex* vxp, GraphWay way, V3GraphVertex* throuvhVxp,
-                          uint32_t cp) const {
-            LogicMTask* const updateVxp = static_cast<LogicMTask*>(vxp);
-            LogicMTask* const lthrouvhVxp = static_cast<LogicMTask*>(throuvhVxp);
-            EdgeSet& edges = updateVxp->m_edges[way.invert()];
-            const auto it = edges.find(lthrouvhVxp);
-            if (cp > it->second) edges.update(it, cp);
-        }
-        // Check that CP matches that of the longest edge wayward of vxp.
-        void checkNewCpVersusEdges(V3GraphVertex* vxp, GraphWay way, uint32_t cp) const {
-            LogicMTask* const mtaskp = static_cast<LogicMTask*>(vxp);
-            const EdgeSet& edges = mtaskp->m_edges[way.invert()];
-            // This is mtaskp's relative with longest !wayward inclusive CP:
-            const auto edgeIt = edges.rbegin();
-            const uint32_t edgeCp = edgeIt->second;
-            UASSERT_OBJ(edgeCp == cp, vxp, "CP doesn't match longest wayward edge");
-        }
-
-    private:
-        VL_UNCOPYABLE(CpCostAccessor);
-    };
-
 private:
     // MEMBERS
 
@@ -231,21 +212,21 @@ private:
     // while searching for a path.
     uint64_t m_generation = 0;
 
-    // Redundant with the V3GraphEdge's, store a map of relatives so we can
-    // quickly check if we have a given parent or child.
-    //
-    // 'm_edges[way]' maps a wayward relative to the !way critical path at
-    // our edge with them. The SortByValueMap supports iterating over
-    // relatives in longest-to-shortest CP order.  We rely on this ordering
-    // in more than one place.
-    using EdgeSet = SortByValueMap<LogicMTask*, uint32_t, CmpLogicMTask>;
-    std::array<EdgeSet, GraphWay::NUM_WAYS> m_edges;
+    // Store a set of forward relatives so we can quickly check if we have a given child
+    std::unordered_set<LogicMTask*> m_edgeSet;
+    // Store the outgoing and incoming edges in a heap sorted by the critical path length
+    std::array<EdgeHeap, GraphWay::NUM_WAYS> m_edgeHeap;
+
+    // SiblingMC for which storage is owned by this MTask
+    std::set<SiblingMC> m_ownSibs;
+    // SiblingMC for which storage is owned by the opposite MTask
+    std::set<const SiblingMC*> m_farSibps;
 
 public:
     // CONSTRUCTORS
     LogicMTask(V3Graph* graphp, MTaskMoveVertex* mtmvVxp)
         : AbstractLogicMTask{graphp} {
-        for (unsigned int& i : m_critPathCost) i = 0;
+        for (uint32_t& item : m_critPathCost) item = 0;
         if (mtmvVxp) {  // Else null for test
             m_vertices.push_back(mtmvVxp);
             if (const OrderLogicVertex* const olvp = mtmvVxp->logicp()) {
@@ -259,6 +240,9 @@ public:
     }
 
     // METHODS
+    std::set<SiblingMC>& ownSibs() { return m_ownSibs; };
+    std::set<const SiblingMC*>& farSibs() { return m_farSibps; };
+
     void moveAllVerticesFrom(LogicMTask* otherp) {
         // splice() is constant time
         m_vertices.splice(m_vertices.end(), otherp->m_vertices);
@@ -296,32 +280,39 @@ public:
         logcost = logcost / 20.0;
 
         const uint32_t stepCost = static_cast<uint32_t>(exp(logcost));
+#if VL_DEBUG
         UASSERT_STATIC(stepCost >= cost, "stepped cost error exceeded");
         UASSERT_STATIC(stepCost <= ((cost * 11 / 10)), "stepped cost error exceeded");
+#endif
         return stepCost;
 #else
         return cost;
 #endif
     }
 
-    void addRelative(GraphWay way, LogicMTask* relativep) {
-        // value is !way cp to this edge
-        const uint32_t cp = relativep->stepCost() + relativep->critPathCost(way.invert());
-        VL_ATTR_UNUSED const bool exits = !m_edges[way].emplace(relativep, cp).second;
+    template <GraphWay::en T_Way>
+    void addRelativeEdge(MTaskEdge* edgep);
+    template <GraphWay::en T_Way>
+    void stealRelativeEdge(MTaskEdge* edgep);
+    template <GraphWay::en T_Way>
+    void removeRelativeEdge(MTaskEdge* edgep);
+
+    void addRelativeMTask(LogicMTask* relativep) {
+        // Add the relative to connecting edge map
+        VL_ATTR_UNUSED const bool exits = !m_edgeSet.emplace(relativep).second;
 #if VL_DEBUG
-        UASSERT(!exits, "Adding existing edge");
+        UASSERT(!exits, "Adding existing relative");
 #endif
     }
-    void removeRelative(GraphWay way, LogicMTask* relativep) { m_edges[way].erase(relativep); }
-    bool hasRelative(GraphWay way, LogicMTask* relativep) { return m_edges[way].has(relativep); }
-    void checkRelativesCp(GraphWay way) const {
-        for (const auto& edge : vlstd::reverse_view(m_edges[way])) {
-            const LogicMTask* const relativep = edge.first;
-            const uint32_t cachedCp = edge.second;
-            const uint32_t cp = relativep->critPathCost(way.invert()) + relativep->stepCost();
-            partCheckCachedScoreVsActual(cachedCp, cp);
-        }
+    void removeRelativeMTask(LogicMTask* relativep) {
+        VL_ATTR_UNUSED const size_t removed = m_edgeSet.erase(relativep);
+#if VL_DEBUG
+        UASSERT(removed, "Relative should have been in set");
+#endif
     }
+    bool hasRelativeMTask(LogicMTask* relativep) const { return m_edgeSet.count(relativep); }
+
+    void checkRelativesCp(GraphWay way) const;
 
     virtual string name() const override {
         // Display forward and reverse critical path costs. This gives a quick
@@ -334,27 +325,7 @@ public:
 
     void setCritPathCost(GraphWay way, uint32_t cost) { m_critPathCost[way] = cost; }
     uint32_t critPathCost(GraphWay way) const { return m_critPathCost[way]; }
-    uint32_t critPathCostWithout(GraphWay way, const V3GraphEdge* withoutp) const {
-        // Compute the critical path cost wayward to this node, without
-        // considering edge 'withoutp'
-        UASSERT(this == withoutp->furtherp(way), "In critPathCostWithout(), edge 'withoutp' must "
-                                                 "further to 'this'");
-
-        // Iterate through edges until we get a relative other than
-        // wayEdgeEndp(way, withoutp). This should take 2 iterations max.
-        const EdgeSet& edges = m_edges[way.invert()];
-        uint32_t result = 0;
-        for (const auto& edge : vlstd::reverse_view(edges)) {
-            if (edge.first != withoutp->furtherp(way.invert())) {
-                // Use the cached cost. It could be a small overestimate
-                // due to stepping. This is consistent with critPathCost()
-                // which also returns the cached cost.
-                result = edge.second;
-                break;
-            }
-        }
-        return result;
-    }
+    uint32_t critPathCostWithout(GraphWay way, const V3GraphEdge* withoutp) const;
 
 private:
     static bool pathExistsFromInternal(LogicMTask* fromp, LogicMTask* top,
@@ -411,65 +382,7 @@ public:
         return pathExistsFromInternal(fromp, top, excludedEdgep, incGeneration());
     }
 
-    static void dumpCpFilePrefixed(const V3Graph* graphp, const string& nameComment) {
-        const string filename = v3Global.debugFilename(nameComment) + ".txt";
-        UINFO(1, "Writing " << filename << endl);
-        const std::unique_ptr<std::ofstream> ofp{V3File::new_ofstream(filename)};
-        std::ostream* const osp = &(*ofp);  // &* needed to deref unique_ptr
-        if (osp->fail()) v3fatalStatic("Can't write " << filename);
-
-        // Find start vertex with longest CP
-        const LogicMTask* startp = nullptr;
-        for (const V3GraphVertex* vxp = graphp->verticesBeginp(); vxp;
-             vxp = vxp->verticesNextp()) {
-            const LogicMTask* const mtaskp = static_cast<const LogicMTask*>(vxp);
-            if (!startp) {
-                startp = mtaskp;
-                continue;
-            }
-            if (mtaskp->cost() + mtaskp->critPathCost(GraphWay::REVERSE)
-                > startp->cost() + startp->critPathCost(GraphWay::REVERSE)) {
-                startp = mtaskp;
-            }
-        }
-
-        // Follow the entire critical path
-        std::vector<const LogicMTask*> path;
-        uint32_t totalCost = 0;
-        for (const LogicMTask* nextp = startp; nextp;) {
-            path.push_back(nextp);
-            totalCost += nextp->cost();
-
-            const EdgeSet& children = nextp->m_edges[GraphWay::FORWARD];
-            const EdgeSet::const_reverse_iterator it = children.rbegin();
-            if (it == children.rend()) {
-                nextp = nullptr;
-            } else {
-                nextp = it->first;
-            }
-        }
-
-        *osp << "totalCost = " << totalCost
-             << " (should match the computed critical path cost (CP) for the graph)\n";
-
-        // Dump
-        for (const LogicMTask* mtaskp : path) {
-            *osp << "begin mtask with cost " << mtaskp->cost() << '\n';
-            for (VxList::const_iterator lit = mtaskp->vertexListp()->begin();
-                 lit != mtaskp->vertexListp()->end(); ++lit) {
-                const OrderLogicVertex* const logicp = (*lit)->logicp();
-                if (!logicp) continue;
-                if (false) {
-                    // Show nodes only
-                    *osp << "> ";
-                    logicp->nodep()->dumpTree(*osp);
-                } else {
-                    // Show nodes with hierarchical costs
-                    V3InstrCount::count(logicp->nodep(), false, osp);
-                }
-            }
-        }
-    }
+    static void dumpCpFilePrefixed(const V3Graph* graphp, const string& nameComment);
 
 private:
     VL_DEBUG_FUNC;  // Declare debug()
@@ -490,11 +403,20 @@ public:
     }
 };
 
-class SiblingMC;
-class MTaskEdge;
+struct MergeCandidateKey {
+    // Note: Structure layout chosen to minimize padding in PairingHeao<*>::Node
+    uint64_t m_id;  // Unique ID part of edge score
+    uint32_t m_score;  // Score part of ID
+    bool operator<(const MergeCandidateKey& other) const {
+        // First by Score then by ID, but notice that we want minimums using a max-heap, so reverse
+        return m_score > other.m_score || (m_score == other.m_score && m_id > other.m_id);
+    }
+};
 
-// Information associated with scoreboarding an MTask
-class MergeCandidate VL_NOT_FINAL {
+using MergeCandidateScoreboard = V3Scoreboard<MergeCandidate, MergeCandidateKey>;
+
+// Information associated with scoreboarding a merge candidate
+class MergeCandidate VL_NOT_FINAL : public MergeCandidateScoreboard::Node {
 private:
     // Only the known subclasses can create or delete one of these
     friend class SiblingMC;
@@ -507,18 +429,17 @@ private:
     // using another bit of the id to denote the actual subtype.
 
     // By using the bottom bits for flags, we can still use < to compare IDs without masking.
-    uint64_t m_id;  // <63:2> Serial number for ordering, <1> subtype (SiblingMC), <0> removed
-    static constexpr uint64_t REMOVED_MASK = 1ULL << 0;
-    static constexpr uint64_t IS_SIBLING_MASK = 1ULL << 1;
-    static constexpr uint64_t ID_INCREMENT = 1ULL << 2;
+    // <63:1> Serial number for ordering, <0> subtype (SiblingMC)
+    static constexpr uint64_t IS_SIBLING_MASK = 1ULL << 0;
+    static constexpr uint64_t ID_INCREMENT = 1ULL << 1;
 
-    bool isSiblingMC() const { return m_id & IS_SIBLING_MASK; }
+    bool isSiblingMC() const { return m_key.m_id & IS_SIBLING_MASK; }
 
     // CONSTRUCTORS
     explicit MergeCandidate(bool isSiblingMC) {
         static uint64_t serial = 0;
         serial += ID_INCREMENT;  // +ID_INCREMENT so doesn't set the special bottom bits
-        m_id = serial | (isSiblingMC * IS_SIBLING_MASK);
+        m_key.m_id = serial | (isSiblingMC * IS_SIBLING_MASK);
     }
     ~MergeCandidate() = default;
 
@@ -530,35 +451,33 @@ public:
     const MTaskEdge* toMTaskEdge() const;  // Instead of dynamic_cast
     bool mergeWouldCreateCycle() const;  // Instead of virtual method
 
-    bool removedFromSb() const { return (m_id & REMOVED_MASK) != 0; }
-    void removedFromSb(bool /*removed*/) { m_id |= REMOVED_MASK; }
-    void clearRemovedFromSb() { m_id &= ~REMOVED_MASK; }
-    bool operator<(const MergeCandidate& other) const { return m_id < other.m_id; }
+    inline void rescore();
+    uint32_t score() const { return m_key.m_score; }
+
+    static MergeCandidate* heapNodeToElem(MergeCandidateScoreboard::Node* nodep) {
+        return static_cast<MergeCandidate*>(nodep);
+    }
 };
 
-static_assert(sizeof(MergeCandidate) == sizeof(uint64_t), "Should not have a vtable");
+static_assert(sizeof(MergeCandidate) == sizeof(MergeCandidateScoreboard::Node),
+              "Should not have a vtable");
 
 // A pair of associated LogicMTask's that are merge candidates for sibling
 // contraction
 class SiblingMC final : public MergeCandidate {
 private:
-    LogicMTask* m_ap;
-    LogicMTask* m_bp;
+    LogicMTask* const m_ap;
+    LogicMTask* const m_bp;
 
 public:
     // CONSTRUCTORS
     SiblingMC() = delete;
     SiblingMC(LogicMTask* ap, LogicMTask* bp)
-        : MergeCandidate{/* isSiblingMC: */ true} {
-        // Assign 'ap' and 'bp' in a canonical order, so we can more easily
-        // compare pairs of SiblingMCs
-        if (ap->id() > bp->id()) {
-            m_ap = ap;
-            m_bp = bp;
-        } else {
-            m_ap = bp;
-            m_bp = ap;
-        }
+        : MergeCandidate{/* isSiblingMC: */ true}
+        , m_ap{ap}
+        , m_bp{bp} {
+        // operator< and storage management depends on this
+        UASSERT(ap->id() > bp->id(), "Should be ordered");
     }
     ~SiblingMC() = default;
     // METHODS
@@ -580,17 +499,23 @@ static_assert(sizeof(SiblingMC) == sizeof(MergeCandidate) + 2 * sizeof(LogicMTas
 
 // GraphEdge for the MTask graph
 class MTaskEdge final : public V3GraphEdge, public MergeCandidate {
+    friend class LogicMTask;
+    template <GraphWay::en T_Way>
+    friend class PartPropagateCp;
+
+    // MEMBERS
+    // This edge can be in 2 EdgeHeaps, one forward and one reverse. We allocate the heap nodes
+    // directly within the edge as they are always required and this makes association cheap.
+    EdgeHeap::Node m_edgeHeapNode[GraphWay::NUM_WAYS];
+
 public:
     // CONSTRUCTORS
     MTaskEdge(V3Graph* graphp, LogicMTask* fromp, LogicMTask* top, int weight)
         : V3GraphEdge{graphp, fromp, top, weight}
         , MergeCandidate{/* isSiblingMC: */ false} {
-        fromp->addRelative(GraphWay::FORWARD, top);
-        top->addRelative(GraphWay::REVERSE, fromp);
-    }
-    virtual ~MTaskEdge() override {
-        fromMTaskp()->removeRelative(GraphWay::FORWARD, toMTaskp());
-        toMTaskp()->removeRelative(GraphWay::REVERSE, fromMTaskp());
+        fromp->addRelativeMTask(top);
+        fromp->addRelativeEdge<GraphWay::FORWARD>(this);
+        top->addRelativeEdge<GraphWay::REVERSE>(this);
     }
     // METHODS
     LogicMTask* furtherMTaskp(GraphWay way) const {
@@ -601,28 +526,142 @@ public:
     bool mergeWouldCreateCycle() const {
         return LogicMTask::pathExistsFrom(fromMTaskp(), toMTaskp(), this);
     }
-    static MTaskEdge* cast(V3GraphEdge* edgep) {
-        if (!edgep) return nullptr;
-        MTaskEdge* const resultp = dynamic_cast<MTaskEdge*>(edgep);
-        UASSERT(resultp, "Failed to cast in MTaskEdge::cast");
-        return resultp;
-    }
     // Following initial assignment of critical paths, clear this MTaskEdge
     // out of the edge-map for each node and reinsert at a new location
     // with updated critical path.
     void resetCriticalPaths() {
         LogicMTask* const fromp = fromMTaskp();
         LogicMTask* const top = toMTaskp();
-        fromp->removeRelative(GraphWay::FORWARD, top);
-        top->removeRelative(GraphWay::REVERSE, fromp);
-        fromp->addRelative(GraphWay::FORWARD, top);
-        top->addRelative(GraphWay::REVERSE, fromp);
+        fromp->removeRelativeEdge<GraphWay::FORWARD>(this);
+        top->removeRelativeEdge<GraphWay::REVERSE>(this);
+        fromp->addRelativeEdge<GraphWay::FORWARD>(this);
+        top->addRelativeEdge<GraphWay::REVERSE>(this);
+    }
+
+    uint32_t cachedCp(GraphWay way) const { return m_edgeHeapNode[way].key().m_score; }
+
+    // Convert from the address of the m_edgeHeapNode[way] in an MTaskEdge back to the MTaskEdge
+    static const MTaskEdge* toMTaskEdge(GraphWay way, const EdgeHeap::Node* nodep) {
+        const size_t offset = VL_OFFSETOF(MTaskEdge, m_edgeHeapNode[way]);
+        return reinterpret_cast<const MTaskEdge*>(reinterpret_cast<uintptr_t>(nodep) - offset);
     }
 
 private:
     VL_UNCOPYABLE(MTaskEdge);
 };
 
+template <GraphWay::en T_Way>
+void LogicMTask::addRelativeEdge(MTaskEdge* edgep) {
+    constexpr GraphWay way{T_Way};
+    constexpr GraphWay inv = way.invert();
+    // Add to the edge heap
+    LogicMTask* const relativep = edgep->furtherMTaskp(way);
+    // Value is !way cp to this edge
+    const uint32_t cp = relativep->stepCost() + relativep->critPathCost(inv);
+    //
+    m_edgeHeap[way].insert(&edgep->m_edgeHeapNode[way], {relativep->id(), cp});
+}
+
+template <GraphWay::en T_Way>
+void LogicMTask::stealRelativeEdge(MTaskEdge* edgep) {
+    constexpr GraphWay way{T_Way};
+    // Make heap node insertable, ruining the heap it is currently in.
+    edgep->m_edgeHeapNode[way].yank();
+    // Add the edge as new
+    addRelativeEdge<T_Way>(edgep);
+}
+
+template <GraphWay::en T_Way>
+void LogicMTask::removeRelativeEdge(MTaskEdge* edgep) {
+    constexpr GraphWay way{T_Way};
+    // Remove from the edge heap
+    m_edgeHeap[way].remove(&edgep->m_edgeHeapNode[way]);
+}
+
+void LogicMTask::checkRelativesCp(GraphWay way) const {
+    for (V3GraphEdge* edgep = beginp(way); edgep; edgep = edgep->nextp(way)) {
+        const LogicMTask* const relativep = static_cast<const LogicMTask*>(edgep->furtherp(way));
+        const uint32_t cachedCp = static_cast<MTaskEdge*>(edgep)->cachedCp(way);
+        const uint32_t cp = relativep->critPathCost(way.invert()) + relativep->stepCost();
+        partCheckCachedScoreVsActual(cachedCp, cp);
+    }
+}
+
+uint32_t LogicMTask::critPathCostWithout(GraphWay way, const V3GraphEdge* withoutp) const {
+    // Compute the critical path cost wayward to this node, without considering edge 'withoutp'.
+    // We need to look at two edges at most, the critical path if that is not via 'withoutp',
+    // or the second-worst path, if the critical path is via 'withoutp'.
+#if VL_DEBUG
+    UASSERT(withoutp->furtherp(way) == this,
+            "In critPathCostWithout(), edge 'withoutp' must further to 'this'");
+#endif
+    const GraphWay inv = way.invert();
+    const EdgeHeap& edgeHeap = m_edgeHeap[inv];
+    const EdgeHeap::Node* const maxp = edgeHeap.max();
+    if (!maxp) return 0;
+    if (MTaskEdge::toMTaskEdge(inv, maxp) != withoutp) return maxp->key().m_score;
+    const EdgeHeap::Node* const secp = edgeHeap.secondMax();
+    if (!secp) return 0;
+    return secp->key().m_score;
+}
+
+void LogicMTask::dumpCpFilePrefixed(const V3Graph* graphp, const string& nameComment) {
+    const string filename = v3Global.debugFilename(nameComment) + ".txt";
+    UINFO(1, "Writing " << filename << endl);
+    const std::unique_ptr<std::ofstream> ofp{V3File::new_ofstream(filename)};
+    std::ostream* const osp = &(*ofp);  // &* needed to deref unique_ptr
+    if (osp->fail()) v3fatalStatic("Can't write " << filename);
+
+    // Find start vertex with longest CP
+    LogicMTask* startp = nullptr;
+    for (V3GraphVertex* vxp = graphp->verticesBeginp(); vxp; vxp = vxp->verticesNextp()) {
+        LogicMTask* const mtaskp = static_cast<LogicMTask*>(vxp);
+        if (!startp) {
+            startp = mtaskp;
+            continue;
+        }
+        if (mtaskp->cost() + mtaskp->critPathCost(GraphWay::REVERSE)
+            > startp->cost() + startp->critPathCost(GraphWay::REVERSE)) {
+            startp = mtaskp;
+        }
+    }
+
+    // Follow the entire critical path
+    std::vector<const LogicMTask*> path;
+    uint32_t totalCost = 0;
+    for (LogicMTask* nextp = startp; nextp;) {
+        path.push_back(nextp);
+        totalCost += nextp->cost();
+
+        if (EdgeHeap::Node* const maxp = nextp->m_edgeHeap[GraphWay::FORWARD].max()) {
+            nextp = MTaskEdge::toMTaskEdge(GraphWay::FORWARD, maxp)->toMTaskp();
+        } else {
+            nextp = nullptr;
+        }
+    }
+
+    *osp << "totalCost = " << totalCost
+         << " (should match the computed critical path cost (CP) for the graph)\n";
+
+    // Dump
+    for (const LogicMTask* mtaskp : path) {
+        *osp << "begin mtask with cost " << mtaskp->cost() << '\n';
+        for (VxList::const_iterator lit = mtaskp->vertexListp()->begin();
+             lit != mtaskp->vertexListp()->end(); ++lit) {
+            const OrderLogicVertex* const logicp = (*lit)->logicp();
+            if (!logicp) continue;
+            if (false) {
+                // Show nodes only
+                *osp << "> ";
+                logicp->nodep()->dumpTree(*osp);
+            } else {
+                // Show nodes with hierarchical costs
+                V3InstrCount::count(logicp->nodep(), false, osp);
+            }
+        }
+    }
+}
+
 // Instead of dynamic cast
 SiblingMC* MergeCandidate::toSiblingMC() {
     return isSiblingMC() ? static_cast<SiblingMC*>(this) : nullptr;
@@ -647,6 +686,40 @@ bool MergeCandidate::mergeWouldCreateCycle() const {
                          : static_cast<const MTaskEdge*>(this)->mergeWouldCreateCycle();
 }
 
+static uint32_t siblingScore(const SiblingMC* sibsp) {
+    const LogicMTask* const ap = sibsp->ap();
+    const LogicMTask* const bp = sibsp->bp();
+    const uint32_t mergedCpCostFwd
+        = std::max(ap->critPathCost(GraphWay::FORWARD), bp->critPathCost(GraphWay::FORWARD));
+    const uint32_t mergedCpCostRev
+        = std::max(ap->critPathCost(GraphWay::REVERSE), bp->critPathCost(GraphWay::REVERSE));
+    return mergedCpCostRev + mergedCpCostFwd + LogicMTask::stepCost(ap->cost() + bp->cost());
+}
+
+static uint32_t edgeScore(const MTaskEdge* edgep) {
+    // Score this edge. Lower is better. The score is the new local CP
+    // length if we merge these mtasks.  ("Local" means the longest
+    // critical path running through the merged node.)
+    const LogicMTask* const top = static_cast<LogicMTask*>(edgep->top());
+    const LogicMTask* const fromp = static_cast<LogicMTask*>(edgep->fromp());
+    const uint32_t mergedCpCostFwd = std::max(fromp->critPathCost(GraphWay::FORWARD),
+                                              top->critPathCostWithout(GraphWay::FORWARD, edgep));
+    const uint32_t mergedCpCostRev = std::max(fromp->critPathCostWithout(GraphWay::REVERSE, edgep),
+                                              top->critPathCost(GraphWay::REVERSE));
+    return mergedCpCostRev + mergedCpCostFwd + LogicMTask::stepCost(fromp->cost() + top->cost());
+}
+
+void MergeCandidate::rescore() {
+    if (const SiblingMC* const sibp = toSiblingMC()) {
+        m_key.m_score = siblingScore(sibp);
+    } else {
+        // The '1 +' favors merging a SiblingMC over an otherwise-
+        // equal-scoring MTaskEdge. The comment on selfTest() talks
+        // about why.
+        m_key.m_score = 1 + edgeScore(static_cast<const MTaskEdge*>(this));
+    }
+}
+
 // ######################################################################
 //  Vertex utility classes
 
@@ -813,7 +886,6 @@ static void partCheckCriticalPaths(V3Graph* mtasksp) {
 // Usage:
 //  * Client increases the cost and/or CP at a node or small set of nodes
 //    (often a pair in practice, eg. edge contraction.)
-//  * Client instances a PartPropagateCp object
 //  * Client calls PartPropagateCp::cpHasIncreased() one or more times.
 //    Each call indicates that the inclusive CP of some "seed" vertex
 //    has increased to a given value.
@@ -823,53 +895,120 @@ static void partCheckCriticalPaths(V3Graph* mtasksp) {
 //  * Client calls PartPropagateCp::go(). Internally, this iteratively
 //    propagates the new CPs wayward through the graph.
 //
+template <GraphWay::en T_Way>
+class PartPropagateCp final {
+    // TYPES
+
+    // We keep pending vertices in a heap during critical path propagation
+    struct PendingKey {
+        LogicMTask* m_mtaskp;  // The vertex in the heap
+        uint32_t m_score;  // The score of this entry
+        void increase(uint32_t score) {
+#if VL_DEBUG
+            UASSERT(score >= m_score, "Must increase");
+#endif
+            m_score = score;
+        }
+        bool operator<(const PendingKey& other) const {
+            if (m_score != other.m_score) return m_score < other.m_score;
+            return LogicMTask::CmpLogicMTask{}(m_mtaskp, other.m_mtaskp);
+        }
+    };
+
+    using PendingHeap = PairingHeap<PendingKey>;
+    using PendingHeapNode = typename PendingHeap::Node;
 
-class PartPropagateCp final : GraphAlg<> {
-private:
     // MEMBERS
-    const GraphWay m_way;  // CPs oriented in this direction: either FORWARD
-    //               // from graph-start to current node, or REVERSE
-    //               // from graph-end to current node.
-    LogicMTask::CpCostAccessor m_access;  // Access cost and CPs on V3GraphVertex's.
-    //                        // confirm we only process each vertex once.
-    const bool m_slowAsserts;  // Enable nontrivial asserts
-    // Pending rescores
-    SortByValueMap<LogicMTask*, uint32_t, LogicMTask::CmpLogicMTask> m_pending;
+    PendingHeap m_pendingHeap;  // Heap of pending rescores
 
+    // We allocate this many heap nodes at once
+    static constexpr size_t ALLOC_CHUNK_SIZE = 128;
+    PendingHeapNode* m_freep = nullptr;  // List of free heap nodes
+    std::vector<std::unique_ptr<PendingHeapNode[]>> m_allocated;  // Allocated heap nodes
+
+    const bool m_slowAsserts;  // Enable nontrivial asserts
     std::set<LogicMTask*> m_seen;  // Used only with slow asserts to check mtasks visited only once
 
 public:
     // CONSTRUCTORS
-    PartPropagateCp(V3Graph* graphp, GraphWay way, bool slowAsserts,
-                    V3EdgeFuncP edgeFuncp = &V3GraphEdge::followAlwaysTrue)
-        : GraphAlg<>{graphp, edgeFuncp}
-        , m_way{way}
-        , m_slowAsserts{slowAsserts} {}
+    PartPropagateCp(bool slowAsserts)
+        : m_slowAsserts{slowAsserts} {}
 
     // METHODS
+private:
+    // Allocate a HeapNode for the given element
+    PendingHeapNode* allocNode() {
+        // If no free nodes available, then make some
+        if (!m_freep) {
+            // Allocate in chunks for efficiency
+            m_allocated.emplace_back(new PendingHeapNode[ALLOC_CHUNK_SIZE]);
+            // Set up free list pointer
+            m_freep = m_allocated.back().get();
+            // Set up free list chain
+            for (size_t i = 1; i < ALLOC_CHUNK_SIZE; ++i) {
+                m_freep[i - 1].m_next.m_ptr = &m_freep[i];
+            }
+            // Clear the next pointer of the last entry
+            m_freep[ALLOC_CHUNK_SIZE - 1].m_next.m_ptr = nullptr;
+        }
+        // Free nodes are available, pick up the first one
+        PendingHeapNode* const resultp = m_freep;
+        m_freep = resultp->m_next.m_ptr;
+        resultp->m_next.m_ptr = nullptr;
+        return resultp;
+    }
+
+    // Release a heap node (make it available for future allocation)
+    void freeNode(PendingHeapNode* nodep) {
+        // Re-use the existing link pointers and simply prepend it to the free list
+        nodep->m_next.m_ptr = m_freep;
+        m_freep = nodep;
+    }
+
+public:
     void cpHasIncreased(V3GraphVertex* vxp, uint32_t newInclusiveCp) {
+        constexpr GraphWay way{T_Way};
+        constexpr GraphWay inv{way.invert()};
+
         // For *vxp, whose CP-inclusive has just increased to
         // newInclusiveCp, iterate to all wayward nodes, update the edges
         // of each, and add each to m_pending if its overall CP has grown.
-        for (V3GraphEdge* edgep = vxp->beginp(m_way); edgep; edgep = edgep->nextp(m_way)) {
-            if (!m_edgeFuncp(edgep)) continue;
-            LogicMTask* const relativep = static_cast<LogicMTask*>(edgep->furtherp(m_way));
-            m_access.notifyEdgeCp(relativep, m_way, vxp, newInclusiveCp);
+        for (MTaskEdge *edgep = static_cast<MTaskEdge*>(vxp->beginp(way)), *nextp; edgep;
+             edgep = nextp) {
+            // Fetch early as likely cache miss
+            nextp = static_cast<MTaskEdge*>(edgep->nextp(way));
 
-            if (m_access.critPathCost(relativep, m_way) < newInclusiveCp) {
-                // relativep's critPathCost() is out of step with its
-                // longest !wayward edge. Schedule that to be resolved.
-                const uint32_t newPendingVal
-                    = newInclusiveCp - m_access.critPathCost(relativep, m_way);
-                const auto pair = m_pending.emplace(relativep, newPendingVal);
-                if (!pair.second && (newPendingVal > pair.first->second)) {
-                    m_pending.update(pair.first, newPendingVal);
-                }
+            LogicMTask* const relativep = edgep->furtherMTaskp(way);
+            EdgeHeap::Node& edgeHeapNode = edgep->m_edgeHeapNode[inv];
+            if (newInclusiveCp > edgeHeapNode.key().m_score) {
+                relativep->m_edgeHeap[inv].increaseKey(&edgeHeapNode, newInclusiveCp);
             }
+
+            const uint32_t critPathCost = relativep->critPathCost(way);
+
+            if (critPathCost >= newInclusiveCp) continue;
+
+            // relativep's critPathCost() is out of step with its longest !wayward edge.
+            // Schedule that to be resolved.
+            const uint32_t newVal = newInclusiveCp - critPathCost;
+
+            if (PendingHeapNode* const nodep = static_cast<PendingHeapNode*>(relativep->userp())) {
+                // Already in heap. Increase score if needed.
+                if (newVal > nodep->key().m_score) m_pendingHeap.increaseKey(nodep, newVal);
+                continue;
+            }
+
+            // Add to heap
+            PendingHeapNode* const nodep = allocNode();
+            relativep->userp(nodep);
+            m_pendingHeap.insert(nodep, {relativep, newVal});
         }
     }
 
     void go() {
+        constexpr GraphWay way{T_Way};
+        constexpr GraphWay inv{way.invert()};
+
         // m_pending maps each pending vertex to the amount that it wayward
         // CP will grow.
         //
@@ -886,27 +1025,34 @@ public:
         // once.  And so on.
         //
         // This generalizes to multiple seed nodes also.
-        while (!m_pending.empty()) {
-            const auto it = m_pending.rbegin();
-            LogicMTask* const updateMep = it->first;
-            const uint32_t cpGrowBy = it->second;
-            m_pending.erase(it);
-
-            // For *updateMep, whose critPathCost was out-of-date with respect
-            // to its edges, update the critPathCost.
-            const uint32_t startCp = m_access.critPathCost(updateMep, m_way);
+        while (!m_pendingHeap.empty()) {
+            // Pop max element from heap
+            PendingHeapNode* const maxp = m_pendingHeap.max();
+            m_pendingHeap.remove(maxp);
+            // Pick up values
+            LogicMTask* const mtaskp = maxp->key().m_mtaskp;
+            const uint32_t cpGrowBy = maxp->key().m_score;
+            // Free the heap node, we are done with it
+            freeNode(maxp);
+            mtaskp->userp(nullptr);
+            // Update the critPathCost of mtaskp, that was out-of-date with respect to its edges
+            const uint32_t startCp = mtaskp->critPathCost(way);
             const uint32_t newCp = startCp + cpGrowBy;
             if (VL_UNLIKELY(m_slowAsserts)) {
-                m_access.checkNewCpVersusEdges(updateMep, m_way, newCp);
+                // Check that CP matches that of the longest edge wayward of vxp.
+                const uint32_t edgeCp = mtaskp->m_edgeHeap[inv].max()->key().m_score;
+                UASSERT_OBJ(edgeCp == newCp, mtaskp, "CP doesn't match longest wayward edge");
                 // Confirm that we only set each node's CP once.  That's an
                 // important property of PartPropagateCp which allows it to be far
                 // faster than a recursive algorithm on some graphs.
-                const bool first = m_seen.insert(updateMep).second;
-                UASSERT_OBJ(first, updateMep, "Set CP on node twice");
+                const bool first = m_seen.insert(mtaskp).second;
+                UASSERT_OBJ(first, mtaskp, "Set CP on node twice");
             }
-            m_access.setCritPathCost(updateMep, m_way, newCp);
-            cpHasIncreased(updateMep, newCp + m_access.cost(updateMep));
+            mtaskp->setCritPathCost(way, newCp);
+            cpHasIncreased(mtaskp, newCp + mtaskp->stepCost());
         }
+
+        if (VL_UNLIKELY(m_slowAsserts)) m_seen.clear();
     }
 
 private:
@@ -939,11 +1085,11 @@ private:
             const unsigned idx1 = V3Os::rand64(rngState) % 50;
             const unsigned idx2 = V3Os::rand64(rngState) % 50;
             if (idx1 > idx2) {
-                if (!m_vx[idx2]->hasRelative(GraphWay::FORWARD, m_vx[idx1])) {
+                if (!m_vx[idx2]->hasRelativeMTask(m_vx[idx1])) {
                     new MTaskEdge{&m_graph, m_vx[idx2], m_vx[idx1], 1};
                 }
             } else if (idx2 > idx1) {
-                if (!m_vx[idx1]->hasRelative(GraphWay::FORWARD, m_vx[idx2])) {
+                if (!m_vx[idx1]->hasRelativeMTask(m_vx[idx2])) {
                     new MTaskEdge{&m_graph, m_vx[idx1], m_vx[idx2], 1};
                 }
             }
@@ -952,7 +1098,7 @@ private:
         partInitCriticalPaths(&m_graph);
 
         // This SelfTest class is also the T_CostAccessor
-        PartPropagateCp prop(&m_graph, GraphWay::FORWARD, true);
+        PartPropagateCp<GraphWay::FORWARD> prop(true);
 
         // Seed the propagator with every input node;
         // This should result in the complete graph getting all CP's assigned.
@@ -961,9 +1107,6 @@ private:
         }
 
         // Run the propagator.
-        //  * The setCritPathCost() routine checks that each node's CP changes
-        //    at most once.
-        //  * The notifyEdgeCp routine is also self checking.
         prop.go();
 
         // Finally, confirm that the entire graph appears to have correct CPs.
@@ -976,7 +1119,7 @@ public:
 
 // Merge edges from a LogicMtask.
 //
-// This code removes 'hasRelative' edges. When this occurs, mark it in need
+// This code removes adjacent edges. When this occurs, mark it in need
 // of a rescore, in case its score has fallen and we need to move it up
 // toward the front of the scoreboard.
 //
@@ -1007,51 +1150,90 @@ public:
 //
 // Another way of stating this: this code ensures that scores of
 // non-transitive edges only ever increase.
-static void partRedirectEdgesFrom(LogicMTask* recipientp, LogicMTask* donorp,
-                                  V3Scoreboard<MergeCandidate, uint32_t>* sbp) {
-    for (const auto& way : {GraphWay::FORWARD, GraphWay::REVERSE}) {
-        for (V3GraphEdge *edgep = donorp->beginp(way), *nextp; edgep; edgep = nextp) {
-            nextp = edgep->nextp(way);
-            MTaskEdge* const tedgep = MTaskEdge::cast(edgep);
-            LogicMTask* const relativep = tedgep->furtherMTaskp(way);
-            if (recipientp->hasRelative(way, relativep)) {
-                // An edge already exists between recipient and relative of donor.
-                // Mark it in need of a rescore
-                if (sbp) {
-                    if (!tedgep->removedFromSb()) sbp->removeElem(tedgep);
-                    const MTaskEdge* const existMTaskEdgep
-                        = MTaskEdge::cast(recipientp->findConnectingEdgep(way, relativep));
-                    UASSERT(existMTaskEdgep, "findConnectingEdge didn't find edge");
-                    if (!existMTaskEdgep->removedFromSb()) {
-                        sbp->hintScoreChanged(existMTaskEdgep);
-                    }
-                }
-                VL_DO_DANGLING(edgep->unlinkDelete(), edgep);
-            } else {
-                // No existing edge between recipient and relative of donor.
-                // Redirect the edge from donor<->relative to recipient<->relative.
-                if (way == GraphWay::REVERSE) {
-                    tedgep->relinkTop(recipientp);
-                    relativep->removeRelative(GraphWay::FORWARD, donorp);
-                    relativep->addRelative(GraphWay::FORWARD, recipientp);
-                    recipientp->addRelative(GraphWay::REVERSE, relativep);
+static void partRedirectEdgesFrom(V3Graph* graphp, LogicMTask* recipientp, LogicMTask* donorp,
+                                  MergeCandidateScoreboard* sbp) {
+
+    // Process outgoing edges
+    MTaskEdge* outNextp = static_cast<MTaskEdge*>(donorp->outBeginp());
+    while (outNextp) {
+        MTaskEdge* const edgep = outNextp;
+        LogicMTask* const relativep = outNextp->toMTaskp();
+        outNextp = static_cast<MTaskEdge*>(outNextp->outNextp());
+
+        relativep->removeRelativeEdge<GraphWay::REVERSE>(edgep);
+
+        if (recipientp->hasRelativeMTask(relativep)) {
+            // An edge already exists between recipient and relative of donor.
+            // Mark it in need of a rescore
+            if (sbp) {
+                if (sbp->contains(edgep)) sbp->remove(edgep);
+                MTaskEdge* const existMTaskEdgep = static_cast<MTaskEdge*>(
+                    recipientp->findConnectingEdgep(GraphWay::FORWARD, relativep));
+#if VL_DEBUG
+                UASSERT(existMTaskEdgep, "findConnectingEdge didn't find edge");
+#endif
+                if (sbp->contains(existMTaskEdgep)) sbp->hintScoreChanged(existMTaskEdgep);
+            }
+            VL_DO_DANGLING(edgep->unlinkDelete(), edgep);
+        } else {
+            // No existing edge between recipient and relative of donor.
+            // Redirect the edge from donor<->relative to recipient<->relative.
+            edgep->relinkFromp(recipientp);
+            recipientp->addRelativeMTask(relativep);
+            recipientp->stealRelativeEdge<GraphWay::FORWARD>(edgep);
+            relativep->addRelativeEdge<GraphWay::REVERSE>(edgep);
+            if (sbp) {
+                if (!sbp->contains(edgep)) {
+                    sbp->add(edgep);
                 } else {
-                    tedgep->relinkFromp(recipientp);
-                    relativep->removeRelative(GraphWay::REVERSE, donorp);
-                    relativep->addRelative(GraphWay::REVERSE, recipientp);
-                    recipientp->addRelative(GraphWay::FORWARD, relativep);
-                }
-                if (sbp) {
-                    if (tedgep->removedFromSb()) {
-                        tedgep->clearRemovedFromSb();
-                        sbp->addElem(tedgep);
-                    } else {
-                        sbp->hintScoreChanged(tedgep);
-                    }
+                    sbp->hintScoreChanged(edgep);
                 }
             }
         }
     }
+
+    // Process incoming edges
+    MTaskEdge* inNextp = static_cast<MTaskEdge*>(donorp->inBeginp());
+    while (inNextp) {
+        MTaskEdge* const edgep = inNextp;
+        LogicMTask* const relativep = inNextp->fromMTaskp();
+        inNextp = static_cast<MTaskEdge*>(inNextp->inNextp());
+
+        relativep->removeRelativeMTask(donorp);
+        relativep->removeRelativeEdge<GraphWay::FORWARD>(edgep);
+
+        if (relativep->hasRelativeMTask(recipientp)) {
+            // An edge already exists between recipient and relative of donor.
+            // Mark it in need of a rescore
+            if (sbp) {
+                if (sbp->contains(edgep)) sbp->remove(edgep);
+                MTaskEdge* const existMTaskEdgep = static_cast<MTaskEdge*>(
+                    recipientp->findConnectingEdgep(GraphWay::REVERSE, relativep));
+#if VL_DEBUG
+                UASSERT(existMTaskEdgep, "findConnectingEdge didn't find edge");
+#endif
+                if (sbp->contains(existMTaskEdgep)) sbp->hintScoreChanged(existMTaskEdgep);
+            }
+            VL_DO_DANGLING(edgep->unlinkDelete(), edgep);
+        } else {
+            // No existing edge between recipient and relative of donor.
+            // Redirect the edge from donor<->relative to recipient<->relative.
+            edgep->relinkTop(recipientp);
+            relativep->addRelativeMTask(recipientp);
+            relativep->addRelativeEdge<GraphWay::FORWARD>(edgep);
+            recipientp->stealRelativeEdge<GraphWay::REVERSE>(edgep);
+            if (sbp) {
+                if (!sbp->contains(edgep)) {
+                    sbp->add(edgep);
+                } else {
+                    sbp->hintScoreChanged(edgep);
+                }
+            }
+        }
+    }
+
+    // Remove donorp from the graph
+    VL_DO_DANGLING(donorp->unlinkDelete(graphp), donorp);
 }
 
 //######################################################################
@@ -1061,14 +1243,6 @@ static void partRedirectEdgesFrom(LogicMTask* recipientp, LogicMTask* donorp,
 class PartContraction final {
 private:
     // TYPES
-
-    // TODO: might get a little more speed by making this a
-    // std::unordered_set and defining hash and equal_to functors for the
-    // SiblingMC:
-    using SibSet = std::set<SiblingMC>;
-    using SibpSet = std::unordered_set<const SiblingMC*>;
-    using MTask2Sibs = std::unordered_map<const LogicMTask*, SibpSet>;
-
     // New CP information for mtaskp reflecting an upcoming merge
     struct NewCp {
         uint32_t cp;
@@ -1082,17 +1256,17 @@ private:
     uint32_t m_scoreLimitBeforeRescore = 0xffffffff;  // Next score rescore at
     unsigned m_mergesSinceRescore = 0;  // Merges since last rescore
     const bool m_slowAsserts;  // Take extra time to validate algorithm
-    V3Scoreboard<MergeCandidate, uint32_t> m_sb;  // Scoreboard
-    SibSet m_pairs;  // Storage for each SiblingMC
-    MTask2Sibs m_mtask2sibs;  // SiblingMC set for each mtask
+    MergeCandidateScoreboard m_sb;  // Scoreboard
+
+    PartPropagateCp<GraphWay::FORWARD> m_forwardPropagator{m_slowAsserts};  // Forward propagator
+    PartPropagateCp<GraphWay::REVERSE> m_reversePropagator{m_slowAsserts};  // Reverse propagator
 
 public:
     // CONSTRUCTORS
     PartContraction(V3Graph* mtasksp, uint32_t scoreLimit, bool slowAsserts)
         : m_mtasksp{mtasksp}
         , m_scoreLimit{scoreLimit}
-        , m_slowAsserts{slowAsserts}
-        , m_sb{&mergeCandidateScore, slowAsserts} {}
+        , m_slowAsserts{slowAsserts} {}
 
     // METHODS
     void go() {
@@ -1116,17 +1290,18 @@ public:
         //  - Incrementally recompute critical paths near the merged mtask.
 
         for (V3GraphVertex* itp = m_mtasksp->verticesBeginp(); itp; itp = itp->verticesNextp()) {
+            itp->userp(nullptr);  // Reset user value. Used by PartPropagateCp.
             std::unordered_set<const V3GraphVertex*> neighbors;
             for (V3GraphEdge* edgep = itp->outBeginp(); edgep; edgep = edgep->outNextp()) {
-                m_sb.addElem(MTaskEdge::cast(edgep));
+                m_sb.add(static_cast<MTaskEdge*>(edgep));
                 if (m_slowAsserts) {
                     UASSERT_OBJ(neighbors.find(edgep->top()) == neighbors.end(), itp,
                                 "Redundant edge found in input to PartContraction()");
                 }
                 neighbors.insert(edgep->top());
             }
-            siblingPairFromRelatives(GraphWay::REVERSE, itp, true);
-            siblingPairFromRelatives(GraphWay::FORWARD, itp, true);
+            siblingPairFromRelatives<GraphWay::REVERSE, true>(itp);
+            siblingPairFromRelatives<GraphWay::FORWARD, true>(itp);
         }
 
         doRescore();  // Set initial scores in scoreboard
@@ -1134,7 +1309,7 @@ public:
         while (true) {
             // This is the best edge to merge, with the lowest
             // score (shortest local critical path)
-            MergeCandidate* const mergeCanp = const_cast<MergeCandidate*>(m_sb.bestp());
+            MergeCandidate* const mergeCanp = m_sb.best();
             if (!mergeCanp) {
                 // Scoreboard found no eligible merges. Maybe a rescore
                 // will produce some merge-able pairs?
@@ -1149,8 +1324,9 @@ public:
                 UASSERT(!m_sb.needsRescore(mergeCanp),
                         "Need-rescore items should not be returned by bestp");
             }
-            const uint32_t cachedScore = m_sb.cachedScore(mergeCanp);
-            const uint32_t actualScore = mergeCandidateScore(mergeCanp);
+            const uint32_t cachedScore = mergeCanp->score();
+            mergeCanp->rescore();
+            const uint32_t actualScore = mergeCanp->score();
 
             if (actualScore > cachedScore) {
                 // Cached score is out-of-date.
@@ -1211,8 +1387,11 @@ public:
             if (mergeCanp->mergeWouldCreateCycle()) {
                 // Remove this edge from scoreboard so we don't keep
                 // reconsidering it on every loop.
-                m_sb.removeElem(mergeCanp);
-                mergeCanp->removedFromSb(true);
+                m_sb.remove(mergeCanp);
+                if (SiblingMC* const smcp = mergeCanp->toSiblingMC()) {
+                    smcp->bp()->farSibs().erase(smcp);
+                    smcp->ap()->ownSibs().erase(*smcp);  // Kills *smcp, so do last
+                }
                 continue;
             }
 
@@ -1245,7 +1424,9 @@ public:
     }
 
 private:
-    NewCp newCp(GraphWay way, LogicMTask* mtaskp, LogicMTask* otherp, MTaskEdge* mergeEdgep) {
+    template <GraphWay::en T_Way>
+    NewCp newCp(LogicMTask* mtaskp, LogicMTask* otherp, MTaskEdge* mergeEdgep) {
+        constexpr GraphWay way{T_Way};
         // Return new wayward-CP for mtaskp reflecting its upcoming merge
         // with otherp. Set 'result.propagate' if mtaskp's wayward
         // relatives will see a new wayward CP from this merge.
@@ -1274,31 +1455,29 @@ private:
     }
 
     void removeSiblingMCsWith(LogicMTask* mtaskp) {
-        for (SibpSet::iterator it = m_mtask2sibs[mtaskp].begin(); it != m_mtask2sibs[mtaskp].end();
-             ++it) {
-            const SiblingMC* const pairp = *it;
-            if (!pairp->removedFromSb()) m_sb.removeElem(pairp);
-            const LogicMTask* const otherp = (pairp->bp() == mtaskp) ? pairp->ap() : pairp->bp();
-            size_t erased = m_mtask2sibs[otherp].erase(pairp);
-            UASSERT_OBJ(erased > 0, otherp, "Expected existing mtask");
-            erased = m_pairs.erase(*pairp);
-            UASSERT_OBJ(erased > 0, mtaskp, "Expected existing mtask");
+        for (const SiblingMC& pair : mtaskp->ownSibs()) {
+            m_sb.remove(const_cast<SiblingMC*>(&pair));
+            // Owner is always ap(), remove from the opposite side
+            pair.bp()->farSibs().erase(&pair);
         }
-        const size_t erased = m_mtask2sibs.erase(mtaskp);
-        UASSERT_OBJ(erased > 0, mtaskp, "Expected existing mtask");
+        for (const SiblingMC* const pairp : mtaskp->farSibs()) {
+            m_sb.remove(const_cast<SiblingMC*>(pairp));
+            // Owner is always ap(), remove from the opposite side
+            pairp->ap()->ownSibs().erase(*pairp);
+        }
+        mtaskp->ownSibs().clear();
+        mtaskp->farSibs().clear();
     }
 
     void contract(MergeCandidate* mergeCanp) {
         LogicMTask* top = nullptr;
         LogicMTask* fromp = nullptr;
         MTaskEdge* mergeEdgep = mergeCanp->toMTaskEdge();
-        const SiblingMC* mergeSibsp = nullptr;
         if (mergeEdgep) {
             top = static_cast<LogicMTask*>(mergeEdgep->top());
             fromp = static_cast<LogicMTask*>(mergeEdgep->fromp());
         } else {
-            mergeSibsp = mergeCanp->toSiblingMC();
-            UASSERT(mergeSibsp, "Failed to cast mergeCanp to either MTaskEdge or SiblingMC");
+            const SiblingMC* mergeSibsp = static_cast<SiblingMC*>(mergeCanp);
             top = mergeSibsp->ap();
             fromp = mergeSibsp->bp();
         }
@@ -1329,15 +1508,18 @@ private:
         //
         // These 'NewCp' objects carry a bit indicating whether we must
         // propagate CP for each of the four cases:
-        const NewCp recipientNewCpFwd = newCp(GraphWay::FORWARD, recipientp, donorp, mergeEdgep);
-        const NewCp donorNewCpFwd = newCp(GraphWay::FORWARD, donorp, recipientp, mergeEdgep);
-        const NewCp recipientNewCpRev = newCp(GraphWay::REVERSE, recipientp, donorp, mergeEdgep);
-        const NewCp donorNewCpRev = newCp(GraphWay::REVERSE, donorp, recipientp, mergeEdgep);
+        const NewCp recipientNewCpFwd = newCp<GraphWay::FORWARD>(recipientp, donorp, mergeEdgep);
+        const NewCp donorNewCpFwd = newCp<GraphWay::FORWARD>(donorp, recipientp, mergeEdgep);
+        const NewCp recipientNewCpRev = newCp<GraphWay::REVERSE>(recipientp, donorp, mergeEdgep);
+        const NewCp donorNewCpRev = newCp<GraphWay::REVERSE>(donorp, recipientp, mergeEdgep);
 
         if (mergeEdgep) {
             // Remove and free the connecting edge. Must do this before
             // propagating CP's below.
-            m_sb.removeElem(mergeCanp);
+            m_sb.remove(mergeCanp);
+            mergeEdgep->fromMTaskp()->removeRelativeMTask(mergeEdgep->toMTaskp());
+            mergeEdgep->fromMTaskp()->removeRelativeEdge<GraphWay::FORWARD>(mergeEdgep);
+            mergeEdgep->toMTaskp()->removeRelativeEdge<GraphWay::REVERSE>(mergeEdgep);
             VL_DO_CLEAR(mergeEdgep->unlinkDelete(), mergeEdgep = nullptr);
         }
 
@@ -1353,25 +1535,22 @@ private:
                                 << (donorNewCpFwd.propagate ? " true " : " false ")
                                 << donorNewCpFwd.propagateCp << endl);
 
-        PartPropagateCp forwardPropagator(m_mtasksp, GraphWay::FORWARD, m_slowAsserts);
-        PartPropagateCp reversePropagator(m_mtasksp, GraphWay::REVERSE, m_slowAsserts);
-
         recipientp->setCritPathCost(GraphWay::FORWARD, recipientNewCpFwd.cp);
         if (recipientNewCpFwd.propagate) {
-            forwardPropagator.cpHasIncreased(recipientp, recipientNewCpFwd.propagateCp);
+            m_forwardPropagator.cpHasIncreased(recipientp, recipientNewCpFwd.propagateCp);
         }
         recipientp->setCritPathCost(GraphWay::REVERSE, recipientNewCpRev.cp);
         if (recipientNewCpRev.propagate) {
-            reversePropagator.cpHasIncreased(recipientp, recipientNewCpRev.propagateCp);
+            m_reversePropagator.cpHasIncreased(recipientp, recipientNewCpRev.propagateCp);
         }
         if (donorNewCpFwd.propagate) {
-            forwardPropagator.cpHasIncreased(donorp, donorNewCpFwd.propagateCp);
+            m_forwardPropagator.cpHasIncreased(donorp, donorNewCpFwd.propagateCp);
         }
         if (donorNewCpRev.propagate) {
-            reversePropagator.cpHasIncreased(donorp, donorNewCpRev.propagateCp);
+            m_reversePropagator.cpHasIncreased(donorp, donorNewCpRev.propagateCp);
         }
-        forwardPropagator.go();
-        reversePropagator.go();
+        m_forwardPropagator.go();
+        m_reversePropagator.go();
 
         // Remove all SiblingMCs that include donorp. This Includes the one
         // we're merging, if we're merging a SiblingMC.
@@ -1381,11 +1560,8 @@ private:
         // to a bounded number.
         removeSiblingMCsWith(recipientp);
 
-        // Redirect all edges
-        partRedirectEdgesFrom(recipientp, donorp, &m_sb);
-
-        // Delete the donorp mtask from the graph
-        VL_DO_CLEAR(donorp->unlinkDelete(m_mtasksp), donorp = nullptr);
+        // Redirect all edges, delete donorp
+        partRedirectEdgesFrom(m_mtasksp, recipientp, donorp, &m_sb);
 
         ++m_mergesSinceRescore;
 
@@ -1398,21 +1574,21 @@ private:
         //  - prereqs of recipientp's postreqs
         //  - postreqs of recipientp's prereqs
         // Note that this depends on the updated critical paths (above).
-        siblingPairFromRelatives(GraphWay::REVERSE, recipientp, true);
-        siblingPairFromRelatives(GraphWay::FORWARD, recipientp, true);
+        siblingPairFromRelatives<GraphWay::REVERSE, true>(recipientp);
+        siblingPairFromRelatives<GraphWay::FORWARD, true>(recipientp);
         unsigned edges = 0;
         for (V3GraphEdge* edgep = recipientp->outBeginp(); edgep; edgep = edgep->outNextp()) {
             LogicMTask* const postreqp = static_cast<LogicMTask*>(edgep->top());
-            siblingPairFromRelatives(GraphWay::REVERSE, postreqp, false);
+            siblingPairFromRelatives<GraphWay::REVERSE, false>(postreqp);
             ++edges;
-            if (edges > PART_SIBLING_EDGE_LIMIT) break;
+            if (edges >= PART_SIBLING_EDGE_LIMIT) break;
         }
         edges = 0;
         for (V3GraphEdge* edgep = recipientp->inBeginp(); edgep; edgep = edgep->inNextp()) {
             LogicMTask* const prereqp = static_cast<LogicMTask*>(edgep->fromp());
-            siblingPairFromRelatives(GraphWay::FORWARD, prereqp, false);
+            siblingPairFromRelatives<GraphWay::FORWARD, false>(prereqp);
             ++edges;
-            if (edges > PART_SIBLING_EDGE_LIMIT) break;
+            if (edges >= PART_SIBLING_EDGE_LIMIT) break;
         }
     }
 
@@ -1429,111 +1605,86 @@ private:
         m_scoreLimitBeforeRescore = 0xffffffff;
     }
 
-    static uint32_t mergeCandidateScore(const MergeCandidate* pairp) {
-        if (const MTaskEdge* const edgep = pairp->toMTaskEdge()) {
-            // The '1 +' favors merging a SiblingMC over an otherwise-
-            // equal-scoring MTaskEdge. The comment on selfTest() talks
-            // about why.
-            return 1 + edgeScore(edgep);
-        } else {
-            return siblingScore(pairp->toSiblingMC());
-        }
-        v3fatalSrc("Failed to cast pairp to either MTaskEdge or SiblingMC in mergeCandidateScore");
-        return 0;
-    }
-
-    VL_ATTR_NOINLINE
-    static uint32_t siblingScore(const SiblingMC* sibsp) {
-        const LogicMTask* const ap = sibsp->ap();
-        const LogicMTask* const bp = sibsp->bp();
-        const uint32_t mergedCpCostFwd
-            = std::max(ap->critPathCost(GraphWay::FORWARD), bp->critPathCost(GraphWay::FORWARD));
-        const uint32_t mergedCpCostRev
-            = std::max(ap->critPathCost(GraphWay::REVERSE), bp->critPathCost(GraphWay::REVERSE));
-        return mergedCpCostRev + mergedCpCostFwd + LogicMTask::stepCost(ap->cost() + bp->cost());
-    }
-
-    VL_ATTR_NOINLINE
-    static uint32_t edgeScore(const V3GraphEdge* edgep) {
-        // Score this edge. Lower is better. The score is the new local CP
-        // length if we merge these mtasks.  ("Local" means the longest
-        // critical path running through the merged node.)
-        const LogicMTask* const top = static_cast<LogicMTask*>(edgep->top());
-        const LogicMTask* const fromp = static_cast<LogicMTask*>(edgep->fromp());
-        const uint32_t mergedCpCostFwd
-            = std::max(fromp->critPathCost(GraphWay::FORWARD),
-                       top->critPathCostWithout(GraphWay::FORWARD, edgep));
-        const uint32_t mergedCpCostRev
-            = std::max(fromp->critPathCostWithout(GraphWay::REVERSE, edgep),
-                       top->critPathCost(GraphWay::REVERSE));
-        return mergedCpCostRev + mergedCpCostFwd
-               + LogicMTask::stepCost(fromp->cost() + top->cost());
-    }
-
     void makeSiblingMC(LogicMTask* ap, LogicMTask* bp) {
-        const SiblingMC newSibs(ap, bp);
-        const std::pair<SibSet::iterator, bool> insertResult = m_pairs.insert(newSibs);
-        if (insertResult.second) {
-            const SiblingMC* const newSibsp = &(*insertResult.first);
-            m_mtask2sibs[ap].insert(newSibsp);
-            m_mtask2sibs[bp].insert(newSibsp);
-            m_sb.addElem(newSibsp);
+        if (ap->id() < bp->id()) std::swap(ap, bp);
+        // The higher id vertex owns the storage
+        const auto emplaceResult = ap->ownSibs().emplace(ap, bp);
+        if (emplaceResult.second) {
+            SiblingMC* const newSibsp = const_cast<SiblingMC*>(&(*emplaceResult.first));
+            bp->farSibs().insert(newSibsp);
+            m_sb.add(newSibsp);
         } else if (m_slowAsserts) {
             // It's fine if we already have this SiblingMC, we may have
             // created it earlier. Just confirm that we have associated data.
-            UASSERT_OBJ(m_mtask2sibs.find(ap) != m_mtask2sibs.end(), ap, "Sibling not found");
-            UASSERT_OBJ(m_mtask2sibs.find(bp) != m_mtask2sibs.end(), bp, "Sibling not found");
             bool found = false;
-            for (SibpSet::iterator it = m_mtask2sibs[ap].begin(); it != m_mtask2sibs[ap].end();
-                 ++it) {
-                const SiblingMC* const sibsp = *it;
-                UASSERT_OBJ(!(!sibsp->removedFromSb() && !m_sb.contains(sibsp)), ap,
-                            "One sibling must be the one we collided with");
-                if ((sibsp->ap() == ap && sibsp->bp() == bp)
-                    || (sibsp->bp() == ap && sibsp->ap() == bp))
-                    found = true;
+            for (const SiblingMC& sibs : ap->ownSibs()) {
+                UASSERT_OBJ(sibs.ap() == ap, ap, "Inconsistent SiblingMC");
+                UASSERT_OBJ(m_sb.contains(&sibs), ap, "Must be on the scoreboard");
+                if (sibs.bp() == bp) found = true;
             }
             UASSERT_OBJ(found, ap, "Sibling not found");
         }
     }
 
-    void siblingPairFromRelatives(GraphWay way, V3GraphVertex* mtaskp, bool exhaustive) {
-        std::vector<LogicMTask*> shortestPrereqs;
+    template <GraphWay::en T_Way, bool Exhaustive>
+    void siblingPairFromRelatives(V3GraphVertex* mtaskp) {
+        constexpr GraphWay way{T_Way};
+        // Need at least 2 edges
+        if (!mtaskp->beginp(way) || !mtaskp->beginp(way)->nextp(way)) return;
 
-        for (V3GraphEdge* edgep = mtaskp->beginp(way); edgep; edgep = edgep->nextp(way)) {
-            LogicMTask* const prereqp = static_cast<LogicMTask*>(edgep->furtherp(way));
-            shortestPrereqs.push_back(prereqp);
-            // Prevent nodes with huge numbers of edges from massively
-            // slowing down the partitioner:
-            if (shortestPrereqs.size() > PART_SIBLING_EDGE_LIMIT) break;
+        std::array<LogicMTask*, PART_SIBLING_EDGE_LIMIT> neighbours;
+
+        // This is a hot method, so we want so sort as efficiently as possible. We pre-load
+        // all data (critical path cost and id) required for determining ordering into an aligned
+        // structure. There is not enough space next to these to keep a whole pointer within 16
+        // bytes, so we store an index into the neighbours buffer instead. We can then compare
+        // and swap these sorting records very efficiently. With this the standard library sorting
+        // functions are efficient enough and using more optimized methods (e.g.: sorting networks)
+        // has no measurable benefit.
+        struct alignas(16) SortingRecord {
+            uint64_t m_id;
+            uint32_t m_cp;
+            uint8_t m_idx;
+            static_assert(PART_SIBLING_EDGE_LIMIT <= std::numeric_limits<uint8_t>::max(),
+                          "m_idx must fit all indices into 'neighbours'");
+            bool operator<(const SortingRecord& that) const {
+                return m_cp < that.m_cp || (m_cp == that.m_cp && m_id < that.m_id);
+            }
+        };
+        static_assert(sizeof(SortingRecord) <= 16, "How could this be padded to more than 16?");
+
+        std::array<SortingRecord, PART_SIBLING_EDGE_LIMIT> sortRecs;
+        size_t n = 0;
+
+        // Populate the buffers
+        for (V3GraphEdge *edgep = mtaskp->beginp(way), *nextp; edgep; edgep = nextp) {
+            nextp = edgep->nextp(way);  // Fetch next first as likely cache miss
+            LogicMTask* const otherp = static_cast<LogicMTask*>(edgep->furtherp(way));
+            neighbours[n] = otherp;
+            sortRecs[n].m_id = otherp->id();
+            sortRecs[n].m_cp = otherp->critPathCost(way) + otherp->cost();
+            sortRecs[n].m_idx = n;
+            ++n;
+            // Prevent nodes with huge numbers of edges from massively slowing down us down
+            if (n >= PART_SIBLING_EDGE_LIMIT) break;
         }
 
-        if (shortestPrereqs.size() <= 1) return;
-
-        const auto cmp = [way](const LogicMTask* ap, const LogicMTask* bp) {
-            const uint32_t aCp = ap->critPathCost(way) + ap->cost();
-            const uint32_t bCp = bp->critPathCost(way) + bp->cost();
-            if (aCp != bCp) return aCp < bCp;
-            return ap->id() < bp->id();
-        };
-
-        // Don't make all possible pairs of prereqs when not requested (non-exhaustive).
+        // Don't make all possible pairs of siblings when not requested (non-exhaustive).
         // Just make a few pairs.
         constexpr size_t MAX_NONEXHAUSTIVE_PAIRS = 3;
 
-        size_t end;  // End index of pairs to add to candidates (exclusive)
-
-        if (exhaustive || (shortestPrereqs.size() <= 2 * MAX_NONEXHAUSTIVE_PAIRS)) {
-            end = shortestPrereqs.size() & ~static_cast<size_t>(1);  // Round down to even
-            std::sort(shortestPrereqs.begin(), shortestPrereqs.end(), cmp);
+        if (Exhaustive || n <= 2 * MAX_NONEXHAUSTIVE_PAIRS) {
+            const size_t end = n & ~static_cast<size_t>(1);  // Round down to even, (we want pairs)
+            std::sort(sortRecs.begin(), sortRecs.begin() + n);
+            for (size_t i = 0; i < end; i += 2) {
+                makeSiblingMC(neighbours[sortRecs[i].m_idx], neighbours[sortRecs[i + 1].m_idx]);
+            }
         } else {
-            end = 2 * MAX_NONEXHAUSTIVE_PAIRS;
-            std::partial_sort(shortestPrereqs.begin(), shortestPrereqs.begin() + end,
-                              shortestPrereqs.end(), cmp);
-        }
-
-        for (size_t i = 0; i < end; i += 2) {
-            makeSiblingMC(shortestPrereqs[i], shortestPrereqs[i + 1]);
+            constexpr size_t end = 2 * MAX_NONEXHAUSTIVE_PAIRS;
+            std::partial_sort(sortRecs.begin(), sortRecs.begin() + end, sortRecs.begin() + n);
+            for (size_t i = 0; i < end; i += 2) {
+                makeSiblingMC(neighbours[sortRecs[i].m_idx], neighbours[sortRecs[i + 1].m_idx]);
+            }
         }
     }
 
@@ -1850,17 +2001,15 @@ private:
                 }
                 // Move all vertices from donorp to mergedp
                 mergedp->moveAllVerticesFrom(donorp);
-                // Redirect edges from donorp to recipientp
-                partRedirectEdgesFrom(mergedp, donorp, nullptr);
-                // Remove donorp from the graph
-                VL_DO_DANGLING(donorp->unlinkDelete(m_mtasksp), donorp);
+                // Redirect edges from donorp to recipientp, delete donorp
+                partRedirectEdgesFrom(m_mtasksp, mergedp, donorp, nullptr);
                 ++m_mergesDone;
             }
 
             if (lastMergedp) {
                 UASSERT_OBJ(lastMergedp->rank() < mergedp->rank(), mergedp,
                             "Merging must be on lower rank");
-                if (!lastMergedp->hasRelative(GraphWay::FORWARD, mergedp)) {
+                if (!lastMergedp->hasRelativeMTask(mergedp)) {
                     new MTaskEdge(m_mtasksp, lastMergedp, mergedp, 1);
                 }
             }
@@ -2506,9 +2655,8 @@ void V3Partition::setupMTaskDeps(V3Graph* mtasksp, const Vx2MTaskMap* vx2mtaskp)
                 UASSERT_OBJ(otherMTaskp != mtaskp, mtaskp, "Would create a cycle edge");
 
                 // Don't create redundant edges.
-                if (mtaskp->hasRelative(GraphWay::FORWARD, otherMTaskp)) {  //
-                    continue;
-                }
+                if (mtaskp->hasRelativeMTask(otherMTaskp)) continue;
+
                 new MTaskEdge(mtasksp, mtaskp, otherMTaskp, 1);
             }
         }
diff --git a/src/V3Scoreboard.cpp b/src/V3Scoreboard.cpp
index 78d466596..d21422a81 100644
--- a/src/V3Scoreboard.cpp
+++ b/src/V3Scoreboard.cpp
@@ -19,26 +19,42 @@
 
 #include "V3Scoreboard.h"
 
-class ScoreboardTestElem final {
+class ScoreboardTestElem;
+
+struct Key {
+    // Node: Structure layout chosen to minimize padding in PairingHeao<*>::Node
+    uint64_t m_id;  // Unique ID part of edge score
+    uint32_t m_score;  // Score part of ID
+    bool operator<(const Key& other) const {
+        // First by Score then by ID, but notice that we want minimums using a max-heap, so reverse
+        return m_score > other.m_score || (m_score == other.m_score && m_id > other.m_id);
+    }
+};
+
+using Scoreboard = V3Scoreboard<ScoreboardTestElem, Key>;
+
+class ScoreboardTestElem final : public Scoreboard::Node {
 public:
-    // MEMBERS
-    uint32_t m_score;
-    uint32_t m_id;
+    uint32_t m_newScore;
     // CONSTRUCTORS
     explicit ScoreboardTestElem(uint32_t score)
-        : m_score{score} {
+        : m_newScore{score} {
+        m_key.m_score = m_newScore;
         static uint32_t s_serial = 0;
-        m_id = ++s_serial;
+        m_key.m_id = ++s_serial;
     }
     ScoreboardTestElem() = default;
-    // METHODS
-    static uint32_t scoreFn(const ScoreboardTestElem* elp) { return elp->m_score; }
 
-    bool operator<(const ScoreboardTestElem& other) const { return m_id < other.m_id; }
+    uint64_t id() const { return m_key.m_id; }
+    void rescore() { m_key.m_score = m_newScore; }
+    uint32_t score() const { return m_key.m_score; }
+    static ScoreboardTestElem* heapNodeToElem(Scoreboard::Node* nodep) {
+        return static_cast<ScoreboardTestElem*>(nodep);
+    }
 };
 
 void V3ScoreboardBase::selfTest() {
-    V3Scoreboard<ScoreboardTestElem, uint32_t> sb(ScoreboardTestElem::scoreFn, true);
+    Scoreboard sb;
 
     UASSERT(!sb.needsRescore(), "SelfTest: Empty sb should not need rescore.");
 
@@ -46,13 +62,13 @@ void V3ScoreboardBase::selfTest() {
     ScoreboardTestElem e2(20);
     ScoreboardTestElem e3(30);
 
-    sb.addElem(&e1);
-    sb.addElem(&e2);
-    sb.addElem(&e3);
+    sb.add(&e1);
+    sb.add(&e2);
+    sb.add(&e3);
 
     UASSERT(sb.needsRescore(), "SelfTest: Newly filled sb should need a rescore.");
     UASSERT(sb.needsRescore(&e1), "SelfTest: Individual newly-added element should need rescore");
-    UASSERT(nullptr == sb.bestp(),
+    UASSERT(nullptr == sb.best(),
             "SelfTest: Newly filled sb should have nothing eligible for Bestp()");
 
     sb.rescore();
@@ -60,24 +76,22 @@ void V3ScoreboardBase::selfTest() {
     UASSERT(!sb.needsRescore(), "SelfTest: Newly rescored sb should not need rescore");
     UASSERT(!sb.needsRescore(&e1),
             "SelfTest: Newly rescored sb should not need an element rescored");
-    UASSERT(e2.m_score == sb.cachedScore(&e2),
-            "SelfTest: Cached score should match current score");
-    UASSERT(&e1 == sb.bestp(), "SelfTest: Should return element with lowest (best) score");
+    UASSERT(&e1 == sb.best(), "SelfTest: Should return element with lowest (best) score");
 
     // Change one element's score
     sb.hintScoreChanged(&e2);
-    e2.m_score = 21;
+    e2.m_newScore = 21;
     UASSERT(sb.needsRescore(&e2), "SelfTest: Should need rescore on elem after hintScoreChanged");
 
     // Remove an element
     UASSERT(sb.contains(&e1), "SelfTest: e1 should be there");
-    sb.removeElem(&e1);
+    sb.remove(&e1);
     UASSERT(!sb.contains(&e1), "SelfTest: e1 should be gone");
     UASSERT(sb.contains(&e2), "SelfTest: e2 should be there, despite needing rescore");
 
     // Now e3 should be our best-scoring element, even though
     // e2 has a better score, since e2 is pending rescore.
-    UASSERT(&e3 == sb.bestp(), "SelfTest: Expect e3 as best element with known score.");
+    UASSERT(&e3 == sb.best(), "SelfTest: Expect e3 as best element with known score.");
     sb.rescore();
-    UASSERT(&e2 == sb.bestp(), "SelfTest: Expect e2 as best element again after Rescore");
+    UASSERT(&e2 == sb.best(), "SelfTest: Expect e2 as best element again after Rescore");
 }
diff --git a/src/V3Scoreboard.h b/src/V3Scoreboard.h
index dc5fce0b0..4bf915431 100644
--- a/src/V3Scoreboard.h
+++ b/src/V3Scoreboard.h
@@ -1,13 +1,6 @@
 // -*- mode: C++; c-file-style: "cc-mode" -*-
 //*************************************************************************
-// DESCRIPTION: Verilator: Scoreboards for thread partitioner
-//
-// Provides scoreboard classes:
-//
-//  * SortByValueMap
-//  * V3Scoreboard
-//
-// See details below
+// DESCRIPTION: Verilator: Scoreboard for mtask coarsening
 //
 // Code available from: https://verilator.org
 //
@@ -28,248 +21,122 @@
 #include "verilatedos.h"
 
 #include "V3Error.h"
+#include "V3PairingHeap.h"
 
-#include <functional>
-#include <map>
-#include <set>
-#include <unordered_map>
+//===============================================================================================
+// V3Scoreboard is essentially a heap that can be hinted that some elements have changed keys, at
+// which points those elements will be deferred as 'unknown' until the next 'rescore' call. We
+// largely reuse the implementation of the slightly more generic PairingHeap, but we do rely on the
+// internal structure of the PairingHeap so changing that class requires changing this.
+//
+// For efficiency, the elements themselves must be the heap nodes, by deriving them from
+// V3Scoreboard<T_Elem, T_Key>::Node. This also means a single element can only be associated with
+// a single scoreboard.
 
-// ######################################################################
-//  SortByValueMap
-
-// A generic key-value map, except iteration is in *value* sorted order. Values need not be unique.
-// Uses T_KeyCompare to break ties in the sort when values collide. Note: Only const iteration is
-// possible, as updating mapped values via iterators is not safe.
-
-template <typename T_Key, typename T_Value, class T_KeyCompare = std::less<T_Key>>
-class SortByValueMap final {
-    // Current implementation is a std::set of key/value pairs, plus a std_unordered_map from keys
-    // to iterators into the set. This keeps most operations fairly cheap and also has the benefit
-    // of being able to re-use the std::set iterators.
-
-    // TYPES
-
-    using Pair = std::pair<T_Key, T_Value>;
-
-    struct PairCmp final {
-        bool operator()(const Pair& a, const Pair& b) const {
-            // First compare values
-            if (a.second != b.second) return a.second < b.second;
-            // Then compare keys
-            return T_KeyCompare{}(a.first, b.first);
-        }
-    };
-
-    using PairSet = std::set<Pair, PairCmp>;
-
-public:
-    using const_iterator = typename PairSet::const_iterator;
-    using const_reverse_iterator = typename PairSet::const_reverse_iterator;
-
-private:
-    // MEMBERS
-    PairSet m_pairs;  // The contents of the map, stored directly as key-value pairs
-    std::unordered_map<T_Key, const_iterator> m_kiMap;  // Key to iterator map
-
-    VL_UNCOPYABLE(SortByValueMap);
-
-public:
-    // CONSTRUCTORS
-    SortByValueMap() = default;
-
-    // Only const iteration is possible
-    const_iterator begin() const { return m_pairs.begin(); }
-    const_iterator end() const { return m_pairs.end(); }
-    const_iterator cbegin() const { m_pairs.cbegin(); }
-    const_iterator cend() const { return m_pairs.cend(); }
-    const_reverse_iterator rbegin() const { return m_pairs.rbegin(); }
-    const_reverse_iterator rend() const { return m_pairs.rend(); }
-    const_reverse_iterator crbegin() const { return m_pairs.crbegin(); }
-    const_reverse_iterator crend() const { return m_pairs.crend(); }
-
-    const_iterator find(const T_Key& key) const {
-        const auto kiIt = m_kiMap.find(key);
-        if (kiIt == m_kiMap.end()) return cend();
-        return kiIt->second;
-    }
-    size_t erase(const T_Key& key) {
-        const auto kiIt = m_kiMap.find(key);
-        if (kiIt == m_kiMap.end()) return 0;
-        m_pairs.erase(kiIt->second);
-        m_kiMap.erase(kiIt);
-        return 1;
-    }
-    void erase(const_iterator it) {
-        m_kiMap.erase(it->first);
-        m_pairs.erase(it);
-    }
-    void erase(const_reverse_iterator rit) {
-        m_kiMap.erase(rit->first);
-        m_pairs.erase(std::next(rit).base());
-    }
-    bool has(const T_Key& key) const { return m_kiMap.count(key); }
-    bool empty() const { return m_pairs.empty(); }
-    // Returns const reference.
-    const T_Value& at(const T_Key& key) const { return m_kiMap.at(key)->second; }
-    // Note this returns const_iterator
-    template <typename... Args>
-    std::pair<const_iterator, bool> emplace(const T_Key& key, Args&&... args) {
-        const auto kiEmp = m_kiMap.emplace(key, end());
-        if (kiEmp.second) {
-            const auto result = m_pairs.emplace(key, std::forward<Args>(args)...);
-#if VL_DEBUG
-            UASSERT(result.second, "Should not be in set yet");
-#endif
-            kiEmp.first->second = result.first;
-            return result;
-        }
-        return {kiEmp.first->second, false};
-    }
-    // Invalidates iterators
-    void update(const_iterator it, T_Value value) {
-        const auto kiIt = m_kiMap.find(it->first);
-        m_pairs.erase(it);
-        kiIt->second = m_pairs.emplace(kiIt->first, value).first;
-    }
-};
-
-//######################################################################
-
-/// V3Scoreboard takes a set of Elem*'s, each having some score.
-/// Scores are assigned by a user-supplied scoring function.
-///
-/// At any time, the V3Scoreboard can return th515e elem with the "best" score
-/// among those elements whose scores are known.
-///
-/// The best score is the _lowest_ score. This makes sense in contexts
-/// where scores represent costs.
-///
-/// The Scoreboard supports mutating element scores efficiently. The client
-/// must hint to the V3Scoreboard when an element's score may have
-/// changed. When it receives this hint, the V3Scoreboard will move the
-/// element into the set of elements whose scores are unknown. Later the
-/// client can tell V3Scoreboard to re-sort the list, which it does
-/// incrementally, by re-scoring all elements whose scores are unknown, and
-/// then moving these back into the score-sorted map. This is efficient
-/// when the subset of elements whose scores change is much smaller than
-/// the full set size.
-
-template <typename T_Elem, typename T_Score, class T_ElemCompare = std::less<T_Elem>>
+template <typename T_Elem, typename T_Key>
 class V3Scoreboard final {
-private:
     // TYPES
-    class CmpElems final {
-    public:
-        bool operator()(const T_Elem* const& ap, const T_Elem* const& bp) const {
-            const T_ElemCompare cmp;
-            return cmp.operator()(*ap, *bp);
-        }
-    };
-    using SortedMap = SortByValueMap<const T_Elem*, T_Score, CmpElems>;
-    using UserScoreFnp = T_Score (*)(const T_Elem*);
+    using Heap = PairingHeap<T_Key>;
+
+public:
+    using Node = typename Heap::Node;
+
+private:
+    using Link = typename Heap::Link;
+
+    // Note: T_Elem is incomplete here, so we cannot assert 'std::is_base_of<Node, T_Elem>::value'
 
     // MEMBERS
-    // Below uses set<> not an unordered_set<>. unordered_set::clear() and
-    // construction results in a 491KB clear operation to zero all the
-    // buckets. Since the set size is generally small, and we iterate the
-    // set members, set is better performant.
-    std::set<const T_Elem*> m_unknown;  // Elements with unknown scores
-    SortedMap m_sorted;  // Set of elements with known scores
-    const UserScoreFnp m_scoreFnp;  // Scoring function
-    const bool m_slowAsserts;  // Do some asserts that require extra lookups
+    Heap m_known;  // The heap of entries with known scores
+    Link m_unknown;  // List of entries with unknown scores
 
 public:
     // CONSTRUCTORS
-    explicit V3Scoreboard(UserScoreFnp scoreFnp, bool slowAsserts)
-        : m_scoreFnp{scoreFnp}
-        , m_slowAsserts{slowAsserts} {}
+    explicit V3Scoreboard() = default;
     ~V3Scoreboard() = default;
 
-    // METHODS
-
-    // Add an element to the scoreboard.
-    // Element begins in needs-rescore state; it won't be returned by
-    // bestp() until after the next rescore().
-    void addElem(const T_Elem* elp) {
-        if (m_slowAsserts) {
-            UASSERT(!contains(elp), "Adding element to scoreboard that was already in scoreboard");
-        }
-        m_unknown.insert(elp);
-    }
-
-    // Remove elp from scoreboard.
-    void removeElem(const T_Elem* elp) {
-        if (0 == m_sorted.erase(elp)) {
-            UASSERT(m_unknown.erase(elp),
-                    "Could not find requested elem to remove from scoreboard");
-        }
-    }
-
-    // Returns true if elp is present in the scoreboard, false otherwise.
-    //
-    // Note: every other V3Scoreboard routine that takes an T_Elem* has
-    // undefined behavior if the element is not in the scoreboard.
-    bool contains(const T_Elem* elp) const {
-        if (m_unknown.find(elp) != m_unknown.end()) return true;
-        return (m_sorted.find(elp) != m_sorted.end());
-    }
-
-    // Get the best element, with the lowest score (lower is better), among
-    // elements whose scores are known. Returns nullptr if no elements with
-    // known scores exist.
-    //
-    // Note: This does not automatically rescore. Client must call
-    // rescore() periodically to ensure all elems in the scoreboard are
-    // reflected in the result of bestp(). Otherwise, bestp() only
-    // considers elements that aren't pending rescore.
-    const T_Elem* bestp() {
-        const auto it = m_sorted.begin();
-        if (VL_UNLIKELY(it == m_sorted.end())) return nullptr;
-        return it->first;
-    }
-
-    // Tell the scoreboard that this element's score may have changed.
-    //
-    // At the time of this call, the element's score becomes "unknown"
-    // to the V3Scoreboard. Unknown elements won't be returned by bestp().
-    // The element's score will remain unknown until the next rescore().
-    //
-    // The client MUST call this for each element whose score has changed.
-    //
-    // The client MAY call this for elements whose score has not changed.
-    // Doing so incurs some compute cost (to re-sort the element back to
-    // its original location) and still makes it ineligible to be returned
-    // by bestp() until the next rescore().
-    void hintScoreChanged(const T_Elem* elp) {
-        m_unknown.insert(elp);
-        m_sorted.erase(elp);
-    }
-
-    // True if any element's score is unknown to V3Scoreboard.
-    bool needsRescore() { return !m_unknown.empty(); }
-    // False if elp's score is known to V3Scoreboard,
-    // else true if elp's score is unknown until the next rescore().
-    bool needsRescore(const T_Elem* elp) { return m_unknown.count(elp); }
-    // Retrieve the last known score for an element.
-    T_Score cachedScore(const T_Elem* elp) { return m_sorted.at(elp); }
-    // For each element whose score is unknown to V3Scoreboard,
-    // call the client's scoring function to get a new score,
-    // and sort all elements by their current score.
-    void rescore() {
-        for (const T_Elem* elp : m_unknown) {
-            VL_ATTR_UNUSED const bool exists = !m_sorted.emplace(elp, m_scoreFnp(elp)).second;
-#if VL_DEBUG
-            UASSERT(!exists, "Should not be in both m_unknown and m_sorted");
-#endif
-        }
-        m_unknown.clear();
-    }
-
 private:
     VL_UNCOPYABLE(V3Scoreboard);
+
+    // METHODSs
+    void addUnknown(T_Elem* nodep) {
+        // Just prepend it to the list of unknown entries
+        nodep->m_next.link(m_unknown.unlink());
+        m_unknown.linkNonNull(nodep);
+        // We mark nodes on the unknown list by making their child pointer point to themselves
+        nodep->m_kids.m_ptr = nodep;
+    }
+
+public:
+    // Returns true if the element is present in the scoreboard, false otherwise. Every other
+    // method that takes a T_Elem* (except for 'add') has undefined behavior if the element is not
+    // in this scoreboard. Furthermore, this method is only valid if the element can only possibly
+    // be in this scoreboard. That is: if the element might be in another scoreboard, the behaviour
+    // of this method is undefined.
+    static bool contains(const T_Elem* nodep) { return nodep->m_ownerpp; }
+
+    // Add an element to the scoreboard. This will not be returned before the next 'rescore' call.
+    void add(T_Elem* nodep) {
+#if VL_DEBUG
+        UASSERT(!contains(nodep), "Adding element to scoreboard that was already in a scoreboard");
+#endif
+        addUnknown(nodep);
+    }
+
+    // Remove element from scoreboard.
+    void remove(T_Elem* nodep) {
+        if (nodep->m_kids.m_ptr == nodep) {
+            // Node is on the unknown list, replace with next
+            nodep->replaceWith(nodep->m_next.unlink());
+            return;
+        }
+        // Node is in the known heap, remove it
+        m_known.remove(nodep);
+    }
+
+    // Get the known element with the highest score (as we are using a max-heap), or nullptr if
+    // there are no elements with known entries. This does not automatically 'rescore'. The client
+    // must call 'rescore' appropriately to ensure all elements in the scoreboard are reflected in
+    // the result of this method.
+    T_Elem* best() const { return T_Elem::heapNodeToElem(m_known.max()); }
+
+    // Tell the scoreboard that this element's score may have changed. At the time of this call,
+    // the element's score becomes 'unknown' to the scoreboard. Unknown elements will not be
+    // returned by 'best until the next call to 'rescore'.
+    void hintScoreChanged(T_Elem* nodep) {
+        // If it's already in the unknown list, then nothing to do
+        if (nodep->m_kids.m_ptr == nodep) return;
+        // Otherwise it was in the heap, remove it
+        m_known.remove(nodep);
+        // Prepend it to the unknown list
+        addUnknown(nodep);
+    }
+
+    // True if we have elements with unknown score
+    bool needsRescore() const { return m_unknown; }
+
+    // True if the element's score is unknown, false otherwise.
+    static bool needsRescore(const T_Elem* nodep) { return nodep->m_kids.m_ptr == nodep; }
+
+    // For each element whose score is unknown, recompute the score and add to the known heap
+    void rescore() {
+        // Rescore and insert all unknown elements
+        for (Node *nodep = m_unknown.unlink(), *nextp; nodep; nodep = nextp) {
+            // Pick up next
+            nextp = nodep->m_next.ptr();
+            // Reset pointers
+            nodep->m_next.m_ptr = nullptr;
+            nodep->m_kids.m_ptr = nullptr;
+            nodep->m_ownerpp = nullptr;
+            // Re-compute the score of the element
+            T_Elem::heapNodeToElem(nodep)->rescore();
+            // re-insert into the heap
+            m_known.insert(nodep);
+        }
+    }
 };
 
-//######################################################################
+// ######################################################################
 
 namespace V3ScoreboardBase {
 void selfTest();