Improve performance of MTask coarsening

Various optimizations to speed up MTasks coarsening (which is the long pole in the multi-threaded scheduling of very large designs). The biggest impact ones: - Use efficient hand written Pairing Heaps for implementing priority queues and the scoreboard, instead of the old SortByValueMap. This helps us avoid having to sort a lot of merge candidates that we will never actually consider and helps a lot in performance. - Remove unnecessary associative containers and store data structures (the heap nodes in particular) directly in the object they relate to. This eliminates a huge amount of lookups and helps a lot in performance. - Distribute storage for SiblingMC instances into the LogicMTask instances, and combine with the sibling maps. This again eliminates hash table lookups and makes storage structures smaller. - Remove some now bidirectional edge maps, keep only the forward map. There are also some other smaller optimizations: - Replaced more unnecessary dynamic_casts with static_casts - Templated some functions/classes to reduce the number of static branches in loops. - Improves sorting of edges for sibling candidate creation - Various micro-optimizations here and there This speeds up MTask coarsening by 3.8x on a large design, which translates to a 2.5x speedup of the ordering pass in multi-threaded mode. (Combined with the earlier optimizations, ordering is now 3x faster.) Due to the elimination of a lot of the auxiliary data structures, and ensuring a minimal size for the necessary ones, memory consumption of the MTask coarsening is also reduced (measured up to 4.4x reduction though the accuracy of this is low). The algorithm is identical except for minor alterations of the order some candidates are added or removed, this can cause perturbation in the output due to tied scores being broken based on IDs.
2025-04-16 01:26:54 +00:00 · 2022-08-07 14:11:58 +01:00 · 2022-08-07 14:11:58 +01:00 · 9ac64d0b92
commit 9ac64d0b92
parent c6607724cb
7 changed files with 1045 additions and 705 deletions
--- a/1
+++ b/1
@ -21,6 +21,7 @@ Verilator 4.225 devel
 * Fix incorrect tristate logic (#3399) [shareefj, Vighnesh Iyer]
 * Fix segfault exporting non-existant package (#3535).
 * Fix case statement comparing string literal (#3544). [Gustav Svensk]
+* Improve Verilation speed with --threads on large designs. [Geza Lore]


 Verilator 4.224 2022-06-19
--- a/include/verilatedos.h
+++ b/include/verilatedos.h
@ -530,6 +530,13 @@ using ssize_t = uint32_t;  ///< signed size_t; returned from read()
 #define VL_STRINGIFY(x) VL_STRINGIFY2(x)
 #define VL_STRINGIFY2(x) #x

+//=========================================================================
+// Offset of field in type
+
+// Address zero can cause compiler problems
+#define VL_OFFSETOF(type, field) \
+    (reinterpret_cast<size_t>(&(reinterpret_cast<type*>(0x10000000)->field)) - 0x10000000)
+
 //=========================================================================
 // Conversions

--- a/src/V3Graph.h
+++ b/src/V3Graph.h
@ -67,7 +67,7 @@ public:
        return names[m_e];
    }
    // METHODS unique to this class
-    constexpr GraphWay invert() const { return m_e == FORWARD ? REVERSE : FORWARD; }
+    constexpr GraphWay invert() const { return GraphWay{m_e ^ 1}; }
    constexpr bool forward() const { return m_e == FORWARD; }
    constexpr bool reverse() const { return m_e != FORWARD; }
 };
--- a/src/V3PairingHeap.h
+++ b/src/V3PairingHeap.h
@ -0,0 +1,303 @@
+// -*- mode: C++; c-file-style: "cc-mode" -*-
+//*************************************************************************
+// DESCRIPTION: Verilator: Pairing Heap data structure
+//
+// Code available from: https://verilator.org
+//
+//*************************************************************************
+//
+// Copyright 2003-2022 by Wilson Snyder. This program is free software; you
+// can redistribute it and/or modify it under the terms of either the GNU
+// Lesser General Public License Version 3 or the Perl Artistic License
+// Version 2.0.
+// SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
+//
+//*************************************************************************
+
+#ifndef VERILATOR_V3PAIRINGHEAP_H_
+#define VERILATOR_V3PAIRINGHEAP_H_
+
+#include "config_build.h"
+#include "verilatedos.h"
+
+#include "V3Error.h"
+
+//=============================================================================
+// Pairing heap (max-heap) with increase key and delete.
+//
+// While this is written as a generic data structure, it's interface and
+// implementation is finely tuned for it's use by V3Parm_tition, and is critical
+// to verilaton performance, so be very careful changing anything or adding any
+// new operations that would impact either memory usage, or performance of the
+// existing operations. This data structure is fully deterministic, meaning
+// the order in which elements with equal keys are retrieved only depends on
+// the order of operations performed on the heap.
+//=============================================================================
+
+template <typename T_Key>
+class PairingHeap final {
+public:
+    struct Node;
+
+    // Just a pointer to a heap Node, but with special accessors to help keep back pointers
+    // consistent.
+    struct Link {
+        Node* m_ptr = nullptr;  // The managed pointer
+
+        Link() = default;
+        VL_UNCOPYABLE(Link);
+
+        // Make the pointer point to the target, and the target's owner pointer to this pointer
+        VL_ATTR_ALWINLINE void link(Node* targetp) {
+            m_ptr = targetp;
+            if (!targetp) return;
+#if VL_DEBUG
+            UASSERT(!targetp->m_ownerpp, "Already linked");
+#endif
+            targetp->m_ownerpp = &m_ptr;
+        }
+
+        // Make the pointer point to the target, and the target's owner pointer to this pointer
+        VL_ATTR_ALWINLINE void linkNonNull(Node* targetp) {
+            m_ptr = targetp;
+#if VL_DEBUG
+            UASSERT(!targetp->m_ownerpp, "Already linked");
+#endif
+            targetp->m_ownerpp = &m_ptr;
+        }
+
+        // Clear the pointer and return it's previous value
+        VL_ATTR_ALWINLINE Node* unlink() {
+            Node* const result = m_ptr;
+#if VL_DEBUG
+            if (result) {
+                UASSERT(m_ptr->m_ownerpp == &m_ptr, "Bad back link");
+                // Not strictly necessary to clear this, but helps debugging
+                m_ptr->m_ownerpp = nullptr;
+            }
+#endif
+            m_ptr = nullptr;
+            return result;
+        }
+
+        // Minimal convenience acessors and operators
+        VL_ATTR_ALWINLINE Node* ptr() const { return m_ptr; }
+        VL_ATTR_ALWINLINE operator bool() const { return m_ptr; }
+        VL_ATTR_ALWINLINE bool operator!() const { return !m_ptr; }
+        VL_ATTR_ALWINLINE Node* operator->() const { return m_ptr; }
+        VL_ATTR_ALWINLINE Node& operator*() const { return *m_ptr; }
+    };
+
+    // A single node in the pairing heap tree
+    struct Node {
+        Link m_next;  // Next in list of sibling heaps
+        Link m_kids;  // Head of list of child heaps
+        Node** m_ownerpp = nullptr;  // Pointer to the Link pointer pointing to this heap
+        T_Key m_key;  // The key in the heap
+
+        // CONSTRUCTOR
+        explicit Node() = default;
+        VL_UNCOPYABLE(Node);
+
+        // METHODS
+        VL_ATTR_ALWINLINE const T_Key& key() const { return m_key; }
+        VL_ATTR_ALWINLINE bool operator<(const Node& that) const { return m_key < that.m_key; }
+        VL_ATTR_ALWINLINE bool operator>(const Node& that) const { return that.m_key < m_key; }
+
+        // Make newp take the place of this in the tree
+        VL_ATTR_ALWINLINE void replaceWith(Node* newp) {
+            *m_ownerpp = newp;  // The owner pointer needs to point to the new node
+            if (newp) newp->m_ownerpp = m_ownerpp;  // The new node needs to point to its owner
+            m_ownerpp = nullptr;  // This node has no owner anymore
+        }
+
+        // Make newp take the place of this in the tree
+        VL_ATTR_ALWINLINE void replaceWithNonNull(Node* newp) {
+            *m_ownerpp = newp;  // The owner pointer needs to point to the new node
+            newp->m_ownerpp = m_ownerpp;  // The new node needs to point to its owner
+            m_ownerpp = nullptr;  // This node has no owner anymore
+        }
+
+        // Yank this node out of the heap it currently is in. This node can then be safely inserted
+        // into another heap. Note that this leaves the heap the node is currently under in an
+        // inconsistent state, so you cannot access it anymore. Still this can save a remove if we
+        // don't care about the state of the source heap.
+        VL_ATTR_ALWINLINE void yank() {
+            m_next.link(nullptr);
+            m_kids.link(nullptr);
+            m_ownerpp = nullptr;
+        }
+    };
+
+private:
+    // MEMBERS
+
+    // The root of the heap. Note: We do not reduce lists during insertion/removal etc, unless we
+    // absolutely have to. This means the root can become a list. This is ok, we will reduce
+    // lazily when requesting the minimum element.
+    mutable Link m_root;
+
+    // CONSTRUCTORS
+    VL_UNCOPYABLE(PairingHeap);
+
+public:
+    explicit PairingHeap() = default;
+
+    // METHODS
+    bool empty() const { return !m_root; }
+
+    // Insert given node into this heap with given key.
+    void insert(Node* nodep, T_Key key) {
+        // Update key of node
+        nodep->m_key = key;
+        insert(nodep);
+    }
+
+    // Insert given node into this heap with key already set in the node
+    void insert(Node* nodep) {
+#if VL_DEBUG
+        UASSERT(!nodep->m_ownerpp && !nodep->m_next && !nodep->m_kids, "Already linked");
+#endif
+        // Just stick it at the front of the root list
+        nodep->m_next.link(m_root.unlink());
+        m_root.linkNonNull(nodep);
+    }
+
+    // Remove given node only from the heap it is contained in
+    void remove(Node* nodep) {
+        if (!nodep->m_next) {
+            // If the node does not have siblings, replace it with its children (might be empty).
+            nodep->replaceWith(nodep->m_kids.unlink());
+        } else if (!nodep->m_kids) {
+            // If it has siblings but no children, replace it with the siblings.
+            nodep->replaceWithNonNull(nodep->m_next.unlink());
+        } else {
+            // If it has both siblings and children, reduce the children and splice that
+            // reduced heap in place of this node
+            Node* const reducedKidsp = reduce(nodep->m_kids.unlink());
+            reducedKidsp->m_next.linkNonNull(nodep->m_next.unlink());
+            nodep->replaceWithNonNull(reducedKidsp);
+        }
+    }
+
+    // Returns the largest element in the heap
+    Node* max() const {
+        // Heap might be empty
+        if (!m_root) return nullptr;
+        // If the root have siblings reduce them
+        if (m_root->m_next) m_root.linkNonNull(reduce(m_root.unlink()));
+        // The root element is the largest
+        return m_root.ptr();
+    }
+
+    // Returns the second-largest element in the heap.
+    // This is only valid to call if 'max' returned a valid element.
+    Node* secondMax() const {
+#if VL_DEBUG
+        UASSERT(m_root, "'max' would have returned nullptr");
+        UASSERT(!m_root->m_next, "'max' would have reduced");
+#endif
+        // If there are no children, there is no second element
+        if (!m_root->m_kids) return nullptr;
+        // If there are multiple children, reduce them
+        if (m_root->m_kids->m_next) m_root->m_kids.linkNonNull(reduce(m_root->m_kids.unlink()));
+        // Return the now singular child, which is the second-largest element
+        return m_root->m_kids.ptr();
+    }
+
+    // Increase the key of the given node to the given new value
+    template <typename T_Update>
+    void increaseKey(Node* nodep, T_Update value) {
+        // Update the key
+        nodep->m_key.increase(value);
+        // Increasing the key of the root is easy
+        if (nodep == m_root.ptr()) return;
+        // Otherwise we do have a little work to do
+        if (!nodep->m_kids) {
+            // If the node has no children, replace it with its siblings (migtht be null)
+            nodep->replaceWith(nodep->m_next.unlink());
+        } else if (!nodep->m_next) {
+            // If the node has no siblings, replace it with its children
+            nodep->replaceWithNonNull(nodep->m_kids.unlink());
+        } else {
+            // The node has both children and siblings. Splice the first child in the place of the
+            // node, and extract the rest of the children with the node
+            Node* const kidsp = nodep->m_kids.unlink();
+            nodep->m_kids.link(kidsp->m_next.unlink());
+            kidsp->m_next.linkNonNull(nodep->m_next.unlink());
+            nodep->replaceWithNonNull(kidsp);
+        }
+        // Just stick the increased node at the front of the root list
+        nodep->m_next.linkNonNull(m_root.unlink());
+        m_root.linkNonNull(nodep);
+    }
+
+private:
+    // Meld (merge) two heaps rooted at the given nodes, return the root of the new heap
+    VL_ATTR_ALWINLINE static Node* merge(Node* ap, Node* bp) {
+#if VL_DEBUG
+        UASSERT(!ap->m_ownerpp && !ap->m_next, "Not root a");
+        UASSERT(!bp->m_ownerpp && !bp->m_next, "Not root b");
+#endif
+        if (*ap > *bp) {  // bp goes under ap
+            bp->m_next.link(ap->m_kids.unlink());
+            ap->m_kids.linkNonNull(bp);
+            return ap;
+        } else {  // ap goes under bp
+            ap->m_next.link(bp->m_kids.unlink());
+            bp->m_kids.linkNonNull(ap);
+            return bp;
+        }
+    }
+
+    // Reduces the list of nodes starting at the given node into a single node that is returned
+    VL_ATTR_NOINLINE static Node* reduce(Node* nodep) {
+#if VL_DEBUG
+        UASSERT(!nodep->m_ownerpp, "Node is linked");
+#endif
+        // If there is only one node in the list, then there is nothing to do
+        if (!nodep->m_next) return nodep;
+        // The result node
+        Node* resultp = nullptr;
+        // Pairwise merge the child nodes
+        while (nodep) {
+            // Pop off the first nodes
+            Node* const ap = nodep;
+            // If we have an odd number of nodes, prepend the unpaired one onto the result list
+            if (!nodep->m_next) {
+                ap->m_next.link(resultp);
+                resultp = ap;
+                break;
+            }
+            // Pop off the second nodes
+            Node* const bp = nodep->m_next.unlink();
+            // Keep hold of the rest of the list
+            nodep = bp->m_next.unlink();
+            // Merge the current pair
+            Node* const mergedp = merge(ap, bp);
+            // Prepend the merged pair to the result list
+            mergedp->m_next.link(resultp);
+            resultp = mergedp;
+        }
+        // Now merge-reduce the merged pairs
+        while (resultp->m_next) {
+            // Pop first two results
+            Node* const ap = resultp;
+            Node* const bp = resultp->m_next.unlink();
+            // Keep hold of the rest of the list
+            resultp = bp->m_next.unlink();
+            // Merge the current pair
+            Node* const mergedp = merge(ap, bp);
+            // Prepend the merged pair to the result list
+            mergedp->m_next.link(resultp);
+            resultp = mergedp;
+        }
+        // Done
+        return resultp;
+    }
+};
+
+// The PairingHeap itself should be a simple pointer and nothing more
+static_assert(sizeof(PairingHeap<int>) == sizeof(PairingHeap<int>::Node*), "Should be a pointer");
+
+#endif  // Guard
--- a/src/V3Partition.cpp
+++ b/src/V3Partition.cpp
--- a/src/V3Scoreboard.cpp
+++ b/src/V3Scoreboard.cpp
@ -19,26 +19,42 @@

 #include "V3Scoreboard.h"

-class ScoreboardTestElem final {
+class ScoreboardTestElem;
+
+struct Key {
+    // Node: Structure layout chosen to minimize padding in PairingHeao<*>::Node
+    uint64_t m_id;  // Unique ID part of edge score
+    uint32_t m_score;  // Score part of ID
+    bool operator<(const Key& other) const {
+        // First by Score then by ID, but notice that we want minimums using a max-heap, so reverse
+        return m_score > other.m_score || (m_score == other.m_score && m_id > other.m_id);
+    }
+};
+
+using Scoreboard = V3Scoreboard<ScoreboardTestElem, Key>;
+
+class ScoreboardTestElem final : public Scoreboard::Node {
 public:
-    // MEMBERS
-    uint32_t m_score;
-    uint32_t m_id;
+    uint32_t m_newScore;
    // CONSTRUCTORS
    explicit ScoreboardTestElem(uint32_t score)
-        : m_score{score} {
+        : m_newScore{score} {
+        m_key.m_score = m_newScore;
        static uint32_t s_serial = 0;
-        m_id = ++s_serial;
+        m_key.m_id = ++s_serial;
    }
    ScoreboardTestElem() = default;
-    // METHODS
-    static uint32_t scoreFn(const ScoreboardTestElem* elp) { return elp->m_score; }

-    bool operator<(const ScoreboardTestElem& other) const { return m_id < other.m_id; }
+    uint64_t id() const { return m_key.m_id; }
+    void rescore() { m_key.m_score = m_newScore; }
+    uint32_t score() const { return m_key.m_score; }
+    static ScoreboardTestElem* heapNodeToElem(Scoreboard::Node* nodep) {
+        return static_cast<ScoreboardTestElem*>(nodep);
+    }
 };

 void V3ScoreboardBase::selfTest() {
-    V3Scoreboard<ScoreboardTestElem, uint32_t> sb(ScoreboardTestElem::scoreFn, true);
+    Scoreboard sb;

    UASSERT(!sb.needsRescore(), "SelfTest: Empty sb should not need rescore.");

@ -46,13 +62,13 @@ void V3ScoreboardBase::selfTest() {
    ScoreboardTestElem e2(20);
    ScoreboardTestElem e3(30);

-    sb.addElem(&e1);
-    sb.addElem(&e2);
-    sb.addElem(&e3);
+    sb.add(&e1);
+    sb.add(&e2);
+    sb.add(&e3);

    UASSERT(sb.needsRescore(), "SelfTest: Newly filled sb should need a rescore.");
    UASSERT(sb.needsRescore(&e1), "SelfTest: Individual newly-added element should need rescore");
-    UASSERT(nullptr == sb.bestp(),
+    UASSERT(nullptr == sb.best(),
            "SelfTest: Newly filled sb should have nothing eligible for Bestp()");

    sb.rescore();
@ -60,24 +76,22 @@ void V3ScoreboardBase::selfTest() {
    UASSERT(!sb.needsRescore(), "SelfTest: Newly rescored sb should not need rescore");
    UASSERT(!sb.needsRescore(&e1),
            "SelfTest: Newly rescored sb should not need an element rescored");
-    UASSERT(e2.m_score == sb.cachedScore(&e2),
-            "SelfTest: Cached score should match current score");
-    UASSERT(&e1 == sb.bestp(), "SelfTest: Should return element with lowest (best) score");
+    UASSERT(&e1 == sb.best(), "SelfTest: Should return element with lowest (best) score");

    // Change one element's score
    sb.hintScoreChanged(&e2);
-    e2.m_score = 21;
+    e2.m_newScore = 21;
    UASSERT(sb.needsRescore(&e2), "SelfTest: Should need rescore on elem after hintScoreChanged");

    // Remove an element
    UASSERT(sb.contains(&e1), "SelfTest: e1 should be there");
-    sb.removeElem(&e1);
+    sb.remove(&e1);
    UASSERT(!sb.contains(&e1), "SelfTest: e1 should be gone");
    UASSERT(sb.contains(&e2), "SelfTest: e2 should be there, despite needing rescore");

    // Now e3 should be our best-scoring element, even though
    // e2 has a better score, since e2 is pending rescore.
-    UASSERT(&e3 == sb.bestp(), "SelfTest: Expect e3 as best element with known score.");
+    UASSERT(&e3 == sb.best(), "SelfTest: Expect e3 as best element with known score.");
    sb.rescore();
-    UASSERT(&e2 == sb.bestp(), "SelfTest: Expect e2 as best element again after Rescore");
+    UASSERT(&e2 == sb.best(), "SelfTest: Expect e2 as best element again after Rescore");
 }
--- a/src/V3Scoreboard.h
+++ b/src/V3Scoreboard.h
@ -1,13 +1,6 @@
 // -*- mode: C++; c-file-style: "cc-mode" -*-
 //*************************************************************************
-// DESCRIPTION: Verilator: Scoreboards for thread partitioner
-//
-// Provides scoreboard classes:
-//
-//  * SortByValueMap
-//  * V3Scoreboard
-//
-// See details below
+// DESCRIPTION: Verilator: Scoreboard for mtask coarsening
 //
 // Code available from: https://verilator.org
 //
@ -28,248 +21,122 @@
 #include "verilatedos.h"

 #include "V3Error.h"
+#include "V3PairingHeap.h"

-#include <functional>
-#include <map>
-#include <set>
-#include <unordered_map>
+//===============================================================================================
+// V3Scoreboard is essentially a heap that can be hinted that some elements have changed keys, at
+// which points those elements will be deferred as 'unknown' until the next 'rescore' call. We
+// largely reuse the implementation of the slightly more generic PairingHeap, but we do rely on the
+// internal structure of the PairingHeap so changing that class requires changing this.
+//
+// For efficiency, the elements themselves must be the heap nodes, by deriving them from
+// V3Scoreboard<T_Elem, T_Key>::Node. This also means a single element can only be associated with
+// a single scoreboard.

-// ######################################################################
-//  SortByValueMap
-
-// A generic key-value map, except iteration is in *value* sorted order. Values need not be unique.
-// Uses T_KeyCompare to break ties in the sort when values collide. Note: Only const iteration is
-// possible, as updating mapped values via iterators is not safe.
-
-template <typename T_Key, typename T_Value, class T_KeyCompare = std::less<T_Key>>
-class SortByValueMap final {
-    // Current implementation is a std::set of key/value pairs, plus a std_unordered_map from keys
-    // to iterators into the set. This keeps most operations fairly cheap and also has the benefit
-    // of being able to re-use the std::set iterators.
-
-    // TYPES
-
-    using Pair = std::pair<T_Key, T_Value>;
-
-    struct PairCmp final {
-        bool operator()(const Pair& a, const Pair& b) const {
-            // First compare values
-            if (a.second != b.second) return a.second < b.second;
-            // Then compare keys
-            return T_KeyCompare{}(a.first, b.first);
-        }
-    };
-
-    using PairSet = std::set<Pair, PairCmp>;
-
-public:
-    using const_iterator = typename PairSet::const_iterator;
-    using const_reverse_iterator = typename PairSet::const_reverse_iterator;
-
-private:
-    // MEMBERS
-    PairSet m_pairs;  // The contents of the map, stored directly as key-value pairs
-    std::unordered_map<T_Key, const_iterator> m_kiMap;  // Key to iterator map
-
-    VL_UNCOPYABLE(SortByValueMap);
-
-public:
-    // CONSTRUCTORS
-    SortByValueMap() = default;
-
-    // Only const iteration is possible
-    const_iterator begin() const { return m_pairs.begin(); }
-    const_iterator end() const { return m_pairs.end(); }
-    const_iterator cbegin() const { m_pairs.cbegin(); }
-    const_iterator cend() const { return m_pairs.cend(); }
-    const_reverse_iterator rbegin() const { return m_pairs.rbegin(); }
-    const_reverse_iterator rend() const { return m_pairs.rend(); }
-    const_reverse_iterator crbegin() const { return m_pairs.crbegin(); }
-    const_reverse_iterator crend() const { return m_pairs.crend(); }
-
-    const_iterator find(const T_Key& key) const {
-        const auto kiIt = m_kiMap.find(key);
-        if (kiIt == m_kiMap.end()) return cend();
-        return kiIt->second;
-    }
-    size_t erase(const T_Key& key) {
-        const auto kiIt = m_kiMap.find(key);
-        if (kiIt == m_kiMap.end()) return 0;
-        m_pairs.erase(kiIt->second);
-        m_kiMap.erase(kiIt);
-        return 1;
-    }
-    void erase(const_iterator it) {
-        m_kiMap.erase(it->first);
-        m_pairs.erase(it);
-    }
-    void erase(const_reverse_iterator rit) {
-        m_kiMap.erase(rit->first);
-        m_pairs.erase(std::next(rit).base());
-    }
-    bool has(const T_Key& key) const { return m_kiMap.count(key); }
-    bool empty() const { return m_pairs.empty(); }
-    // Returns const reference.
-    const T_Value& at(const T_Key& key) const { return m_kiMap.at(key)->second; }
-    // Note this returns const_iterator
-    template <typename... Args>
-    std::pair<const_iterator, bool> emplace(const T_Key& key, Args&&... args) {
-        const auto kiEmp = m_kiMap.emplace(key, end());
-        if (kiEmp.second) {
-            const auto result = m_pairs.emplace(key, std::forward<Args>(args)...);
-#if VL_DEBUG
-            UASSERT(result.second, "Should not be in set yet");
-#endif
-            kiEmp.first->second = result.first;
-            return result;
-        }
-        return {kiEmp.first->second, false};
-    }
-    // Invalidates iterators
-    void update(const_iterator it, T_Value value) {
-        const auto kiIt = m_kiMap.find(it->first);
-        m_pairs.erase(it);
-        kiIt->second = m_pairs.emplace(kiIt->first, value).first;
-    }
-};
-
-//######################################################################
-
-/// V3Scoreboard takes a set of Elem*'s, each having some score.
-/// Scores are assigned by a user-supplied scoring function.
-///
-/// At any time, the V3Scoreboard can return th515e elem with the "best" score
-/// among those elements whose scores are known.
-///
-/// The best score is the _lowest_ score. This makes sense in contexts
-/// where scores represent costs.
-///
-/// The Scoreboard supports mutating element scores efficiently. The client
-/// must hint to the V3Scoreboard when an element's score may have
-/// changed. When it receives this hint, the V3Scoreboard will move the
-/// element into the set of elements whose scores are unknown. Later the
-/// client can tell V3Scoreboard to re-sort the list, which it does
-/// incrementally, by re-scoring all elements whose scores are unknown, and
-/// then moving these back into the score-sorted map. This is efficient
-/// when the subset of elements whose scores change is much smaller than
-/// the full set size.
-
-template <typename T_Elem, typename T_Score, class T_ElemCompare = std::less<T_Elem>>
+template <typename T_Elem, typename T_Key>
 class V3Scoreboard final {
-private:
    // TYPES
-    class CmpElems final {
-    public:
-        bool operator()(const T_Elem* const& ap, const T_Elem* const& bp) const {
-            const T_ElemCompare cmp;
-            return cmp.operator()(*ap, *bp);
-        }
-    };
-    using SortedMap = SortByValueMap<const T_Elem*, T_Score, CmpElems>;
-    using UserScoreFnp = T_Score (*)(const T_Elem*);
+    using Heap = PairingHeap<T_Key>;
+
+public:
+    using Node = typename Heap::Node;
+
+private:
+    using Link = typename Heap::Link;
+
+    // Note: T_Elem is incomplete here, so we cannot assert 'std::is_base_of<Node, T_Elem>::value'

    // MEMBERS
-    // Below uses set<> not an unordered_set<>. unordered_set::clear() and
-    // construction results in a 491KB clear operation to zero all the
-    // buckets. Since the set size is generally small, and we iterate the
-    // set members, set is better performant.
-    std::set<const T_Elem*> m_unknown;  // Elements with unknown scores
-    SortedMap m_sorted;  // Set of elements with known scores
-    const UserScoreFnp m_scoreFnp;  // Scoring function
-    const bool m_slowAsserts;  // Do some asserts that require extra lookups
+    Heap m_known;  // The heap of entries with known scores
+    Link m_unknown;  // List of entries with unknown scores

 public:
    // CONSTRUCTORS
-    explicit V3Scoreboard(UserScoreFnp scoreFnp, bool slowAsserts)
-        : m_scoreFnp{scoreFnp}
-        , m_slowAsserts{slowAsserts} {}
+    explicit V3Scoreboard() = default;
    ~V3Scoreboard() = default;

-    // METHODS
-
-    // Add an element to the scoreboard.
-    // Element begins in needs-rescore state; it won't be returned by
-    // bestp() until after the next rescore().
-    void addElem(const T_Elem* elp) {
-        if (m_slowAsserts) {
-            UASSERT(!contains(elp), "Adding element to scoreboard that was already in scoreboard");
-        }
-        m_unknown.insert(elp);
-    }
-
-    // Remove elp from scoreboard.
-    void removeElem(const T_Elem* elp) {
-        if (0 == m_sorted.erase(elp)) {
-            UASSERT(m_unknown.erase(elp),
-                    "Could not find requested elem to remove from scoreboard");
-        }
-    }
-
-    // Returns true if elp is present in the scoreboard, false otherwise.
-    //
-    // Note: every other V3Scoreboard routine that takes an T_Elem* has
-    // undefined behavior if the element is not in the scoreboard.
-    bool contains(const T_Elem* elp) const {
-        if (m_unknown.find(elp) != m_unknown.end()) return true;
-        return (m_sorted.find(elp) != m_sorted.end());
-    }
-
-    // Get the best element, with the lowest score (lower is better), among
-    // elements whose scores are known. Returns nullptr if no elements with
-    // known scores exist.
-    //
-    // Note: This does not automatically rescore. Client must call
-    // rescore() periodically to ensure all elems in the scoreboard are
-    // reflected in the result of bestp(). Otherwise, bestp() only
-    // considers elements that aren't pending rescore.
-    const T_Elem* bestp() {
-        const auto it = m_sorted.begin();
-        if (VL_UNLIKELY(it == m_sorted.end())) return nullptr;
-        return it->first;
-    }
-
-    // Tell the scoreboard that this element's score may have changed.
-    //
-    // At the time of this call, the element's score becomes "unknown"
-    // to the V3Scoreboard. Unknown elements won't be returned by bestp().
-    // The element's score will remain unknown until the next rescore().
-    //
-    // The client MUST call this for each element whose score has changed.
-    //
-    // The client MAY call this for elements whose score has not changed.
-    // Doing so incurs some compute cost (to re-sort the element back to
-    // its original location) and still makes it ineligible to be returned
-    // by bestp() until the next rescore().
-    void hintScoreChanged(const T_Elem* elp) {
-        m_unknown.insert(elp);
-        m_sorted.erase(elp);
-    }
-
-    // True if any element's score is unknown to V3Scoreboard.
-    bool needsRescore() { return !m_unknown.empty(); }
-    // False if elp's score is known to V3Scoreboard,
-    // else true if elp's score is unknown until the next rescore().
-    bool needsRescore(const T_Elem* elp) { return m_unknown.count(elp); }
-    // Retrieve the last known score for an element.
-    T_Score cachedScore(const T_Elem* elp) { return m_sorted.at(elp); }
-    // For each element whose score is unknown to V3Scoreboard,
-    // call the client's scoring function to get a new score,
-    // and sort all elements by their current score.
-    void rescore() {
-        for (const T_Elem* elp : m_unknown) {
-            VL_ATTR_UNUSED const bool exists = !m_sorted.emplace(elp, m_scoreFnp(elp)).second;
-#if VL_DEBUG
-            UASSERT(!exists, "Should not be in both m_unknown and m_sorted");
-#endif
-        }
-        m_unknown.clear();
-    }
-
 private:
    VL_UNCOPYABLE(V3Scoreboard);
+
+    // METHODSs
+    void addUnknown(T_Elem* nodep) {
+        // Just prepend it to the list of unknown entries
+        nodep->m_next.link(m_unknown.unlink());
+        m_unknown.linkNonNull(nodep);
+        // We mark nodes on the unknown list by making their child pointer point to themselves
+        nodep->m_kids.m_ptr = nodep;
+    }
+
+public:
+    // Returns true if the element is present in the scoreboard, false otherwise. Every other
+    // method that takes a T_Elem* (except for 'add') has undefined behavior if the element is not
+    // in this scoreboard. Furthermore, this method is only valid if the element can only possibly
+    // be in this scoreboard. That is: if the element might be in another scoreboard, the behaviour
+    // of this method is undefined.
+    static bool contains(const T_Elem* nodep) { return nodep->m_ownerpp; }
+
+    // Add an element to the scoreboard. This will not be returned before the next 'rescore' call.
+    void add(T_Elem* nodep) {
+#if VL_DEBUG
+        UASSERT(!contains(nodep), "Adding element to scoreboard that was already in a scoreboard");
+#endif
+        addUnknown(nodep);
+    }
+
+    // Remove element from scoreboard.
+    void remove(T_Elem* nodep) {
+        if (nodep->m_kids.m_ptr == nodep) {
+            // Node is on the unknown list, replace with next
+            nodep->replaceWith(nodep->m_next.unlink());
+            return;
+        }
+        // Node is in the known heap, remove it
+        m_known.remove(nodep);
+    }
+
+    // Get the known element with the highest score (as we are using a max-heap), or nullptr if
+    // there are no elements with known entries. This does not automatically 'rescore'. The client
+    // must call 'rescore' appropriately to ensure all elements in the scoreboard are reflected in
+    // the result of this method.
+    T_Elem* best() const { return T_Elem::heapNodeToElem(m_known.max()); }
+
+    // Tell the scoreboard that this element's score may have changed. At the time of this call,
+    // the element's score becomes 'unknown' to the scoreboard. Unknown elements will not be
+    // returned by 'best until the next call to 'rescore'.
+    void hintScoreChanged(T_Elem* nodep) {
+        // If it's already in the unknown list, then nothing to do
+        if (nodep->m_kids.m_ptr == nodep) return;
+        // Otherwise it was in the heap, remove it
+        m_known.remove(nodep);
+        // Prepend it to the unknown list
+        addUnknown(nodep);
+    }
+
+    // True if we have elements with unknown score
+    bool needsRescore() const { return m_unknown; }
+
+    // True if the element's score is unknown, false otherwise.
+    static bool needsRescore(const T_Elem* nodep) { return nodep->m_kids.m_ptr == nodep; }
+
+    // For each element whose score is unknown, recompute the score and add to the known heap
+    void rescore() {
+        // Rescore and insert all unknown elements
+        for (Node *nodep = m_unknown.unlink(), *nextp; nodep; nodep = nextp) {
+            // Pick up next
+            nextp = nodep->m_next.ptr();
+            // Reset pointers
+            nodep->m_next.m_ptr = nullptr;
+            nodep->m_kids.m_ptr = nullptr;
+            nodep->m_ownerpp = nullptr;
+            // Re-compute the score of the element
+            T_Elem::heapNodeToElem(nodep)->rescore();
+            // re-insert into the heap
+            m_known.insert(nodep);
+        }
+    }
 };

-//######################################################################
+// ######################################################################

 namespace V3ScoreboardBase {
 void selfTest();