From f073b278f91d87c27c3a2a1b1aa3560880264eea Mon Sep 17 00:00:00 2001 From: Geza Lore Date: Sat, 9 Nov 2024 18:14:19 +0000 Subject: [PATCH] Balance concatenations in DFG (#5598) The DFG peephole pass converts all associative trees into right leaning, which is good for simplifying pattern recognition, but can lead to an excessive amount of wide intermediate results being constructed for right leaning concatenations. Add a new pass to balance concatenation trees by trying to: - Create VL_EDATASIZE (32-bit) sub-terms, so words can then be packed easily afterwards - Try to ensure the operands of a concat are roughly the same width within a concatenation tree. This does not yield the shortest tree, but it ensures it has many sub-nodes that are small enough to fit into machine registers. This can eliminate a lot of wide intermediate results, which would need temporaries, and also increases ILP within sub-expressions (assuming the C compiler can't figure that out itself). This is over 2x run-time speedup on the high_perf configuration of VeeR EH2 (which you could arguably also get with -fno-dfg, but oh well). --- src/CMakeLists.txt | 1 + src/Makefile_obj.in | 1 + src/V3Dfg.h | 3 + src/V3DfgBalanceTrees.cpp | 197 +++++++++++++++++++++++++++ src/V3DfgOptimizer.cpp | 4 +- src/V3DfgOptimizer.h | 2 +- src/V3DfgPasses.cpp | 11 +- src/V3DfgPasses.h | 16 ++- src/Verilator.cpp | 4 +- test_regress/t/t_dfg_balance_cats.py | 21 +++ test_regress/t/t_dfg_balance_cats.v | 35 +++++ test_regress/t/t_opt_const_dfg.py | 2 +- 12 files changed, 289 insertions(+), 8 deletions(-) create mode 100644 src/V3DfgBalanceTrees.cpp create mode 100755 test_regress/t/t_dfg_balance_cats.py create mode 100644 test_regress/t/t_dfg_balance_cats.v diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index e0b1792a6..9049fc215 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -224,6 +224,7 @@ set(COMMON_SOURCES V3Descope.cpp V3Dfg.cpp V3DfgAstToDfg.cpp + V3DfgBalanceTrees.cpp V3DfgCache.cpp V3DfgDecomposition.cpp V3DfgDfgToAst.cpp diff --git a/src/Makefile_obj.in b/src/Makefile_obj.in index d29baa840..0e972fc71 100644 --- a/src/Makefile_obj.in +++ b/src/Makefile_obj.in @@ -237,6 +237,7 @@ RAW_OBJS_PCH_ASTNOMT = \ V3Descope.o \ V3Dfg.o \ V3DfgAstToDfg.o \ + V3DfgBalanceTrees.o \ V3DfgCache.o \ V3DfgDecomposition.o \ V3DfgDfgToAst.o \ diff --git a/src/V3Dfg.h b/src/V3Dfg.h index 5fab278ee..8b0978b97 100644 --- a/src/V3Dfg.h +++ b/src/V3Dfg.h @@ -274,6 +274,9 @@ public: // Predicate: has 1 or more sinks bool hasSinks() const { return m_sinksp != nullptr; } + // Predicate: has precisely 1 sink + bool hasSingleSink() const { return m_sinksp && !m_sinksp->m_nextp; } + // Predicate: has 2 or more sinks bool hasMultipleSinks() const { return m_sinksp && m_sinksp->m_nextp; } diff --git a/src/V3DfgBalanceTrees.cpp b/src/V3DfgBalanceTrees.cpp new file mode 100644 index 000000000..6b5eca2d8 --- /dev/null +++ b/src/V3DfgBalanceTrees.cpp @@ -0,0 +1,197 @@ +// -*- mode: C++; c-file-style: "cc-mode" -*- +//************************************************************************* +// DESCRIPTION: Verilator: Balance associative op trees in DfgGraphs +// +// Code available from: https://verilator.org +// +//************************************************************************* +// +// Copyright 2003-2024 by Wilson Snyder. This program is free software; you +// can redistribute it and/or modify it under the terms of either the GNU +// Lesser General Public License Version 3 or the Perl Artistic License +// Version 2.0. +// SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0 +// +//************************************************************************* +// +// - Convert concatenation trees into balanced form +// +//************************************************************************* + +#include "V3PchAstNoMT.h" // VL_MT_DISABLED_CODE_UNIT + +#include "V3Dfg.h" +#include "V3DfgPasses.h" + +VL_DEFINE_DEBUG_FUNCTIONS; + +class DfgBalanceTrees final { + // We keep the expressions, together with their offsets within a concatenation tree + struct ConcatTerm final { + DfgVertex* vtxp = nullptr; + size_t offset = 0; + + ConcatTerm() = default; + ConcatTerm(DfgVertex* vtxp, size_t offset) + : vtxp{vtxp} + , offset{offset} {} + }; + + DfgGraph& m_dfg; // The graph being processed + V3DfgBalanceTreesContext& m_ctx; // The optimization context for stats + + // Is the given vertex the root of a tree (of potentially size 1), of the given type? + template + static bool isRoot(const DfgVertex& vtx) { + static_assert(std::is_base_of::value, + "'Vertex' must be a 'DfgVertexBinary'"); + if (!vtx.is()) return false; + // Has a single sink, and that sink is not another vertex of the same type + return vtx.hasSingleSink() && !vtx.findSink(); + } + + // Recursive implementation of 'gatherTerms' below. + template + static void gatherTermsImpl(DfgVertex* vtxp, std::vector& terms) { + // Base case: different type, or multiple sinks -> it's a term + if (!vtxp->is() || vtxp->hasMultipleSinks()) { + terms.emplace_back(vtxp); + return; + } + // Recursive case: gather sub terms, right to right + DfgVertexBinary* const binp = vtxp->as(); + gatherTermsImpl(binp->rhsp(), terms); + gatherTermsImpl(binp->lhsp(), terms); + } + + // Gather terms in the tree of given type, rooted at the given vertex. + // Results are right to left, that is, index 0 in the returned vector + // is the rightmost term, index size()-1 is the leftmost term. + template + static std::vector gatherTerms(Vertex& root) { + static_assert(std::is_base_of::value, + "'Vertex' must be a 'DfgVertexBinary'"); + std::vector terms; + gatherTermsImpl(root.rhsp(), terms); + gatherTermsImpl(root.lhsp(), terms); + return terms; + } + + // Construct a balanced concatenation from the given terms, + // between indices begin (inclusive), and end (exclusive). + // Note term[end].offset must be valid. term[end].vtxp is + // never referenced. + DfgVertex* constructConcat(const std::vector& terms, const size_t begin, + const size_t end) { + UASSERT(end < terms.size(), "Invalid end"); + UASSERT(begin < end, "Invalid range"); + // Base case: just return the term + if (end == begin + 1) return terms[begin].vtxp; + + // Recursive case: + // Compute the mid-point, trying to create roughly equal width intermediates + const size_t width = terms[end].offset - terms[begin].offset; + const size_t midOffset = width / 2 + terms[begin].offset; + const auto beginIt = terms.begin() + begin; + const auto endIt = terms.begin() + end; + const auto midIt = std::lower_bound(beginIt + 1, endIt - 1, midOffset, // + [&](const ConcatTerm& term, size_t value) { // + return term.offset < value; + }); + const size_t mid = begin + std::distance(beginIt, midIt); + UASSERT(begin < mid && mid < end, "Must make some progress"); + // Construct the subtrees + DfgVertex* const rhsp = constructConcat(terms, begin, mid); + DfgVertex* const lhsp = constructConcat(terms, mid, end); + // Construct new node + AstNodeDType* const dtypep = DfgVertex::dtypeForWidth(lhsp->width() + rhsp->width()); + DfgConcat* const newp = new DfgConcat{m_dfg, lhsp->fileline(), dtypep}; + newp->rhsp(rhsp); + newp->lhsp(lhsp); + return newp; + } + + // Delete unused tree rooted at the given vertex + void deleteTree(DfgVertexBinary* const vtxp) { + UASSERT_OBJ(!vtxp->hasSinks(), vtxp, "Trying to remove used vertex"); + DfgVertexBinary* const lhsp = vtxp->lhsp()->cast(); + DfgVertexBinary* const rhsp = vtxp->rhsp()->cast(); + VL_DO_DANGLING(vtxp->unlinkDelete(m_dfg), vtxp); + if (lhsp && !lhsp->hasSinks()) deleteTree(lhsp); + if (rhsp && !rhsp->hasSinks()) deleteTree(rhsp); + } + + void balanceConcat(DfgConcat* const rootp) { + // Gather all input vertices of the tree + const std::vector vtxps = gatherTerms(*rootp); + // Don't bother with trivial trees + if (vtxps.size() <= 3) return; + + // Construct the terms Vector that we are going to do processing on + std::vector terms(vtxps.size() + 1); + // These are redundant (constructor does the same), but here they are for clarity + terms[0].offset = 0; + terms[vtxps.size()].vtxp = nullptr; + for (size_t i = 0; i < vtxps.size(); ++i) { + terms[i].vtxp = vtxps[i]; + terms[i + 1].offset = terms[i].offset + vtxps[i]->width(); + } + + // Round 1: try to create terms ending on VL_EDATASIZE boundaries. + // This ensures we pack bits within a VL_EDATASIZE first is possible, + // and then hopefully we can just assemble VL_EDATASIZE words afterward. + std::vector terms2; + { + terms2.reserve(terms.size()); + + size_t begin = 0; // Start of current range considered + size_t end = 0; // End of current range considered + size_t offset = 0; // Offset of current range considered + + // Create a term from the current range + const auto makeTerm = [&]() { + DfgVertex* const vtxp = constructConcat(terms, begin, end); + terms2.emplace_back(vtxp, offset); + offset += vtxp->width(); + begin = end; + }; + + // Create all terms ending on a boundary. + while (++end < terms.size() - 1) { + if (terms[end].offset % VL_EDATASIZE == 0) makeTerm(); + } + // Final term. Loop condition above ensures this always exists, + // and might or might not be on a boundary. + makeTerm(); + // Sentinel term + terms2.emplace_back(nullptr, offset); + // should have ended up with the same number of bits at least... + UASSERT(terms2.back().offset == terms.back().offset, "Inconsitent terms"); + } + + // Round 2: Combine the partial terms + rootp->replaceWith(constructConcat(terms2, 0, terms2.size() - 1)); + VL_DO_DANGLING(deleteTree(rootp), rootp); + + ++m_ctx.m_balancedConcats; + } + + DfgBalanceTrees(DfgGraph& dfg, V3DfgBalanceTreesContext& ctx) + : m_dfg{dfg} + , m_ctx{ctx} { + // Find all roots + std::vector rootps; + for (DfgVertex& vtx : dfg.opVertices()) { + if (isRoot(vtx)) rootps.emplace_back(vtx.as()); + } + // Balance them + for (DfgConcat* const rootp : rootps) balanceConcat(rootp); + } + +public: + static void apply(DfgGraph& dfg, V3DfgBalanceTreesContext& ctx) { DfgBalanceTrees{dfg, ctx}; } +}; + +void V3DfgPasses::balanceTrees(DfgGraph& dfg, V3DfgBalanceTreesContext& ctx) { + DfgBalanceTrees::apply(dfg, ctx); +} diff --git a/src/V3DfgOptimizer.cpp b/src/V3DfgOptimizer.cpp index d6c6f1f30..7297cdd85 100644 --- a/src/V3DfgOptimizer.cpp +++ b/src/V3DfgOptimizer.cpp @@ -236,7 +236,7 @@ void V3DfgOptimizer::extract(AstNetlist* netlistp) { V3Global::dumpCheckGlobalTree("dfg-extract", 0, dumpTreeEitherLevel() >= 3); } -void V3DfgOptimizer::optimize(AstNetlist* netlistp, const string& label) { +void V3DfgOptimizer::optimize(AstNetlist* netlistp, const string& label, bool lastInvocation) { UINFO(2, __FUNCTION__ << ": " << endl); // NODE STATE @@ -282,7 +282,7 @@ void V3DfgOptimizer::optimize(AstNetlist* netlistp, const string& label) { for (auto& component : acyclicComponents) { if (dumpDfgLevel() >= 7) component->dumpDotFilePrefixed(ctx.prefix() + "source"); // Optimize the component - V3DfgPasses::optimize(*component, ctx); + V3DfgPasses::optimize(*component, ctx, lastInvocation); // Add back under the main DFG (we will convert everything back in one go) dfg->addGraph(*component); } diff --git a/src/V3DfgOptimizer.h b/src/V3DfgOptimizer.h index 067b5e801..df67c3e53 100644 --- a/src/V3DfgOptimizer.h +++ b/src/V3DfgOptimizer.h @@ -29,7 +29,7 @@ namespace V3DfgOptimizer { void extract(AstNetlist*) VL_MT_DISABLED; // Optimize the design -void optimize(AstNetlist*, const string& label) VL_MT_DISABLED; +void optimize(AstNetlist*, const string& label, bool lastInvocation) VL_MT_DISABLED; } // namespace V3DfgOptimizer #endif // Guard diff --git a/src/V3DfgPasses.cpp b/src/V3DfgPasses.cpp index d67642e8c..5b3f04041 100644 --- a/src/V3DfgPasses.cpp +++ b/src/V3DfgPasses.cpp @@ -42,6 +42,11 @@ V3DfgEliminateVarsContext::~V3DfgEliminateVarsContext() { m_varsRemoved); } +V3DfgBalanceTreesContext::~V3DfgBalanceTreesContext() { + V3Stats::addStat("Optimizations, DFG " + m_label + " BalanceTrees, concat trees balanced", + m_balancedConcats); +} + static std::string getPrefix(const std::string& label) { if (label.empty()) return ""; std::string str = VString::removeWhitespace(label); @@ -332,7 +337,7 @@ void V3DfgPasses::eliminateVars(DfgGraph& dfg, V3DfgEliminateVarsContext& ctx) { for (AstVar* const varp : replacedVariables) varp->unlinkFrBack()->deleteTree(); } -void V3DfgPasses::optimize(DfgGraph& dfg, V3DfgOptimizationContext& ctx) { +void V3DfgPasses::optimize(DfgGraph& dfg, V3DfgOptimizationContext& ctx, bool lastInvocation) { // There is absolutely nothing useful we can do with a graph of size 2 or less if (dfg.size() <= 2) return; @@ -360,6 +365,10 @@ void V3DfgPasses::optimize(DfgGraph& dfg, V3DfgOptimizationContext& ctx) { } // Accumulate patterns for reporting if (v3Global.opt.stats()) ctx.m_patternStats.accumulate(dfg); + // The peephole pass covnerts all trees to right leaning, so only do this on the last DFG run. + if (lastInvocation) { + apply(4, "balanceTrees", [&]() { balanceTrees(dfg, ctx.m_balanceTreesContext); }); + } apply(4, "regularize", [&]() { regularize(dfg, ctx.m_regularizeContext); }); if (dumpDfgLevel() >= 8) dfg.dumpDotAllVarConesPrefixed(ctx.prefix() + "optimized"); } diff --git a/src/V3DfgPasses.h b/src/V3DfgPasses.h index 2b1e08aa6..d893c84ce 100644 --- a/src/V3DfgPasses.h +++ b/src/V3DfgPasses.h @@ -68,6 +68,17 @@ public: ~V3DfgEliminateVarsContext() VL_MT_DISABLED; }; +class V3DfgBalanceTreesContext final { + const std::string m_label; // Label to apply to stats + +public: + VDouble0 m_balancedConcats; // Number of temporaries introduced + + explicit V3DfgBalanceTreesContext(const std::string& label) + : m_label{label} {} + ~V3DfgBalanceTreesContext() VL_MT_DISABLED; +}; + class V3DfgOptimizationContext final { const std::string m_label; // Label to add to stats, etc. const std::string m_prefix; // Prefix to add to file dumps (derived from label) @@ -92,6 +103,7 @@ public: V3DfgPeepholeContext m_peepholeContext{m_label}; V3DfgRegularizeContext m_regularizeContext{m_label}; V3DfgEliminateVarsContext m_eliminateVarsContext{m_label}; + V3DfgBalanceTreesContext m_balanceTreesContext{m_label}; V3DfgPatternStats m_patternStats; @@ -112,7 +124,7 @@ namespace V3DfgPasses { DfgGraph* astToDfg(AstModule&, V3DfgOptimizationContext&) VL_MT_DISABLED; // Optimize the given DfgGraph -void optimize(DfgGraph&, V3DfgOptimizationContext&) VL_MT_DISABLED; +void optimize(DfgGraph&, V3DfgOptimizationContext&, bool lastInvocation) VL_MT_DISABLED; // Convert DfgGraph back into Ast, and insert converted graph back into its parent module. // Returns the parent module. @@ -134,6 +146,8 @@ void regularize(DfgGraph&, V3DfgRegularizeContext&) VL_MT_DISABLED; void removeUnused(DfgGraph&) VL_MT_DISABLED; // Eliminate (remove or replace) redundant variables. Also removes resulting unused logic. void eliminateVars(DfgGraph&, V3DfgEliminateVarsContext&) VL_MT_DISABLED; +// Make computation trees balanced +void balanceTrees(DfgGraph&, V3DfgBalanceTreesContext&) VL_MT_DISABLED; } // namespace V3DfgPasses diff --git a/src/Verilator.cpp b/src/Verilator.cpp index d6b58ea9c..92d3f53de 100644 --- a/src/Verilator.cpp +++ b/src/Verilator.cpp @@ -286,7 +286,7 @@ static void process() { if (v3Global.opt.fDfgPreInline()) { // Pre inline DFG optimization - V3DfgOptimizer::optimize(v3Global.rootp(), "pre inline"); + V3DfgOptimizer::optimize(v3Global.rootp(), "pre inline", /* lastInvocation: */ false); } if (!(v3Global.opt.serializeOnly() && !v3Global.opt.flatten())) { @@ -303,7 +303,7 @@ static void process() { if (v3Global.opt.fDfgPostInline()) { // Post inline DFG optimization - V3DfgOptimizer::optimize(v3Global.rootp(), "post inline"); + V3DfgOptimizer::optimize(v3Global.rootp(), "post inline", /* lastInvocation: */ true); } // --PRE-FLAT OPTIMIZATIONS------------------ diff --git a/test_regress/t/t_dfg_balance_cats.py b/test_regress/t/t_dfg_balance_cats.py new file mode 100755 index 000000000..0a4055967 --- /dev/null +++ b/test_regress/t/t_dfg_balance_cats.py @@ -0,0 +1,21 @@ +#!/usr/bin/env python3 +# DESCRIPTION: Verilator: Verilog Test driver/expect definition +# +# Copyright 2024 by Wilson Snyder. This program is free software; you +# can redistribute it and/or modify it under the terms of either the GNU +# Lesser General Public License Version 3 or the Perl Artistic License +# Version 2.0. +# SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0 + +import vltest_bootstrap + +test.scenarios('vlt') + +test.compile(verilator_flags2=["--stats"]) + +test.file_grep(test.stats, + r' Optimizations, DFG pre inline BalanceTrees, concat trees balanced\s+(\d+)', 0) +test.file_grep(test.stats, + r' Optimizations, DFG post inline BalanceTrees, concat trees balanced\s+(\d+)', 1) + +test.passes() diff --git a/test_regress/t/t_dfg_balance_cats.v b/test_regress/t/t_dfg_balance_cats.v new file mode 100644 index 000000000..4562ca1e5 --- /dev/null +++ b/test_regress/t/t_dfg_balance_cats.v @@ -0,0 +1,35 @@ +// DESCRIPTION: Verilator: Verilog Test module +// +// This file ONLY is placed under the Creative Commons Public Domain, for +// any use, without warranty, 2024 by Wilson Snyder. +// SPDX-License-Identifier: CC0-1.0 + +// verilator lint_off UNOPTFLAT + +module t(i, o); + localparam N = 2000; // Deliberately not multiple of 32 + + input i; + wire [N-1:0] i; + + output o; + wire [N-1:0] o; + + for (genvar n = 0 ; n + 31 < N ; n += 32) begin + assign o[n+ 0 +: 1] = i[(N-1-n)- 0 -: 1]; + assign o[n+ 1 +: 1] = i[(N-1-n)- 1 -: 1]; + assign o[n+ 2 +: 2] = i[(N-1-n)- 2 -: 2]; + assign o[n+ 4 +: 4] = i[(N-1-n)- 4 -: 4]; + assign o[n+ 8 +: 8] = i[(N-1-n)- 8 -: 8]; + assign o[n+16 +: 8] = i[(N-1-n)-16 -: 8]; + assign o[n+24 +: 4] = i[(N-1-n)-24 -: 4]; + assign o[n+28 +: 2] = i[(N-1-n)-28 -: 2]; + assign o[n+30 +: 1] = i[(N-1-n)-30 -: 1]; + assign o[n+31 +: 1] = i[(N-1-n)-31 -: 1]; + end + + for (genvar n = N / 32 * 32; n < N ; ++n) begin + assign o[n] = i[N-1-n]; + end + +endmodule diff --git a/test_regress/t/t_opt_const_dfg.py b/test_regress/t/t_opt_const_dfg.py index eed838d28..e46719f23 100755 --- a/test_regress/t/t_opt_const_dfg.py +++ b/test_regress/t/t_opt_const_dfg.py @@ -17,6 +17,6 @@ test.compile(verilator_flags2=["-Wno-UNOPTTHREADS", "--stats", test.t_dir + "/t_ test.execute() if test.vlt: - test.file_grep(test.stats, r'Optimizations, Const bit op reduction\s+(\d+)', 40) + test.file_grep(test.stats, r'Optimizations, Const bit op reduction\s+(\d+)', 39) test.passes()