diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index e0b1792a6..9049fc215 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -224,6 +224,7 @@ set(COMMON_SOURCES V3Descope.cpp V3Dfg.cpp V3DfgAstToDfg.cpp + V3DfgBalanceTrees.cpp V3DfgCache.cpp V3DfgDecomposition.cpp V3DfgDfgToAst.cpp diff --git a/src/Makefile_obj.in b/src/Makefile_obj.in index d29baa840..0e972fc71 100644 --- a/src/Makefile_obj.in +++ b/src/Makefile_obj.in @@ -237,6 +237,7 @@ RAW_OBJS_PCH_ASTNOMT = \ V3Descope.o \ V3Dfg.o \ V3DfgAstToDfg.o \ + V3DfgBalanceTrees.o \ V3DfgCache.o \ V3DfgDecomposition.o \ V3DfgDfgToAst.o \ diff --git a/src/V3Dfg.h b/src/V3Dfg.h index 5fab278ee..8b0978b97 100644 --- a/src/V3Dfg.h +++ b/src/V3Dfg.h @@ -274,6 +274,9 @@ public: // Predicate: has 1 or more sinks bool hasSinks() const { return m_sinksp != nullptr; } + // Predicate: has precisely 1 sink + bool hasSingleSink() const { return m_sinksp && !m_sinksp->m_nextp; } + // Predicate: has 2 or more sinks bool hasMultipleSinks() const { return m_sinksp && m_sinksp->m_nextp; } diff --git a/src/V3DfgBalanceTrees.cpp b/src/V3DfgBalanceTrees.cpp new file mode 100644 index 000000000..6b5eca2d8 --- /dev/null +++ b/src/V3DfgBalanceTrees.cpp @@ -0,0 +1,197 @@ +// -*- mode: C++; c-file-style: "cc-mode" -*- +//************************************************************************* +// DESCRIPTION: Verilator: Balance associative op trees in DfgGraphs +// +// Code available from: https://verilator.org +// +//************************************************************************* +// +// Copyright 2003-2024 by Wilson Snyder. This program is free software; you +// can redistribute it and/or modify it under the terms of either the GNU +// Lesser General Public License Version 3 or the Perl Artistic License +// Version 2.0. +// SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0 +// +//************************************************************************* +// +// - Convert concatenation trees into balanced form +// +//************************************************************************* + +#include "V3PchAstNoMT.h" // VL_MT_DISABLED_CODE_UNIT + +#include "V3Dfg.h" +#include "V3DfgPasses.h" + +VL_DEFINE_DEBUG_FUNCTIONS; + +class DfgBalanceTrees final { + // We keep the expressions, together with their offsets within a concatenation tree + struct ConcatTerm final { + DfgVertex* vtxp = nullptr; + size_t offset = 0; + + ConcatTerm() = default; + ConcatTerm(DfgVertex* vtxp, size_t offset) + : vtxp{vtxp} + , offset{offset} {} + }; + + DfgGraph& m_dfg; // The graph being processed + V3DfgBalanceTreesContext& m_ctx; // The optimization context for stats + + // Is the given vertex the root of a tree (of potentially size 1), of the given type? + template + static bool isRoot(const DfgVertex& vtx) { + static_assert(std::is_base_of::value, + "'Vertex' must be a 'DfgVertexBinary'"); + if (!vtx.is()) return false; + // Has a single sink, and that sink is not another vertex of the same type + return vtx.hasSingleSink() && !vtx.findSink(); + } + + // Recursive implementation of 'gatherTerms' below. + template + static void gatherTermsImpl(DfgVertex* vtxp, std::vector& terms) { + // Base case: different type, or multiple sinks -> it's a term + if (!vtxp->is() || vtxp->hasMultipleSinks()) { + terms.emplace_back(vtxp); + return; + } + // Recursive case: gather sub terms, right to right + DfgVertexBinary* const binp = vtxp->as(); + gatherTermsImpl(binp->rhsp(), terms); + gatherTermsImpl(binp->lhsp(), terms); + } + + // Gather terms in the tree of given type, rooted at the given vertex. + // Results are right to left, that is, index 0 in the returned vector + // is the rightmost term, index size()-1 is the leftmost term. + template + static std::vector gatherTerms(Vertex& root) { + static_assert(std::is_base_of::value, + "'Vertex' must be a 'DfgVertexBinary'"); + std::vector terms; + gatherTermsImpl(root.rhsp(), terms); + gatherTermsImpl(root.lhsp(), terms); + return terms; + } + + // Construct a balanced concatenation from the given terms, + // between indices begin (inclusive), and end (exclusive). + // Note term[end].offset must be valid. term[end].vtxp is + // never referenced. + DfgVertex* constructConcat(const std::vector& terms, const size_t begin, + const size_t end) { + UASSERT(end < terms.size(), "Invalid end"); + UASSERT(begin < end, "Invalid range"); + // Base case: just return the term + if (end == begin + 1) return terms[begin].vtxp; + + // Recursive case: + // Compute the mid-point, trying to create roughly equal width intermediates + const size_t width = terms[end].offset - terms[begin].offset; + const size_t midOffset = width / 2 + terms[begin].offset; + const auto beginIt = terms.begin() + begin; + const auto endIt = terms.begin() + end; + const auto midIt = std::lower_bound(beginIt + 1, endIt - 1, midOffset, // + [&](const ConcatTerm& term, size_t value) { // + return term.offset < value; + }); + const size_t mid = begin + std::distance(beginIt, midIt); + UASSERT(begin < mid && mid < end, "Must make some progress"); + // Construct the subtrees + DfgVertex* const rhsp = constructConcat(terms, begin, mid); + DfgVertex* const lhsp = constructConcat(terms, mid, end); + // Construct new node + AstNodeDType* const dtypep = DfgVertex::dtypeForWidth(lhsp->width() + rhsp->width()); + DfgConcat* const newp = new DfgConcat{m_dfg, lhsp->fileline(), dtypep}; + newp->rhsp(rhsp); + newp->lhsp(lhsp); + return newp; + } + + // Delete unused tree rooted at the given vertex + void deleteTree(DfgVertexBinary* const vtxp) { + UASSERT_OBJ(!vtxp->hasSinks(), vtxp, "Trying to remove used vertex"); + DfgVertexBinary* const lhsp = vtxp->lhsp()->cast(); + DfgVertexBinary* const rhsp = vtxp->rhsp()->cast(); + VL_DO_DANGLING(vtxp->unlinkDelete(m_dfg), vtxp); + if (lhsp && !lhsp->hasSinks()) deleteTree(lhsp); + if (rhsp && !rhsp->hasSinks()) deleteTree(rhsp); + } + + void balanceConcat(DfgConcat* const rootp) { + // Gather all input vertices of the tree + const std::vector vtxps = gatherTerms(*rootp); + // Don't bother with trivial trees + if (vtxps.size() <= 3) return; + + // Construct the terms Vector that we are going to do processing on + std::vector terms(vtxps.size() + 1); + // These are redundant (constructor does the same), but here they are for clarity + terms[0].offset = 0; + terms[vtxps.size()].vtxp = nullptr; + for (size_t i = 0; i < vtxps.size(); ++i) { + terms[i].vtxp = vtxps[i]; + terms[i + 1].offset = terms[i].offset + vtxps[i]->width(); + } + + // Round 1: try to create terms ending on VL_EDATASIZE boundaries. + // This ensures we pack bits within a VL_EDATASIZE first is possible, + // and then hopefully we can just assemble VL_EDATASIZE words afterward. + std::vector terms2; + { + terms2.reserve(terms.size()); + + size_t begin = 0; // Start of current range considered + size_t end = 0; // End of current range considered + size_t offset = 0; // Offset of current range considered + + // Create a term from the current range + const auto makeTerm = [&]() { + DfgVertex* const vtxp = constructConcat(terms, begin, end); + terms2.emplace_back(vtxp, offset); + offset += vtxp->width(); + begin = end; + }; + + // Create all terms ending on a boundary. + while (++end < terms.size() - 1) { + if (terms[end].offset % VL_EDATASIZE == 0) makeTerm(); + } + // Final term. Loop condition above ensures this always exists, + // and might or might not be on a boundary. + makeTerm(); + // Sentinel term + terms2.emplace_back(nullptr, offset); + // should have ended up with the same number of bits at least... + UASSERT(terms2.back().offset == terms.back().offset, "Inconsitent terms"); + } + + // Round 2: Combine the partial terms + rootp->replaceWith(constructConcat(terms2, 0, terms2.size() - 1)); + VL_DO_DANGLING(deleteTree(rootp), rootp); + + ++m_ctx.m_balancedConcats; + } + + DfgBalanceTrees(DfgGraph& dfg, V3DfgBalanceTreesContext& ctx) + : m_dfg{dfg} + , m_ctx{ctx} { + // Find all roots + std::vector rootps; + for (DfgVertex& vtx : dfg.opVertices()) { + if (isRoot(vtx)) rootps.emplace_back(vtx.as()); + } + // Balance them + for (DfgConcat* const rootp : rootps) balanceConcat(rootp); + } + +public: + static void apply(DfgGraph& dfg, V3DfgBalanceTreesContext& ctx) { DfgBalanceTrees{dfg, ctx}; } +}; + +void V3DfgPasses::balanceTrees(DfgGraph& dfg, V3DfgBalanceTreesContext& ctx) { + DfgBalanceTrees::apply(dfg, ctx); +} diff --git a/src/V3DfgOptimizer.cpp b/src/V3DfgOptimizer.cpp index d6c6f1f30..7297cdd85 100644 --- a/src/V3DfgOptimizer.cpp +++ b/src/V3DfgOptimizer.cpp @@ -236,7 +236,7 @@ void V3DfgOptimizer::extract(AstNetlist* netlistp) { V3Global::dumpCheckGlobalTree("dfg-extract", 0, dumpTreeEitherLevel() >= 3); } -void V3DfgOptimizer::optimize(AstNetlist* netlistp, const string& label) { +void V3DfgOptimizer::optimize(AstNetlist* netlistp, const string& label, bool lastInvocation) { UINFO(2, __FUNCTION__ << ": " << endl); // NODE STATE @@ -282,7 +282,7 @@ void V3DfgOptimizer::optimize(AstNetlist* netlistp, const string& label) { for (auto& component : acyclicComponents) { if (dumpDfgLevel() >= 7) component->dumpDotFilePrefixed(ctx.prefix() + "source"); // Optimize the component - V3DfgPasses::optimize(*component, ctx); + V3DfgPasses::optimize(*component, ctx, lastInvocation); // Add back under the main DFG (we will convert everything back in one go) dfg->addGraph(*component); } diff --git a/src/V3DfgOptimizer.h b/src/V3DfgOptimizer.h index 067b5e801..df67c3e53 100644 --- a/src/V3DfgOptimizer.h +++ b/src/V3DfgOptimizer.h @@ -29,7 +29,7 @@ namespace V3DfgOptimizer { void extract(AstNetlist*) VL_MT_DISABLED; // Optimize the design -void optimize(AstNetlist*, const string& label) VL_MT_DISABLED; +void optimize(AstNetlist*, const string& label, bool lastInvocation) VL_MT_DISABLED; } // namespace V3DfgOptimizer #endif // Guard diff --git a/src/V3DfgPasses.cpp b/src/V3DfgPasses.cpp index d67642e8c..5b3f04041 100644 --- a/src/V3DfgPasses.cpp +++ b/src/V3DfgPasses.cpp @@ -42,6 +42,11 @@ V3DfgEliminateVarsContext::~V3DfgEliminateVarsContext() { m_varsRemoved); } +V3DfgBalanceTreesContext::~V3DfgBalanceTreesContext() { + V3Stats::addStat("Optimizations, DFG " + m_label + " BalanceTrees, concat trees balanced", + m_balancedConcats); +} + static std::string getPrefix(const std::string& label) { if (label.empty()) return ""; std::string str = VString::removeWhitespace(label); @@ -332,7 +337,7 @@ void V3DfgPasses::eliminateVars(DfgGraph& dfg, V3DfgEliminateVarsContext& ctx) { for (AstVar* const varp : replacedVariables) varp->unlinkFrBack()->deleteTree(); } -void V3DfgPasses::optimize(DfgGraph& dfg, V3DfgOptimizationContext& ctx) { +void V3DfgPasses::optimize(DfgGraph& dfg, V3DfgOptimizationContext& ctx, bool lastInvocation) { // There is absolutely nothing useful we can do with a graph of size 2 or less if (dfg.size() <= 2) return; @@ -360,6 +365,10 @@ void V3DfgPasses::optimize(DfgGraph& dfg, V3DfgOptimizationContext& ctx) { } // Accumulate patterns for reporting if (v3Global.opt.stats()) ctx.m_patternStats.accumulate(dfg); + // The peephole pass covnerts all trees to right leaning, so only do this on the last DFG run. + if (lastInvocation) { + apply(4, "balanceTrees", [&]() { balanceTrees(dfg, ctx.m_balanceTreesContext); }); + } apply(4, "regularize", [&]() { regularize(dfg, ctx.m_regularizeContext); }); if (dumpDfgLevel() >= 8) dfg.dumpDotAllVarConesPrefixed(ctx.prefix() + "optimized"); } diff --git a/src/V3DfgPasses.h b/src/V3DfgPasses.h index 2b1e08aa6..d893c84ce 100644 --- a/src/V3DfgPasses.h +++ b/src/V3DfgPasses.h @@ -68,6 +68,17 @@ public: ~V3DfgEliminateVarsContext() VL_MT_DISABLED; }; +class V3DfgBalanceTreesContext final { + const std::string m_label; // Label to apply to stats + +public: + VDouble0 m_balancedConcats; // Number of temporaries introduced + + explicit V3DfgBalanceTreesContext(const std::string& label) + : m_label{label} {} + ~V3DfgBalanceTreesContext() VL_MT_DISABLED; +}; + class V3DfgOptimizationContext final { const std::string m_label; // Label to add to stats, etc. const std::string m_prefix; // Prefix to add to file dumps (derived from label) @@ -92,6 +103,7 @@ public: V3DfgPeepholeContext m_peepholeContext{m_label}; V3DfgRegularizeContext m_regularizeContext{m_label}; V3DfgEliminateVarsContext m_eliminateVarsContext{m_label}; + V3DfgBalanceTreesContext m_balanceTreesContext{m_label}; V3DfgPatternStats m_patternStats; @@ -112,7 +124,7 @@ namespace V3DfgPasses { DfgGraph* astToDfg(AstModule&, V3DfgOptimizationContext&) VL_MT_DISABLED; // Optimize the given DfgGraph -void optimize(DfgGraph&, V3DfgOptimizationContext&) VL_MT_DISABLED; +void optimize(DfgGraph&, V3DfgOptimizationContext&, bool lastInvocation) VL_MT_DISABLED; // Convert DfgGraph back into Ast, and insert converted graph back into its parent module. // Returns the parent module. @@ -134,6 +146,8 @@ void regularize(DfgGraph&, V3DfgRegularizeContext&) VL_MT_DISABLED; void removeUnused(DfgGraph&) VL_MT_DISABLED; // Eliminate (remove or replace) redundant variables. Also removes resulting unused logic. void eliminateVars(DfgGraph&, V3DfgEliminateVarsContext&) VL_MT_DISABLED; +// Make computation trees balanced +void balanceTrees(DfgGraph&, V3DfgBalanceTreesContext&) VL_MT_DISABLED; } // namespace V3DfgPasses diff --git a/src/Verilator.cpp b/src/Verilator.cpp index d6b58ea9c..92d3f53de 100644 --- a/src/Verilator.cpp +++ b/src/Verilator.cpp @@ -286,7 +286,7 @@ static void process() { if (v3Global.opt.fDfgPreInline()) { // Pre inline DFG optimization - V3DfgOptimizer::optimize(v3Global.rootp(), "pre inline"); + V3DfgOptimizer::optimize(v3Global.rootp(), "pre inline", /* lastInvocation: */ false); } if (!(v3Global.opt.serializeOnly() && !v3Global.opt.flatten())) { @@ -303,7 +303,7 @@ static void process() { if (v3Global.opt.fDfgPostInline()) { // Post inline DFG optimization - V3DfgOptimizer::optimize(v3Global.rootp(), "post inline"); + V3DfgOptimizer::optimize(v3Global.rootp(), "post inline", /* lastInvocation: */ true); } // --PRE-FLAT OPTIMIZATIONS------------------ diff --git a/test_regress/t/t_dfg_balance_cats.py b/test_regress/t/t_dfg_balance_cats.py new file mode 100755 index 000000000..0a4055967 --- /dev/null +++ b/test_regress/t/t_dfg_balance_cats.py @@ -0,0 +1,21 @@ +#!/usr/bin/env python3 +# DESCRIPTION: Verilator: Verilog Test driver/expect definition +# +# Copyright 2024 by Wilson Snyder. This program is free software; you +# can redistribute it and/or modify it under the terms of either the GNU +# Lesser General Public License Version 3 or the Perl Artistic License +# Version 2.0. +# SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0 + +import vltest_bootstrap + +test.scenarios('vlt') + +test.compile(verilator_flags2=["--stats"]) + +test.file_grep(test.stats, + r' Optimizations, DFG pre inline BalanceTrees, concat trees balanced\s+(\d+)', 0) +test.file_grep(test.stats, + r' Optimizations, DFG post inline BalanceTrees, concat trees balanced\s+(\d+)', 1) + +test.passes() diff --git a/test_regress/t/t_dfg_balance_cats.v b/test_regress/t/t_dfg_balance_cats.v new file mode 100644 index 000000000..4562ca1e5 --- /dev/null +++ b/test_regress/t/t_dfg_balance_cats.v @@ -0,0 +1,35 @@ +// DESCRIPTION: Verilator: Verilog Test module +// +// This file ONLY is placed under the Creative Commons Public Domain, for +// any use, without warranty, 2024 by Wilson Snyder. +// SPDX-License-Identifier: CC0-1.0 + +// verilator lint_off UNOPTFLAT + +module t(i, o); + localparam N = 2000; // Deliberately not multiple of 32 + + input i; + wire [N-1:0] i; + + output o; + wire [N-1:0] o; + + for (genvar n = 0 ; n + 31 < N ; n += 32) begin + assign o[n+ 0 +: 1] = i[(N-1-n)- 0 -: 1]; + assign o[n+ 1 +: 1] = i[(N-1-n)- 1 -: 1]; + assign o[n+ 2 +: 2] = i[(N-1-n)- 2 -: 2]; + assign o[n+ 4 +: 4] = i[(N-1-n)- 4 -: 4]; + assign o[n+ 8 +: 8] = i[(N-1-n)- 8 -: 8]; + assign o[n+16 +: 8] = i[(N-1-n)-16 -: 8]; + assign o[n+24 +: 4] = i[(N-1-n)-24 -: 4]; + assign o[n+28 +: 2] = i[(N-1-n)-28 -: 2]; + assign o[n+30 +: 1] = i[(N-1-n)-30 -: 1]; + assign o[n+31 +: 1] = i[(N-1-n)-31 -: 1]; + end + + for (genvar n = N / 32 * 32; n < N ; ++n) begin + assign o[n] = i[N-1-n]; + end + +endmodule diff --git a/test_regress/t/t_opt_const_dfg.py b/test_regress/t/t_opt_const_dfg.py index eed838d28..e46719f23 100755 --- a/test_regress/t/t_opt_const_dfg.py +++ b/test_regress/t/t_opt_const_dfg.py @@ -17,6 +17,6 @@ test.compile(verilator_flags2=["-Wno-UNOPTTHREADS", "--stats", test.t_dir + "/t_ test.execute() if test.vlt: - test.file_grep(test.stats, r'Optimizations, Const bit op reduction\s+(\d+)', 40) + test.file_grep(test.stats, r'Optimizations, Const bit op reduction\s+(\d+)', 39) test.passes()