Balance concatenations in DFG (#5598)

The DFG peephole pass converts all associative trees into right leaning,
which is good for simplifying pattern recognition, but can lead to an
excessive amount of wide intermediate results being constructed for
right leaning concatenations.

Add a new pass to balance concatenation trees by trying to:
- Create VL_EDATASIZE (32-bit) sub-terms, so words can then be packed
  easily afterwards
- Try to ensure the operands of a concat are roughly the same width
  within a concatenation tree. This does not yield the shortest tree,
  but it ensures it has many sub-nodes that are small enough to fit into
  machine registers.

This can eliminate a lot of wide intermediate results, which would need
temporaries, and also increases ILP within sub-expressions (assuming the
C compiler can't figure that out itself).

This is over 2x run-time speedup on the high_perf configuration of
VeeR EH2 (which you could arguably also get with -fno-dfg, but oh well).
This commit is contained in:
Geza Lore 2024-11-09 18:14:19 +00:00 committed by GitHub
parent 4969125e5a
commit f073b278f9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
12 changed files with 289 additions and 8 deletions

View File

@ -224,6 +224,7 @@ set(COMMON_SOURCES
V3Descope.cpp
V3Dfg.cpp
V3DfgAstToDfg.cpp
V3DfgBalanceTrees.cpp
V3DfgCache.cpp
V3DfgDecomposition.cpp
V3DfgDfgToAst.cpp

View File

@ -237,6 +237,7 @@ RAW_OBJS_PCH_ASTNOMT = \
V3Descope.o \
V3Dfg.o \
V3DfgAstToDfg.o \
V3DfgBalanceTrees.o \
V3DfgCache.o \
V3DfgDecomposition.o \
V3DfgDfgToAst.o \

View File

@ -274,6 +274,9 @@ public:
// Predicate: has 1 or more sinks
bool hasSinks() const { return m_sinksp != nullptr; }
// Predicate: has precisely 1 sink
bool hasSingleSink() const { return m_sinksp && !m_sinksp->m_nextp; }
// Predicate: has 2 or more sinks
bool hasMultipleSinks() const { return m_sinksp && m_sinksp->m_nextp; }

197
src/V3DfgBalanceTrees.cpp Normal file
View File

@ -0,0 +1,197 @@
// -*- mode: C++; c-file-style: "cc-mode" -*-
//*************************************************************************
// DESCRIPTION: Verilator: Balance associative op trees in DfgGraphs
//
// Code available from: https://verilator.org
//
//*************************************************************************
//
// Copyright 2003-2024 by Wilson Snyder. This program is free software; you
// can redistribute it and/or modify it under the terms of either the GNU
// Lesser General Public License Version 3 or the Perl Artistic License
// Version 2.0.
// SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
//
//*************************************************************************
//
// - Convert concatenation trees into balanced form
//
//*************************************************************************
#include "V3PchAstNoMT.h" // VL_MT_DISABLED_CODE_UNIT
#include "V3Dfg.h"
#include "V3DfgPasses.h"
VL_DEFINE_DEBUG_FUNCTIONS;
class DfgBalanceTrees final {
// We keep the expressions, together with their offsets within a concatenation tree
struct ConcatTerm final {
DfgVertex* vtxp = nullptr;
size_t offset = 0;
ConcatTerm() = default;
ConcatTerm(DfgVertex* vtxp, size_t offset)
: vtxp{vtxp}
, offset{offset} {}
};
DfgGraph& m_dfg; // The graph being processed
V3DfgBalanceTreesContext& m_ctx; // The optimization context for stats
// Is the given vertex the root of a tree (of potentially size 1), of the given type?
template <typename Vertex>
static bool isRoot(const DfgVertex& vtx) {
static_assert(std::is_base_of<DfgVertexBinary, Vertex>::value,
"'Vertex' must be a 'DfgVertexBinary'");
if (!vtx.is<Vertex>()) return false;
// Has a single sink, and that sink is not another vertex of the same type
return vtx.hasSingleSink() && !vtx.findSink<Vertex>();
}
// Recursive implementation of 'gatherTerms' below.
template <typename Vertex>
static void gatherTermsImpl(DfgVertex* vtxp, std::vector<DfgVertex*>& terms) {
// Base case: different type, or multiple sinks -> it's a term
if (!vtxp->is<Vertex>() || vtxp->hasMultipleSinks()) {
terms.emplace_back(vtxp);
return;
}
// Recursive case: gather sub terms, right to right
DfgVertexBinary* const binp = vtxp->as<Vertex>();
gatherTermsImpl<Vertex>(binp->rhsp(), terms);
gatherTermsImpl<Vertex>(binp->lhsp(), terms);
}
// Gather terms in the tree of given type, rooted at the given vertex.
// Results are right to left, that is, index 0 in the returned vector
// is the rightmost term, index size()-1 is the leftmost term.
template <typename Vertex>
static std::vector<DfgVertex*> gatherTerms(Vertex& root) {
static_assert(std::is_base_of<DfgVertexBinary, Vertex>::value,
"'Vertex' must be a 'DfgVertexBinary'");
std::vector<DfgVertex*> terms;
gatherTermsImpl<Vertex>(root.rhsp(), terms);
gatherTermsImpl<Vertex>(root.lhsp(), terms);
return terms;
}
// Construct a balanced concatenation from the given terms,
// between indices begin (inclusive), and end (exclusive).
// Note term[end].offset must be valid. term[end].vtxp is
// never referenced.
DfgVertex* constructConcat(const std::vector<ConcatTerm>& terms, const size_t begin,
const size_t end) {
UASSERT(end < terms.size(), "Invalid end");
UASSERT(begin < end, "Invalid range");
// Base case: just return the term
if (end == begin + 1) return terms[begin].vtxp;
// Recursive case:
// Compute the mid-point, trying to create roughly equal width intermediates
const size_t width = terms[end].offset - terms[begin].offset;
const size_t midOffset = width / 2 + terms[begin].offset;
const auto beginIt = terms.begin() + begin;
const auto endIt = terms.begin() + end;
const auto midIt = std::lower_bound(beginIt + 1, endIt - 1, midOffset, //
[&](const ConcatTerm& term, size_t value) { //
return term.offset < value;
});
const size_t mid = begin + std::distance(beginIt, midIt);
UASSERT(begin < mid && mid < end, "Must make some progress");
// Construct the subtrees
DfgVertex* const rhsp = constructConcat(terms, begin, mid);
DfgVertex* const lhsp = constructConcat(terms, mid, end);
// Construct new node
AstNodeDType* const dtypep = DfgVertex::dtypeForWidth(lhsp->width() + rhsp->width());
DfgConcat* const newp = new DfgConcat{m_dfg, lhsp->fileline(), dtypep};
newp->rhsp(rhsp);
newp->lhsp(lhsp);
return newp;
}
// Delete unused tree rooted at the given vertex
void deleteTree(DfgVertexBinary* const vtxp) {
UASSERT_OBJ(!vtxp->hasSinks(), vtxp, "Trying to remove used vertex");
DfgVertexBinary* const lhsp = vtxp->lhsp()->cast<DfgVertexBinary>();
DfgVertexBinary* const rhsp = vtxp->rhsp()->cast<DfgVertexBinary>();
VL_DO_DANGLING(vtxp->unlinkDelete(m_dfg), vtxp);
if (lhsp && !lhsp->hasSinks()) deleteTree(lhsp);
if (rhsp && !rhsp->hasSinks()) deleteTree(rhsp);
}
void balanceConcat(DfgConcat* const rootp) {
// Gather all input vertices of the tree
const std::vector<DfgVertex*> vtxps = gatherTerms<DfgConcat>(*rootp);
// Don't bother with trivial trees
if (vtxps.size() <= 3) return;
// Construct the terms Vector that we are going to do processing on
std::vector<ConcatTerm> terms(vtxps.size() + 1);
// These are redundant (constructor does the same), but here they are for clarity
terms[0].offset = 0;
terms[vtxps.size()].vtxp = nullptr;
for (size_t i = 0; i < vtxps.size(); ++i) {
terms[i].vtxp = vtxps[i];
terms[i + 1].offset = terms[i].offset + vtxps[i]->width();
}
// Round 1: try to create terms ending on VL_EDATASIZE boundaries.
// This ensures we pack bits within a VL_EDATASIZE first is possible,
// and then hopefully we can just assemble VL_EDATASIZE words afterward.
std::vector<ConcatTerm> terms2;
{
terms2.reserve(terms.size());
size_t begin = 0; // Start of current range considered
size_t end = 0; // End of current range considered
size_t offset = 0; // Offset of current range considered
// Create a term from the current range
const auto makeTerm = [&]() {
DfgVertex* const vtxp = constructConcat(terms, begin, end);
terms2.emplace_back(vtxp, offset);
offset += vtxp->width();
begin = end;
};
// Create all terms ending on a boundary.
while (++end < terms.size() - 1) {
if (terms[end].offset % VL_EDATASIZE == 0) makeTerm();
}
// Final term. Loop condition above ensures this always exists,
// and might or might not be on a boundary.
makeTerm();
// Sentinel term
terms2.emplace_back(nullptr, offset);
// should have ended up with the same number of bits at least...
UASSERT(terms2.back().offset == terms.back().offset, "Inconsitent terms");
}
// Round 2: Combine the partial terms
rootp->replaceWith(constructConcat(terms2, 0, terms2.size() - 1));
VL_DO_DANGLING(deleteTree(rootp), rootp);
++m_ctx.m_balancedConcats;
}
DfgBalanceTrees(DfgGraph& dfg, V3DfgBalanceTreesContext& ctx)
: m_dfg{dfg}
, m_ctx{ctx} {
// Find all roots
std::vector<DfgConcat*> rootps;
for (DfgVertex& vtx : dfg.opVertices()) {
if (isRoot<DfgConcat>(vtx)) rootps.emplace_back(vtx.as<DfgConcat>());
}
// Balance them
for (DfgConcat* const rootp : rootps) balanceConcat(rootp);
}
public:
static void apply(DfgGraph& dfg, V3DfgBalanceTreesContext& ctx) { DfgBalanceTrees{dfg, ctx}; }
};
void V3DfgPasses::balanceTrees(DfgGraph& dfg, V3DfgBalanceTreesContext& ctx) {
DfgBalanceTrees::apply(dfg, ctx);
}

View File

@ -236,7 +236,7 @@ void V3DfgOptimizer::extract(AstNetlist* netlistp) {
V3Global::dumpCheckGlobalTree("dfg-extract", 0, dumpTreeEitherLevel() >= 3);
}
void V3DfgOptimizer::optimize(AstNetlist* netlistp, const string& label) {
void V3DfgOptimizer::optimize(AstNetlist* netlistp, const string& label, bool lastInvocation) {
UINFO(2, __FUNCTION__ << ": " << endl);
// NODE STATE
@ -282,7 +282,7 @@ void V3DfgOptimizer::optimize(AstNetlist* netlistp, const string& label) {
for (auto& component : acyclicComponents) {
if (dumpDfgLevel() >= 7) component->dumpDotFilePrefixed(ctx.prefix() + "source");
// Optimize the component
V3DfgPasses::optimize(*component, ctx);
V3DfgPasses::optimize(*component, ctx, lastInvocation);
// Add back under the main DFG (we will convert everything back in one go)
dfg->addGraph(*component);
}

View File

@ -29,7 +29,7 @@ namespace V3DfgOptimizer {
void extract(AstNetlist*) VL_MT_DISABLED;
// Optimize the design
void optimize(AstNetlist*, const string& label) VL_MT_DISABLED;
void optimize(AstNetlist*, const string& label, bool lastInvocation) VL_MT_DISABLED;
} // namespace V3DfgOptimizer
#endif // Guard

View File

@ -42,6 +42,11 @@ V3DfgEliminateVarsContext::~V3DfgEliminateVarsContext() {
m_varsRemoved);
}
V3DfgBalanceTreesContext::~V3DfgBalanceTreesContext() {
V3Stats::addStat("Optimizations, DFG " + m_label + " BalanceTrees, concat trees balanced",
m_balancedConcats);
}
static std::string getPrefix(const std::string& label) {
if (label.empty()) return "";
std::string str = VString::removeWhitespace(label);
@ -332,7 +337,7 @@ void V3DfgPasses::eliminateVars(DfgGraph& dfg, V3DfgEliminateVarsContext& ctx) {
for (AstVar* const varp : replacedVariables) varp->unlinkFrBack()->deleteTree();
}
void V3DfgPasses::optimize(DfgGraph& dfg, V3DfgOptimizationContext& ctx) {
void V3DfgPasses::optimize(DfgGraph& dfg, V3DfgOptimizationContext& ctx, bool lastInvocation) {
// There is absolutely nothing useful we can do with a graph of size 2 or less
if (dfg.size() <= 2) return;
@ -360,6 +365,10 @@ void V3DfgPasses::optimize(DfgGraph& dfg, V3DfgOptimizationContext& ctx) {
}
// Accumulate patterns for reporting
if (v3Global.opt.stats()) ctx.m_patternStats.accumulate(dfg);
// The peephole pass covnerts all trees to right leaning, so only do this on the last DFG run.
if (lastInvocation) {
apply(4, "balanceTrees", [&]() { balanceTrees(dfg, ctx.m_balanceTreesContext); });
}
apply(4, "regularize", [&]() { regularize(dfg, ctx.m_regularizeContext); });
if (dumpDfgLevel() >= 8) dfg.dumpDotAllVarConesPrefixed(ctx.prefix() + "optimized");
}

View File

@ -68,6 +68,17 @@ public:
~V3DfgEliminateVarsContext() VL_MT_DISABLED;
};
class V3DfgBalanceTreesContext final {
const std::string m_label; // Label to apply to stats
public:
VDouble0 m_balancedConcats; // Number of temporaries introduced
explicit V3DfgBalanceTreesContext(const std::string& label)
: m_label{label} {}
~V3DfgBalanceTreesContext() VL_MT_DISABLED;
};
class V3DfgOptimizationContext final {
const std::string m_label; // Label to add to stats, etc.
const std::string m_prefix; // Prefix to add to file dumps (derived from label)
@ -92,6 +103,7 @@ public:
V3DfgPeepholeContext m_peepholeContext{m_label};
V3DfgRegularizeContext m_regularizeContext{m_label};
V3DfgEliminateVarsContext m_eliminateVarsContext{m_label};
V3DfgBalanceTreesContext m_balanceTreesContext{m_label};
V3DfgPatternStats m_patternStats;
@ -112,7 +124,7 @@ namespace V3DfgPasses {
DfgGraph* astToDfg(AstModule&, V3DfgOptimizationContext&) VL_MT_DISABLED;
// Optimize the given DfgGraph
void optimize(DfgGraph&, V3DfgOptimizationContext&) VL_MT_DISABLED;
void optimize(DfgGraph&, V3DfgOptimizationContext&, bool lastInvocation) VL_MT_DISABLED;
// Convert DfgGraph back into Ast, and insert converted graph back into its parent module.
// Returns the parent module.
@ -134,6 +146,8 @@ void regularize(DfgGraph&, V3DfgRegularizeContext&) VL_MT_DISABLED;
void removeUnused(DfgGraph&) VL_MT_DISABLED;
// Eliminate (remove or replace) redundant variables. Also removes resulting unused logic.
void eliminateVars(DfgGraph&, V3DfgEliminateVarsContext&) VL_MT_DISABLED;
// Make computation trees balanced
void balanceTrees(DfgGraph&, V3DfgBalanceTreesContext&) VL_MT_DISABLED;
} // namespace V3DfgPasses

View File

@ -286,7 +286,7 @@ static void process() {
if (v3Global.opt.fDfgPreInline()) {
// Pre inline DFG optimization
V3DfgOptimizer::optimize(v3Global.rootp(), "pre inline");
V3DfgOptimizer::optimize(v3Global.rootp(), "pre inline", /* lastInvocation: */ false);
}
if (!(v3Global.opt.serializeOnly() && !v3Global.opt.flatten())) {
@ -303,7 +303,7 @@ static void process() {
if (v3Global.opt.fDfgPostInline()) {
// Post inline DFG optimization
V3DfgOptimizer::optimize(v3Global.rootp(), "post inline");
V3DfgOptimizer::optimize(v3Global.rootp(), "post inline", /* lastInvocation: */ true);
}
// --PRE-FLAT OPTIMIZATIONS------------------

View File

@ -0,0 +1,21 @@
#!/usr/bin/env python3
# DESCRIPTION: Verilator: Verilog Test driver/expect definition
#
# Copyright 2024 by Wilson Snyder. This program is free software; you
# can redistribute it and/or modify it under the terms of either the GNU
# Lesser General Public License Version 3 or the Perl Artistic License
# Version 2.0.
# SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
import vltest_bootstrap
test.scenarios('vlt')
test.compile(verilator_flags2=["--stats"])
test.file_grep(test.stats,
r' Optimizations, DFG pre inline BalanceTrees, concat trees balanced\s+(\d+)', 0)
test.file_grep(test.stats,
r' Optimizations, DFG post inline BalanceTrees, concat trees balanced\s+(\d+)', 1)
test.passes()

View File

@ -0,0 +1,35 @@
// DESCRIPTION: Verilator: Verilog Test module
//
// This file ONLY is placed under the Creative Commons Public Domain, for
// any use, without warranty, 2024 by Wilson Snyder.
// SPDX-License-Identifier: CC0-1.0
// verilator lint_off UNOPTFLAT
module t(i, o);
localparam N = 2000; // Deliberately not multiple of 32
input i;
wire [N-1:0] i;
output o;
wire [N-1:0] o;
for (genvar n = 0 ; n + 31 < N ; n += 32) begin
assign o[n+ 0 +: 1] = i[(N-1-n)- 0 -: 1];
assign o[n+ 1 +: 1] = i[(N-1-n)- 1 -: 1];
assign o[n+ 2 +: 2] = i[(N-1-n)- 2 -: 2];
assign o[n+ 4 +: 4] = i[(N-1-n)- 4 -: 4];
assign o[n+ 8 +: 8] = i[(N-1-n)- 8 -: 8];
assign o[n+16 +: 8] = i[(N-1-n)-16 -: 8];
assign o[n+24 +: 4] = i[(N-1-n)-24 -: 4];
assign o[n+28 +: 2] = i[(N-1-n)-28 -: 2];
assign o[n+30 +: 1] = i[(N-1-n)-30 -: 1];
assign o[n+31 +: 1] = i[(N-1-n)-31 -: 1];
end
for (genvar n = N / 32 * 32; n < N ; ++n) begin
assign o[n] = i[N-1-n];
end
endmodule

View File

@ -17,6 +17,6 @@ test.compile(verilator_flags2=["-Wno-UNOPTTHREADS", "--stats", test.t_dir + "/t_
test.execute()
if test.vlt:
test.file_grep(test.stats, r'Optimizations, Const bit op reduction\s+(\d+)', 40)
test.file_grep(test.stats, r'Optimizations, Const bit op reduction\s+(\d+)', 39)
test.passes()