mirror of
https://github.com/verilator/verilator.git
synced 2024-12-29 10:47:34 +00:00
Balance concatenations in DFG (#5598)
The DFG peephole pass converts all associative trees into right leaning, which is good for simplifying pattern recognition, but can lead to an excessive amount of wide intermediate results being constructed for right leaning concatenations. Add a new pass to balance concatenation trees by trying to: - Create VL_EDATASIZE (32-bit) sub-terms, so words can then be packed easily afterwards - Try to ensure the operands of a concat are roughly the same width within a concatenation tree. This does not yield the shortest tree, but it ensures it has many sub-nodes that are small enough to fit into machine registers. This can eliminate a lot of wide intermediate results, which would need temporaries, and also increases ILP within sub-expressions (assuming the C compiler can't figure that out itself). This is over 2x run-time speedup on the high_perf configuration of VeeR EH2 (which you could arguably also get with -fno-dfg, but oh well).
This commit is contained in:
parent
4969125e5a
commit
f073b278f9
@ -224,6 +224,7 @@ set(COMMON_SOURCES
|
||||
V3Descope.cpp
|
||||
V3Dfg.cpp
|
||||
V3DfgAstToDfg.cpp
|
||||
V3DfgBalanceTrees.cpp
|
||||
V3DfgCache.cpp
|
||||
V3DfgDecomposition.cpp
|
||||
V3DfgDfgToAst.cpp
|
||||
|
@ -237,6 +237,7 @@ RAW_OBJS_PCH_ASTNOMT = \
|
||||
V3Descope.o \
|
||||
V3Dfg.o \
|
||||
V3DfgAstToDfg.o \
|
||||
V3DfgBalanceTrees.o \
|
||||
V3DfgCache.o \
|
||||
V3DfgDecomposition.o \
|
||||
V3DfgDfgToAst.o \
|
||||
|
@ -274,6 +274,9 @@ public:
|
||||
// Predicate: has 1 or more sinks
|
||||
bool hasSinks() const { return m_sinksp != nullptr; }
|
||||
|
||||
// Predicate: has precisely 1 sink
|
||||
bool hasSingleSink() const { return m_sinksp && !m_sinksp->m_nextp; }
|
||||
|
||||
// Predicate: has 2 or more sinks
|
||||
bool hasMultipleSinks() const { return m_sinksp && m_sinksp->m_nextp; }
|
||||
|
||||
|
197
src/V3DfgBalanceTrees.cpp
Normal file
197
src/V3DfgBalanceTrees.cpp
Normal file
@ -0,0 +1,197 @@
|
||||
// -*- mode: C++; c-file-style: "cc-mode" -*-
|
||||
//*************************************************************************
|
||||
// DESCRIPTION: Verilator: Balance associative op trees in DfgGraphs
|
||||
//
|
||||
// Code available from: https://verilator.org
|
||||
//
|
||||
//*************************************************************************
|
||||
//
|
||||
// Copyright 2003-2024 by Wilson Snyder. This program is free software; you
|
||||
// can redistribute it and/or modify it under the terms of either the GNU
|
||||
// Lesser General Public License Version 3 or the Perl Artistic License
|
||||
// Version 2.0.
|
||||
// SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
|
||||
//
|
||||
//*************************************************************************
|
||||
//
|
||||
// - Convert concatenation trees into balanced form
|
||||
//
|
||||
//*************************************************************************
|
||||
|
||||
#include "V3PchAstNoMT.h" // VL_MT_DISABLED_CODE_UNIT
|
||||
|
||||
#include "V3Dfg.h"
|
||||
#include "V3DfgPasses.h"
|
||||
|
||||
VL_DEFINE_DEBUG_FUNCTIONS;
|
||||
|
||||
class DfgBalanceTrees final {
|
||||
// We keep the expressions, together with their offsets within a concatenation tree
|
||||
struct ConcatTerm final {
|
||||
DfgVertex* vtxp = nullptr;
|
||||
size_t offset = 0;
|
||||
|
||||
ConcatTerm() = default;
|
||||
ConcatTerm(DfgVertex* vtxp, size_t offset)
|
||||
: vtxp{vtxp}
|
||||
, offset{offset} {}
|
||||
};
|
||||
|
||||
DfgGraph& m_dfg; // The graph being processed
|
||||
V3DfgBalanceTreesContext& m_ctx; // The optimization context for stats
|
||||
|
||||
// Is the given vertex the root of a tree (of potentially size 1), of the given type?
|
||||
template <typename Vertex>
|
||||
static bool isRoot(const DfgVertex& vtx) {
|
||||
static_assert(std::is_base_of<DfgVertexBinary, Vertex>::value,
|
||||
"'Vertex' must be a 'DfgVertexBinary'");
|
||||
if (!vtx.is<Vertex>()) return false;
|
||||
// Has a single sink, and that sink is not another vertex of the same type
|
||||
return vtx.hasSingleSink() && !vtx.findSink<Vertex>();
|
||||
}
|
||||
|
||||
// Recursive implementation of 'gatherTerms' below.
|
||||
template <typename Vertex>
|
||||
static void gatherTermsImpl(DfgVertex* vtxp, std::vector<DfgVertex*>& terms) {
|
||||
// Base case: different type, or multiple sinks -> it's a term
|
||||
if (!vtxp->is<Vertex>() || vtxp->hasMultipleSinks()) {
|
||||
terms.emplace_back(vtxp);
|
||||
return;
|
||||
}
|
||||
// Recursive case: gather sub terms, right to right
|
||||
DfgVertexBinary* const binp = vtxp->as<Vertex>();
|
||||
gatherTermsImpl<Vertex>(binp->rhsp(), terms);
|
||||
gatherTermsImpl<Vertex>(binp->lhsp(), terms);
|
||||
}
|
||||
|
||||
// Gather terms in the tree of given type, rooted at the given vertex.
|
||||
// Results are right to left, that is, index 0 in the returned vector
|
||||
// is the rightmost term, index size()-1 is the leftmost term.
|
||||
template <typename Vertex>
|
||||
static std::vector<DfgVertex*> gatherTerms(Vertex& root) {
|
||||
static_assert(std::is_base_of<DfgVertexBinary, Vertex>::value,
|
||||
"'Vertex' must be a 'DfgVertexBinary'");
|
||||
std::vector<DfgVertex*> terms;
|
||||
gatherTermsImpl<Vertex>(root.rhsp(), terms);
|
||||
gatherTermsImpl<Vertex>(root.lhsp(), terms);
|
||||
return terms;
|
||||
}
|
||||
|
||||
// Construct a balanced concatenation from the given terms,
|
||||
// between indices begin (inclusive), and end (exclusive).
|
||||
// Note term[end].offset must be valid. term[end].vtxp is
|
||||
// never referenced.
|
||||
DfgVertex* constructConcat(const std::vector<ConcatTerm>& terms, const size_t begin,
|
||||
const size_t end) {
|
||||
UASSERT(end < terms.size(), "Invalid end");
|
||||
UASSERT(begin < end, "Invalid range");
|
||||
// Base case: just return the term
|
||||
if (end == begin + 1) return terms[begin].vtxp;
|
||||
|
||||
// Recursive case:
|
||||
// Compute the mid-point, trying to create roughly equal width intermediates
|
||||
const size_t width = terms[end].offset - terms[begin].offset;
|
||||
const size_t midOffset = width / 2 + terms[begin].offset;
|
||||
const auto beginIt = terms.begin() + begin;
|
||||
const auto endIt = terms.begin() + end;
|
||||
const auto midIt = std::lower_bound(beginIt + 1, endIt - 1, midOffset, //
|
||||
[&](const ConcatTerm& term, size_t value) { //
|
||||
return term.offset < value;
|
||||
});
|
||||
const size_t mid = begin + std::distance(beginIt, midIt);
|
||||
UASSERT(begin < mid && mid < end, "Must make some progress");
|
||||
// Construct the subtrees
|
||||
DfgVertex* const rhsp = constructConcat(terms, begin, mid);
|
||||
DfgVertex* const lhsp = constructConcat(terms, mid, end);
|
||||
// Construct new node
|
||||
AstNodeDType* const dtypep = DfgVertex::dtypeForWidth(lhsp->width() + rhsp->width());
|
||||
DfgConcat* const newp = new DfgConcat{m_dfg, lhsp->fileline(), dtypep};
|
||||
newp->rhsp(rhsp);
|
||||
newp->lhsp(lhsp);
|
||||
return newp;
|
||||
}
|
||||
|
||||
// Delete unused tree rooted at the given vertex
|
||||
void deleteTree(DfgVertexBinary* const vtxp) {
|
||||
UASSERT_OBJ(!vtxp->hasSinks(), vtxp, "Trying to remove used vertex");
|
||||
DfgVertexBinary* const lhsp = vtxp->lhsp()->cast<DfgVertexBinary>();
|
||||
DfgVertexBinary* const rhsp = vtxp->rhsp()->cast<DfgVertexBinary>();
|
||||
VL_DO_DANGLING(vtxp->unlinkDelete(m_dfg), vtxp);
|
||||
if (lhsp && !lhsp->hasSinks()) deleteTree(lhsp);
|
||||
if (rhsp && !rhsp->hasSinks()) deleteTree(rhsp);
|
||||
}
|
||||
|
||||
void balanceConcat(DfgConcat* const rootp) {
|
||||
// Gather all input vertices of the tree
|
||||
const std::vector<DfgVertex*> vtxps = gatherTerms<DfgConcat>(*rootp);
|
||||
// Don't bother with trivial trees
|
||||
if (vtxps.size() <= 3) return;
|
||||
|
||||
// Construct the terms Vector that we are going to do processing on
|
||||
std::vector<ConcatTerm> terms(vtxps.size() + 1);
|
||||
// These are redundant (constructor does the same), but here they are for clarity
|
||||
terms[0].offset = 0;
|
||||
terms[vtxps.size()].vtxp = nullptr;
|
||||
for (size_t i = 0; i < vtxps.size(); ++i) {
|
||||
terms[i].vtxp = vtxps[i];
|
||||
terms[i + 1].offset = terms[i].offset + vtxps[i]->width();
|
||||
}
|
||||
|
||||
// Round 1: try to create terms ending on VL_EDATASIZE boundaries.
|
||||
// This ensures we pack bits within a VL_EDATASIZE first is possible,
|
||||
// and then hopefully we can just assemble VL_EDATASIZE words afterward.
|
||||
std::vector<ConcatTerm> terms2;
|
||||
{
|
||||
terms2.reserve(terms.size());
|
||||
|
||||
size_t begin = 0; // Start of current range considered
|
||||
size_t end = 0; // End of current range considered
|
||||
size_t offset = 0; // Offset of current range considered
|
||||
|
||||
// Create a term from the current range
|
||||
const auto makeTerm = [&]() {
|
||||
DfgVertex* const vtxp = constructConcat(terms, begin, end);
|
||||
terms2.emplace_back(vtxp, offset);
|
||||
offset += vtxp->width();
|
||||
begin = end;
|
||||
};
|
||||
|
||||
// Create all terms ending on a boundary.
|
||||
while (++end < terms.size() - 1) {
|
||||
if (terms[end].offset % VL_EDATASIZE == 0) makeTerm();
|
||||
}
|
||||
// Final term. Loop condition above ensures this always exists,
|
||||
// and might or might not be on a boundary.
|
||||
makeTerm();
|
||||
// Sentinel term
|
||||
terms2.emplace_back(nullptr, offset);
|
||||
// should have ended up with the same number of bits at least...
|
||||
UASSERT(terms2.back().offset == terms.back().offset, "Inconsitent terms");
|
||||
}
|
||||
|
||||
// Round 2: Combine the partial terms
|
||||
rootp->replaceWith(constructConcat(terms2, 0, terms2.size() - 1));
|
||||
VL_DO_DANGLING(deleteTree(rootp), rootp);
|
||||
|
||||
++m_ctx.m_balancedConcats;
|
||||
}
|
||||
|
||||
DfgBalanceTrees(DfgGraph& dfg, V3DfgBalanceTreesContext& ctx)
|
||||
: m_dfg{dfg}
|
||||
, m_ctx{ctx} {
|
||||
// Find all roots
|
||||
std::vector<DfgConcat*> rootps;
|
||||
for (DfgVertex& vtx : dfg.opVertices()) {
|
||||
if (isRoot<DfgConcat>(vtx)) rootps.emplace_back(vtx.as<DfgConcat>());
|
||||
}
|
||||
// Balance them
|
||||
for (DfgConcat* const rootp : rootps) balanceConcat(rootp);
|
||||
}
|
||||
|
||||
public:
|
||||
static void apply(DfgGraph& dfg, V3DfgBalanceTreesContext& ctx) { DfgBalanceTrees{dfg, ctx}; }
|
||||
};
|
||||
|
||||
void V3DfgPasses::balanceTrees(DfgGraph& dfg, V3DfgBalanceTreesContext& ctx) {
|
||||
DfgBalanceTrees::apply(dfg, ctx);
|
||||
}
|
@ -236,7 +236,7 @@ void V3DfgOptimizer::extract(AstNetlist* netlistp) {
|
||||
V3Global::dumpCheckGlobalTree("dfg-extract", 0, dumpTreeEitherLevel() >= 3);
|
||||
}
|
||||
|
||||
void V3DfgOptimizer::optimize(AstNetlist* netlistp, const string& label) {
|
||||
void V3DfgOptimizer::optimize(AstNetlist* netlistp, const string& label, bool lastInvocation) {
|
||||
UINFO(2, __FUNCTION__ << ": " << endl);
|
||||
|
||||
// NODE STATE
|
||||
@ -282,7 +282,7 @@ void V3DfgOptimizer::optimize(AstNetlist* netlistp, const string& label) {
|
||||
for (auto& component : acyclicComponents) {
|
||||
if (dumpDfgLevel() >= 7) component->dumpDotFilePrefixed(ctx.prefix() + "source");
|
||||
// Optimize the component
|
||||
V3DfgPasses::optimize(*component, ctx);
|
||||
V3DfgPasses::optimize(*component, ctx, lastInvocation);
|
||||
// Add back under the main DFG (we will convert everything back in one go)
|
||||
dfg->addGraph(*component);
|
||||
}
|
||||
|
@ -29,7 +29,7 @@ namespace V3DfgOptimizer {
|
||||
void extract(AstNetlist*) VL_MT_DISABLED;
|
||||
|
||||
// Optimize the design
|
||||
void optimize(AstNetlist*, const string& label) VL_MT_DISABLED;
|
||||
void optimize(AstNetlist*, const string& label, bool lastInvocation) VL_MT_DISABLED;
|
||||
} // namespace V3DfgOptimizer
|
||||
|
||||
#endif // Guard
|
||||
|
@ -42,6 +42,11 @@ V3DfgEliminateVarsContext::~V3DfgEliminateVarsContext() {
|
||||
m_varsRemoved);
|
||||
}
|
||||
|
||||
V3DfgBalanceTreesContext::~V3DfgBalanceTreesContext() {
|
||||
V3Stats::addStat("Optimizations, DFG " + m_label + " BalanceTrees, concat trees balanced",
|
||||
m_balancedConcats);
|
||||
}
|
||||
|
||||
static std::string getPrefix(const std::string& label) {
|
||||
if (label.empty()) return "";
|
||||
std::string str = VString::removeWhitespace(label);
|
||||
@ -332,7 +337,7 @@ void V3DfgPasses::eliminateVars(DfgGraph& dfg, V3DfgEliminateVarsContext& ctx) {
|
||||
for (AstVar* const varp : replacedVariables) varp->unlinkFrBack()->deleteTree();
|
||||
}
|
||||
|
||||
void V3DfgPasses::optimize(DfgGraph& dfg, V3DfgOptimizationContext& ctx) {
|
||||
void V3DfgPasses::optimize(DfgGraph& dfg, V3DfgOptimizationContext& ctx, bool lastInvocation) {
|
||||
// There is absolutely nothing useful we can do with a graph of size 2 or less
|
||||
if (dfg.size() <= 2) return;
|
||||
|
||||
@ -360,6 +365,10 @@ void V3DfgPasses::optimize(DfgGraph& dfg, V3DfgOptimizationContext& ctx) {
|
||||
}
|
||||
// Accumulate patterns for reporting
|
||||
if (v3Global.opt.stats()) ctx.m_patternStats.accumulate(dfg);
|
||||
// The peephole pass covnerts all trees to right leaning, so only do this on the last DFG run.
|
||||
if (lastInvocation) {
|
||||
apply(4, "balanceTrees", [&]() { balanceTrees(dfg, ctx.m_balanceTreesContext); });
|
||||
}
|
||||
apply(4, "regularize", [&]() { regularize(dfg, ctx.m_regularizeContext); });
|
||||
if (dumpDfgLevel() >= 8) dfg.dumpDotAllVarConesPrefixed(ctx.prefix() + "optimized");
|
||||
}
|
||||
|
@ -68,6 +68,17 @@ public:
|
||||
~V3DfgEliminateVarsContext() VL_MT_DISABLED;
|
||||
};
|
||||
|
||||
class V3DfgBalanceTreesContext final {
|
||||
const std::string m_label; // Label to apply to stats
|
||||
|
||||
public:
|
||||
VDouble0 m_balancedConcats; // Number of temporaries introduced
|
||||
|
||||
explicit V3DfgBalanceTreesContext(const std::string& label)
|
||||
: m_label{label} {}
|
||||
~V3DfgBalanceTreesContext() VL_MT_DISABLED;
|
||||
};
|
||||
|
||||
class V3DfgOptimizationContext final {
|
||||
const std::string m_label; // Label to add to stats, etc.
|
||||
const std::string m_prefix; // Prefix to add to file dumps (derived from label)
|
||||
@ -92,6 +103,7 @@ public:
|
||||
V3DfgPeepholeContext m_peepholeContext{m_label};
|
||||
V3DfgRegularizeContext m_regularizeContext{m_label};
|
||||
V3DfgEliminateVarsContext m_eliminateVarsContext{m_label};
|
||||
V3DfgBalanceTreesContext m_balanceTreesContext{m_label};
|
||||
|
||||
V3DfgPatternStats m_patternStats;
|
||||
|
||||
@ -112,7 +124,7 @@ namespace V3DfgPasses {
|
||||
DfgGraph* astToDfg(AstModule&, V3DfgOptimizationContext&) VL_MT_DISABLED;
|
||||
|
||||
// Optimize the given DfgGraph
|
||||
void optimize(DfgGraph&, V3DfgOptimizationContext&) VL_MT_DISABLED;
|
||||
void optimize(DfgGraph&, V3DfgOptimizationContext&, bool lastInvocation) VL_MT_DISABLED;
|
||||
|
||||
// Convert DfgGraph back into Ast, and insert converted graph back into its parent module.
|
||||
// Returns the parent module.
|
||||
@ -134,6 +146,8 @@ void regularize(DfgGraph&, V3DfgRegularizeContext&) VL_MT_DISABLED;
|
||||
void removeUnused(DfgGraph&) VL_MT_DISABLED;
|
||||
// Eliminate (remove or replace) redundant variables. Also removes resulting unused logic.
|
||||
void eliminateVars(DfgGraph&, V3DfgEliminateVarsContext&) VL_MT_DISABLED;
|
||||
// Make computation trees balanced
|
||||
void balanceTrees(DfgGraph&, V3DfgBalanceTreesContext&) VL_MT_DISABLED;
|
||||
|
||||
} // namespace V3DfgPasses
|
||||
|
||||
|
@ -286,7 +286,7 @@ static void process() {
|
||||
|
||||
if (v3Global.opt.fDfgPreInline()) {
|
||||
// Pre inline DFG optimization
|
||||
V3DfgOptimizer::optimize(v3Global.rootp(), "pre inline");
|
||||
V3DfgOptimizer::optimize(v3Global.rootp(), "pre inline", /* lastInvocation: */ false);
|
||||
}
|
||||
|
||||
if (!(v3Global.opt.serializeOnly() && !v3Global.opt.flatten())) {
|
||||
@ -303,7 +303,7 @@ static void process() {
|
||||
|
||||
if (v3Global.opt.fDfgPostInline()) {
|
||||
// Post inline DFG optimization
|
||||
V3DfgOptimizer::optimize(v3Global.rootp(), "post inline");
|
||||
V3DfgOptimizer::optimize(v3Global.rootp(), "post inline", /* lastInvocation: */ true);
|
||||
}
|
||||
|
||||
// --PRE-FLAT OPTIMIZATIONS------------------
|
||||
|
21
test_regress/t/t_dfg_balance_cats.py
Executable file
21
test_regress/t/t_dfg_balance_cats.py
Executable file
@ -0,0 +1,21 @@
|
||||
#!/usr/bin/env python3
|
||||
# DESCRIPTION: Verilator: Verilog Test driver/expect definition
|
||||
#
|
||||
# Copyright 2024 by Wilson Snyder. This program is free software; you
|
||||
# can redistribute it and/or modify it under the terms of either the GNU
|
||||
# Lesser General Public License Version 3 or the Perl Artistic License
|
||||
# Version 2.0.
|
||||
# SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
|
||||
|
||||
import vltest_bootstrap
|
||||
|
||||
test.scenarios('vlt')
|
||||
|
||||
test.compile(verilator_flags2=["--stats"])
|
||||
|
||||
test.file_grep(test.stats,
|
||||
r' Optimizations, DFG pre inline BalanceTrees, concat trees balanced\s+(\d+)', 0)
|
||||
test.file_grep(test.stats,
|
||||
r' Optimizations, DFG post inline BalanceTrees, concat trees balanced\s+(\d+)', 1)
|
||||
|
||||
test.passes()
|
35
test_regress/t/t_dfg_balance_cats.v
Normal file
35
test_regress/t/t_dfg_balance_cats.v
Normal file
@ -0,0 +1,35 @@
|
||||
// DESCRIPTION: Verilator: Verilog Test module
|
||||
//
|
||||
// This file ONLY is placed under the Creative Commons Public Domain, for
|
||||
// any use, without warranty, 2024 by Wilson Snyder.
|
||||
// SPDX-License-Identifier: CC0-1.0
|
||||
|
||||
// verilator lint_off UNOPTFLAT
|
||||
|
||||
module t(i, o);
|
||||
localparam N = 2000; // Deliberately not multiple of 32
|
||||
|
||||
input i;
|
||||
wire [N-1:0] i;
|
||||
|
||||
output o;
|
||||
wire [N-1:0] o;
|
||||
|
||||
for (genvar n = 0 ; n + 31 < N ; n += 32) begin
|
||||
assign o[n+ 0 +: 1] = i[(N-1-n)- 0 -: 1];
|
||||
assign o[n+ 1 +: 1] = i[(N-1-n)- 1 -: 1];
|
||||
assign o[n+ 2 +: 2] = i[(N-1-n)- 2 -: 2];
|
||||
assign o[n+ 4 +: 4] = i[(N-1-n)- 4 -: 4];
|
||||
assign o[n+ 8 +: 8] = i[(N-1-n)- 8 -: 8];
|
||||
assign o[n+16 +: 8] = i[(N-1-n)-16 -: 8];
|
||||
assign o[n+24 +: 4] = i[(N-1-n)-24 -: 4];
|
||||
assign o[n+28 +: 2] = i[(N-1-n)-28 -: 2];
|
||||
assign o[n+30 +: 1] = i[(N-1-n)-30 -: 1];
|
||||
assign o[n+31 +: 1] = i[(N-1-n)-31 -: 1];
|
||||
end
|
||||
|
||||
for (genvar n = N / 32 * 32; n < N ; ++n) begin
|
||||
assign o[n] = i[N-1-n];
|
||||
end
|
||||
|
||||
endmodule
|
@ -17,6 +17,6 @@ test.compile(verilator_flags2=["-Wno-UNOPTTHREADS", "--stats", test.t_dir + "/t_
|
||||
test.execute()
|
||||
|
||||
if test.vlt:
|
||||
test.file_grep(test.stats, r'Optimizations, Const bit op reduction\s+(\d+)', 40)
|
||||
test.file_grep(test.stats, r'Optimizations, Const bit op reduction\s+(\d+)', 39)
|
||||
|
||||
test.passes()
|
||||
|
Loading…
Reference in New Issue
Block a user