From 77ef2cd487369128521caa432e05e7a361b4d3bb Mon Sep 17 00:00:00 2001 From: Geza Lore Date: Sun, 10 Nov 2024 15:51:59 +0000 Subject: [PATCH] Split up assignments to wides with Concat on the RHS (#5599) Add a new pass to split up (recursively): foo = {l, r}; into the following, with the right indices, iff the concatenation straddles a wide word boundary. foo[_:_] = r; foo[_:_] = l; This eliminates more wide temporaries. Another 23% speedup on VeeR EH2 high_perf. Also brings the predicted stack size from 8M to 40k. --- docs/guide/exe_verilator.rst | 4 + src/CMakeLists.txt | 2 + src/Makefile_obj.in | 1 + src/V3FuncOpt.cpp | 182 ++++++++++++++++++++ src/V3FuncOpt.h | 32 ++++ src/V3Options.cpp | 4 + src/V3Options.h | 3 + src/Verilator.cpp | 4 + test_regress/t/t_dfg_balance_cats.py | 3 + test_regress/t/t_dfg_balance_cats_nofunc.py | 26 +++ 10 files changed, 261 insertions(+) create mode 100644 src/V3FuncOpt.cpp create mode 100644 src/V3FuncOpt.h create mode 100755 test_regress/t/t_dfg_balance_cats_nofunc.py diff --git a/docs/guide/exe_verilator.rst b/docs/guide/exe_verilator.rst index 0ffe5afb3..e70f70b71 100644 --- a/docs/guide/exe_verilator.rst +++ b/docs/guide/exe_verilator.rst @@ -589,6 +589,10 @@ Summary: .. option:: -fno-expand +.. option:: -fno-func-opt + +.. option:: -fno-func-opt-split-cat + .. option:: -fno-gate .. option:: -fno-inline diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 9049fc215..9b1aac1d0 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -95,6 +95,7 @@ set(HEADERS V3Force.h V3Fork.h V3FunctionTraits.h + V3FuncOpt.h V3Gate.h V3Global.h V3Graph.h @@ -255,6 +256,7 @@ set(COMMON_SOURCES V3FileLine.cpp V3Force.cpp V3Fork.cpp + V3FuncOpt.cpp V3Gate.cpp V3Global.cpp V3Graph.cpp diff --git a/src/Makefile_obj.in b/src/Makefile_obj.in index 0e972fc71..0945e4690 100644 --- a/src/Makefile_obj.in +++ b/src/Makefile_obj.in @@ -204,6 +204,7 @@ RAW_OBJS_PCH_ASTMT = \ V3EmitCPch.o \ V3EmitV.o \ V3File.o \ + V3FuncOpt.o \ V3Global.o \ V3Hasher.o \ V3Number.o \ diff --git a/src/V3FuncOpt.cpp b/src/V3FuncOpt.cpp new file mode 100644 index 000000000..f71621a24 --- /dev/null +++ b/src/V3FuncOpt.cpp @@ -0,0 +1,182 @@ +// -*- mode: C++; c-file-style: "cc-mode" -*- +//************************************************************************* +// DESCRIPTION: Verilator: Generic optimizations on a per function basis +// +// Code available from: https://verilator.org +// +//************************************************************************* +// +// Copyright 2003-2024 by Wilson Snyder. This program is free software; you +// can redistribute it and/or modify it under the terms of either the GNU +// Lesser General Public License Version 3 or the Perl Artistic License +// Version 2.0. +// SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0 +// +//************************************************************************* +// +// - Split assignments to wide locations with Concat on the RHS +// at word boundaries: +// foo = {l, r}; +// becomes (recursively): +// foo[_:_] = r; +// foo[_:_] = l; +// +//************************************************************************* + +#include "V3PchAstMT.h" + +#include "V3FuncOpt.h" + +#include "V3Global.h" +#include "V3Stats.h" +#include "V3ThreadPool.h" + +VL_DEFINE_DEBUG_FUNCTIONS; + +class FuncOptVisitor final : public VNVisitor { + // NODE STATE + // AstNodeAssign::user() -> bool. Already checked, safe to split. Omit expensive check. + + // STATE - Statistic tracking + VDouble0 m_concatSplits; // Number of splits in assignments with Concat on RHS + + // True for e.g.: foo = foo >> 1; or foo[foo[0]] = ...; + static bool readsLhs(AstNodeAssign* nodep) { + // It is expected that the number of vars written on the LHS is very small (should be 1). + std::unordered_set lhsWrVarps; + std::unordered_set lhsRdVarps; + nodep->lhsp()->foreach([&](const AstVarRef* refp) { + if (refp->access().isWriteOrRW()) lhsWrVarps.emplace(refp->varp()); + if (refp->access().isReadOrRW()) lhsRdVarps.emplace(refp->varp()); + }); + + // Common case of 1 variable on the LHS - special handling for speed + if (lhsWrVarps.size() == 1) { + const AstVar* const lhsWrVarp = *lhsWrVarps.begin(); + // Check Rhs doesn't read the written var + const bool rhsReadsWritten = nodep->rhsp()->exists([=](const AstVarRef* refp) { // + return refp->varp() == lhsWrVarp; + }); + if (rhsReadsWritten) return true; + // Check Lhs doesn't read the written var + return lhsRdVarps.count(lhsWrVarp); + } + + // Generic case of multiple vars written on LHS + // TODO: this might be impossible due to earlier transforms, not sure + // Check Rhs doesn't read the written vars + const bool rhsReadsWritten = nodep->rhsp()->exists([&](const AstVarRef* refp) { // + return lhsWrVarps.count(refp->varp()); + }); + if (rhsReadsWritten) return true; + // Check Lhs doesn't read the written vars + for (const AstVar* const lhsWrVarp : lhsWrVarps) { + if (lhsRdVarps.count(lhsWrVarp)) return true; + } + return false; + } + + // METHODS + // Split wide assignments with a wide concatenation on the RHS. + // Returns true if 'nodep' was deleted + bool splitConcat(AstNodeAssign* nodep) { + UINFO(9, "splitConcat " << nodep << "\n"); + // Only care about concatenations on the right + AstConcat* const rhsp = VN_CAST(nodep->rhsp(), Concat); + if (!rhsp) return false; + // Will need the LHS + AstNodeExpr* lhsp = nodep->lhsp(); + UASSERT_OBJ(lhsp->width() == rhsp->width(), nodep, "Inconsistent assignment"); + // Only consider pure assignments. Nodes inserted below are safe. + if (!nodep->user1() && (!lhsp->isPure() || !rhsp->isPure())) return false; + // Check for a Sel on the LHS if present, and skip over it + uint32_t lsb = 0; + if (AstSel* const selp = VN_CAST(lhsp, Sel)) { + if (AstConst* const lsbp = VN_CAST(selp->lsbp(), Const)) { + lhsp = selp->fromp(); + lsb = lsbp->toUInt(); + } else { + // Don't optimize if it's a variable select + return false; + } + } + // No need to split assignments targeting storage smaller than a machine register + if (lhsp->width() <= VL_QUADSIZE) return false; + + // If it's a concat straddling a word boundary, try to split it. + // The next visit on the new nodes will split it recursively. + // Otherwise, keep the original assignment. + const int lsbWord = lsb / VL_EDATASIZE; + const int msbWord = (lsb + rhsp->width() - 1) / VL_EDATASIZE; + if (lsbWord == msbWord) return false; + + // If the RHS reads the LHS, we can't actually do this. Nodes inserted below are safe. + if (!nodep->user1() && readsLhs(nodep)) return false; + + // Ok, actually split it now + UINFO(5, "splitConcat optimizing " << nodep << "\n"); + ++m_concatSplits; + // The 2 parts and their offsets + AstNodeExpr* const rrp = rhsp->rhsp()->unlinkFrBack(); + AstNodeExpr* const rlp = rhsp->lhsp()->unlinkFrBack(); + const int rLsb = lsb; + const int lLsb = lsb + rrp->width(); + // Insert the 2 assignment right after the original. They will be visited next. + AstAssign* const arp = new AstAssign{ + nodep->fileline(), + new AstSel{lhsp->fileline(), lhsp->cloneTreePure(false), rLsb, rrp->width()}, rrp}; + AstAssign* const alp = new AstAssign{ + nodep->fileline(), + new AstSel{lhsp->fileline(), lhsp->unlinkFrBack(), lLsb, rlp->width()}, rlp}; + nodep->addNextHere(arp); + arp->addNextHere(alp); + // Safe to split these. + arp->user1(true); + alp->user1(true); + // Nuke what is left + VL_DO_DANGLING(pushDeletep(nodep->unlinkFrBack()), nodep); + return true; + } + + // VISIT + void visit(AstNodeAssign* nodep) override { + // TODO: Only thing remaining inside functions should be AstAssign (that is, an actual + // assignment statemant), but we stil use AstAssignW, AstAssignDly, and all, fix. + if (v3Global.opt.fFuncSplitCat()) { + if (splitConcat(nodep)) return; // Must return here, in case more code is added below + } + } + + void visit(AstNodeExpr*) override {} // No need to descend further (Ignore AstExprStmt...) + + void visit(AstNode* nodep) override { iterateChildren(nodep); } + + // CONSTRUCTORS + explicit FuncOptVisitor(AstCFunc* funcp) { iterateChildren(funcp); } + ~FuncOptVisitor() override { + V3Stats::addStatSum("Optimizations, FuncOpt concat splits", m_concatSplits); + } + +public: + static void apply(AstCFunc* funcp) { FuncOptVisitor{funcp}; } +}; + +//###################################################################### + +void V3FuncOpt::funcOptAll(AstNetlist* nodep) { + UINFO(2, __FUNCTION__ << ": " << endl); + { + const VNUser1InUse user1InUse; + V3ThreadScope threadScope; + for (AstNodeModule *modp = nodep->modulesp(), *nextModp; modp; modp = nextModp) { + nextModp = VN_AS(modp->nextp(), NodeModule); + for (AstNode *nodep = modp->stmtsp(), *nextNodep; nodep; nodep = nextNodep) { + nextNodep = nodep->nextp(); + if (AstCFunc* const cfuncp = VN_CAST(nodep, CFunc)) { + threadScope.enqueue([cfuncp]() { FuncOptVisitor::apply(cfuncp); }); + } + } + } + } + V3Global::dumpCheckGlobalTree("funcopt", 0, dumpTreeEitherLevel() >= 3); +} diff --git a/src/V3FuncOpt.h b/src/V3FuncOpt.h new file mode 100644 index 000000000..d6c1de2d3 --- /dev/null +++ b/src/V3FuncOpt.h @@ -0,0 +1,32 @@ +// -*- mode: C++; c-file-style: "cc-mode" -*- +//************************************************************************* +// DESCRIPTION: Verilator: Generic optimizations on a per function basis +// +// Code available from: https://verilator.org +// +//************************************************************************* +// +// Copyright 2003-2024 by Wilson Snyder. This program is free software; you +// can redistribute it and/or modify it under the terms of either the GNU +// Lesser General Public License Version 3 or the Perl Artistic License +// Version 2.0. +// SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0 +// +//************************************************************************* + +#ifndef VERILATOR_V3FUNCOPT_H_ +#define VERILATOR_V3FUNCOPT_H_ + +#include "config_build.h" +#include "verilatedos.h" + +class AstNetlist; + +//============================================================================ + +class V3FuncOpt final { +public: + static void funcOptAll(AstNetlist* nodep); +}; + +#endif // Guard diff --git a/src/V3Options.cpp b/src/V3Options.cpp index ae32fdf3d..11a154a43 100644 --- a/src/V3Options.cpp +++ b/src/V3Options.cpp @@ -1303,6 +1303,10 @@ void V3Options::parseOptsList(FileLine* fl, const string& optdir, int argc, DECL_OPTION("-fdead-assigns", FOnOff, &m_fDeadAssigns); DECL_OPTION("-fdead-cells", FOnOff, &m_fDeadCells); DECL_OPTION("-fexpand", FOnOff, &m_fExpand); + DECL_OPTION("-ffunc-opt", CbFOnOff, [this](bool flag) { // + m_fFuncSplitCat = flag; + }); + DECL_OPTION("-ffunc-opt-split-cat", FOnOff, &m_fFuncSplitCat); DECL_OPTION("-fgate", FOnOff, &m_fGate); DECL_OPTION("-finline", FOnOff, &m_fInline); DECL_OPTION("-flife", FOnOff, &m_fLife); diff --git a/src/V3Options.h b/src/V3Options.h index 2ac99bf04..5eaa0aebd 100644 --- a/src/V3Options.h +++ b/src/V3Options.h @@ -384,6 +384,7 @@ private: bool m_fDeadAssigns; // main switch: -fno-dead-assigns: remove dead assigns bool m_fDeadCells; // main switch: -fno-dead-cells: remove dead cells bool m_fExpand; // main switch: -fno-expand: expansion of C macros + bool m_fFuncSplitCat = true; // main switch: -fno-func-split-cat: expansion of C macros bool m_fGate; // main switch: -fno-gate: gate wire elimination bool m_fInline; // main switch: -fno-inline: module inlining bool m_fLife; // main switch: -fno-life: variable lifetime @@ -674,6 +675,8 @@ public: bool fDeadAssigns() const { return m_fDeadAssigns; } bool fDeadCells() const { return m_fDeadCells; } bool fExpand() const { return m_fExpand; } + bool fFuncSplitCat() const { return m_fFuncSplitCat; } + bool fFunc() const { return fFuncSplitCat(); } bool fGate() const { return m_fGate; } bool fInline() const { return m_fInline; } bool fLife() const { return m_fLife; } diff --git a/src/Verilator.cpp b/src/Verilator.cpp index 92d3f53de..1c4d58cfa 100644 --- a/src/Verilator.cpp +++ b/src/Verilator.cpp @@ -53,6 +53,7 @@ #include "V3File.h" #include "V3Force.h" #include "V3Fork.h" +#include "V3FuncOpt.h" #include "V3Gate.h" #include "V3Global.h" #include "V3Graph.h" @@ -497,6 +498,9 @@ static void process() { // --GENERATION------------------ if (!v3Global.opt.serializeOnly()) { + // Generic optimizations on a per-function basis + if (v3Global.opt.fFunc()) V3FuncOpt::funcOptAll(v3Global.rootp()); + // Remove unused vars V3Const::constifyAll(v3Global.rootp()); V3Dead::deadifyAll(v3Global.rootp()); diff --git a/test_regress/t/t_dfg_balance_cats.py b/test_regress/t/t_dfg_balance_cats.py index 0a4055967..93de94adf 100755 --- a/test_regress/t/t_dfg_balance_cats.py +++ b/test_regress/t/t_dfg_balance_cats.py @@ -17,5 +17,8 @@ test.file_grep(test.stats, r' Optimizations, DFG pre inline BalanceTrees, concat trees balanced\s+(\d+)', 0) test.file_grep(test.stats, r' Optimizations, DFG post inline BalanceTrees, concat trees balanced\s+(\d+)', 1) +test.file_grep(test.stats, r'Optimizations, DFG pre inline Dfg2Ast, result equations\s+(\d+)', 1) +test.file_grep(test.stats, r'Optimizations, DFG post inline Dfg2Ast, result equations\s+(\d+)', 1) +test.file_grep(test.stats, r'Optimizations, FuncOpt concat splits\s+(\d+)', 62) test.passes() diff --git a/test_regress/t/t_dfg_balance_cats_nofunc.py b/test_regress/t/t_dfg_balance_cats_nofunc.py new file mode 100755 index 000000000..d57622f3a --- /dev/null +++ b/test_regress/t/t_dfg_balance_cats_nofunc.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python3 +# DESCRIPTION: Verilator: Verilog Test driver/expect definition +# +# Copyright 2024 by Wilson Snyder. This program is free software; you +# can redistribute it and/or modify it under the terms of either the GNU +# Lesser General Public License Version 3 or the Perl Artistic License +# Version 2.0. +# SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0 + +import vltest_bootstrap + +test.scenarios('vlt') + +test.top_filename = "t/t_dfg_balance_cats.v" + +test.compile(verilator_flags2=["--stats", "-fno-func-opt"]) + +test.file_grep(test.stats, + r' Optimizations, DFG pre inline BalanceTrees, concat trees balanced\s+(\d+)', 0) +test.file_grep(test.stats, + r' Optimizations, DFG post inline BalanceTrees, concat trees balanced\s+(\d+)', 1) +test.file_grep(test.stats, r'Optimizations, DFG pre inline Dfg2Ast, result equations\s+(\d+)', 1) +test.file_grep(test.stats, r'Optimizations, DFG post inline Dfg2Ast, result equations\s+(\d+)', 1) +test.file_grep_not(test.stats, r'Optimizations, FuncOpt concat splits') + +test.passes()