diff --git a/Changes b/Changes
index 763307976..699de9efb 100644
--- a/Changes
+++ b/Changes
@@ -23,6 +23,13 @@ The contributors that suggested a given feature are shown in []. Thanks!
 
 ****  Fix queues and dynamic array wide ops. (#2352) [Vassilis Papaefstathiou]
 
+****  --output-split is now on by default. VM_PARALLEL_BUILDS is set by
+      default iff the --output-split caused an actual file split to occur.
+      --output-split-cfuncs and --output-split-ctrace now default to the
+      value of --output-split. These changes should improve build times of
+      medium and large designs with default options. User makefiles may
+      require changes.
+
 
 * Verilator 4.034 2020-05-03
 
diff --git a/bin/verilator b/bin/verilator
index 2931f781a..8189e8c82 100755
--- a/bin/verilator
+++ b/bin/verilator
@@ -1122,17 +1122,15 @@ developers.
 
 Enables splitting the output .cpp files into multiple outputs.  When a C++
 file exceeds the specified number of operations, a new file will be created
-at the next function boundary.  In addition, any infrequently executed
-"cold" routines will be placed into __Slow files.  This accelerates
-compilation by as optimization can be disabled on the routines in __Slow,
-and the remaining files can be compiled on parallel machines.  Using
---output-split should have only a trivial impact on performance.  On one
-design --output-split 20000 resulted in splitting into approximately
-one-minute-compile chunks.
+at the next function boundary.  In addition, if the total output code size
+exceeds the specified value, VM_PARALLEL_BUILDS will be set to 1 by default
+in the generated make files, making parallel compilation possible. Using
+--output-split should have only a trivial impact on model performance. But
+can greatly improve C++ compilation speed. The use of I<ccache> (set for you
+if present at configure time) is also more effective with this option.
 
-Typically when using this, make with VM_PARALLEL_BUILDS=1 (set for you if
-using the default makefiles), and use I<ccache> (set for you if present at
-configure time).
+This option is on by default with a value of 20000. To disable, pass with a
+value of 0.
 
 =item --output-split-cfuncs I<statements>
 
@@ -1144,10 +1142,14 @@ worse with decreasing split values.  Note that this option is stronger than
 --output-split in the sense that --output-split will not split inside a
 function.
 
+Defaults to the value of --output-split, unless explicitly specified.
+
 =item --output-split-ctrace I<statements>
 
-Enables splitting trace functions in the output .cpp files into
-multiple functions.  Defaults to same setting as --output-split-cfuncs.
+Similar to --output-split-cfuncs, enables splitting trace functions in the
+output .cpp files into multiple functions.
+
+Defaults to the value of --output-split, unless explicitly specified.
 
 =item -P
 
@@ -2085,16 +2087,17 @@ line to make:
 
     make OPT_FAST="-Os -march=native -fno-stack-protector" -f Vour.mk Vour__ALL.a
 
-OPT_FAST specifies optimizations for those programs that are part of the
-fast path, mostly code that is executed every cycle.  OPT_SLOW specifies
-optimizations for slow-path files (plus tracing), which execute only
-rarely, yet take a long time to compile with optimization on.  OPT
-specifies overall optimization and affects all compiles, including those
-OPT_FAST and OPT_SLOW control.  For best results, use OPT="-Os
--march=native", and link with "-static".  Nearly the same results can be
-had with much better compile times with OPT_FAST="-O1 -fstrict-aliasing".
-Higher optimization such as "-O2" or "-O3" may help, but gcc compile times
-may be excessive under O3 on even medium sized designs.
+OPT_FAST specifies optimizations for those parts of the program that are on the
+fast path. This is mostly code that is executed every cycle.  OPT_SLOW
+specifies optimizations for slow-path files, which execute only rarely, yet
+take a long time to compile with optimization on.  OPT_SLOW is ignored if
+VM_PARALLEL_BUILDS is not 1, in which case all code is compliled with OPT_FAST.
+See also the C<--output-split> option.  OPT specifies overall optimization and
+affects all compiles, including those OPT_FAST and OPT_SLOW control.  For best
+results, use OPT="-Os -march=native", and link with "-static".  Nearly the same
+results can be had with much better compile times with OPT_FAST="-O1
+-fstrict-aliasing".  Higher optimization such as "-O2" or "-O3" may help, but
+gcc compile times may be excessive under O3 on even medium sized designs.
 
 Unfortunately, using the optimizer with SystemC files can result in
 compiles taking several minutes.  (The SystemC libraries have many little
@@ -5215,7 +5218,12 @@ test_regress/t/t_extend_class files show an example of how to do this.
 =item How do I get faster build times?
 
 When running make pass the make variable VM_PARALLEL_BUILDS=1 so that
-builds occur in parallel.
+builds occur in parallel. Note this is now set by default if the output
+code size exceeds the value of --output-split.
+
+Verilator emits any infrequently executed "cold" routines into separate
+__Slow.cpp files. This can accelerate compilation as optimization can be
+disabled on these routines. See the OPT_FAST and OPT_SLOW make variables.
 
 Use a recent compiler.  Newer compilers tend do be faster, with the
 now relatively old GCC 3.0 to 3.3 being horrible.
diff --git a/src/V3Options.cpp b/src/V3Options.cpp
index 547fa8df9..1cdd942db 100644
--- a/src/V3Options.cpp
+++ b/src/V3Options.cpp
@@ -640,6 +640,10 @@ void V3Options::notify() {
     // --trace-threads implies --threads 1 unless explicitly specified
     if (traceThreads() && !threads()) m_threads = 1;
 
+    // Default split limits if not specified
+    if (m_outputSplitCFuncs < 0) m_outputSplitCFuncs = m_outputSplit;
+    if (m_outputSplitCTrace < 0) m_outputSplitCTrace = m_outputSplit;
+
     if (v3Global.opt.main() && v3Global.opt.systemC()) {
         cmdfl->v3error("--main not usable with SystemC. Suggest see examples for sc_main().");
     }
@@ -1058,13 +1062,15 @@ void V3Options::parseOptsList(FileLine* fl, const string& optdir, int argc, char
             } else if (!strcmp(sw, "-output-split-cfuncs") && (i + 1) < argc) {
                 shift;
                 m_outputSplitCFuncs = atoi(argv[i]);
-                if (m_outputSplitCFuncs
-                    && (!m_outputSplitCTrace || m_outputSplitCTrace > m_outputSplitCFuncs)) {
-                    m_outputSplitCTrace = m_outputSplitCFuncs;
+                if (m_outputSplitCFuncs < 0) {
+                    fl->v3error("--output-split-cfuncs must be >= 0: " << argv[i]);
                 }
-            } else if (!strcmp(sw, "-output-split-ctrace")) {  // Undocumented optimization tweak
+            } else if (!strcmp(sw, "-output-split-ctrace")) {
                 shift;
                 m_outputSplitCTrace = atoi(argv[i]);
+                if (m_outputSplitCTrace < 0) {
+                    fl->v3error("--output-split-ctrace must be >= 0: " << argv[i]);
+                }
             } else if (!strcmp(sw, "-protect-lib") && (i + 1) < argc) {
                 shift;
                 m_protectLib = argv[i];
@@ -1605,9 +1611,9 @@ V3Options::V3Options() {
     m_inlineMult = 2000;
     m_maxNumWidth = 65536;
     m_moduleRecursion = 100;
-    m_outputSplit = 0;
-    m_outputSplitCFuncs = 0;
-    m_outputSplitCTrace = 0;
+    m_outputSplit = 20000;
+    m_outputSplitCFuncs = -1;
+    m_outputSplitCTrace = -1;
     m_traceDepth = 0;
     m_traceMaxArray = 32;
     m_traceMaxWidth = 256;
diff --git a/test_regress/t/t_protect_ids_key.out b/test_regress/t/t_protect_ids_key.out
index 036238fb8..cfb58b841 100644
--- a/test_regress/t/t_protect_ids_key.out
+++ b/test_regress/t/t_protect_ids_key.out
@@ -22,6 +22,7 @@
   <map from="PSEGxK" to="__Vscope_t__secret_inst"/>
   <map from="PS25fg" to="__Vtask_dpix_a_task__1__i"/>
   <map from="PSHuZZ" to="_change_request"/>
+  <map from="PS75kd" to="_change_request_1"/>
   <map from="PSyTg5" to="_ctor_var_reset"/>
   <map from="PS8lsQ" to="_eval"/>
   <map from="PSKZ7c" to="_eval_debug_assertions"/>
diff --git a/test_regress/t/t_unopt_converge_initial_run_bad.pl b/test_regress/t/t_unopt_converge_initial_run_bad.pl
index 490ce1a5d..8825ddf1e 100755
--- a/test_regress/t/t_unopt_converge_initial_run_bad.pl
+++ b/test_regress/t/t_unopt_converge_initial_run_bad.pl
@@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_unopt_converge_initial.v");
 
 compile(
-    v_flags2 => ['+define+ALLOW_UNOPT'],
+    v_flags2 => ['+define+ALLOW_UNOPT --output-split 0'],
     );
 
 execute(
diff --git a/test_regress/t/t_unopt_converge_print_bad.pl b/test_regress/t/t_unopt_converge_print_bad.pl
index eee9ef712..c20b4c98f 100755
--- a/test_regress/t/t_unopt_converge_print_bad.pl
+++ b/test_regress/t/t_unopt_converge_print_bad.pl
@@ -14,7 +14,7 @@ top_filename("t/t_unopt_converge.v");
 #$Self->{verilated_debug} = 1;
 
 compile(
-    v_flags2 => ['+define+ALLOW_UNOPT'],
+    v_flags2 => ['+define+ALLOW_UNOPT --output-split 0'],
     make_flags => 'CPPFLAGS_ADD=-DVL_DEBUG',
     );
 
diff --git a/test_regress/t/t_unopt_converge_run_bad.pl b/test_regress/t/t_unopt_converge_run_bad.pl
index d700863a7..ce55a1d72 100755
--- a/test_regress/t/t_unopt_converge_run_bad.pl
+++ b/test_regress/t/t_unopt_converge_run_bad.pl
@@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_unopt_converge.v");
 
 compile(
-    v_flags2 => ['+define+ALLOW_UNOPT'],
+    v_flags2 => ['+define+ALLOW_UNOPT --output-split 0'],
     );
 
 execute(
diff --git a/test_regress/t/t_verilated_debug.out b/test_regress/t/t_verilated_debug.out
index f1bcca561..f4bc23936 100644
--- a/test_regress/t/t_verilated_debug.out
+++ b/test_regress/t/t_verilated_debug.out
@@ -11,9 +11,11 @@ internalsDump:
 -V{t#,#}+    Vt_verilated_debug::_eval_settle
 -V{t#,#}+    Vt_verilated_debug::_eval
 -V{t#,#}+    Vt_verilated_debug::_change_request
+-V{t#,#}+    Vt_verilated_debug::_change_request_1
 -V{t#,#}+ Clock loop
 -V{t#,#}+    Vt_verilated_debug::_eval
 -V{t#,#}+    Vt_verilated_debug::_change_request
+-V{t#,#}+    Vt_verilated_debug::_change_request_1
 -V{t#,#}+++++TOP Evaluate Vt_verilated_debug::eval
 -V{t#,#}+    Vt_verilated_debug::_eval_debug_assertions
 -V{t#,#}+ Clock loop
@@ -21,4 +23,5 @@ internalsDump:
 -V{t#,#}+    Vt_verilated_debug::_sequent__TOP__1
 *-* All Finished *-*
 -V{t#,#}+    Vt_verilated_debug::_change_request
+-V{t#,#}+    Vt_verilated_debug::_change_request_1
 -V{t#,#}+    Vt_verilated_debug::final