Improve run-time profiling

The --prof-threads option has been split into two independent options: 1. --prof-exec, for collecting verilator_gantt and other execution related profiling data, and 2. --prof-pgo, for collecting data needed for PGO The implementation of execution profiling is extricated from VlThreadPool and is now a separate class VlExecutionProfiler. This means --prof-exec can now be used for single-threaded models (though it does not measure a lot of things just yet). For consistency VerilatedProfiler is renamed VlPgoProfiler. Both VlExecutionProfiler and VlPgoProfiler are in verilated_profiler.{h/cpp}, but can be used completely independently. Also re-worked the execution profile format so it now only emits events without holding onto any temporaries. This is in preparation for some future optimizations that would be hindered by the introduction of function locals via AstText. Also removed the Barrier event. Clearing the profile buffers is not notably more expensive as the profiling records are trivially destructible.
2025-04-04 19:52:39 +00:00 · 2022-03-25 19:46:50 +00:00 · 2022-03-25 19:46:50 +00:00 · b1b5b5dfe2
commit b1b5b5dfe2
parent c7440b250f
40 changed files with 900 additions and 712 deletions
--- a/1
+++ b/1
@ -13,6 +13,7 @@ Verilator 4.221 devel

 **Minor:**

+* Split --prof-threads into --prof-exec and --prof-pgo (#3365). [Geza Lore, Shunyao CAD]


 Verilator 4.220 2022-03-12
--- a/bin/verilator
+++ b/bin/verilator
@ -370,7 +370,8 @@ detailed descriptions of these arguments.
    --prefix <topname>          Name of top level class
    --prof-c                    Compile C++ code with profiling
    --prof-cfuncs               Name functions for profiling
-    --prof-threads              Enable generating gantt chart data for threads
+    --prof-exec                 Enable generating execution profile for gantt chart
+    --prof-pgo                  Enable generating profiling data for PGO
    --protect-key <key>         Key for symbol protection
    --protect-ids               Hash identifier names for obscurity
    --protect-lib <name>        Create a DPI protected library
@ -445,10 +446,10 @@ description of these arguments.
     +verilator+error+limit+<value>    Set error limit
     +verilator+help                   Display help
     +verilator+noassert               Disable assert checking
-     +verilator+prof+threads+file+<filename>  Set profile filename
-     +verilator+prof+threads+start+<value>    Set profile starting point
-     +verilator+prof+threads+window+<value>   Set profile duration
-     +verilator+prof+vlt+file+<filename>      Set profile guided filename
+     +verilator+prof+exec+file+<filename>  Set execution profile filename
+     +verilator+prof+exec+start+<value>    Set execution profile starting point
+     +verilator+prof+exec+window+<value>   Set execution profile duration
+     +verilator+prof+vlt+file+<filename>   Set PGO profile filename
     +verilator+rand+reset+<value>     Set random reset technique
     +verilator+seed+<value>           Set random seed
     +verilator+V                      Verbose version and config
--- a/bin/verilator_gantt
+++ b/bin/verilator_gantt
@ -9,7 +9,7 @@ import re
 import statistics
 # from pprint import pprint

-Threads = collections.defaultdict(lambda: {})
+Threads = collections.defaultdict(lambda: collections.defaultdict(lambda: {}))
 Mtasks = collections.defaultdict(lambda: {})
 Evals = collections.defaultdict(lambda: {})
 EvalLoops = collections.defaultdict(lambda: {})
@ -30,12 +30,12 @@ def process(filename):

 def read_data(filename):
    with open(filename) as fh:
-        re_prof = re.compile(
-            r'^VLPROF mtask\s(\d+)\sstart\s(\d+)\selapsed\s(\d+)\spredict_start\s(\d+)\spredict_cost\s(\d+)\scpu\s(\d+)\son thread (\d+)'
-        )
-        re_eval = re.compile(r'^VLPROF eval\sstart\s(\d+)\selapsed\s(\d+)')
-        re_loop = re.compile(
-            r'^VLPROF eval_loop\sstart\s(\d+)\selapsed\s(\d+)')
+        re_thread = re.compile(r'^VLPROFTHREAD (\d+)$')
+        re_record = re.compile(r'^VLPROFEXEC (\S+) (\d+)(.*)$')
+        re_payload_mtaskBegin = re.compile(
+            r'id (\d+) predictStart (\d+) cpu (\d+)')
+        re_payload_mtaskEnd = re.compile(r'id (\d+) predictCost (\d+)')
+
        re_arg1 = re.compile(r'VLPROF arg\s+(\S+)\+([0-9.]*)\s*')
        re_arg2 = re.compile(r'VLPROF arg\s+(\S+)\s+([0-9.]*)\s*$')
        re_stat = re.compile(r'VLPROF stat\s+(\S+)\s+([0-9.]+)')
@ -43,46 +43,59 @@ def read_data(filename):
        re_proc_cpu = re.compile(r'VLPROFPROC processor\s*:\s*(\d+)\s*$')
        re_proc_dat = re.compile(r'VLPROFPROC ([a-z_ ]+)\s*:\s*(.*)$')
        cpu = None
+        thread = None
+
+        lastEvalBeginTick = None
+        lastEvalLoopBeginTick = None

        for line in fh:
-            if re_prof.match(line):
-                match = re_prof.match(line)
-                mtask = int(match.group(1))
-                start = int(match.group(2))
-                elapsed_time = int(match.group(3))
-                end = start + elapsed_time
-                predict_start = int(match.group(4))
-                predict_cost = int(match.group(5))
-                cpu = int(match.group(6))
-                thread = int(match.group(7))
-                if start not in Threads[thread]:
-                    Threads[thread][start] = {}
-                Threads[thread][start]['mtask'] = mtask
-                Threads[thread][start]['end'] = end
-                Threads[thread][start]['cpu'] = cpu
-                Threads[thread][start]['predict_start'] = predict_start
-                Threads[thread][start]['predict_cost'] = predict_cost
-
-                if 'elapsed' not in Mtasks[mtask]:
-                    Mtasks[mtask] = {'end': 0, 'elapsed': 0}
-                Mtasks[mtask]['thread'] = thread
-                Mtasks[mtask]['elapsed'] += elapsed_time
-                Mtasks[mtask]['predict_start'] = predict_start
-                Mtasks[mtask]['predict_cost'] = predict_cost
-                Mtasks[mtask]['end'] = max(Mtasks[mtask]['end'], end)
-            elif re_eval.match(line):
-                match = re_eval.match(line)
-                start = int(match.group(1))
-                elapsed_time = int(match.group(2))
-                Evals[start]['start'] = start
-                Evals[start]['end'] = start + elapsed_time
-            elif re_loop.match(line):
-                match = re_loop.match(line)
-                start = int(match.group(1))
-                elapsed_time = int(match.group(2))
-                EvalLoops[start]['start'] = start
-                EvalLoops[start]['end'] = start + elapsed_time
-            elif re.match(r'^VLPROFTHREAD', line):
+            recordMatch = re_record.match(line)
+            if recordMatch:
+                kind, tick, payload = recordMatch.groups()
+                tick = int(tick)
+                payload = payload.strip()
+                if kind == "EVAL_BEGIN":
+                    Evals[tick]['start'] = tick
+                    lastEvalBeginTick = tick
+                elif kind == "EVAL_END":
+                    Evals[lastEvalBeginTick]['end'] = tick
+                    lastEvalBeginTick = None
+                elif kind == "EVAL_LOOP_BEGIN":
+                    EvalLoops[tick]['start'] = tick
+                    lastEvalLoopBeginTick = tick
+                elif kind == "EVAL_LOOP_END":
+                    EvalLoops[lastEvalLoopBeginTick]['end'] = tick
+                    lastEvalLoopBeginTick = None
+                elif kind == "MTASK_BEGIN":
+                    mtask, predict_start, ecpu = re_payload_mtaskBegin.match(
+                        payload).groups()
+                    mtask = int(mtask)
+                    predict_start = int(predict_start)
+                    ecpu = int(ecpu)
+                    Threads[thread][tick]['mtask'] = mtask
+                    Threads[thread][tick]['predict_start'] = predict_start
+                    Threads[thread][tick]['cpu'] = ecpu
+                    if 'elapsed' not in Mtasks[mtask]:
+                        Mtasks[mtask] = {'end': 0, 'elapsed': 0}
+                    Mtasks[mtask]['begin'] = tick
+                    Mtasks[mtask]['thread'] = thread
+                    Mtasks[mtask]['predict_start'] = predict_start
+                elif kind == "MTASK_END":
+                    mtask, predict_cost = re_payload_mtaskEnd.match(
+                        payload).groups()
+                    mtask = int(mtask)
+                    predict_cost = int(predict_cost)
+                    begin = Mtasks[mtask]['begin']
+                    Threads[thread][begin]['end'] = tick
+                    Threads[thread][begin]['predict_cost'] = predict_cost
+                    Mtasks[mtask]['elapsed'] += tick - begin
+                    Mtasks[mtask]['predict_cost'] = predict_cost
+                    Mtasks[mtask]['end'] = max(Mtasks[mtask]['end'], tick)
+                elif Args.debug:
+                    print("-Unknown execution trace record: %s" % line)
+            elif re_thread.match(line):
+                thread = int(re_thread.match(line).group(1))
+            elif re.match(r'^VLPROF(THREAD|VERSION)', line):
                pass
            elif re_arg1.match(line):
                match = re_arg1.match(line)
@ -131,11 +144,12 @@ def report():
        plus = "+" if re.match(r'^\+', arg) else " "
        print("  %s%s%s" % (arg, plus, Global['args'][arg]))

-    nthreads = len(Threads)
+    nthreads = int(Global['stats']['threads'])
    Global['cpus'] = {}
    for thread in Threads:
        # Make potentially multiple characters per column
        for start in Threads[thread]:
+            if not Threads[thread][start]: continue
            cpu = Threads[thread][start]['cpu']
            elapsed = Threads[thread][start]['end'] - start
            if cpu not in Global['cpus']:
@ -169,74 +183,79 @@ def report():
    print("\nAnalysis:")
    print("  Total threads             = %d" % nthreads)
    print("  Total mtasks              = %d" % len(Mtasks))
-    ncpus = len(Global['cpus'])
+    ncpus = max(len(Global['cpus']), 1)
    print("  Total cpus used           = %d" % ncpus)
-    print("  Total yields              = %d" % int(Global['stats']['yields']))
+    print("  Total yields              = %d" %
+          int(Global['stats'].get('yields', 0)))
    print("  Total evals               = %d" % len(Evals))
    print("  Total eval loops          = %d" % len(EvalLoops))
-    print("  Total eval time           = %d rdtsc ticks" %
-          Global['measured_last_end'])
-    print("  Longest mtask time        = %d rdtsc ticks" % long_mtask_time)
-    print("  All-thread mtask time     = %d rdtsc ticks" %
-          measured_mt_mtask_time)
-    long_efficiency = long_mtask_time / (Global.get('measured_last_end', 1)
-                                         or 1)
-    print("  Longest-thread efficiency = %0.1f%%" % (long_efficiency * 100.0))
-    mt_efficiency = measured_mt_mtask_time / (
-        Global.get('measured_last_end', 1) * nthreads or 1)
-    print("  All-thread efficiency     = %0.1f%%" % (mt_efficiency * 100.0))
-    print("  All-thread speedup        = %0.1f" % (mt_efficiency * nthreads))
-    if Global['rdtsc_cycle_time'] > 0:
-        ut = measured_mt_mtask_time / Global['rdtsc_cycle_time']
-        print("tot_mtask_cpu=" + measured_mt_mtask_time + " cyc=" +
-              Global['rdtsc_cycle_time'] + " ut=" + ut)
+    if Mtasks:
+        print("  Total eval time           = %d rdtsc ticks" %
+              Global['measured_last_end'])
+        print("  Longest mtask time        = %d rdtsc ticks" % long_mtask_time)
+        print("  All-thread mtask time     = %d rdtsc ticks" %
+              measured_mt_mtask_time)
+        long_efficiency = long_mtask_time / (Global.get(
+            'measured_last_end', 1) or 1)
+        print("  Longest-thread efficiency = %0.1f%%" %
+              (long_efficiency * 100.0))
+        mt_efficiency = measured_mt_mtask_time / (
+            Global.get('measured_last_end', 1) * nthreads or 1)
+        print("  All-thread efficiency     = %0.1f%%" %
+              (mt_efficiency * 100.0))
+        print("  All-thread speedup        = %0.1f" %
+              (mt_efficiency * nthreads))
+        if Global['rdtsc_cycle_time'] > 0:
+            ut = measured_mt_mtask_time / Global['rdtsc_cycle_time']
+            print("tot_mtask_cpu=" + measured_mt_mtask_time + " cyc=" +
+                  Global['rdtsc_cycle_time'] + " ut=" + ut)

-    predict_mt_efficiency = predict_mt_mtask_time / (
-        Global.get('predict_last_end', 1) * nthreads or 1)
-    print("\nPrediction (what Verilator used for scheduling):")
-    print("  All-thread efficiency     = %0.1f%%" %
-          (predict_mt_efficiency * 100.0))
-    print("  All-thread speedup        = %0.1f" %
-          (predict_mt_efficiency * nthreads))
+        predict_mt_efficiency = predict_mt_mtask_time / (
+            Global.get('predict_last_end', 1) * nthreads or 1)
+        print("\nPrediction (what Verilator used for scheduling):")
+        print("  All-thread efficiency     = %0.1f%%" %
+              (predict_mt_efficiency * 100.0))
+        print("  All-thread speedup        = %0.1f" %
+              (predict_mt_efficiency * nthreads))

-    p2e_ratios = []
-    min_p2e = 1000000
-    min_mtask = None
-    max_p2e = -1000000
-    max_mtask = None
+        p2e_ratios = []
+        min_p2e = 1000000
+        min_mtask = None
+        max_p2e = -1000000
+        max_mtask = None

-    for mtask in sorted(Mtasks.keys()):
-        if Mtasks[mtask]['elapsed'] > 0:
-            if Mtasks[mtask]['predict_cost'] == 0:
-                Mtasks[mtask]['predict_cost'] = 1  # don't log(0) below
-            p2e_ratio = math.log(Mtasks[mtask]['predict_cost'] /
-                                 Mtasks[mtask]['elapsed'])
-            p2e_ratios.append(p2e_ratio)
+        for mtask in sorted(Mtasks.keys()):
+            if Mtasks[mtask]['elapsed'] > 0:
+                if Mtasks[mtask]['predict_cost'] == 0:
+                    Mtasks[mtask]['predict_cost'] = 1  # don't log(0) below
+                p2e_ratio = math.log(Mtasks[mtask]['predict_cost'] /
+                                     Mtasks[mtask]['elapsed'])
+                p2e_ratios.append(p2e_ratio)

-            if p2e_ratio > max_p2e:
-                max_p2e = p2e_ratio
-                max_mtask = mtask
-            if p2e_ratio < min_p2e:
-                min_p2e = p2e_ratio
-                min_mtask = mtask
+                if p2e_ratio > max_p2e:
+                    max_p2e = p2e_ratio
+                    max_mtask = mtask
+                if p2e_ratio < min_p2e:
+                    min_p2e = p2e_ratio
+                    min_mtask = mtask

-    print("\nStatistics:")
-    print("  min log(p2e) = %0.3f" % min_p2e, end="")
-    print("  from mtask %d (predict %d," %
-          (min_mtask, Mtasks[min_mtask]['predict_cost']),
-          end="")
-    print(" elapsed %d)" % Mtasks[min_mtask]['elapsed'])
-    print("  max log(p2e) = %0.3f" % max_p2e, end="")
-    print("  from mtask %d (predict %d," %
-          (max_mtask, Mtasks[max_mtask]['predict_cost']),
-          end="")
-    print(" elapsed %d)" % Mtasks[max_mtask]['elapsed'])
+        print("\nMTask statistics:")
+        print("  min log(p2e) = %0.3f" % min_p2e, end="")
+        print("  from mtask %d (predict %d," %
+              (min_mtask, Mtasks[min_mtask]['predict_cost']),
+              end="")
+        print(" elapsed %d)" % Mtasks[min_mtask]['elapsed'])
+        print("  max log(p2e) = %0.3f" % max_p2e, end="")
+        print("  from mtask %d (predict %d," %
+              (max_mtask, Mtasks[max_mtask]['predict_cost']),
+              end="")
+        print(" elapsed %d)" % Mtasks[max_mtask]['elapsed'])

-    stddev = statistics.pstdev(p2e_ratios)
-    mean = statistics.mean(p2e_ratios)
-    print("  mean = %0.3f" % mean)
-    print("  stddev = %0.3f" % stddev)
-    print("  e ^ stddev = %0.3f" % math.exp(stddev))
+        stddev = statistics.pstdev(p2e_ratios)
+        mean = statistics.mean(p2e_ratios)
+        print("  mean = %0.3f" % mean)
+        print("  stddev = %0.3f" % stddev)
+        print("  e ^ stddev = %0.3f" % math.exp(stddev))

    report_cpus()

@ -375,44 +394,45 @@ def write_vcd(filename):
            vcd['values'][eval_start][elcode] = n
            vcd['values'][eval_end][elcode] = None

-        # Predicted graph
-        for eval_start in EvalLoops:
-            eval_end = EvalLoops[eval_start]['end']
-            # Compute scale so predicted graph is of same width as eval
-            measured_scaling = (eval_end -
-                                eval_start) / Global['predict_last_end']
-            # Predict mtasks that fill the time the eval occupied
-            for mtask in Mtasks:
-                thread = Mtasks[mtask]['thread']
-                pred_scaled_start = eval_start + int(
-                    Mtasks[mtask]['predict_start'] * measured_scaling)
-                pred_scaled_end = eval_start + int(
-                    (Mtasks[mtask]['predict_start'] +
-                     Mtasks[mtask]['predict_cost']) * measured_scaling)
-                if pred_scaled_start == pred_scaled_end:
-                    continue
+        if Mtasks:
+            # Predicted graph
+            for eval_start in EvalLoops:
+                eval_end = EvalLoops[eval_start]['end']
+                # Compute scale so predicted graph is of same width as eval
+                measured_scaling = (eval_end -
+                                    eval_start) / Global['predict_last_end']
+                # Predict mtasks that fill the time the eval occupied
+                for mtask in Mtasks:
+                    thread = Mtasks[mtask]['thread']
+                    pred_scaled_start = eval_start + int(
+                        Mtasks[mtask]['predict_start'] * measured_scaling)
+                    pred_scaled_end = eval_start + int(
+                        (Mtasks[mtask]['predict_start'] +
+                         Mtasks[mtask]['predict_cost']) * measured_scaling)
+                    if pred_scaled_start == pred_scaled_end:
+                        continue

-                sig = "predicted_thread%d_mtask" % thread
-                if sig not in vcd['sigs']['predicted_threads']:
-                    vcd['sigs']['predicted_threads'][sig] = code
-                    code += 1
-                mcode = vcd['sigs']['predicted_threads'][sig]
+                    sig = "predicted_thread%d_mtask" % thread
+                    if sig not in vcd['sigs']['predicted_threads']:
+                        vcd['sigs']['predicted_threads'][sig] = code
+                        code += 1
+                    mcode = vcd['sigs']['predicted_threads'][sig]

-                vcd['values'][pred_scaled_start][mcode] = mtask
-                vcd['values'][pred_scaled_end][mcode] = None
+                    vcd['values'][pred_scaled_start][mcode] = mtask
+                    vcd['values'][pred_scaled_end][mcode] = None

-                parallelism['predicted'][pred_scaled_start] += 1
-                parallelism['predicted'][pred_scaled_end] -= 1
+                    parallelism['predicted'][pred_scaled_start] += 1
+                    parallelism['predicted'][pred_scaled_end] -= 1

-        # Parallelism graph
-        for measpred in ('measured', 'predicted'):
-            vcd['sigs']['Stats']["%s_parallelism" % measpred] = code
-            pcode = code
-            code += 1
-            value = 0
-            for time in sorted(parallelism[measpred].keys()):
-                value += parallelism[measpred][time]
-                vcd['values'][time][pcode] = value
+            # Parallelism graph
+            for measpred in ('measured', 'predicted'):
+                vcd['sigs']['Stats']["%s_parallelism" % measpred] = code
+                pcode = code
+                code += 1
+                value = 0
+                for time in sorted(parallelism[measpred].keys()):
+                    value += parallelism[measpred][time]
+                    vcd['values'][time][pcode] = value

        # Create output file
        fh.write("$version Generated by verilator_gantt $end\n")
@ -476,10 +496,10 @@ parser.add_argument('--no-vcd',
                    action='store_true')
 parser.add_argument('--vcd',
                    help='filename for vcd outpue',
-                    default='profile_threads.vcd')
+                    default='profile_exec.vcd')
 parser.add_argument('filename',
-                    help='input profile_threads.dat filename to process',
-                    default='profile_threads.dat')
+                    help='input profile_exec.dat filename to process',
+                    default='profile_exec.dat')

 Args = parser.parse_args()

--- a/docs/guide/deprecations.rst
+++ b/docs/guide/deprecations.rst
@ -19,3 +19,14 @@ Verilated_heavy.h
 Option `--cdc`
  The experimental `--cdc` option is believed to be generally unused and is
  planned for removal no sooner than January 2023.
+
+Option `--prof-threads`
+  The `--prof-threads` option has been superseded by the `--prof-exec` and
+  `--prof-pgo` options and is planned for removal no sooner than April 2023.
+
+Verilated model options `+verilator+prof+threads+*`
+  The `+verilator+prof+threads+start`, `+verilator+prof+threads+window` and
+  `+verilator+prof+threads+file` options have been superseded by the
+  `+verilator+prof+exec+start`, `+verilator+prof+exec+window` and
+  `+verilator+prof+exec+file` options respectively and are planned for removal
+  no sooner than April 2023.
--- a/docs/guide/exe_sim.rst
+++ b/docs/guide/exe_sim.rst
@ -38,33 +38,45 @@ Summary:

   Display help and exit.

-.. option:: +verilator+prof+threads+file+<filename>
+.. option:: +verilator+prof+exec+file+<filename>

-   When a model was Verilated using :vlopt:`--prof-threads`, sets the
+   When a model was Verilated using :vlopt:`--prof-exec`, sets the
   simulation runtime filename to dump to.  Defaults to
-   :file:`profile_threads.dat`.
+   :file:`profile_exec.dat`.

-.. option:: +verilator+prof+threads+start+<value>
+.. option:: +verilator+prof+exec+start+<value>

-   When a model was Verilated using :vlopt:`--prof-threads`, the simulation
+   When a model was Verilated using :vlopt:`--prof-exec`, the simulation
   runtime will wait until $time is at this value (expressed in units of
   the time precision), then start the profiling warmup, then
   capturing. Generally this should be set to some time that is well within
   the normal operation of the simulation, i.e. outside of reset. If 0, the
   dump is disabled. Defaults to 1.

-.. option:: +verilator+prof+threads+window+<value>
+.. option:: +verilator+prof+exec+window+<value>

-   When a model was Verilated using :vlopt:`--prof-threads`, after $time
-   reaches :vlopt:`+verilator+prof+threads+start+\<value\>`, Verilator will
+   When a model was Verilated using :vlopt:`--prof-exec`, after $time
+   reaches :vlopt:`+verilator+prof+exec+start+\<value\>`, Verilator will
   warm up the profiling for this number of eval() calls, then will capture
   the profiling of this number of eval() calls.  Defaults to 2, which
   makes sense for a single-clock-domain module where it's typical to want
   to capture one posedge eval() and one negedge eval().

+.. option:: +verilator+prof+threads+file+<filename>
+
+   Deprecated. Alias for :vlopt:`+verilator+prof+exec+file+\<filename\>`
+
+.. option:: +verilator+prof+threads+start+<value>
+
+   Deprecated. Alias for :vlopt:`+verilator+prof+exec+start+\<value\>`
+
+.. option:: +verilator+prof+threads+window+<value>
+
+   Deprecated. Alias for :vlopt:`+verilator+prof+exec+window+\<filename\>`
+
 .. option:: +verilator+prof+vlt+file+<filename>

-   When a model was Verilated using :vlopt:`--prof-threads`, sets the
+   When a model was Verilated using :vlopt:`--prof-pgo`, sets the
   profile-guided optimization data runtime filename to dump to.  Defaults
   to :file:`profile.vlt`.

--- a/docs/guide/exe_verilator.rst
+++ b/docs/guide/exe_verilator.rst
@ -845,10 +845,19 @@ Summary:

   Using :vlopt:`--prof-cfuncs` also enables :vlopt:`--prof-c`.

+.. option:: --prof-exec
+
+   Enable collection of execution trace, that can be convered into a gantt
+   chart with verilator_gantt See :ref:`Execution Profiling`.
+
+.. option:: --prof-pgo
+
+   Enable collection of profiling data for profile guided verilation. Currently
+   this is only useful with :vlopt:`--threads`. See :ref:`Thread PGO`.
+
 .. option:: --prof-threads

-   Enable gantt chart data collection for threaded builds. See :ref:`Thread
-   Profiling` and :ref:`Thread PGO`.
+   Deprecated. Same as --prof-exec and --prof-pgo together.

 .. option:: --protect-key <key>

--- a/docs/guide/exe_verilator_gantt.rst
+++ b/docs/guide/exe_verilator_gantt.rst
@ -72,7 +72,7 @@ verilator_gantt Arguments

 .. option:: <filename>

-The filename to read data from, defaults to "profile_threads.dat".
+The filename to read data from, defaults to "profile_exec.dat".

 .. option:: --help

--- a/docs/guide/files.rst
+++ b/docs/guide/files.rst
@ -155,13 +155,13 @@ The Verilated executable may produce the following:
   * - gmon.out
     - GCC/clang code profiler output, often fed into :command:`verilator_profcfunc`
   * - profile.vlt
-     - -profile data file for :ref:`Thread PGO`
-   * - profile_threads.dat
-     - -profile-threads data file for :command:`verilator_gantt`
+     - --prof-pgo data file for :ref:`Thread PGO`
+   * - profile_exec.dat
+     - --prof-exec data file for :command:`verilator_gantt`

 Verilator_gantt may produce the following:

 .. list-table::

-   * - profile_threads.vcd
+   * - profile_exec.vcd
     - Gantt report waveform output
--- a/docs/guide/simulating.rst
+++ b/docs/guide/simulating.rst
@ -279,26 +279,25 @@ To use profiling:
   is being spent.


-.. _Thread Profiling:
+.. _Execution Profiling:

-Thread Profiling
-================
+Execution Profiling
+===================

-When using multithreaded mode (:vlopt:`--threads`), it is useful to see
-statistics and visualize how well the multiple CPUs are being utilized.
+For performance optimization, it is useful to see statistics and visualize how
+execution time is distributed in a verilated model.

-With the :vlopt:`--prof-threads` option, Verilator will:
+With the :vlopt:`--prof-exec` option, Verilator will:

-* Add code to the Verilated model to record the start and end time of each
-  macro-task across a number of calls to eval. (What is a macro-task?  See
-  the Verilator internals document (:file:`docs/internals.rst` in the
-  distribution.)
+* Add code to the Verilated model to record execution flow.

 * Add code to save profiling data in non-human-friendly form to the file
-  specified with :vlopt:`+verilator+prof+threads+file+\<filename\>`.
+  specified with :vlopt:`+verilator+prof+exec+file+\<filename\>`.

-* Add code to save profiling data for thread profile-guided
-  optimization. See :ref:`Thread PGO`.
+* In multi-threaded models, add code to record the start and end time of each
+  macro-task across a number of calls to eval. (What is a macro-task?  See the
+  Verilator internals document (:file:`docs/internals.rst` in the
+  distribution.)

 The :command:`verilator_gantt` program may then be run to transform the
 saved profiling file into a nicer visual format and produce some related
@ -406,8 +405,8 @@ others as they prove beneficial.
 Thread Profile-Guided Optimization
 ----------------------------------

-Verilator supports thread profile-guided optimization (Thread PGO) to
-improve multithreaded performance.
+Verilator supports profile-guided optimization (verilation) of multi-threaded
+models (Thread PGO) to improve performance.

 When using multithreading, Verilator computes how long macro tasks take and
 tries to balance those across threads.  (What is a macro-task?  See the
@ -417,13 +416,14 @@ balanced, leading to decreased performance.  Thread PGO allows collecting
 profiling data to replace the estimates and better optimize these
 decisions.

-To use Thread PGO, Verilate the model with the :vlopt:`--prof-threads`
-option.
+To use Thread PGO, Verilate the model with the :vlopt:`--prof-pgo` option. This
+will code to the verilated model to save profiling data for profile-guided
+optimization.

 Run the model executable. When the executable exits, it will create a
 profile.vlt file.

-Rerun Verilator, optionally omitting the :vlopt:`--prof-threads` option,
+Rerun Verilator, optionally omitting the :vlopt:`--prof-pgo` option,
 and adding the profile.vlt generated earlier to the command line.

 Note there is no Verilator equivalent to GCC's --fprofile-use. Verilator's
--- a/docs/guide/verilating.rst
+++ b/docs/guide/verilating.rst
@ -265,7 +265,7 @@ This will limit memory to socket 0, and threads to cores 0, 1, 2, 3,
 (presumably on socket 0) optimizing performance.  Of course this must be
 adjusted if you want another simulator using e.g. socket 1, or if you
 Verilated with a different number of threads.  To see what CPUs are
-actually used, use :vlopt:`--prof-threads`.
+actually used, use :vlopt:`--prof-exec`.


 Multithreaded Verilog and Library Support
--- a/docs/internals.rst
+++ b/docs/internals.rst
@ -301,7 +301,7 @@ prerequisites on other threads have finished.

 The synchronization cost is cheap if the prereqs are done. If they're not,
 fragmentation (idle CPU cores waiting) is possible. This is the major
-source of overhead in this approach. The ``--prof-threads`` switch and the
+source of overhead in this approach. The ``--prof-exec`` switch and the
 ``verilator_gantt`` script can visualize the time lost to such
 fragmentation.

--- a/include/verilated.cpp
+++ b/include/verilated.cpp
@ -2280,7 +2280,7 @@ VerilatedContext::VerilatedContext()
    : m_impdatap{new VerilatedContextImpData} {
    Verilated::lastContextp(this);
    Verilated::threadContextp(this);
-    m_ns.m_profThreadsFilename = "profile_threads.dat";
+    m_ns.m_profExecFilename = "profile_exec.dat";
    m_ns.m_profVltFilename = "profile.vlt";
    m_fdps.resize(31);
    std::fill(m_fdps.begin(), m_fdps.end(), static_cast<FILE*>(nullptr));
@ -2348,21 +2348,21 @@ void VerilatedContext::gotFinish(bool flag) VL_MT_SAFE {
    const VerilatedLockGuard lock{m_mutex};
    m_s.m_gotFinish = flag;
 }
-void VerilatedContext::profThreadsStart(vluint64_t flag) VL_MT_SAFE {
+void VerilatedContext::profExecStart(vluint64_t flag) VL_MT_SAFE {
    const VerilatedLockGuard lock{m_mutex};
-    m_ns.m_profThreadsStart = flag;
+    m_ns.m_profExecStart = flag;
 }
-void VerilatedContext::profThreadsWindow(vluint64_t flag) VL_MT_SAFE {
+void VerilatedContext::profExecWindow(vluint64_t flag) VL_MT_SAFE {
    const VerilatedLockGuard lock{m_mutex};
-    m_ns.m_profThreadsWindow = flag;
+    m_ns.m_profExecWindow = flag;
 }
-void VerilatedContext::profThreadsFilename(const std::string& flag) VL_MT_SAFE {
+void VerilatedContext::profExecFilename(const std::string& flag) VL_MT_SAFE {
    const VerilatedLockGuard lock{m_mutex};
-    m_ns.m_profThreadsFilename = flag;
+    m_ns.m_profExecFilename = flag;
 }
-std::string VerilatedContext::profThreadsFilename() const VL_MT_SAFE {
+std::string VerilatedContext::profExecFilename() const VL_MT_SAFE {
    const VerilatedLockGuard lock{m_mutex};
-    return m_ns.m_profThreadsFilename;
+    return m_ns.m_profExecFilename;
 }
 void VerilatedContext::profVltFilename(const std::string& flag) VL_MT_SAFE {
    const VerilatedLockGuard lock{m_mutex};
@ -2524,12 +2524,15 @@ void VerilatedContextImp::commandArgVl(const std::string& arg) {
                        "Exiting due to command line argument (not an error)");
        } else if (arg == "+verilator+noassert") {
            assertOn(false);
-        } else if (commandArgVlUint64(arg, "+verilator+prof+threads+start+", u64)) {
-            profThreadsStart(u64);
-        } else if (commandArgVlUint64(arg, "+verilator+prof+threads+window+", u64, 1)) {
-            profThreadsWindow(u64);
-        } else if (commandArgVlString(arg, "+verilator+prof+threads+file+", str)) {
-            profThreadsFilename(str);
+        } else if (commandArgVlUint64(arg, "+verilator+prof+exec+start+", u64)
+                   || commandArgVlUint64(arg, "+verilator+prof+threads+start+", u64)) {
+            profExecStart(u64);
+        } else if (commandArgVlUint64(arg, "+verilator+prof+exec+window+", u64, 1)
+                   || commandArgVlUint64(arg, "+verilator+prof+threads+window+", u64, 1)) {
+            profExecWindow(u64);
+        } else if (commandArgVlString(arg, "+verilator+prof+exec+file+", str)
+                   || commandArgVlString(arg, "+verilator+prof+threads+file+", str)) {
+            profExecFilename(str);
        } else if (commandArgVlString(arg, "+verilator+prof+vlt+file+", str)) {
            profVltFilename(str);
        } else if (commandArgVlUint64(arg, "+verilator+rand+reset+", u64, 0, 2)) {
--- a/include/verilated.h
+++ b/include/verilated.h
@ -344,10 +344,10 @@ protected:
    struct NonSerialized {  // Non-serialized information
        // These are reloaded from on command-line settings, so do not need to persist
        // Fast path
-        vluint64_t m_profThreadsStart = 1;  // +prof+threads starting time
-        vluint32_t m_profThreadsWindow = 2;  // +prof+threads window size
+        vluint64_t m_profExecStart = 1;  // +prof+exec+start time
+        vluint32_t m_profExecWindow = 2;  // +prof+exec+window size
        // Slow path
-        std::string m_profThreadsFilename;  // +prof+threads filename
+        std::string m_profExecFilename;  // +prof+exec+file filename
        std::string m_profVltFilename;  // +prof+vlt filename
    } m_ns;

@ -518,13 +518,13 @@ public:  // But for internal use only
    std::string dumpfile() const VL_MT_SAFE_EXCLUDES(m_timeDumpMutex);
    std::string dumpfileCheck() const VL_MT_SAFE_EXCLUDES(m_timeDumpMutex);

-    // Internal: --prof-threads related settings
-    void profThreadsStart(vluint64_t flag) VL_MT_SAFE;
-    vluint64_t profThreadsStart() const VL_MT_SAFE { return m_ns.m_profThreadsStart; }
-    void profThreadsWindow(vluint64_t flag) VL_MT_SAFE;
-    vluint32_t profThreadsWindow() const VL_MT_SAFE { return m_ns.m_profThreadsWindow; }
-    void profThreadsFilename(const std::string& flag) VL_MT_SAFE;
-    std::string profThreadsFilename() const VL_MT_SAFE;
+    // Internal: --prof-exec related settings
+    void profExecStart(vluint64_t flag) VL_MT_SAFE;
+    vluint64_t profExecStart() const VL_MT_SAFE { return m_ns.m_profExecStart; }
+    void profExecWindow(vluint64_t flag) VL_MT_SAFE;
+    vluint32_t profExecWindow() const VL_MT_SAFE { return m_ns.m_profExecWindow; }
+    void profExecFilename(const std::string& flag) VL_MT_SAFE;
+    std::string profExecFilename() const VL_MT_SAFE;
    void profVltFilename(const std::string& flag) VL_MT_SAFE;
    std::string profVltFilename() const VL_MT_SAFE;

--- a/include/verilated_funcs.h
+++ b/include/verilated_funcs.h
@ -112,15 +112,6 @@ extern WDataOutP VL_RAND_RESET_W(int obits, WDataOutP outwp);
 /// Zero reset a signal (slow - else use VL_ZERO_W)
 extern WDataOutP VL_ZERO_RESET_W(int obits, WDataOutP outwp);

-#if VL_THREADED
-/// Return high-precision counter for profiling, or 0x0 if not available
-inline QData VL_RDTSC_Q() {
-    vluint64_t val;
-    VL_RDTSC(val);
-    return val;
-}
-#endif
-
 extern void VL_PRINTTIMESCALE(const char* namep, const char* timeunitp,
                              const VerilatedContext* contextp) VL_MT_SAFE;

--- a/include/verilated_profiler.cpp
+++ b/include/verilated_profiler.cpp
@ -0,0 +1,191 @@
+// -*- mode: C++; c-file-style: "cc-mode" -*-
+//=============================================================================
+//
+// Code available from: https://verilator.org
+//
+// Copyright 2012-2022 by Wilson Snyder. This program is free software; you can
+// redistribute it and/or modify it under the terms of either the GNU
+// Lesser General Public License Version 3 or the Perl Artistic License
+// Version 2.0.
+// SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
+//
+//=============================================================================
+///
+/// \file
+/// \brief Verilated run-time profiling implementation code
+///
+//=============================================================================
+
+#include "verilatedos.h"
+#include "verilated_profiler.h"
+
+#if VL_THREADED
+#include "verilated_threads.h"
+#endif
+
+#include <fstream>
+#include <string>
+
+//=============================================================================
+// Globals
+
+// Internal note: Globals may multi-construct, see verilated.cpp top.
+
+VL_THREAD_LOCAL VlExecutionProfiler::ExecutionTrace VlExecutionProfiler::t_trace;
+
+constexpr const char* const VlExecutionRecord::s_ascii[];
+
+//=============================================================================
+// VlPgoProfiler implementation
+
+vluint16_t VlExecutionRecord::getcpu() {
+#if defined(__linux)
+    return sched_getcpu();  // TODO: this is a system call. Not exactly cheap.
+#elif defined(__APPLE__) && !defined(__arm64__)
+    vluint32_t info[4];
+    __cpuid_count(1, 0, info[0], info[1], info[2], info[3]);
+    // info[1] is EBX, bits 24-31 are APIC ID
+    if ((info[3] & (1 << 9)) == 0) {
+        return -1;  // no APIC on chip
+    } else {
+        return (unsigned)info[1] >> 24;
+    }
+#elif defined(_WIN32)
+    return GetCurrentProcessorNumber();
+#else
+    return 0;
+#endif
+}
+
+//=============================================================================
+// VlExecutionProfiler implementation
+
+template <size_t N> size_t roundUptoMultipleOf(size_t value) {
+    static_assert((N & (N - 1)) == 0, "'N' must be a power of 2");
+    size_t mask = N - 1;
+    return (value + mask) & ~mask;
+}
+
+VlExecutionProfiler::VlExecutionProfiler() {
+    // Setup profiling on main thread
+    setupThread(0);
+}
+
+void VlExecutionProfiler::configure(const VerilatedContext& context) {
+    if (VL_UNLIKELY(m_enabled)) {
+        --m_windowCount;
+        if (VL_UNLIKELY(m_windowCount == context.profExecWindow())) {
+            VL_DEBUG_IF(VL_DBG_MSGF("+ profile start collection\n"););
+            clear();  // Clear the profile after the cache warm-up cycles.
+            m_tickBegin = VL_CPU_TICK();
+        } else if (VL_UNLIKELY(m_windowCount == 0)) {
+            const vluint64_t tickEnd = VL_CPU_TICK();
+            VL_DEBUG_IF(VL_DBG_MSGF("+ profile end\n"););
+            const std::string& fileName = context.profExecFilename();
+            dump(fileName.c_str(), tickEnd);
+            m_enabled = false;
+        }
+        return;
+    }
+
+    const vluint64_t startReq = context.profExecStart() + 1;  // + 1, so we can start at time 0
+
+    if (VL_UNLIKELY(m_lastStartReq < startReq && VL_TIME_Q() >= context.profExecStart())) {
+        VL_DEBUG_IF(VL_DBG_MSGF("+ profile start warmup\n"););
+        VL_DEBUG_IF(assert(m_windowCount == 0););
+        m_enabled = true;
+        m_windowCount = context.profExecWindow() * 2;
+        m_lastStartReq = startReq;
+    }
+}
+
+void VlExecutionProfiler::setupThread(uint32_t threadId) {
+    // Reserve some space in the thread-local profiling buffer, in order to try to avoid malloc
+    // while profiling.
+    t_trace.reserve(RESERVED_TRACE_CAPACITY);
+    // Register thread-local buffer in list of all buffers
+    {
+        const VerilatedLockGuard lock{m_mutex};
+        bool exists = !m_traceps.emplace(threadId, &t_trace).second;
+        assert(!exists);
+    }
+}
+
+void VlExecutionProfiler::clear() VL_MT_SAFE_EXCLUDES(m_mutex) {
+    const VerilatedLockGuard lock{m_mutex};
+    for (const auto& pair : m_traceps) {
+        ExecutionTrace* const tracep = pair.second;
+        const size_t reserve = roundUptoMultipleOf<RESERVED_TRACE_CAPACITY>(tracep->size());
+        tracep->clear();
+        tracep->reserve(reserve);
+    }
+}
+
+void VlExecutionProfiler::dump(const char* filenamep, vluint64_t tickEnd)
+    VL_MT_SAFE_EXCLUDES(m_mutex) {
+    const VerilatedLockGuard lock{m_mutex};
+    VL_DEBUG_IF(VL_DBG_MSGF("+prof+exec writing to '%s'\n", filenamep););
+
+    FILE* const fp = std::fopen(filenamep, "w");
+    if (VL_UNLIKELY(!fp)) { VL_FATAL_MT(filenamep, 0, "", "+prof+exec+file file not writable"); }
+
+    // TODO Perhaps merge with verilated_coverage output format, so can
+    // have a common merging and reporting tool, etc.
+    fprintf(fp, "VLPROFVERSION 2.0 # Verilator execution profile version 2.0\n");
+    fprintf(fp, "VLPROF arg +verilator+prof+exec+start+%" PRIu64 "\n",
+            Verilated::threadContextp()->profExecStart());
+    fprintf(fp, "VLPROF arg +verilator+prof+exec+window+%u\n",
+            Verilated::threadContextp()->profExecWindow());
+    const unsigned threads = static_cast<unsigned>(m_traceps.size());
+    fprintf(fp, "VLPROF stat threads %u\n", threads);
+#ifdef VL_THREADED
+    fprintf(fp, "VLPROF stat yields %" PRIu64 "\n", VlMTaskVertex::yields());
+#endif
+
+    // Copy /proc/cpuinfo into this output so verilator_gantt can be run on
+    // a different machine
+    {
+        const std::unique_ptr<std::ifstream> ifp{new std::ifstream("/proc/cpuinfo")};
+        if (!ifp->fail()) {
+            std::string line;
+            while (std::getline(*ifp, line)) { fprintf(fp, "VLPROFPROC %s\n", line.c_str()); }
+        }
+    }
+
+    for (const auto& pair : m_traceps) {
+        const uint32_t threadId = pair.first;
+        ExecutionTrace* const tracep = pair.second;
+        fprintf(fp, "VLPROFTHREAD %" PRIu32 "\n", threadId);
+
+        for (const VlExecutionRecord& er : *tracep) {
+            const char* const name = VlExecutionRecord::s_ascii[static_cast<uint8_t>(er.m_type)];
+            const vluint64_t time = er.m_tick - m_tickBegin;
+            fprintf(fp, "VLPROFEXEC %s %" PRIu64, name, time);
+
+            switch (er.m_type) {
+            case VlExecutionRecord::Type::EVAL_BEGIN:
+            case VlExecutionRecord::Type::EVAL_END:
+            case VlExecutionRecord::Type::EVAL_LOOP_BEGIN:
+            case VlExecutionRecord::Type::EVAL_LOOP_END:
+                // No payload
+                fprintf(fp, "\n");
+                break;
+            case VlExecutionRecord::Type::MTASK_BEGIN: {
+                const auto& payload = er.m_payload.mtaskBegin;
+                fprintf(fp, " id %u predictStart %u cpu %u\n", payload.m_id,
+                        payload.m_predictStart, payload.m_cpu);
+                break;
+            }
+            case VlExecutionRecord::Type::MTASK_END: {
+                const auto& payload = er.m_payload.mtaskEnd;
+                fprintf(fp, " id %u predictCost %u\n", payload.m_id, payload.m_predictCost);
+                break;
+            }
+            default: abort();  // LCOV_EXCL_LINE
+            }
+        }
+    }
+    fprintf(fp, "VLPROF stat ticks %" PRIu64 "\n", tickEnd - m_tickBegin);
+
+    std::fclose(fp);
+}
--- a/include/verilated_profiler.h
+++ b/include/verilated_profiler.h
@ -12,7 +12,7 @@
 //=============================================================================
 ///
 /// \file
-/// \brief Verilated general profiling header
+/// \brief Verilated run-time profiling header
 ///
 /// This file is not part of the Verilated public-facing API.
 /// It is only for internal use by Verilated library routines.
@ -23,58 +23,204 @@
 #define VERILATOR_VERILATED_PROFILER_H_

 #include "verilatedos.h"
-#include "verilated.h"  // for VerilatedMutex and clang annotations

-#include <deque>
+#ifndef VL_PROFILER
+#error "verilated_profiler.h/cpp expects VL_PROFILER (from --prof-{exec, pgo}"
+#endif
+
+#include "verilated.h"
+
+#include <array>
+#include <atomic>
+#include <cassert>
 #include <string>
+#include <type_traits>
+#include <vector>
+
+class VlExecutionProfiler;
+
+//=============================================================================
+// Macros to simplify generated code
+
+#define VL_EXEC_TRACE_ADD_RECORD(vlSymsp) \
+    if (VL_UNLIKELY((vlSymsp)->__Vm_executionProfiler.enabled())) \
+    (vlSymsp)->__Vm_executionProfiler.addRecord()
+
+//=============================================================================
+// Return high-precision counter for profiling, or 0x0 if not available
+VL_ATTR_ALWINLINE
+inline QData VL_CPU_TICK() {
+    vluint64_t val;
+    VL_GET_CPU_TICK(val);
+    return val;
+}
+
+//=============================================================================
+// Private class used by VlExecutionProfiler
+
+#define _VL_FOREACH_APPLY(macro, arg) macro(arg, #arg)
+
+// clang-format off
+#define FOREACH_VlExecutionRecord_TYPE(macro) \
+    _VL_FOREACH_APPLY(macro, EVAL_BEGIN) \
+    _VL_FOREACH_APPLY(macro, EVAL_END) \
+    _VL_FOREACH_APPLY(macro, EVAL_LOOP_BEGIN) \
+    _VL_FOREACH_APPLY(macro, EVAL_LOOP_END) \
+    _VL_FOREACH_APPLY(macro, MTASK_BEGIN) \
+    _VL_FOREACH_APPLY(macro, MTASK_END)
+// clang-format on
+
+class VlExecutionRecord final {
+    friend class VlExecutionProfiler;
+
+    // TYPES
+    enum class Type : uint8_t {
+#define VL_FOREACH_MACRO(id, name) id,
+        FOREACH_VlExecutionRecord_TYPE(VL_FOREACH_MACRO)
+#undef VL_FOREACH_MACRO
+    };
+
+    static constexpr const char* const s_ascii[] = {
+#define VL_FOREACH_MACRO(id, name) name,
+        FOREACH_VlExecutionRecord_TYPE(VL_FOREACH_MACRO)
+#undef VL_FOREACH_MACRO
+    };
+
+    union Payload {
+        struct {
+            vluint32_t m_id;  // MTask id
+            vluint32_t m_predictStart;  // Time scheduler predicted would start
+            vluint32_t m_cpu;  // Executing CPU id
+        } mtaskBegin;
+        struct {
+            vluint32_t m_id;  // MTask id
+            vluint32_t m_predictCost;  // How long scheduler predicted would take
+        } mtaskEnd;
+    };
+
+    // STATE
+    // Layout below allows efficient packing.
+    const vluint64_t m_tick = VL_CPU_TICK();  // Tick at construction
+    Payload m_payload;  // The record payload
+    Type m_type;  // The record type
+    static_assert(alignof(vluint64_t) >= alignof(Payload), "Padding not allowed");
+    static_assert(alignof(Payload) >= alignof(Type), "Padding not allowed");
+
+    static vluint16_t getcpu();  // Return currently executing CPU id

-// Profile record, private class used only by this header
-class VerilatedProfilerRec final {
-    const std::string m_name;  // Hashed name of mtask/etc
-    const size_t m_counterNumber = 0;  // Which counter has data
 public:
+    // CONSTRUCTOR
+    VlExecutionRecord() = default;
+
    // METHODS
-    VerilatedProfilerRec(size_t counterNumber, const std::string& name)
-        : m_name{name}
-        , m_counterNumber{counterNumber} {}
-    VerilatedProfilerRec() = default;
-    size_t counterNumber() const { return m_counterNumber; }
-    std::string name() const { return m_name; }
+    void evalBegin() { m_type = Type::EVAL_BEGIN; }
+    void evalEnd() { m_type = Type::EVAL_END; }
+    void evalLoopBegin() { m_type = Type::EVAL_LOOP_BEGIN; }
+    void evalLoopEnd() { m_type = Type::EVAL_LOOP_END; }
+    void mtaskBegin(vluint32_t id, vluint32_t predictStart) {
+        m_payload.mtaskBegin.m_id = id;
+        m_payload.mtaskBegin.m_predictStart = predictStart;
+        m_payload.mtaskBegin.m_cpu = getcpu();
+        m_type = Type::MTASK_BEGIN;
+    }
+    void mtaskEnd(vluint32_t id, vluint32_t predictCost) {
+        m_payload.mtaskEnd.m_id = id;
+        m_payload.mtaskEnd.m_predictCost = predictCost;
+        m_type = Type::MTASK_END;
+    }
 };

-// Create some number of bucketed profilers
-template <std::size_t T_Entries> class VerilatedProfiler final {
-    // Counters are stored packed, all together, versus in VerilatedProfilerRec to
-    // reduce cache effects
-    std::array<vluint64_t, T_Entries> m_counters{};  // Time spent on this record
-    std::deque<VerilatedProfilerRec> m_records;  // Record information
+static_assert(std::is_trivially_destructible<VlExecutionRecord>::value,
+              "VlExecutionRecord should be trivially destructible for fast buffer clearing");
+
+//=============================================================================
+// VlExecutionProfiler is for collecting profiling data about model execution
+
+class VlExecutionProfiler final {
+    // CONSTANTS
+
+    // In order to try to avoid dynamic memory allocations during the actual profiling phase,
+    // trace buffers are pre-allocated to be able to hold [a multiple] of this many records.
+    static constexpr size_t RESERVED_TRACE_CAPACITY = 4096;
+
+    // TYPES
+
+    // Execution traces are recorded into thread local vectors. We can append records of profiling
+    // events to this vector with very low overhead, and then dump them out later. This prevents
+    // the overhead of printf/malloc/IO from corrupting the profiling data. It's super cheap to
+    // append a VlProfileRec struct on the end of a pre-allocated vector; this is the only cost we
+    // pay in real-time during a profiling cycle. Internal note: Globals may multi-construct, see
+    // verilated.cpp top.
+    using ExecutionTrace = std::vector<VlExecutionRecord>;
+
+    // STATE
+    static VL_THREAD_LOCAL ExecutionTrace t_trace;  // thread-local trace buffers
+    VerilatedMutex m_mutex;
+    // Map from thread id to &t_trace of given thread
+    std::map<uint32_t, ExecutionTrace*> m_traceps VL_GUARDED_BY(m_mutex);
+
+    bool m_enabled = false;  // Is profiling currently enabled
+
+    vluint64_t m_tickBegin = 0;  // Sample time (rdtsc() on x86) at beginning of collection
+    vluint64_t m_lastStartReq = 0;  // Last requested profiling start (in simulation time)
+    vluint32_t m_windowCount = 0;  // Track our position in the cache warmup and profile window
+
+public:
+    // CONSTRUCTOR
+    VlExecutionProfiler();
+
+    // METHODS
+
+    // Is profiling enabled
+    inline bool enabled() const { return m_enabled; }
+    // Append a trace record to the trace buffer of the current thread
+    inline VlExecutionRecord& addRecord() {
+        t_trace.emplace_back();
+        return t_trace.back();
+    }
+    // Configure profiler (called in beginning of 'eval')
+    void configure(const VerilatedContext&);
+    // Setup profiling on a particular thread;
+    void setupThread(uint32_t threadId);
+    // Clear all profiling data
+    void clear() VL_MT_SAFE_EXCLUDES(m_mutex);
+    // Write profiling data into file
+    void dump(const char* filenamep, vluint64_t tickEnd) VL_MT_SAFE_EXCLUDES(m_mutex);
+};
+
+//=============================================================================
+// VlPgoProfiler is for collecting profiling data for PGO
+
+template <std::size_t T_Entries> class VlPgoProfiler final {
+    // TYPES
+    struct Record final {
+        const std::string m_name;  // Hashed name of mtask/etc
+        const size_t m_counterNumber = 0;  // Which counter has data
+    };
+
+    // Counters are stored packed, all together to reduce cache effects
+    std::array<vluint64_t, T_Entries> m_counters;  // Time spent on this record
+    std::vector<Record> m_records;  // Record information

 public:
    // METHODS
-    VerilatedProfiler() = default;
-    ~VerilatedProfiler() = default;
+    VlPgoProfiler() = default;
+    ~VlPgoProfiler() = default;
    void write(const char* modelp, const std::string& filename) VL_MT_SAFE;
    void addCounter(size_t counter, const std::string& name) {
        VL_DEBUG_IF(assert(counter < T_Entries););
-        m_records.emplace_back(VerilatedProfilerRec{counter, name});
+        m_records.emplace_back(Record{name, counter});
    }
    void startCounter(size_t counter) {
-        vluint64_t val;
-        VL_RDTSC(val);
-        // -= so when we add end time in stopCounter, we already subtracted
-        // out, without needing to hold another temporary
-        m_counters[counter] -= val;
-    }
-    void stopCounter(size_t counter) {
-        vluint64_t val;
-        VL_RDTSC(val);
-        m_counters[counter] += val;
+        // -= so when we add end time in stopCounter, the net effect is adding the difference,
+        // without needing to hold onto a temporary
+        m_counters[counter] -= VL_CPU_TICK();
    }
+    void stopCounter(size_t counter) { m_counters[counter] += VL_CPU_TICK(); }
 };

 template <std::size_t T_Entries>
-void VerilatedProfiler<T_Entries>::write(const char* modelp,
-                                         const std::string& filename) VL_MT_SAFE {
+void VlPgoProfiler<T_Entries>::write(const char* modelp, const std::string& filename) VL_MT_SAFE {
    static VerilatedMutex s_mutex;
    const VerilatedLockGuard lock{s_mutex};

@ -88,14 +234,9 @@ void VerilatedProfiler<T_Entries>::write(const char* modelp,

    VL_DEBUG_IF(VL_DBG_MSGF("+prof+vlt+file writing to '%s'\n", filename.c_str()););

-    FILE* fp = nullptr;
-    if (!s_firstCall) fp = std::fopen(filename.c_str(), "a");
-    if (VL_UNLIKELY(!fp))
-        fp = std::fopen(filename.c_str(), "w");  // firstCall, or doesn't exist yet
+    FILE* const fp = std::fopen(filename.c_str(), s_firstCall ? "w" : "a");
    if (VL_UNLIKELY(!fp)) {
        VL_FATAL_MT(filename.c_str(), 0, "", "+prof+vlt+file file not writable");
-        // cppcheck-suppress resourceLeak   // bug, doesn't realize fp is nullptr
-        return;  // LCOV_EXCL_LINE
    }
    s_firstCall = false;

@ -104,10 +245,9 @@ void VerilatedProfiler<T_Entries>::write(const char* modelp,
    fprintf(fp, "// Verilated model profile-guided optimization data dump file\n");
    fprintf(fp, "`verilator_config\n");

-    for (const auto& it : m_records) {
-        const std::string& name = it.name();
+    for (const Record& rec : m_records) {
        fprintf(fp, "profile_data -model \"%s\" -mtask \"%s\" -cost 64'd%" PRIu64 "\n", modelp,
-                name.c_str(), m_counters[it.counterNumber()]);
+                rec.m_name.c_str(), m_counters[rec.m_counterNumber]);
    }

    std::fclose(fp);
--- a/include/verilated_threads.cpp
+++ b/include/verilated_threads.cpp
@ -24,8 +24,11 @@
 #include "verilatedos.h"
 #include "verilated_threads.h"

+#ifdef VL_PROFILER
+#include "verilated_profiler.h"
+#endif
+
 #include <cstdio>
-#include <fstream>
 #include <memory>
 #include <string>

@ -36,8 +39,6 @@

 std::atomic<vluint64_t> VlMTaskVertex::s_yields;

-VL_THREAD_LOCAL VlThreadPool::ProfileTrace* VlThreadPool::t_profilep = nullptr;
-
 //=============================================================================
 // VlMTaskVertex

@ -50,12 +51,11 @@ VlMTaskVertex::VlMTaskVertex(vluint32_t upstreamDepCount)
 //=============================================================================
 // VlWorkerThread

-VlWorkerThread::VlWorkerThread(VlThreadPool* poolp, VerilatedContext* contextp, bool profiling)
+VlWorkerThread::VlWorkerThread(uint32_t threadId, VerilatedContext* contextp,
+                               VlExecutionProfiler* profilerp)
    : m_ready_size{0}
-    , m_poolp{poolp}
-    , m_profiling{profiling}  // Must init this last -- after setting up fields that it might read:
    , m_exiting{false}
-    , m_cthread{startWorker, this}
+    , m_cthread{startWorker, this, threadId, profilerp}
    , m_contextp{contextp} {}

 VlWorkerThread::~VlWorkerThread() {
@ -66,8 +66,6 @@ VlWorkerThread::~VlWorkerThread() {
 }

 void VlWorkerThread::workerLoop() {
-    if (VL_UNLIKELY(m_profiling)) m_poolp->setupProfilingClientThread();
-
    ExecRec work;
    work.m_fnp = nullptr;

@ -82,143 +80,42 @@ void VlWorkerThread::workerLoop() {
            work.m_fnp = nullptr;
        }
    }
-
-    if (VL_UNLIKELY(m_profiling)) m_poolp->tearDownProfilingClientThread();
 }

-void VlWorkerThread::startWorker(VlWorkerThread* workerp) {
+void VlWorkerThread::startWorker(VlWorkerThread* workerp, uint32_t threadId,
+                                 VlExecutionProfiler* profilerp) {
    Verilated::threadContextp(workerp->m_contextp);
+#ifdef VL_PROFILER
+    // Note: setupThread is not defined without VL_PROFILER, hence the #ifdef. Still, we might
+    // not be profiling execution (e.g.: PGO only), so profilerp might still be nullptr.
+    if (profilerp) profilerp->setupThread(threadId);
+#endif
    workerp->workerLoop();
 }

 //=============================================================================
 // VlThreadPool

-VlThreadPool::VlThreadPool(VerilatedContext* contextp, int nThreads, bool profiling)
-    : m_profiling{profiling} {
+VlThreadPool::VlThreadPool(VerilatedContext* contextp, int nThreads,
+                           VlExecutionProfiler* profiler) {
    // --threads N passes nThreads=N-1, as the "main" threads counts as 1
+    ++nThreads;
    const unsigned cpus = std::thread::hardware_concurrency();
-    if (cpus < nThreads + 1) {
+    if (cpus < nThreads) {
        static int warnedOnce = 0;
        if (!warnedOnce++) {
            VL_PRINTF_MT("%%Warning: System has %u CPUs but model Verilated with"
                         " --threads %d; may run slow.\n",
-                         cpus, nThreads + 1);
+                         cpus, nThreads);
        }
    }
-    // Create'em
-    for (int i = 0; i < nThreads; ++i) {
-        m_workers.push_back(new VlWorkerThread{this, contextp, profiling});
+    // Create worker threads
+    for (uint32_t threadId = 1; threadId < nThreads; ++threadId) {
+        m_workers.push_back(new VlWorkerThread{threadId, contextp, profiler});
    }
-    // Set up a profile buffer for the current thread too -- on the
-    // assumption that it's the same thread that calls eval and may be
-    // donated to run mtasks during the eval.
-    if (VL_UNLIKELY(m_profiling)) setupProfilingClientThread();
 }

 VlThreadPool::~VlThreadPool() {
    // Each ~WorkerThread will wait for its thread to exit.
    for (auto& i : m_workers) delete i;
-    if (VL_UNLIKELY(m_profiling)) tearDownProfilingClientThread();
-}
-
-void VlThreadPool::tearDownProfilingClientThread() {
-    assert(t_profilep);
-    delete t_profilep;
-    t_profilep = nullptr;
-}
-
-void VlThreadPool::setupProfilingClientThread() VL_MT_SAFE_EXCLUDES(m_mutex) {
-    assert(!t_profilep);
-    t_profilep = new ProfileTrace;
-    // Reserve some space in the thread-local profiling buffer;
-    // try not to malloc while collecting profiling.
-    t_profilep->reserve(4096);
-    {
-        const VerilatedLockGuard lock{m_mutex};
-        m_allProfiles.insert(t_profilep);
-    }
-}
-
-void VlThreadPool::profileAppendAll(const VlProfileRec& rec) VL_MT_SAFE_EXCLUDES(m_mutex) {
-    const VerilatedLockGuard lock{m_mutex};
-    for (const auto& profilep : m_allProfiles) {
-        // Every thread's profile trace gets a copy of rec.
-        profilep->emplace_back(rec);
-    }
-}
-
-void VlThreadPool::profileDump(const char* filenamep, vluint64_t tickStart, vluint64_t tickEnd)
-    VL_MT_SAFE_EXCLUDES(m_mutex) {
-    const VerilatedLockGuard lock{m_mutex};
-    VL_DEBUG_IF(VL_DBG_MSGF("+prof+threads writing to '%s'\n", filenamep););
-
-    FILE* const fp = std::fopen(filenamep, "w");
-    if (VL_UNLIKELY(!fp)) {
-        VL_FATAL_MT(filenamep, 0, "", "+prof+threads+file file not writable");
-        // cppcheck-suppress resourceLeak   // bug, doesn't realize fp is nullptr
-        return;  // LCOV_EXCL_LINE
-    }
-
-    // TODO Perhaps merge with verilated_coverage output format, so can
-    // have a common merging and reporting tool, etc.
-    fprintf(fp, "VLPROFTHREAD 1.1 # Verilator thread profile dump version 1.1\n");
-    fprintf(fp, "VLPROF arg --threads %" PRIu64 "\n", vluint64_t(m_workers.size() + 1));
-    fprintf(fp, "VLPROF arg +verilator+prof+threads+start+%" PRIu64 "\n",
-            Verilated::threadContextp()->profThreadsStart());
-    fprintf(fp, "VLPROF arg +verilator+prof+threads+window+%u\n",
-            Verilated::threadContextp()->profThreadsWindow());
-    fprintf(fp, "VLPROF stat yields %" PRIu64 "\n", VlMTaskVertex::yields());
-
-    // Copy /proc/cpuinfo into this output so verilator_gantt can be run on
-    // a different machine
-    {
-        const std::unique_ptr<std::ifstream> ifp{new std::ifstream("/proc/cpuinfo")};
-        if (!ifp->fail()) {
-            std::string line;
-            while (std::getline(*ifp, line)) { fprintf(fp, "VLPROFPROC %s\n", line.c_str()); }
-        }
-    }
-
-    vluint32_t thread_id = 0;
-    for (const auto& pi : m_allProfiles) {
-        ++thread_id;
-
-        bool printing = false;  // False while in warmup phase
-        for (const auto& ei : *pi) {
-            switch (ei.m_type) {
-            case VlProfileRec::TYPE_BARRIER:  //
-                printing = true;
-                break;
-            case VlProfileRec::TYPE_EVAL:
-                if (!printing) break;
-                fprintf(fp,
-                        "VLPROF eval start %" PRIu64 " elapsed %" PRIu64 " cpu %u on thread %u\n",
-                        ei.m_startTime - tickStart, (ei.m_endTime - ei.m_startTime), ei.m_cpu,
-                        thread_id);
-                break;
-            case VlProfileRec::TYPE_EVAL_LOOP:
-                if (!printing) break;
-                fprintf(fp,
-                        "VLPROF eval_loop start %" PRIu64 " elapsed %" PRIu64
-                        " cpu %u on thread %u\n",
-                        ei.m_startTime - tickStart, (ei.m_endTime - ei.m_startTime), ei.m_cpu,
-                        thread_id);
-                break;
-            case VlProfileRec::TYPE_MTASK_RUN:
-                if (!printing) break;
-                fprintf(fp,
-                        "VLPROF mtask %d"
-                        " start %" PRIu64 " elapsed %" PRIu64
-                        " predict_start %u predict_cost %u cpu %u on thread %u\n",
-                        ei.m_mtaskId, ei.m_startTime - tickStart, (ei.m_endTime - ei.m_startTime),
-                        ei.m_predictStart, ei.m_predictCost, ei.m_cpu, thread_id);
-                break;
-            default: assert(false); break;  // LCOV_EXCL_LINE
-            }
-        }
-    }
-    fprintf(fp, "VLPROF stat ticks %" PRIu64 "\n", tickEnd - tickStart);
-
-    std::fclose(fp);
 }
--- a/include/verilated_threads.h
+++ b/include/verilated_threads.h
@ -35,8 +35,10 @@
 #error "verilated_threads.h/cpp expected VL_THREADED (from verilator --threads)"
 #endif

+#include <atomic>
 #include <condition_variable>
 #include <set>
+#include <thread>
 #include <vector>

 // clang-format off
@ -127,64 +129,7 @@ public:
    }
 };

-// Profiling support
-class VlProfileRec final {
-protected:
-    friend class VlThreadPool;
-    enum VlProfileE { TYPE_MTASK_RUN, TYPE_EVAL, TYPE_EVAL_LOOP, TYPE_BARRIER };
-    // Layout below allows efficient packing.
-    // Leave endTime first, so no math needed to calculate address in endRecord
-    vluint64_t m_endTime = 0;  // Tick at end of execution
-    vluint64_t m_startTime = 0;  // Tick at start of execution
-    vluint32_t m_mtaskId = 0;  // Mtask we're logging
-    vluint32_t m_predictStart = 0;  // Time scheduler predicted would start
-    vluint32_t m_predictCost = 0;  // How long scheduler predicted would take
-    VlProfileE m_type = TYPE_BARRIER;  // Record type
-    unsigned m_cpu;  // Execution CPU number (at start anyways)
-public:
-    class Barrier {};
-    VlProfileRec() = default;
-    explicit VlProfileRec(Barrier) { m_cpu = getcpu(); }
-    void startEval(vluint64_t time) {
-        m_type = VlProfileRec::TYPE_EVAL;
-        m_startTime = time;
-        m_cpu = getcpu();
-    }
-    void startEvalLoop(vluint64_t time) {
-        m_type = VlProfileRec::TYPE_EVAL_LOOP;
-        m_startTime = time;
-        m_cpu = getcpu();
-    }
-    void startRecord(vluint64_t time, vluint32_t mtask, vluint32_t predictStart,
-                     vluint32_t predictCost) {
-        m_type = VlProfileRec::TYPE_MTASK_RUN;
-        m_mtaskId = mtask;
-        m_predictStart = predictStart;
-        m_predictCost = predictCost;
-        m_startTime = time;
-        m_cpu = getcpu();
-    }
-    void endRecord(vluint64_t time) { m_endTime = time; }
-    static int getcpu() {  // Return current executing CPU
-#if defined(__linux)
-        return sched_getcpu();
-#elif defined(__APPLE__) && !defined(__arm64__)
-        vluint32_t info[4];
-        __cpuid_count(1, 0, info[0], info[1], info[2], info[3]);
-        // info[1] is EBX, bits 24-31 are APIC ID
-        if ((info[3] & (1 << 9)) == 0) {
-            return -1;  // no APIC on chip
-        } else {
-            return (unsigned)info[1] >> 24;
-        }
-#elif defined(_WIN32)
-        return GetCurrentProcessorNumber();
-#else
-        return 0;
-#endif
-    }
-};
-
+class VlExecutionProfiler;
 class VlThreadPool;

 class VlWorkerThread final {
@ -217,9 +162,6 @@ private:
    // Store the size atomically, so we can spin wait
    std::atomic<size_t> m_ready_size;

-    VlThreadPool* const m_poolp;  // Our associated thread pool
-
-    const bool m_profiling;  // Is profiling enabled?
    std::atomic<bool> m_exiting;  // Worker thread should exit
    std::thread m_cthread;  // Underlying C++ thread record
    VerilatedContext* const m_contextp;  // Context for spawned thread
@ -228,7 +170,8 @@ private:

 public:
    // CONSTRUCTORS
-    explicit VlWorkerThread(VlThreadPool* poolp, VerilatedContext* contextp, bool profiling);
+    explicit VlWorkerThread(uint32_t threadId, VerilatedContext* contextp,
+                            VlExecutionProfiler* profilerp);
    ~VlWorkerThread();

    // METHODS
@ -265,34 +208,20 @@ public:
        if (notify) m_cv.notify_one();
    }
    void workerLoop();
-    static void startWorker(VlWorkerThread* workerp);
+    static void startWorker(VlWorkerThread* workerp, uint32_t threadId,
+                            VlExecutionProfiler* profilerp);
 };

 class VlThreadPool final {
-    // TYPES
-    using ProfileTrace = std::vector<VlProfileRec>;
-
    // MEMBERS
    std::vector<VlWorkerThread*> m_workers;  // our workers
-    const bool m_profiling;  // is profiling enabled?
-
-    // Support profiling -- we can append records of profiling events
-    // to this vector with very low overhead, and then dump them out
-    // later. This prevents the overhead of printf/malloc/IO from
-    // corrupting the profiling data. It's super cheap to append
-    // a VlProfileRec struct on the end of a pre-allocated vector;
-    // this is the only cost we pay in real-time during a profiling cycle.
-    // Internal note: Globals may multi-construct, see verilated.cpp top.
-    static VL_THREAD_LOCAL ProfileTrace* t_profilep;
-    std::set<ProfileTrace*> m_allProfiles VL_GUARDED_BY(m_mutex);
-    VerilatedMutex m_mutex;

 public:
    // CONSTRUCTORS
    // Construct a thread pool with 'nThreads' dedicated threads. The thread
    // pool will create these threads and make them available to execute tasks
    // via this->workerp(index)->addTask(...)
-    VlThreadPool(VerilatedContext* contextp, int nThreads, bool profiling);
+    VlThreadPool(VerilatedContext* contextp, int nThreads, VlExecutionProfiler* profilerp);
    ~VlThreadPool();

    // METHODS
@ -302,17 +231,6 @@ public:
        assert(index < m_workers.size());
        return m_workers[index];
    }
-    inline VlProfileRec* profileAppend() {
-        t_profilep->emplace_back();
-        return &(t_profilep->back());
-    }
-    void profileAppendAll(const VlProfileRec& rec) VL_MT_SAFE_EXCLUDES(m_mutex);
-    void profileDump(const char* filenamep, vluint64_t tickStart, vluint64_t tickEnd)
-        VL_MT_SAFE_EXCLUDES(m_mutex);
-    // In profiling mode, each executing thread must call
-    // this once to setup profiling state:
-    void setupProfilingClientThread() VL_MT_SAFE_EXCLUDES(m_mutex);
-    void tearDownProfilingClientThread();

 private:
    VL_UNCOPYABLE(VlThreadPool);
--- a/include/verilatedos.h
+++ b/include/verilatedos.h
@ -438,7 +438,7 @@ using ssize_t = uint32_t;  ///< signed size_t; returned from read()
 #if defined(__i386__) || defined(__x86_64__)
 // The vluint64_t argument is loaded with a high-performance counter for profiling
 // or 0x0 if not implemented on this platform
-#define VL_RDTSC(val) \
+#define VL_GET_CPU_TICK(val) \
    { \
        vluint32_t hi, lo; \
        asm volatile("rdtsc" : "=a"(lo), "=d"(hi)); \
@ -446,14 +446,14 @@ using ssize_t = uint32_t;  ///< signed size_t; returned from read()
    }
 #elif defined(__aarch64__)
 // 1 GHz virtual system timer on SBSA level 5 compliant systems, else often 100 MHz
-# define VL_RDTSC(val) \
+# define VL_GET_CPU_TICK(val) \
    { \
        asm volatile("isb" : : : "memory"); \
        asm volatile("mrs %[rt],CNTVCT_EL0" : [rt] "=r"(val)); \
    }
 #else
 // We just silently ignore unknown OSes, as only leads to missing statistics
-# define VL_RDTSC(val) (val) = 0;
+# define VL_GET_CPU_TICK(val) (val) = 0;
 #endif

 //=========================================================================
--- a/src/V3EmitCImp.cpp
+++ b/src/V3EmitCImp.cpp
@ -63,7 +63,7 @@ class EmitCGatherDependencies final : VNVisitor {
            UASSERT_OBJ(selfPointer.find("vlSymsp") != string::npos, nodep,
                        "Unknown self pointer: '" << selfPointer << "'");
            // Dereferencing vlSymsp, so we need it's definition...
-            m_dependencies.insert(EmitCBaseVisitor::symClassName());
+            addSymsDependency();
        }
    }

@ -117,9 +117,7 @@ class EmitCGatherDependencies final : VNVisitor {
        iterateChildrenConst(nodep);
    }
    virtual void visit(AstNodeSimpleText* nodep) override {
-        if (nodep->text().find("vlSymsp") != string::npos) {
-            m_dependencies.insert(EmitCBaseVisitor::symClassName());
-        }
+        if (nodep->text().find("vlSymsp") != string::npos) addSymsDependency();
        iterateChildrenConst(nodep);
    }
    virtual void visit(AstNode* nodep) override { iterateChildrenConst(nodep); }
--- a/src/V3EmitCMake.cpp
+++ b/src/V3EmitCMake.cpp
@ -178,6 +178,9 @@ class CMakeEmitter final {
        if (v3Global.opt.mtasks()) {
            global.emplace_back("${VERILATOR_ROOT}/include/verilated_threads.cpp");
        }
+        if (v3Global.opt.usesProfiler()) {
+            global.emplace_back("${VERILATOR_ROOT}/include/verilated_profiler.cpp");
+        }
        if (!v3Global.opt.libCreate().empty()) {
            global.emplace_back(v3Global.opt.makeDir() + "/" + v3Global.opt.libCreate() + ".cpp");
        }
--- a/src/V3EmitCModel.cpp
+++ b/src/V3EmitCModel.cpp
@ -330,21 +330,14 @@ class EmitCModel final : public EmitCFunc {
        if (initial)
            puts(topModNameProtected + "__" + protect("_eval_settle") + "(&(vlSymsp->TOP));\n");

-        const string recName = "__Vprfloop";
-        if (v3Global.opt.profThreads() && !initial) {
-            puts("VlProfileRec* " + recName + " = nullptr;\n");
-            // Leave this if() here, as don't want to call VL_RDTSC_Q unless profiling
-            puts("if (VL_UNLIKELY(vlSymsp->__Vm_profile_cycle_start)) {\n");
-            // Eval start
-            puts(/**/ recName + " = vlSymsp->__Vm_threadPoolp->profileAppend();\n");
-            puts(/**/ recName + "->startEvalLoop(VL_RDTSC_Q());\n");
-            puts("}\n");
+        if (v3Global.opt.profExec() && !initial) {
+            puts("VL_EXEC_TRACE_ADD_RECORD(vlSymsp).evalLoopBegin();\n");
        }

        puts(topModNameProtected + "__" + protect("_eval") + "(&(vlSymsp->TOP));\n");

-        if (v3Global.opt.profThreads() && !initial) {
-            puts("if (VL_UNLIKELY(" + recName + ")) " + recName + "->endRecord(VL_RDTSC_Q());\n");
+        if (v3Global.opt.profExec() && !initial) {
+            puts("VL_EXEC_TRACE_ADD_RECORD(vlSymsp).evalLoopEnd();\n");
        }

        if (v3Global.rootp()->changeRequest()) {
@ -434,61 +427,9 @@ class EmitCModel final : public EmitCFunc {
            puts("Verilated::mtaskId(" + cvtToStr(mtaskId) + ");\n");
        }

-        if (v3Global.opt.profThreads()) {
-            puts("if (VL_UNLIKELY((vlSymsp->_vm_contextp__->profThreadsStart() != "
-                 "vlSymsp->__Vm_profile_time_finished)\n");
-            puts(" && (VL_TIME_Q() > vlSymsp->_vm_contextp__->profThreadsStart())\n");
-            puts(" && (vlSymsp->_vm_contextp__->profThreadsWindow() >= 1))) {\n");
-            // Within a profile (either starting, middle, or end)
-            puts(/**/ "if (vlSymsp->__Vm_profile_window_ct == 0) {\n");  // Opening file?
-            puts(/**/ "VL_DEBUG_IF(VL_DBG_MSGF(\"+ profile start warmup\\n\"););\n");
-            // Start profile on this cycle. We'll capture a window worth, then
-            // only analyze the next window worth. The idea is that the first window
-            // capture will hit some cache-cold stuff (eg printf) but it'll be warm
-            // by the time we hit the second window, we hope.
-            puts(/****/ "vlSymsp->__Vm_profile_cycle_start = VL_RDTSC_Q();\n");
-            // "* 2" as first half is warmup, second half is collection
-            puts(/****/ "vlSymsp->__Vm_profile_window_ct"
-                        " = vlSymsp->_vm_contextp__->profThreadsWindow()"
-                        " * 2 + 1;\n");
-            puts(/**/ "}\n");
-            puts(/**/ "--(vlSymsp->__Vm_profile_window_ct);\n");
-            puts(/**/ "if (vlSymsp->__Vm_profile_window_ct"
-                      " == vlSymsp->_vm_contextp__->profThreadsWindow()) {\n");
-            // This barrier record in every threads' profile demarcates the
-            // cache-warm-up cycles before the barrier from the actual profile
-            // cycles afterward.
-            puts(/****/ "vlSymsp->__Vm_threadPoolp->profileAppendAll(");
-            puts(/****/ "VlProfileRec{VlProfileRec::Barrier{}});\n");
-            puts(/****/ "vlSymsp->__Vm_profile_cycle_start = VL_RDTSC_Q();\n");
-            puts(/**/ "}\n");
-            // Ending trace file?
-            puts(/**/ "else if (vlSymsp->__Vm_profile_window_ct == 0) {\n");
-            puts(/****/ "vluint64_t tick_end = VL_RDTSC_Q();\n");
-            puts(/****/ "VL_DEBUG_IF(VL_DBG_MSGF(\"+ profile end\\n\"););\n");
-            puts(/****/ "vlSymsp->__Vm_threadPoolp->profileDump("
-                        "vlSymsp->_vm_contextp__->profThreadsFilename().c_str(), "
-                        "vlSymsp->__Vm_profile_cycle_start, "
-                        "tick_end);\n");
-            // This turns off the test to enter the profiling code, but still
-            // allows the user to collect another profile by changing
-            // profThreadsStart
-            puts(/****/ "vlSymsp->__Vm_profile_time_finished = "
-                        "vlSymsp->_vm_contextp__->profThreadsStart();\n");
-            puts(/****/ "vlSymsp->__Vm_profile_cycle_start = 0;\n");
-            puts(/**/ "}\n");
-            puts("}\n");
-        }
-
-        const string recName = "__Vprfeval";
-        if (v3Global.opt.profThreads()) {
-            puts("VlProfileRec* " + recName + " = nullptr;\n");
-            // Leave this if() here, as don't want to call VL_RDTSC_Q unless profiling
-            puts("if (VL_UNLIKELY(vlSymsp->__Vm_profile_cycle_start)) {\n");
-            // Eval start
-            puts(/**/ recName + " = vlSymsp->__Vm_threadPoolp->profileAppend();\n");
-            puts(/**/ recName + "->startEval(VL_RDTSC_Q());\n");
-            puts("}\n");
+        if (v3Global.opt.profExec()) {
+            puts("vlSymsp->__Vm_executionProfiler.configure(*(vlSymsp->_vm_contextp__));\n");
+            puts("VL_EXEC_TRACE_ADD_RECORD(vlSymsp).evalBegin();\n");
        }

        emitSettleLoop(modp, /* initial: */ false);
@ -499,10 +440,7 @@ class EmitCModel final : public EmitCFunc {
        }
        if (v3Global.opt.threads()) puts("Verilated::endOfEval(vlSymsp->__Vm_evalMsgQp);\n");

-        if (v3Global.opt.profThreads()) {
-            // End eval record
-            puts("if (VL_UNLIKELY(" + recName + ")) " + recName + "->endRecord(VL_RDTSC_Q());\n");
-        }
+        if (v3Global.opt.profExec()) puts("VL_EXEC_TRACE_ADD_RECORD(vlSymsp).evalEnd();\n");
        puts("}\n");
    }

--- a/src/V3EmitCSyms.cpp
+++ b/src/V3EmitCSyms.cpp
@ -395,7 +395,7 @@ void EmitCSyms::emitSymHdr() {
    if (v3Global.needTraceDumper()) {
        puts("#include \"" + v3Global.opt.traceSourceLang() + ".h\"\n");
    }
-    if (v3Global.opt.profThreads()) puts("#include \"verilated_profiler.h\"\n");
+    if (v3Global.opt.usesProfiler()) puts("#include \"verilated_profiler.h\"\n");

    puts("\n// INCLUDE MODEL CLASS\n");
    puts("\n#include \"" + topClassName() + ".h\"\n");
@ -445,18 +445,15 @@ void EmitCSyms::emitSymHdr() {
    }
    puts("bool __Vm_didInit = false;\n");

+    if (v3Global.opt.profExec()) {
+        puts("\n// EXECUTION PROFILING\n");
+        puts("VlExecutionProfiler __Vm_executionProfiler;\n");
+    }
+
    if (v3Global.opt.mtasks()) {
+        puts("\n// MULTI-THREADING\n");
        puts("VlThreadPool* const __Vm_threadPoolp;\n");
        puts("bool __Vm_even_cycle = false;\n");
-
-        if (v3Global.opt.profThreads()) {
-            // rdtsc() at current cycle start
-            puts("vluint64_t __Vm_profile_cycle_start = 0;\n");
-            // Time we finished analysis
-            puts("vluint64_t __Vm_profile_time_finished = 0;\n");
-            // Track our position in the cache warmup and actual profile window
-            puts("vluint32_t __Vm_profile_window_ct = 0;\n");
-        }
    }

    puts("\n// MODULE INSTANCE STATE\n");
@ -477,8 +474,8 @@ void EmitCSyms::emitSymHdr() {
        puts("];\n");
    }

-    if (v3Global.opt.profThreads()) {
-        puts("\n// PROFILING\n");
+    if (v3Global.opt.profPgo()) {
+        puts("\n// PGO PROFILING\n");
        vluint64_t maxProfilerId = 0;
        if (v3Global.opt.mtasks()) {
            for (const V3GraphVertex* vxp
@ -490,7 +487,7 @@ void EmitCSyms::emitSymHdr() {
            }
        }
        ++maxProfilerId;  // As size must include 0
-        puts("VerilatedProfiler<" + cvtToStr(maxProfilerId) + "> _vm_profiler;\n");
+        puts("VlPgoProfiler<" + cvtToStr(maxProfilerId) + "> _vm_pgoProfiler;\n");
    }

    if (!m_scopeNames.empty()) {  // Scope names
@ -682,8 +679,8 @@ void EmitCSyms::emitSymImp() {
        puts("if (__Vm_dumping) _traceDumpClose();\n");
        puts("#endif  // VM_TRACE\n");
    }
-    if (v3Global.opt.profThreads()) {
-        puts("_vm_profiler.write(\"" + topClassName()
+    if (v3Global.opt.profPgo()) {
+        puts("_vm_pgoProfiler.write(\"" + topClassName()
             + "\", _vm_contextp__->profVltFilename());\n");
    }
    if (v3Global.opt.mtasks()) puts("delete __Vm_threadPoolp;\n");
@ -719,8 +716,8 @@ void EmitCSyms::emitSymImp() {
        // that calls eval() becomes the final Nth thread for the
        // duration of the eval call.
        puts("    , __Vm_threadPoolp{new VlThreadPool{_vm_contextp__, "
-             + cvtToStr(v3Global.opt.threads() - 1) + ", " + cvtToStr(v3Global.opt.profThreads())
-             + "}}\n");
+             + cvtToStr(v3Global.opt.threads() - 1) + ", "
+             + (v3Global.opt.profExec() ? "&__Vm_executionProfiler" : "nullptr") + "}}\n");
    }

    puts("    // Setup module instances\n");
@ -741,14 +738,14 @@ void EmitCSyms::emitSymImp() {
    }
    puts("{\n");

-    if (v3Global.opt.profThreads()) {
-        puts("// Configure profiling\n");
+    if (v3Global.opt.profPgo()) {
+        puts("// Configure profiling for PGO\n");
        if (v3Global.opt.mtasks()) {
            for (const V3GraphVertex* vxp
                 = v3Global.rootp()->execGraphp()->depGraphp()->verticesBeginp();
                 vxp; vxp = vxp->verticesNextp()) {
                ExecMTask* const mtp = dynamic_cast<ExecMTask*>(const_cast<V3GraphVertex*>(vxp));
-                puts("_vm_profiler.addCounter(" + cvtToStr(mtp->profilerId()) + ", \""
+                puts("_vm_pgoProfiler.addCounter(" + cvtToStr(mtp->profilerId()) + ", \""
                     + mtp->hashName() + "\");\n");
            }
        }
--- a/src/V3EmitMk.cpp
+++ b/src/V3EmitMk.cpp
@ -112,6 +112,9 @@ public:
                        }
                    }
                    if (v3Global.opt.mtasks()) putMakeClassEntry(of, "verilated_threads.cpp");
+                    if (v3Global.opt.usesProfiler()) {
+                        putMakeClassEntry(of, "verilated_profiler.cpp");
+                    }
                } else if (support == 2 && slow) {
                } else {
                    for (AstNodeFile* nodep = v3Global.rootp()->filesp(); nodep;
@ -189,6 +192,7 @@ public:
        of.puts("# User CFLAGS (from -CFLAGS on Verilator command line)\n");
        of.puts("VM_USER_CFLAGS = \\\n");
        if (!v3Global.opt.libCreate().empty()) of.puts("\t-fPIC \\\n");
+        if (v3Global.opt.usesProfiler()) of.puts("\t-DVL_PROFILER \\\n");
        const V3StringList& cFlags = v3Global.opt.cFlags();
        for (const string& i : cFlags) of.puts("\t" + i + " \\\n");
        of.puts("\n");
--- a/src/V3Options.cpp
+++ b/src/V3Options.cpp
@ -1236,7 +1236,13 @@ void V3Options::parseOptsList(FileLine* fl, const string& optdir, int argc, char
    DECL_OPTION("-prof-cfuncs", CbCall, [this]() { m_profC = m_profCFuncs = true; });
    DECL_OPTION("-profile-cfuncs", CbCall,
                [this]() { m_profC = m_profCFuncs = true; });  // Renamed
-    DECL_OPTION("-prof-threads", OnOff, &m_profThreads);
+    DECL_OPTION("-prof-exec", OnOff, &m_profExec);
+    DECL_OPTION("-prof-pgo", OnOff, &m_profPgo);
+    DECL_OPTION("-prof-threads", CbOnOff, [this, fl](bool flag) {
+        fl->v3warn(DEPRECATED, "Option --prof-threads is deprecated. "
+                               "Use --prof-exec and --prof-pgo instead.");
+        m_profExec = m_profPgo = flag;
+    });
    DECL_OPTION("-protect-ids", OnOff, &m_protectIds);
    DECL_OPTION("-protect-key", Set, &m_protectKey);
    DECL_OPTION("-protect-lib", CbVal, [this](const char* valp) {
--- a/src/V3Options.h
+++ b/src/V3Options.h
@ -255,7 +255,8 @@ private:
    bool m_ppComments = false;      // main switch: --pp-comments
    bool m_profC = false;           // main switch: --prof-c
    bool m_profCFuncs = false;      // main switch: --prof-cfuncs
-    bool m_profThreads = false;     // main switch: --prof-threads
+    bool m_profExec = false;        // main switch: --prof-exec
+    bool m_profPgo = false;         // main switch: --prof-pgo
    bool m_protectIds = false;      // main switch: --protect-ids
    bool m_public = false;          // main switch: --public
    bool m_publicFlatRW = false;    // main switch: --public-flat-rw
@ -468,7 +469,9 @@ public:
    bool ppComments() const { return m_ppComments; }
    bool profC() const { return m_profC; }
    bool profCFuncs() const { return m_profCFuncs; }
-    bool profThreads() const { return m_profThreads; }
+    bool profExec() const { return m_profExec; }
+    bool profPgo() const { return m_profPgo; }
+    bool usesProfiler() const { return profExec() || profPgo(); }
    bool protectIds() const { return m_protectIds; }
    bool allPublic() const { return m_public; }
    bool publicFlatRW() const { return m_publicFlatRW; }
--- a/src/V3Partition.cpp
+++ b/src/V3Partition.cpp
@ -2918,43 +2918,39 @@ static void addMTaskToFunction(const ThreadSchedule& schedule, const uint32_t th
        addStrStmt("vlSelf->" + name + +".waitUntilUpstreamDone(even_cycle);\n");
    }

-    string recName;
-    if (v3Global.opt.profThreads()) {
-        recName = "__Vprfthr_" + cvtToStr(mtaskp->id());
-        addStrStmt("VlProfileRec* " + recName + " = nullptr;\n");
-        // Leave this if() here, as don't want to call VL_RDTSC_Q unless profiling
-        addStrStmt("if (VL_UNLIKELY(vlSymsp->__Vm_profile_cycle_start)) {\n" +  //
-                   recName + " = vlSymsp->__Vm_threadPoolp->profileAppend();\n" +  //
-                   recName + "->startRecord(VL_RDTSC_Q()," +  //
-                   " " + cvtToStr(mtaskp->id()) + "," +  //
-                   " " + cvtToStr(mtaskp->predictStart()) + "," +  //
-                   " " + cvtToStr(mtaskp->cost()) + ");\n" +  //
-                   "}\n");
+    if (v3Global.opt.profExec()) {
+        const string& id = cvtToStr(mtaskp->id());
+        const string& predictStart = cvtToStr(mtaskp->predictStart());
+        addStrStmt("VL_EXEC_TRACE_ADD_RECORD(vlSymsp).mtaskBegin(" + id + ", " + predictStart
+                   + ");\n");
    }
-    if (v3Global.opt.profThreads()) {
+    if (v3Global.opt.profPgo()) {
        // No lock around startCounter, as counter numbers are unique per thread
-        addStrStmt("vlSymsp->_vm_profiler.startCounter(" + cvtToStr(mtaskp->profilerId())
+        addStrStmt("vlSymsp->_vm_pgoProfiler.startCounter(" + cvtToStr(mtaskp->profilerId())
                   + ");\n");
    }

    //
    addStrStmt("Verilated::mtaskId(" + cvtToStr(mtaskp->id()) + ");\n");

-    // Move the the actual body of calls to leaf functions into this function
+    // Move the actual body of calls to leaf functions into this function
    funcp->addStmtsp(mtaskp->bodyp()->unlinkFrBack());

-    if (v3Global.opt.profThreads()) {
-        // No lock around stopCounter, as counter numbers are unique per thread
-        addStrStmt("vlSymsp->_vm_profiler.stopCounter(" + cvtToStr(mtaskp->profilerId()) + ");\n");
-    }
-    if (v3Global.opt.profThreads()) {
-        addStrStmt("if (VL_UNLIKELY(" + recName + ")) "  //
-                   + recName + "->endRecord(VL_RDTSC_Q());\n");
-    }
-
    // Flush message queue
    addStrStmt("Verilated::endOfThreadMTask(vlSymsp->__Vm_evalMsgQp);\n");

+    if (v3Global.opt.profPgo()) {
+        // No lock around stopCounter, as counter numbers are unique per thread
+        addStrStmt("vlSymsp->_vm_pgoProfiler.stopCounter(" + cvtToStr(mtaskp->profilerId())
+                   + ");\n");
+    }
+    if (v3Global.opt.profExec()) {
+        const string& id = cvtToStr(mtaskp->id());
+        const string& predictConst = cvtToStr(mtaskp->cost());
+        addStrStmt("VL_EXEC_TRACE_ADD_RECORD(vlSymsp).mtaskEnd(" + id + ", " + predictConst
+                   + ");\n");
+    }
+
    // For any dependent mtask that's on another thread, signal one dependency completion.
    for (V3GraphEdge* edgep = mtaskp->outBeginp(); edgep; edgep = edgep->outNextp()) {
        const ExecMTask* const nextp = dynamic_cast<ExecMTask*>(edgep->top());
--- a/test_regress/t/t_gantt.pl
+++ b/test_regress/t/t_gantt.pl
@ -9,9 +9,8 @@ if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); di
 # SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0

 # Test for bin/verilator_gantt,
-#
-# Only needed in multithreaded regression.
-scenarios(vltmt => 1);
+
+scenarios(vlt_all => 1);

 # It doesn't really matter what test
 # we use, so long as it runs several cycles,
@ -20,13 +19,13 @@ top_filename("t/t_gen_alw.v");

 compile(
    # Checks below care about thread count, so use 2 (minimum reasonable)
-    v_flags2 => ["--prof-threads --threads 2"]
+    v_flags2 => ["--prof-exec",  ($Self->{vltmt} ? "--threads 2" : "")]
    );

 execute(
-    all_run_flags => ["+verilator+prof+threads+start+2",
-                      " +verilator+prof+threads+window+2",
-                      " +verilator+prof+threads+file+$Self->{obj_dir}/profile_threads.dat",
+    all_run_flags => ["+verilator+prof+exec+start+2",
+                      " +verilator+prof+exec+window+2",
+                      " +verilator+prof+exec+file+$Self->{obj_dir}/profile_exec.dat",
                      " +verilator+prof+vlt+file+$Self->{obj_dir}/profile.vlt",
                      ],
    check_finished => 1,
@ -37,17 +36,22 @@ execute(
 # The profiling data still goes direct to the runtime's STDOUT
 #  (maybe that should go to a separate file - gantt.dat?)
 run(cmd => ["$ENV{VERILATOR_ROOT}/bin/verilator_gantt",
-            "$Self->{obj_dir}/profile_threads.dat",
-            "--vcd $Self->{obj_dir}/profile_threads.vcd",
+            "$Self->{obj_dir}/profile_exec.dat",
+            "--vcd $Self->{obj_dir}/profile_exec.vcd",
            "| tee $Self->{obj_dir}/gantt.log"],
    );

-file_grep("$Self->{obj_dir}/gantt.log", qr/Total threads += 2/i);
-file_grep("$Self->{obj_dir}/gantt.log", qr/Total mtasks += 7/i);
+if ($Self->{vltmt}) {
+    file_grep("$Self->{obj_dir}/gantt.log", qr/Total threads += 2/i);
+    file_grep("$Self->{obj_dir}/gantt.log", qr/Total mtasks += 7/i);
+} else {
+    file_grep("$Self->{obj_dir}/gantt.log", qr/Total threads += 1/i);
+    file_grep("$Self->{obj_dir}/gantt.log", qr/Total mtasks += 0/i);
+}
 file_grep("$Self->{obj_dir}/gantt.log", qr/Total evals += 2/i);

 # Diff to itself, just to check parsing
-vcd_identical("$Self->{obj_dir}/profile_threads.vcd", "$Self->{obj_dir}/profile_threads.vcd");
+vcd_identical("$Self->{obj_dir}/profile_exec.vcd", "$Self->{obj_dir}/profile_exec.vcd");

 ok(1);
 1;
--- a/test_regress/t/t_gantt_io.dat
+++ b/test_regress/t/t_gantt_io.dat
@ -1,8 +1,8 @@
-VLPROFTHREAD 1.1 # Verilator thread profile dump version 1.1
-VLPROF arg --threads 2
-VLPROF arg +verilator+prof+threads+start+2
-VLPROF arg +verilator+prof+threads+window+2
+VLPROFVERSION 2.0
+VLPROF arg +verilator+prof+exec+start+2
+VLPROF arg +verilator+prof+exec+window+2
 VLPROF stat yields 0
+VLPROF stat threads 2
 VLPROFPROC processor    : 0
 VLPROFPROC vendor_id    : AuthenticTest
 VLPROFPROC cpu family   : 23
@ -899,22 +899,42 @@ VLPROFPROC cache_alignment      : 64
 VLPROFPROC address sizes        : 43 bits physical, 48 bits virtual
 VLPROFPROC power management: ts ttp tm hwpstate cpb eff_freq_ro [13] [14]
 VLPROFPROC 
-VLPROF eval start 595 elapsed 11655 cpu 19 on thread 1
-VLPROF eval_loop start 945 elapsed 11235 cpu 19 on thread 1
-VLPROF mtask 6 start 2695 elapsed 210 predict_start 0 predict_cost 30 cpu 19 on thread 1
-VLPROF mtask 10 start 9695 elapsed 175 predict_start 196 predict_cost 30 cpu 19 on thread 1
-VLPROF eval start 13720 elapsed 8610 cpu 19 on thread 1
-VLPROF eval_loop start 14000 elapsed 8085 cpu 19 on thread 1
-VLPROF mtask 6 start 15610 elapsed 210 predict_start 0 predict_cost 30 cpu 19 on thread 1
-VLPROF mtask 10 start 21700 elapsed 175 predict_start 196 predict_cost 30 cpu 19 on thread 1
-VLPROF mtask 5 start 5495 elapsed 595 predict_start 0 predict_cost 30 cpu 10 on thread 2
-VLPROF mtask 7 start 6300 elapsed 595 predict_start 30 predict_cost 30 cpu 10 on thread 2
-VLPROF mtask 8 start 7490 elapsed 1050 predict_start 60 predict_cost 107 cpu 10 on thread 2
-VLPROF mtask 9 start 9135 elapsed 595 predict_start 167 predict_cost 30 cpu 10 on thread 2
-VLPROF mtask 11 start 10255 elapsed 805 predict_start 197 predict_cost 30 cpu 10 on thread 2
-VLPROF mtask 5 start 18375 elapsed 595 predict_start 0 predict_cost 30 cpu 10 on thread 2
-VLPROF mtask 7 start 19145 elapsed 175 predict_start 30 predict_cost 30 cpu 10 on thread 2
-VLPROF mtask 8 start 19670 elapsed 140 predict_start 60 predict_cost 107 cpu 10 on thread 2
-VLPROF mtask 9 start 20650 elapsed 70 predict_start 167 predict_cost 30 cpu 10 on thread 2
-VLPROF mtask 11 start 21140 elapsed 105 predict_start 197 predict_cost 30 cpu 10 on thread 2
+VLPROFTHREAD 0
+VLPROFEXEC EVAL_BEGIN 595
+VLPROFEXEC EVAL_LOOP_BEGIN 945
+VLPROFEXEC MTASK_BEGIN 2695 id 6 predictStart 0 cpu 19
+VLPROFEXEC MTASK_END 2905 id 6 predictCost 30
+VLPROFEXEC MTASK_BEGIN 9695 id 10 predictStart 196 cpu 19
+VLPROFEXEC MTASK_END 9870 id 10 predictCost 30
+VLPROFEXEC EVAL_LOOP_END 12180
+VLPROFEXEC EVAL_END 12250
+VLPROFEXEC EVAL_BEGIN 13720
+VLPROFEXEC EVAL_LOOP_BEGIN 14000
+VLPROFEXEC MTASK_BEGIN 15610 id 6 predictStart 0 cpu 19
+VLPROFEXEC MTASK_END 15820 id 6 predictCost 30
+VLPROFEXEC MTASK_BEGIN 21700 id 10 predictStart 196 cpu 19
+VLPROFEXEC MTASK_END 21875 id 10 predictCost 30
+VLPROFEXEC EVAL_LOOP_END 22085
+VLPROFEXEC EVAL_END 22330
+VLPROFTHREAD 1
+VLPROFEXEC MTASK_BEGIN 5495 id 5 predictStart 0 cpu 10
+VLPROFEXEC MTASK_END 6090 id 5 predictCost 30
+VLPROFEXEC MTASK_BEGIN 6300 id 7 predictStart 30 cpu 10
+VLPROFEXEC MTASK_END 6895 id 7 predictCost 30
+VLPROFEXEC MTASK_BEGIN 7490 id 8 predictStart 60 cpu 10
+VLPROFEXEC MTASK_END 8540 id 8 predictCost 107
+VLPROFEXEC MTASK_BEGIN 9135 id 9 predictStart 167 cpu 10
+VLPROFEXEC MTASK_END 9730 id 9 predictCost 30
+VLPROFEXEC MTASK_BEGIN 10255 id 11 predictStart 197 cpu 10
+VLPROFEXEC MTASK_END 11060 id 11 predictCost 30
+VLPROFEXEC MTASK_BEGIN 18375 id 5 predictStart 0 cpu 10
+VLPROFEXEC MTASK_END 18970 id 5 predictCost 30
+VLPROFEXEC MTASK_BEGIN 19145 id 7 predictStart 30 cpu 10
+VLPROFEXEC MTASK_END 19320 id 7 predictCost 30
+VLPROFEXEC MTASK_BEGIN 19670 id 8 predictStart 60 cpu 10
+VLPROFEXEC MTASK_END 19810 id 8 predictCost 107
+VLPROFEXEC MTASK_BEGIN 20650 id 9 predictStart 167 cpu 10
+VLPROFEXEC MTASK_END 20720 id 9 predictCost 30
+VLPROFEXEC MTASK_BEGIN 21140 id 11 predictStart 197 cpu 10
+VLPROFEXEC MTASK_END 21245 id 11 predictCost 30
 VLPROF stat ticks 23415
--- a/test_regress/t/t_gantt_io.out
+++ b/test_regress/t/t_gantt_io.out
@ -1,9 +1,8 @@
 Verilator Gantt report

 Argument settings:
-  +verilator+prof+threads+start+2
-  +verilator+prof+threads+window+2
-  --threads 2
+  +verilator+prof+exec+start+2
+  +verilator+prof+exec+window+2

 Analysis:
  Total threads             = 2
@ -23,7 +22,7 @@ Prediction (what Verilator used for scheduling):
  All-thread efficiency     = 63.2%
  All-thread speedup        = 1.3

-Statistics:
+MTask statistics:
  min log(p2e) = -3.681  from mtask 5 (predict 30, elapsed 1190)
  max log(p2e) = -2.409  from mtask 8 (predict 107, elapsed 1190)
  mean = -2.992
@ -34,4 +33,4 @@ CPUs:
  cpu 10: cpu_time=4725 socket=0 core=10  Test Ryzen 9 3950X 16-Core Processor
  cpu 19: cpu_time=770 socket=0 core=3  Test Ryzen 9 3950X 16-Core Processor

-Writing profile_threads.vcd
+Writing profile_exec.vcd
--- a/test_regress/t/t_gantt_io.pl
+++ b/test_regress/t/t_gantt_io.pl
@ -16,7 +16,7 @@ run(cmd => ["cd $Self->{obj_dir} && $ENV{VERILATOR_ROOT}/bin/verilator_gantt"

 files_identical("$Self->{obj_dir}/gantt.log", $Self->{golden_filename});

-vcd_identical("$Self->{obj_dir}/profile_threads.vcd", "$Self->{t_dir}/$Self->{name}.vcd.out");
+vcd_identical("$Self->{obj_dir}/profile_exec.vcd", "$Self->{t_dir}/$Self->{name}.vcd.out");

 ok(1);
 1;
--- a/test_regress/t/t_gantt_io.vcd.out
+++ b/test_regress/t/t_gantt_io.vcd.out
@ -15,8 +15,8 @@ $timescale 1ns $end
   $var wire 32 vc eval_loop [31:0] $end
  $upscope $end
  $scope module measured_threads $end
-   $var wire 32 v0 thread1_mtask [31:0] $end
-   $var wire 32 v4 thread2_mtask [31:0] $end
+   $var wire 32 v0 thread0_mtask [31:0] $end
+   $var wire 32 v4 thread1_mtask [31:0] $end
  $upscope $end
  $scope module mtasks $end
   $var wire 32 v3 mtask10_cpu [31:0] $end
@ -28,8 +28,8 @@ $timescale 1ns $end
   $var wire 32 v9 mtask9_cpu [31:0] $end
  $upscope $end
  $scope module predicted_threads $end
-   $var wire 32 vd predicted_thread1_mtask [31:0] $end
-   $var wire 32 ve predicted_thread2_mtask [31:0] $end
+   $var wire 32 vd predicted_thread0_mtask [31:0] $end
+   $var wire 32 ve predicted_thread1_mtask [31:0] $end
  $upscope $end
 $upscope $end
 $enddefinitions $end
@ -65,7 +65,7 @@ b111 ve
 b1 v10
 #2695
 b110 v0
-b1 v1
+b0 v1
 b10011 v2
 b1 vf
 #2905
@ -78,7 +78,7 @@ b1000 ve
 b1 v10
 #5495
 b101 v4
-b10 v5
+b1 v5
 b1010 v6
 b1 vf
 #6090
@ -88,7 +88,7 @@ bz v6
 b0 vf
 #6300
 b111 v4
-b10 v5
+b1 v5
 b1010 v7
 b1 vf
 #6895
@ -98,7 +98,7 @@ bz v7
 b0 vf
 #7490
 b1000 v4
-b10 v5
+b1 v5
 b1010 v8
 b1 vf
 #8540
@ -108,7 +108,7 @@ bz v8
 b0 vf
 #9135
 b1001 v4
-b10 v5
+b1 v5
 b1010 v9
 b1 vf
 #9210
@ -116,7 +116,7 @@ b1001 ve
 b1 v10
 #9695
 b1010 v0
-b1 v1
+b0 v1
 b10011 v3
 b10 vf
 #9730
@ -131,7 +131,7 @@ bz v3
 b0 vf
 #10255
 b1011 v4
-b10 v5
+b1 v5
 b1010 va
 b1 vf
 #10645
@ -167,7 +167,7 @@ b111 ve
 b1 v10
 #15610
 b110 v0
-b1 v1
+b0 v1
 b10011 v2
 b1 vf
 #15820
@ -180,7 +180,7 @@ b1000 ve
 b1 v10
 #18375
 b101 v4
-b10 v5
+b1 v5
 b1010 v6
 b1 vf
 #18970
@ -190,7 +190,7 @@ bz v6
 b0 vf
 #19145
 b111 v4
-b10 v5
+b1 v5
 b1010 v7
 b1 vf
 #19320
@ -200,7 +200,7 @@ bz v7
 b0 vf
 #19670
 b1000 v4
-b10 v5
+b1 v5
 b1010 v8
 b1 vf
 #19810
@ -213,7 +213,7 @@ b1001 ve
 b1 v10
 #20650
 b1001 v4
-b10 v5
+b1 v5
 b1010 v9
 b1 vf
 #20720
@ -229,7 +229,7 @@ b1011 ve
 b10 v10
 #21140
 b1011 v4
-b10 v5
+b1 v5
 b1010 va
 b1 vf
 #21245
@ -239,7 +239,7 @@ bz va
 b0 vf
 #21700
 b1010 v0
-b1 v1
+b0 v1
 b10011 v3
 b1 vf
 #21875
--- a/test_regress/t/t_gantt_io_arm.dat
+++ b/test_regress/t/t_gantt_io_arm.dat
@ -1,7 +1,7 @@
-VLPROFTHREAD 1.1 # Verilator thread profile dump version 1.1
-VLPROF arg --threads 4
-VLPROF arg +verilator+prof+threads+start+1
-VLPROF arg +verilator+prof+threads+window+2
+VLPROFVERSION 2.0
+VLPROF arg +verilator+prof+exec+start+1
+VLPROF arg +verilator+prof+exec+window+2
+VLPROF stat threads 2
 VLPROF stat yields 51
 VLPROFPROC processor	: 0
 VLPROFPROC model name	: Phytium,FT-2500/128
@ -43,11 +43,20 @@ VLPROFPROC CPU variant	: 0x1
 VLPROFPROC CPU part	: 0x663
 VLPROFPROC CPU revision	: 3
 VLPROFPROC 
-VLPROF eval start 57709 elapsed 1745979 cpu 2 on thread 1
-VLPROF eval_loop start 58532 elapsed 1744353 cpu 2 on thread 1
-VLPROF mtask 85 start 90465 elapsed 64569 predict_start 14315 predict_cost 30533 cpu 2 on thread 1
-VLPROF mtask 79 start 156555 elapsed 137754 predict_start 44848 predict_cost 48001 cpu 2 on thread 1
-VLPROF mtask 90 start 77352 elapsed 1159 predict_start 14315 predict_cost 21592 cpu 3 on thread 2
-VLPROF mtask 81 start 79799 elapsed 868 predict_start 35907 predict_cost 29215 cpu 3 on thread 2
-VLPROF mtask 87 start 81746 elapsed 887 predict_start 65147 predict_cost 33809 cpu 3 on thread 2
+VLPROFTHREAD 0
+VLPROFEXEC EVAL_BEGIN 57709
+VLPROFEXEC EVAL_LOOP_BEGIN 58532
+VLPROFEXEC MTASK_BEGIN 90465 id 85 predictStart 14315 cpu 2
+VLPROFEXEC MTASK_END 155034 id 85 predictCost 30533
+VLPROFEXEC MTASK_BEGIN 156555 id 79 predictStart 44848 cpu 2
+VLPROFEXEC MTASK_END 294309 id 79 predictCost 48001
+VLPROFEXEC EVAL_LOOP_END 18028850
+VLPROFEXEC EVAL_END 1803680
+VLPROFTHREAD 1
+VLPROFEXEC MTASK_BEGIN 77352 id 90 predictStart 14315 cpu 3
+VLPROFEXEC MTASK_END 78511 id 90 predictCost 21592
+VLPROFEXEC MTASK_BEGIN 79799 id 81 predictStart 35907 cpu 3
+VLPROFEXEC MTASK_END 80667 id 81 predictCost 29215
+VLPROFEXEC MTASK_BEGIN 81746 id 87 predictStart 65147 cpu 3
+VLPROFEXEC MTASK_END 82633 id 87 predictCost 33809
 VLPROF stat ticks 180832
--- a/test_regress/t/t_gantt_io_arm.out
+++ b/test_regress/t/t_gantt_io_arm.out
@ -1,9 +1,8 @@
 Verilator Gantt report

 Argument settings:
-  +verilator+prof+threads+start+1
-  +verilator+prof+threads+window+2
-  --threads 4
+  +verilator+prof+exec+start+1
+  +verilator+prof+exec+window+2

 Analysis:
  Total threads             = 2
@ -23,7 +22,7 @@ Prediction (what Verilator used for scheduling):
  All-thread efficiency     = 82.4%
  All-thread speedup        = 1.6

-Statistics:
+MTask statistics:
  min log(p2e) = -1.054  from mtask 79 (predict 48001, elapsed 137754)
  max log(p2e) = 3.641  from mtask 87 (predict 33809, elapsed 887)
  mean = 1.656
@ -34,4 +33,4 @@ CPUs:
  cpu 2: cpu_time=202323  Phytium,FT-2500/128
  cpu 3: cpu_time=2914  Phytium,FT-2500/128

-Writing profile_threads.vcd
+Writing profile_exec.vcd
--- a/test_regress/t/t_gantt_io_noproc.dat
+++ b/test_regress/t/t_gantt_io_noproc.dat
@ -1,24 +1,44 @@
-VLPROFTHREAD 1.1 # Verilator thread profile dump version 1.1
-VLPROF arg --threads 2
-VLPROF arg +verilator+prof+threads+start+2
-VLPROF arg +verilator+prof+threads+window+2
+VLPROFVERSION 2.0
+VLPROF arg +verilator+prof+exec+start+2
+VLPROF arg +verilator+prof+exec+window+2
+VLPROF stat threads 2
 VLPROF stat yields 0
-VLPROF eval start 595 elapsed 11655 cpu 19 on thread 1
-VLPROF eval_loop start 945 elapsed 11235 cpu 19 on thread 1
-VLPROF mtask 6 start 2695 elapsed 210 predict_start 0 predict_cost 30 cpu 19 on thread 1
-VLPROF mtask 10 start 9695 elapsed 175 predict_start 196 predict_cost 30 cpu 19 on thread 1
-VLPROF eval start 13720 elapsed 8610 cpu 19 on thread 1
-VLPROF eval_loop start 14000 elapsed 8085 cpu 19 on thread 1
-VLPROF mtask 6 start 15610 elapsed 210 predict_start 0 predict_cost 30 cpu 19 on thread 1
-VLPROF mtask 10 start 21700 elapsed 175 predict_start 196 predict_cost 30 cpu 19 on thread 1
-VLPROF mtask 5 start 5495 elapsed 595 predict_start 0 predict_cost 30 cpu 10 on thread 2
-VLPROF mtask 7 start 6300 elapsed 595 predict_start 30 predict_cost 30 cpu 10 on thread 2
-VLPROF mtask 8 start 7490 elapsed 1050 predict_start 60 predict_cost 107 cpu 10 on thread 2
-VLPROF mtask 9 start 9135 elapsed 595 predict_start 167 predict_cost 30 cpu 10 on thread 2
-VLPROF mtask 11 start 10255 elapsed 805 predict_start 197 predict_cost 30 cpu 10 on thread 2
-VLPROF mtask 5 start 18375 elapsed 595 predict_start 0 predict_cost 30 cpu 10 on thread 2
-VLPROF mtask 7 start 19145 elapsed 175 predict_start 30 predict_cost 30 cpu 10 on thread 2
-VLPROF mtask 8 start 19670 elapsed 140 predict_start 60 predict_cost 107 cpu 10 on thread 2
-VLPROF mtask 9 start 20650 elapsed 70 predict_start 167 predict_cost 30 cpu 10 on thread 2
-VLPROF mtask 11 start 21140 elapsed 105 predict_start 197 predict_cost 30 cpu 10 on thread 2
+VLPROFTHREAD 0
+VLPROFEXEC EVAL_BEGIN 595
+VLPROFEXEC EVAL_LOOP_BEGIN 945
+VLPROFEXEC MTASK_BEGIN 2695 id 6 predictStart 0 cpu 19
+VLPROFEXEC MTASK_END 2905 id 6 predictCost 30
+VLPROFEXEC MTASK_BEGIN 9695 id 10 predictStart 196 cpu 19
+VLPROFEXEC MTASK_END 9870 id 10 predictCost 30
+VLPROFEXEC EVAL_LOOP_END 12180
+VLPROFEXEC EVAL_END 12250
+VLPROFEXEC EVAL_BEGIN 13720
+VLPROFEXEC EVAL_LOOP_BEGIN 14000
+VLPROFEXEC MTASK_BEGIN 15610 id 6 predictStart 0 cpu 19
+VLPROFEXEC MTASK_END 15820 id 6 predictCost 30
+VLPROFEXEC MTASK_BEGIN 21700 id 10 predictStart 196 cpu 19
+VLPROFEXEC MTASK_END 21875 id 10 predictCost 30
+VLPROFEXEC EVAL_LOOP_END 22085
+VLPROFEXEC EVAL_END 22330
+VLPROFTHREAD 1
+VLPROFEXEC MTASK_BEGIN 5495 id 5 predictStart 0 cpu 10
+VLPROFEXEC MTASK_END 6090 id 5 predictCost 30
+VLPROFEXEC MTASK_BEGIN 6300 id 7 predictStart 30 cpu 10
+VLPROFEXEC MTASK_END 6895 id 7 predictCost 30
+VLPROFEXEC MTASK_BEGIN 7490 id 8 predictStart 60 cpu 10
+VLPROFEXEC MTASK_END 8540 id 8 predictCost 107
+VLPROFEXEC MTASK_BEGIN 9135 id 9 predictStart 167 cpu 10
+VLPROFEXEC MTASK_END 9730 id 9 predictCost 30
+VLPROFEXEC MTASK_BEGIN 10255 id 11 predictStart 197 cpu 10
+VLPROFEXEC MTASK_END 11060 id 11 predictCost 30
+VLPROFEXEC MTASK_BEGIN 18375 id 5 predictStart 0 cpu 10
+VLPROFEXEC MTASK_END 18970 id 5 predictCost 30
+VLPROFEXEC MTASK_BEGIN 19145 id 7 predictStart 30 cpu 10
+VLPROFEXEC MTASK_END 19320 id 7 predictCost 30
+VLPROFEXEC MTASK_BEGIN 19670 id 8 predictStart 60 cpu 10
+VLPROFEXEC MTASK_END 19810 id 8 predictCost 107
+VLPROFEXEC MTASK_BEGIN 20650 id 9 predictStart 167 cpu 10
+VLPROFEXEC MTASK_END 20720 id 9 predictCost 30
+VLPROFEXEC MTASK_BEGIN 21140 id 11 predictStart 197 cpu 10
+VLPROFEXEC MTASK_END 21245 id 11 predictCost 30
 VLPROF stat ticks 23415
--- a/test_regress/t/t_gantt_io_noproc.out
+++ b/test_regress/t/t_gantt_io_noproc.out
@ -1,9 +1,8 @@
 Verilator Gantt report

 Argument settings:
-  +verilator+prof+threads+start+2
-  +verilator+prof+threads+window+2
-  --threads 2
+  +verilator+prof+exec+start+2
+  +verilator+prof+exec+window+2

 Analysis:
  Total threads             = 2
@ -23,7 +22,7 @@ Prediction (what Verilator used for scheduling):
  All-thread efficiency     = 63.2%
  All-thread speedup        = 1.3

-Statistics:
+MTask statistics:
  min log(p2e) = -3.681  from mtask 5 (predict 30, elapsed 1190)
  max log(p2e) = -2.409  from mtask 8 (predict 107, elapsed 1190)
  mean = -2.992
--- a/test_regress/t/t_gate_tree.pl
+++ b/test_regress/t/t_gate_tree.pl
@ -118,9 +118,9 @@ compile(
    );

 execute(
-    all_run_flags => ["+verilator+prof+threads+start+100",
-                      " +verilator+prof+threads+window+2",
-                      " +verilator+prof+threads+file+$Self->{obj_dir}/profile_threads.dat",
+    all_run_flags => ["+verilator+prof+exec+start+100",
+                      " +verilator+prof+exec+window+2",
+                      " +verilator+prof+exec+file+$Self->{obj_dir}/profile_exec.dat",
                      " +verilator+prof+vlt+file+$Self->{obj_dir}/profile.vlt",
                      ],
    check_finished => 1,
--- a/test_regress/t/t_pgo_threads.pl
+++ b/test_regress/t/t_pgo_threads.pl
@ -14,12 +14,12 @@ scenarios(vltmt => 1);
 top_filename("t/t_gen_alw.v");

 compile(
-    v_flags2 => ["--prof-threads --threads 2"]
+    v_flags2 => ["--prof-pgo --threads 2"]
    );

 execute(
-    all_run_flags => ["+verilator+prof+threads+start+0",
-                      " +verilator+prof+threads+file+/dev/null",
+    all_run_flags => ["+verilator+prof+exec+start+0",
+                      " +verilator+prof+exec+file+/dev/null",
                      " +verilator+prof+vlt+file+$Self->{obj_dir}/profile.vlt",
                      ],
    check_finished => 1,
@ -28,8 +28,8 @@ execute(
 file_grep("$Self->{obj_dir}/profile.vlt", qr/profile_data/i);

 compile(
-    # Intentinally no --prof-threads here, so we make sure profile data
-    # can read in without it (that is no prof-thread effect on profile_data hash names)
+    # Intentinally no --prof-pgo here to make sure profile data can be read in
+    # without it (that is: --prof-pgo has no effect on profile_data hash names)
    v_flags2 => ["--threads 2",
                 " $Self->{obj_dir}/profile.vlt"],
    );
--- a/test_regress/t/t_verilated_all.pl
+++ b/test_regress/t/t_verilated_all.pl
@ -21,13 +21,12 @@ compile(
                          ? "--threads 2 $root/include/verilated_threads.cpp" : ""),
                         ($Self->cfg_with_threaded
                          ? "--trace-threads 1" : ""),
-                         ($Self->cfg_with_threaded
-                          ? "--prof-threads" : ""),
+                         "--prof-exec", "--prof-pgo",
                         "$root/include/verilated_save.cpp"],
    );

 execute(
-    all_run_flags => [" +verilator+prof+threads+file+/dev/null",
+    all_run_flags => [" +verilator+prof+exec+file+/dev/null",
                      " +verilator+prof+vlt+file+/dev/null",
                      ],
    check_finished => 1,
--- a/test_regress/t/t_verilated_all_newest.pl
+++ b/test_regress/t/t_verilated_all_newest.pl
@ -16,12 +16,12 @@ my $root = "..";

 compile(
    # Can't use --coverage and --savable together, so cheat and compile inline
-    verilator_flags2 => ["--cc --coverage-toggle --coverage-line --coverage-user --trace --vpi $root/include/verilated_save.cpp"],
+    verilator_flags2 => ["--cc --coverage-toggle --coverage-line --coverage-user --trace --prof-exec --prof-pgo --vpi $root/include/verilated_save.cpp"],
    make_flags => 'DRIVER_STD=newest',
    );

 execute(
-    all_run_flags => [" +verilator+prof+threads+file+/dev/null",
+    all_run_flags => [" +verilator+prof+exec+file+/dev/null",
                      " +verilator+prof+vlt+file+/dev/null",
                      ],
    check_finished => 1,