Improve --prof-exec infrastructure and report

Again --prof-exec have bit-rotted a little with all the recent changes to the structure of the generated code. This patch contains a few improvements: - Repalce the eval/evl_loop begin/end events with generic section_push/section_pop events, that can be arbitrarily sprinkled into the generate code (so long as they are matched correctly) to measure various sections. The report then contains a nested profile of the sections, and the VCD trace shows the section names. - Better handling of exec graphs - Clearer overall statistics
2024-12-29 10:47:34 +00:00 · 2023-05-10 12:12:34 +01:00 · 2023-05-10 12:12:34 +01:00 · 4c0edd2efb
commit 4c0edd2efb
parent 146cdc020d
17 changed files with 662 additions and 561 deletions
--- a/bin/verilator_gantt
+++ b/bin/verilator_gantt
@ -1,33 +1,32 @@
 #!/usr/bin/env python3
-# pylint: disable=C0103,C0114,C0116,C0209,C0301,R0914,R0912,R0915,W0511,eval-used
+# pylint: disable=C0103,C0114,C0116,C0209,C0301,R0914,R0912,R0915,W0511,W0603,eval-used
 ######################################################################

 import argparse
+import bisect
 import collections
 import math
 import re
 import statistics
 # from pprint import pprint

-Threads = collections.defaultdict(lambda: collections.defaultdict(lambda: {}))
-Mtasks = collections.defaultdict(lambda: {})
-Evals = collections.defaultdict(lambda: {})
-EvalLoops = collections.defaultdict(lambda: {})
+Sections = []
+LongestVcdStrValueLength = 0
+Threads = collections.defaultdict(lambda: [])  # List of records per thread id
+Mtasks = collections.defaultdict(lambda: {'elapsed': 0, 'end': 0})
+Cpus = collections.defaultdict(lambda: {'mtask_time': 0})
 Global = {
    'args': {},
    'cpuinfo': collections.defaultdict(lambda: {}),
-    'rdtsc_cycle_time': 0,
    'stats': {}
 }
+ElapsedTime = None  # total elapsed time
+ExecGraphTime = 0  # total elapsed time excuting an exec graph
+ExecGraphIntervals = []  # list of (start, end) pairs

 ######################################################################


-def process(filename):
-    read_data(filename)
-    report()
-
-
 def read_data(filename):
    with open(filename, "r", encoding="utf8") as fh:
        re_thread = re.compile(r'^VLPROFTHREAD (\d+)$')
@ -39,14 +38,17 @@ def read_data(filename):
        re_arg1 = re.compile(r'VLPROF arg\s+(\S+)\+([0-9.]*)\s*')
        re_arg2 = re.compile(r'VLPROF arg\s+(\S+)\s+([0-9.]*)\s*$')
        re_stat = re.compile(r'VLPROF stat\s+(\S+)\s+([0-9.]+)')
-        re_time = re.compile(r'rdtsc time = (\d+) ticks')
        re_proc_cpu = re.compile(r'VLPROFPROC processor\s*:\s*(\d+)\s*$')
        re_proc_dat = re.compile(r'VLPROFPROC ([a-z_ ]+)\s*:\s*(.*)$')
        cpu = None
        thread = None
+        execGraphStart = None

-        lastEvalBeginTick = None
-        lastEvalLoopBeginTick = None
+        global LongestVcdStrValueLength
+        global ExecGraphTime
+
+        SectionStack = []
+        mTaskThread = {}

        for line in fh:
            recordMatch = re_record.match(line)
@ -54,29 +56,31 @@ def read_data(filename):
                kind, tick, payload = recordMatch.groups()
                tick = int(tick)
                payload = payload.strip()
-                if kind == "EVAL_BEGIN":
-                    Evals[tick]['start'] = tick
-                    lastEvalBeginTick = tick
-                elif kind == "EVAL_END":
-                    Evals[lastEvalBeginTick]['end'] = tick
-                    lastEvalBeginTick = None
-                elif kind == "EVAL_LOOP_BEGIN":
-                    EvalLoops[tick]['start'] = tick
-                    lastEvalLoopBeginTick = tick
-                elif kind == "EVAL_LOOP_END":
-                    EvalLoops[lastEvalLoopBeginTick]['end'] = tick
-                    lastEvalLoopBeginTick = None
+                if kind == "SECTION_PUSH":
+                    LongestVcdStrValueLength = max(LongestVcdStrValueLength,
+                                                   len(payload))
+                    SectionStack.append(payload)
+                    Sections.append((tick, tuple(SectionStack)))
+                elif kind == "SECTION_POP":
+                    assert SectionStack, "SECTION_POP without SECTION_PUSH"
+                    SectionStack.pop()
+                    Sections.append((tick, tuple(SectionStack)))
                elif kind == "MTASK_BEGIN":
                    mtask, predict_start, ecpu = re_payload_mtaskBegin.match(
                        payload).groups()
                    mtask = int(mtask)
                    predict_start = int(predict_start)
                    ecpu = int(ecpu)
-                    Threads[thread][tick]['mtask'] = mtask
-                    Threads[thread][tick]['predict_start'] = predict_start
-                    Threads[thread][tick]['cpu'] = ecpu
-                    if 'elapsed' not in Mtasks[mtask]:
-                        Mtasks[mtask] = {'end': 0, 'elapsed': 0}
+                    mTaskThread[mtask] = thread
+                    records = Threads[thread]
+                    assert not records or records[-1]['start'] <= records[-1][
+                        'end'] <= tick
+                    records.append({
+                        'start': tick,
+                        'mtask': mtask,
+                        'predict_start': predict_start,
+                        'cpu': ecpu
+                    })
                    Mtasks[mtask]['begin'] = tick
                    Mtasks[mtask]['thread'] = thread
                    Mtasks[mtask]['predict_start'] = predict_start
@ -86,11 +90,18 @@ def read_data(filename):
                    mtask = int(mtask)
                    predict_cost = int(predict_cost)
                    begin = Mtasks[mtask]['begin']
-                    Threads[thread][begin]['end'] = tick
-                    Threads[thread][begin]['predict_cost'] = predict_cost
+                    record = Threads[mTaskThread[mtask]][-1]
+                    record['end'] = tick
+                    record['predict_cost'] = predict_cost
                    Mtasks[mtask]['elapsed'] += tick - begin
                    Mtasks[mtask]['predict_cost'] = predict_cost
                    Mtasks[mtask]['end'] = max(Mtasks[mtask]['end'], tick)
+                elif kind == "EXEC_GRAPH_BEGIN":
+                    execGraphStart = tick
+                elif kind == "EXEC_GRAPH_END":
+                    ExecGraphTime += tick - execGraphStart
+                    ExecGraphIntervals.append((execGraphStart, tick))
+                    execGraphStart = None
                elif Args.debug:
                    print("-Unknown execution trace record: %s" % line)
            elif re_thread.match(line):
@ -109,7 +120,7 @@ def read_data(filename):
            elif re_proc_cpu.match(line):
                match = re_proc_cpu.match(line)
                cpu = int(match.group(1))
-            elif cpu and re_proc_dat.match(line):
+            elif cpu is not None and re_proc_dat.match(line):
                match = re_proc_dat.match(line)
                term = match.group(1)
                value = match.group(2)
@ -121,11 +132,6 @@ def read_data(filename):
                pass
            elif Args.debug:
                print("-Unk: %s" % line)
-            # TODO -- this is parsing text printed by a client.
-            # Really, verilator proper should generate this
-            # if it's useful...
-            if re_time.match(line):
-                Global['rdtsc_cycle_time'] = re_time.group(1)


 def re_match_result(regexp, line, result_to):
@ -144,125 +150,33 @@ def report():
        plus = "+" if re.match(r'^\+', arg) else " "
        print("  %s%s%s" % (arg, plus, Global['args'][arg]))

+    for records in Threads.values():
+        for record in records:
+            cpu = record['cpu']
+            elapsed = record['end'] - record['start']
+            Cpus[cpu]['mtask_time'] += elapsed
+
+    global ElapsedTime
+    ElapsedTime = int(Global['stats']['ticks'])
    nthreads = int(Global['stats']['threads'])
-    Global['cpus'] = {}
-    for thread in Threads:
-        # Make potentially multiple characters per column
-        for start in Threads[thread]:
-            if not Threads[thread][start]:
-                continue
-            cpu = Threads[thread][start]['cpu']
-            elapsed = Threads[thread][start]['end'] - start
-            if cpu not in Global['cpus']:
-                Global['cpus'][cpu] = {'cpu_time': 0}
-            Global['cpus'][cpu]['cpu_time'] += elapsed
+    ncpus = max(len(Cpus), 1)

-    measured_mt_mtask_time = 0
-    predict_mt_mtask_time = 0
-    long_mtask_time = 0
-    measured_last_end = 0
-    predict_last_end = 0
-    for mtask in Mtasks:
-        measured_mt_mtask_time += Mtasks[mtask]['elapsed']
-        predict_mt_mtask_time += Mtasks[mtask]['predict_cost']
-        measured_last_end = max(measured_last_end, Mtasks[mtask]['end'])
-        predict_last_end = max(
-            predict_last_end,
-            Mtasks[mtask]['predict_start'] + Mtasks[mtask]['predict_cost'])
-        long_mtask_time = max(long_mtask_time, Mtasks[mtask]['elapsed'])
-    Global['measured_last_end'] = measured_last_end
-    Global['predict_last_end'] = predict_last_end
-
-    # If we know cycle time in the same (rdtsc) units,
-    # this will give us an actual utilization number,
-    # (how effectively we keep the cores busy.)
-    #
-    # It also gives us a number we can compare against
-    # serial mode, to estimate the overhead of data sharing,
-    # which will show up in the total elapsed time. (Overhead
-    # of synchronization and scheduling should not.)
-    print("\nAnalysis:")
-    print("  Total threads             = %d" % nthreads)
-    print("  Total mtasks              = %d" % len(Mtasks))
-    ncpus = max(len(Global['cpus']), 1)
-    print("  Total cpus used           = %d" % ncpus)
-    print("  Total yields              = %d" %
-          int(Global['stats'].get('yields', 0)))
-    print("  Total evals               = %d" % len(Evals))
-    print("  Total eval loops          = %d" % len(EvalLoops))
-    if Mtasks:
-        print("  Total eval time           = %d rdtsc ticks" %
-              Global['measured_last_end'])
-        print("  Longest mtask time        = %d rdtsc ticks" % long_mtask_time)
-        print("  All-thread mtask time     = %d rdtsc ticks" %
-              measured_mt_mtask_time)
-        long_efficiency = long_mtask_time / (Global.get(
-            'measured_last_end', 1) or 1)
-        print("  Longest-thread efficiency = %0.1f%%" %
-              (long_efficiency * 100.0))
-        mt_efficiency = measured_mt_mtask_time / (
-            Global.get('measured_last_end', 1) * nthreads or 1)
-        print("  All-thread efficiency     = %0.1f%%" %
-              (mt_efficiency * 100.0))
-        print("  All-thread speedup        = %0.1f" %
-              (mt_efficiency * nthreads))
-        if Global['rdtsc_cycle_time'] > 0:
-            ut = measured_mt_mtask_time / Global['rdtsc_cycle_time']
-            print("tot_mtask_cpu=" + measured_mt_mtask_time + " cyc=" +
-                  Global['rdtsc_cycle_time'] + " ut=" + ut)
-
-        predict_mt_efficiency = predict_mt_mtask_time / (
-            Global.get('predict_last_end', 1) * nthreads or 1)
-        print("\nPrediction (what Verilator used for scheduling):")
-        print("  All-thread efficiency     = %0.1f%%" %
-              (predict_mt_efficiency * 100.0))
-        print("  All-thread speedup        = %0.1f" %
-              (predict_mt_efficiency * nthreads))
-
-        p2e_ratios = []
-        min_p2e = 1000000
-        min_mtask = None
-        max_p2e = -1000000
-        max_mtask = None
-
-        for mtask in sorted(Mtasks.keys()):
-            if Mtasks[mtask]['elapsed'] > 0:
-                if Mtasks[mtask]['predict_cost'] == 0:
-                    Mtasks[mtask]['predict_cost'] = 1  # don't log(0) below
-                p2e_ratio = math.log(Mtasks[mtask]['predict_cost'] /
-                                     Mtasks[mtask]['elapsed'])
-                p2e_ratios.append(p2e_ratio)
-
-                if p2e_ratio > max_p2e:
-                    max_p2e = p2e_ratio
-                    max_mtask = mtask
-                if p2e_ratio < min_p2e:
-                    min_p2e = p2e_ratio
-                    min_mtask = mtask
-
-        print("\nMTask statistics:")
-        print("  min log(p2e) = %0.3f" % min_p2e, end="")
-        print("  from mtask %d (predict %d," %
-              (min_mtask, Mtasks[min_mtask]['predict_cost']),
-              end="")
-        print(" elapsed %d)" % Mtasks[min_mtask]['elapsed'])
-        print("  max log(p2e) = %0.3f" % max_p2e, end="")
-        print("  from mtask %d (predict %d," %
-              (max_mtask, Mtasks[max_mtask]['predict_cost']),
-              end="")
-        print(" elapsed %d)" % Mtasks[max_mtask]['elapsed'])
-
-        stddev = statistics.pstdev(p2e_ratios)
-        mean = statistics.mean(p2e_ratios)
-        print("  mean = %0.3f" % mean)
-        print("  stddev = %0.3f" % stddev)
-        print("  e ^ stddev = %0.3f" % math.exp(stddev))
+    print("\nSummary:")
+    print("  Total elapsed time = {} rdtsc ticks".format(ElapsedTime))
+    print("  Parallelized code  = {:.2%} of elapsed time".format(
+        ExecGraphTime / ElapsedTime))
+    print("  Total threads      = %d" % nthreads)
+    print("  Total CPUs used    = %d" % ncpus)
+    print("  Total mtasks       = %d" % len(Mtasks))
+    print("  Total yields       = %d" % int(Global['stats'].get('yields', 0)))

+    report_mtasks()
    report_cpus()
+    report_sections()

    if nthreads > ncpus:
        print()
-        print("%%Warning: There were fewer CPUs (%d) then threads (%d)." %
+        print("%%Warning: There were fewer CPUs (%d) than threads (%d)." %
              (ncpus, nthreads))
        print("        : See docs on use of numactl.")
    else:
@ -279,32 +193,130 @@ def report():
    print()


+def report_mtasks():
+    if not Mtasks:
+        return
+
+    nthreads = int(Global['stats']['threads'])
+
+    # If we know cycle time in the same (rdtsc) units,
+    # this will give us an actual utilization number,
+    # (how effectively we keep the cores busy.)
+    #
+    # It also gives us a number we can compare against
+    # serial mode, to estimate the overhead of data sharing,
+    # which will show up in the total elapsed time. (Overhead
+    # of synchronization and scheduling should not.)
+    total_mtask_time = 0
+    thread_mtask_time = collections.defaultdict(lambda: 0)
+    long_mtask_time = 0
+    long_mtask = None
+    predict_mtask_time = 0
+    predict_elapsed = 0
+    for mtaskId in Mtasks:
+        record = Mtasks[mtaskId]
+        predict_mtask_time += record['predict_cost']
+        total_mtask_time += record['elapsed']
+        thread_mtask_time[record['thread']] += record['elapsed']
+        predict_end = record['predict_start'] + record['predict_cost']
+        predict_elapsed = max(predict_elapsed, predict_end)
+        if record['elapsed'] > long_mtask_time:
+            long_mtask_time = record['elapsed']
+            long_mtask = mtaskId
+    Global['predict_last_end'] = predict_elapsed
+
+    serialTime = ElapsedTime - ExecGraphTime
+
+    def subReport(elapsed, work):
+        print("  Thread utilization = {:7.2%}".format(work /
+                                                      (elapsed * nthreads)))
+        print("  Speedup            = {:6.3}x".format(work / elapsed))
+
+    print("\nParallelized code, measured:")
+    subReport(ExecGraphTime, total_mtask_time)
+
+    print("\nParallelized code, predicted during static scheduling:")
+    subReport(predict_elapsed, predict_mtask_time)
+
+    print("\nAll code, measured:")
+    subReport(ElapsedTime, serialTime + total_mtask_time)
+
+    print("\nAll code, measured, scaled by predicted speedup:")
+    expectedParallelSpeedup = predict_mtask_time / predict_elapsed
+    scaledElapsed = serialTime + total_mtask_time / expectedParallelSpeedup
+    subReport(scaledElapsed, serialTime + total_mtask_time)
+
+    p2e_ratios = []
+    min_p2e = 1000000
+    min_mtask = None
+    max_p2e = -1000000
+    max_mtask = None
+
+    for mtask in sorted(Mtasks.keys()):
+        if Mtasks[mtask]['elapsed'] > 0:
+            if Mtasks[mtask]['predict_cost'] == 0:
+                Mtasks[mtask]['predict_cost'] = 1  # don't log(0) below
+            p2e_ratio = math.log(Mtasks[mtask]['predict_cost'] /
+                                 Mtasks[mtask]['elapsed'])
+            p2e_ratios.append(p2e_ratio)
+
+            if p2e_ratio > max_p2e:
+                max_p2e = p2e_ratio
+                max_mtask = mtask
+            if p2e_ratio < min_p2e:
+                min_p2e = p2e_ratio
+                min_mtask = mtask
+
+    print("\nMTask statistics:")
+    print("  Longest mtask id = {}".format(long_mtask))
+    print("  Longest mtask time = {:.2%} of time elapsed in parallelized code".
+          format(long_mtask_time / ExecGraphTime))
+    print("  min log(p2e) = %0.3f" % min_p2e, end="")
+
+    print("  from mtask %d (predict %d," %
+          (min_mtask, Mtasks[min_mtask]['predict_cost']),
+          end="")
+    print(" elapsed %d)" % Mtasks[min_mtask]['elapsed'])
+    print("  max log(p2e) = %0.3f" % max_p2e, end="")
+    print("  from mtask %d (predict %d," %
+          (max_mtask, Mtasks[max_mtask]['predict_cost']),
+          end="")
+    print(" elapsed %d)" % Mtasks[max_mtask]['elapsed'])
+
+    stddev = statistics.pstdev(p2e_ratios)
+    mean = statistics.mean(p2e_ratios)
+    print("  mean = %0.3f" % mean)
+    print("  stddev = %0.3f" % stddev)
+    print("  e ^ stddev = %0.3f" % math.exp(stddev))
+
+
 def report_cpus():
-    print("\nCPUs:")
+    print("\nCPU info:")

    Global['cpu_sockets'] = collections.defaultdict(lambda: 0)
    Global['cpu_socket_cores'] = collections.defaultdict(lambda: 0)

-    for cpu in sorted(Global['cpus'].keys()):
-        print("  cpu %d: " % cpu, end='')
-        print("cpu_time=%d" % Global['cpus'][cpu]['cpu_time'], end='')
-
-        socket = None
+    print("   Id | Time spent executing MTask | Socket | Core | Model")
+    print("      | % of elapsed ticks / ticks |        |      |")
+    print("  ====|============================|========|======|======")
+    for cpu in sorted(Cpus):
+        socket = ""
+        core = ""
+        model = ""
        if cpu in Global['cpuinfo']:
            cpuinfo = Global['cpuinfo'][cpu]
            if 'physical_id' in cpuinfo and 'core_id' in cpuinfo:
-                socket = int(cpuinfo['physical_id'])
+                socket = cpuinfo['physical_id']
                Global['cpu_sockets'][socket] += 1
-                print(" socket=%d" % socket, end='')
-
-                core = int(cpuinfo['core_id'])
-                Global['cpu_socket_cores'][str(socket) + "__" + str(core)] += 1
-                print(" core=%d" % core, end='')
+                core = cpuinfo['core_id']
+                Global['cpu_socket_cores'][socket + "__" + core] += 1

            if 'model_name' in cpuinfo:
                model = cpuinfo['model_name']
-                print("  %s" % model, end='')
-        print()
+
+        print("  {:3d} | {:7.2%} / {:16d} | {:>6s} | {:>4s} | {}".format(
+            cpu, Cpus[cpu]['mtask_time'] / ElapsedTime,
+            Cpus[cpu]['mtask_time'], socket, core, model))

    if len(Global['cpu_sockets']) > 1:
        Global['cpu_sockets_warning'] = True
@ -313,25 +325,68 @@ def report_cpus():
                Global['cpu_socket_cores_warning'] = True


+def report_sections():
+    if not Sections:
+        return
+    print("\nSection profile:")
+
+    totalTime = collections.defaultdict(lambda: 0)
+    selfTime = collections.defaultdict(lambda: 0)
+
+    sectionTree = [0, {}, 1]  # [selfTime, childTrees, numberOfTimesEntered]
+    prevTime = 0
+    prevStack = ()
+    for time, stack in Sections:
+        if len(stack) > len(prevStack):
+            scope = sectionTree
+            for item in stack:
+                scope = scope[1].setdefault(item, [0, {}, 0])
+            scope[2] += 1
+        dt = time - prevTime
+        scope = sectionTree
+        for item in prevStack:
+            scope = scope[1].setdefault(item, [0, {}, 0])
+        scope[0] += dt
+
+        if prevStack:
+            for name in prevStack:
+                totalTime[name] += dt
+            selfTime[prevStack[-1]] += dt
+        prevTime = time
+        prevStack = stack
+
+    def treeSum(tree):
+        n = tree[0]
+        for subTree in tree[1].values():
+            n += treeSum(subTree)
+        return n
+
+    # Make sure the tree sums to the elapsed time
+    sectionTree[0] += ElapsedTime - treeSum(sectionTree)
+
+    def printTree(prefix, name, entries, tree):
+        print("  {:7.2%} | {:7.2%} | {:8} | {:10.2f} | {}".format(
+            treeSum(tree) / ElapsedTime, tree[0] / ElapsedTime, tree[2],
+            tree[2] / entries, prefix + name))
+        for k in sorted(tree[1], key=lambda _: -treeSum(tree[1][_])):
+            printTree(prefix + "  ", k, tree[2], tree[1][k])
+
+    print(" Total    | Self    | Total    | Relative   | Section")
+    print(" time     | time    | entries  | entries    |  name  ")
+    print("==========|=========|==========|============|========")
+    printTree("", "*TOTAL*", 1, sectionTree)
+
+
 ######################################################################


 def write_vcd(filename):
    print("Writing %s" % filename)
    with open(filename, "w", encoding="utf8") as fh:
-        vcd = {
-            'values':
-            collections.defaultdict(lambda: {}),  # {<time>}{<code>} = value
-            'sigs': {
-                'predicted_threads': {},
-                'measured_threads': {},
-                'cpus': {},
-                'evals': {},
-                'mtasks': {},
-                'Stats': {}
-            }  # {<module>}{<sig}} = code
-        }
-        code = 0
+        # dict of dicts of hierarchy elements/signal name  -> (code, width)
+        scopeSigs = {}
+        # map from time -> code -> value
+        values = collections.defaultdict(lambda: {})

        parallelism = {
            'measured': collections.defaultdict(lambda: 0),
@ -340,132 +395,139 @@ def write_vcd(filename):
        parallelism['measured'][0] = 0
        parallelism['predicted'][0] = 0

-        # Measured graph
-        for thread in sorted(Threads.keys()):
-            sig = "thread%d_mtask" % thread
-            if sig not in vcd['sigs']['measured_threads']:
-                vcd['sigs']['measured_threads'][sig] = code
-                code += 1
-            mcode = vcd['sigs']['measured_threads'][sig]
+        nextFreeCode = 0

-            for start in sorted(Threads[thread]):
-                mtask = Threads[thread][start]['mtask']
-                end = Threads[thread][start]['end']
-                cpu = Threads[thread][start]['cpu']
-                vcd['values'][start][mcode] = mtask
-                vcd['values'][end][mcode] = None
+        def getCode(width, *names):
+            nonlocal nextFreeCode
+            scope = scopeSigs
+            *path, name = names
+            for item in path:
+                scope = scope.setdefault(item, {})
+            code, oldWidth = scope.setdefault(name, (nextFreeCode, width))
+            assert oldWidth == width
+            if code == nextFreeCode:
+                nextFreeCode += 1
+            return code
+
+        def addValue(code, time, val):
+            if isinstance(val, str):
+                buf = ""
+                for c in val:
+                    buf += bin(ord(c))[2:].rjust(8, "0")
+                val = buf.ljust(LongestVcdStrValueLength * 8, "0")
+            values[time][code] = val
+
+        # Measured graph
+        for thread in sorted(Threads):
+            mcode = getCode(32, 'measured', 't%d_mtask' % thread)
+            for record in Threads[thread]:
+                start = record['start']
+                mtask = record['mtask']
+                end = record['end']
+                cpu = record['cpu']
+                addValue(mcode, start, mtask)
+                addValue(mcode, end, None)
                parallelism['measured'][start] += 1
                parallelism['measured'][end] -= 1

-                sig = "cpu%d_thread" % cpu
-                if sig not in vcd['sigs']['cpus']:
-                    vcd['sigs']['cpus'][sig] = code
-                    code += 1
-                ccode = vcd['sigs']['cpus'][sig]
-                vcd['values'][start][ccode] = thread
-                vcd['values'][end][ccode] = None
+                code = getCode(32, 'measured', 'cpu%d_thread' % cpu)
+                addValue(code, start, thread)
+                addValue(code, end, None)

-                sig = "mtask%d_cpu" % mtask
-                if sig not in vcd['sigs']['mtasks']:
-                    vcd['sigs']['mtasks'][sig] = code
-                    code += 1
-                ccode = vcd['sigs']['mtasks'][sig]
-                vcd['values'][start][ccode] = cpu
-                vcd['values'][end][ccode] = None
+                code = getCode(32, 'measured', 'cpu%d_mtask' % cpu)
+                addValue(code, start, mtask)
+                addValue(code, end, None)

-        # Eval graph
-        vcd['sigs']['evals']["eval"] = code
-        elcode = code
-        code += 1
-        n = 0
-        for eval_start in Evals:
-            eval_end = Evals[eval_start]['end']
-            n += 1
-            vcd['values'][eval_start][elcode] = n
-            vcd['values'][eval_end][elcode] = None
+        tStart = sorted(_['start'] for records in Threads.values()
+                        for _ in records)
+        tEnd = sorted(_['end'] for records in Threads.values()
+                      for _ in records)

-        # Eval_loop graph
-        vcd['sigs']['evals']["eval_loop"] = code
-        elcode = code
-        code += 1
-        n = 0
-        for eval_start in EvalLoops:
-            eval_end = EvalLoops[eval_start]['end']
-            n += 1
-            vcd['values'][eval_start][elcode] = n
-            vcd['values'][eval_end][elcode] = None
+        # Predicted graph
+        for start, end in ExecGraphIntervals:
+            # Find the earliest MTask start after the start point, and the
+            # latest MTask end before the end point, so we can scale to the
+            # same range
+            start = tStart[bisect.bisect_left(tStart, start)]
+            end = tEnd[bisect.bisect_right(tEnd, end) - 1]
+            # Compute scale so predicted graph is of same width as interval
+            measured_scaling = (end - start) / Global['predict_last_end']
+            # Predict mtasks that fill the time the execution occupied
+            for mtask in Mtasks:
+                thread = Mtasks[mtask]['thread']
+                pred_scaled_start = start + int(
+                    Mtasks[mtask]['predict_start'] * measured_scaling)
+                pred_scaled_end = start + int(
+                    (Mtasks[mtask]['predict_start'] +
+                     Mtasks[mtask]['predict_cost']) * measured_scaling)
+                if pred_scaled_start == pred_scaled_end:
+                    continue

-        if Mtasks:
-            # Predicted graph
-            for eval_start in EvalLoops:
-                eval_end = EvalLoops[eval_start]['end']
-                # Compute scale so predicted graph is of same width as eval
-                measured_scaling = (eval_end -
-                                    eval_start) / Global['predict_last_end']
-                # Predict mtasks that fill the time the eval occupied
-                for mtask in Mtasks:
-                    thread = Mtasks[mtask]['thread']
-                    pred_scaled_start = eval_start + int(
-                        Mtasks[mtask]['predict_start'] * measured_scaling)
-                    pred_scaled_end = eval_start + int(
-                        (Mtasks[mtask]['predict_start'] +
-                         Mtasks[mtask]['predict_cost']) * measured_scaling)
-                    if pred_scaled_start == pred_scaled_end:
-                        continue
+                mcode = getCode(32, 'predicted', 't%d_mtask' % thread)
+                addValue(mcode, pred_scaled_start, mtask)
+                addValue(mcode, pred_scaled_end, None)

-                    sig = "predicted_thread%d_mtask" % thread
-                    if sig not in vcd['sigs']['predicted_threads']:
-                        vcd['sigs']['predicted_threads'][sig] = code
-                        code += 1
-                    mcode = vcd['sigs']['predicted_threads'][sig]
+                parallelism['predicted'][pred_scaled_start] += 1
+                parallelism['predicted'][pred_scaled_end] -= 1

-                    vcd['values'][pred_scaled_start][mcode] = mtask
-                    vcd['values'][pred_scaled_end][mcode] = None
+        # Parallelism graph
+        for measpred in ('measured', 'predicted'):
+            pcode = getCode(32, 'stats', '%s_parallelism' % measpred)
+            value = 0
+            for time in sorted(parallelism[measpred].keys()):
+                value += parallelism[measpred][time]
+                addValue(pcode, time, value)

-                    parallelism['predicted'][pred_scaled_start] += 1
-                    parallelism['predicted'][pred_scaled_end] -= 1
-
-            # Parallelism graph
-            for measpred in ('measured', 'predicted'):
-                vcd['sigs']['Stats']["%s_parallelism" % measpred] = code
-                pcode = code
-                code += 1
-                value = 0
-                for time in sorted(parallelism[measpred].keys()):
-                    value += parallelism[measpred][time]
-                    vcd['values'][time][pcode] = value
+        # Section graph
+        if Sections:
+            scode = getCode(LongestVcdStrValueLength * 8, "section", "trace")
+            dcode = getCode(32, "section", "depth")
+            for time, stack in Sections:
+                addValue(scode, time, stack[-1] if stack else None)
+                addValue(dcode, time, len(stack))

        # Create output file
        fh.write("$version Generated by verilator_gantt $end\n")
        fh.write("$timescale 1ns $end\n")
        fh.write("\n")

-        all_codes = {}
+        all_codes = set()
+
+        def writeScope(scope):
+            assert isinstance(scope, dict)
+            for key in sorted(scope):
+                val = scope[key]
+                if isinstance(val, dict):
+                    fh.write("  $scope module %s $end\n" % key)
+                    writeScope(val)
+                    fh.write("  $upscope $end\n")
+                else:
+                    (code, width) = val
+                    all_codes.add(code)
+                    fh.write("   $var wire %d v%x %s [%d:0] $end\n" %
+                             (width, code, key, width - 1))
+
        fh.write(" $scope module gantt $end\n")
-        for module in sorted(vcd['sigs'].keys()):
-            fh.write("  $scope module %s $end\n" % module)
-            for sig in sorted(vcd['sigs'][module].keys()):
-                code = vcd['sigs'][module][sig]
-                fh.write("   $var wire 32 v%x %s [31:0] $end\n" % (code, sig))
-                all_codes[code] = 1
-            fh.write("  $upscope $end\n")
+        writeScope(scopeSigs)
        fh.write(" $upscope $end\n")
        fh.write("$enddefinitions $end\n")
        fh.write("\n")

        first = True
-        for time in sorted(vcd['values']):
+        for time in sorted(values):
            if first:
                first = False
                # Start with Z for any signals without time zero data
-                for code in sorted(all_codes.keys()):
-                    if code not in vcd['values'][time]:
-                        vcd['values'][time][code] = None
+                for code in sorted(all_codes):
+                    if code not in values[time]:
+                        addValue(code, time, None)
            fh.write("#%d\n" % time)
-            for code in sorted(vcd['values'][time].keys()):
-                value = vcd['values'][time][code]
+            for code in sorted(values[time]):
+                value = values[time][code]
                if value is None:
                    fh.write("bz v%x\n" % code)
+                elif isinstance(value, str):
+                    fh.write("b%s v%x\n" % (value, code))
                else:
                    fh.write("b%s v%x\n" % (format(value, 'b'), code))

@ -504,7 +566,8 @@ parser.add_argument('filename',

 Args = parser.parse_args()

-process(Args.filename)
+read_data(Args.filename)
+report()
 if not Args.no_vcd:
    write_vcd(Args.vcd)

--- a/docs/guide/exe_verilator_gantt.rst
+++ b/docs/guide/exe_verilator_gantt.rst
@ -30,15 +30,13 @@ Gantt Chart VCD Signals
 In waveforms, there are the following signals. In GTKWave, use "decimal"
 data format to remove the leading zeros and make the traces easier to read.

-evals
-  Increments each time when eval_step was measured to be active.  This
-  allow visualization of how much time eval_step was active.
+trace/section
+  Shows the name of the current top of the execution section stack.
+  Set GTKWave data format to "ASCII".

-eval_loop
-  Increments each time when the evaluation loop within eval_step was
-  measured to be active.  For best performance, there is only a single
-  evaluation loop within each eval_step call; that is, the eval_loop
-  waveform looks identical to the evals waveform.
+trace/depth
+  Shows the depth of the execution section stack.
+  Set GTKWave data format to "Analog".

 measured_parallelism
  The number of mtasks active at this time, for best performance, this will
--- a/include/verilated_profiler.cpp
+++ b/include/verilated_profiler.cpp
@ -161,12 +161,18 @@ void VlExecutionProfiler::dump(const char* filenamep, uint64_t tickEnd)

    // TODO Perhaps merge with verilated_coverage output format, so can
    // have a common merging and reporting tool, etc.
-    fprintf(fp, "VLPROFVERSION 2.0 # Verilator execution profile version 2.0\n");
+    fprintf(fp, "VLPROFVERSION 2.1 # Verilator execution profile version 2.1\n");
    fprintf(fp, "VLPROF arg +verilator+prof+exec+start+%" PRIu64 "\n",
            Verilated::threadContextp()->profExecStart());
    fprintf(fp, "VLPROF arg +verilator+prof+exec+window+%u\n",
            Verilated::threadContextp()->profExecWindow());
-    const unsigned threads = static_cast<unsigned>(m_traceps.size());
+    // Note that VerilatedContext will by default create as many threads as there are hardware
+    // processors, but not all of them might be utilized. Report the actual number that has trace
+    // entries to avoid over-counting.
+    unsigned threads = 0;
+    for (const auto& pair : m_traceps) {
+        if (!pair.second->empty()) ++threads;
+    }
    fprintf(fp, "VLPROF stat threads %u\n", threads);
    fprintf(fp, "VLPROF stat yields %" PRIu64 "\n", VlMTaskVertex::yields());

@ -183,6 +189,7 @@ void VlExecutionProfiler::dump(const char* filenamep, uint64_t tickEnd)
    for (const auto& pair : m_traceps) {
        const uint32_t threadId = pair.first;
        ExecutionTrace* const tracep = pair.second;
+        if (tracep->empty()) continue;
        fprintf(fp, "VLPROFTHREAD %" PRIu32 "\n", threadId);

        for (const VlExecutionRecord& er : *tracep) {
@ -191,10 +198,9 @@ void VlExecutionProfiler::dump(const char* filenamep, uint64_t tickEnd)
            fprintf(fp, "VLPROFEXEC %s %" PRIu64, name, time);

            switch (er.m_type) {
-            case VlExecutionRecord::Type::EVAL_BEGIN:
-            case VlExecutionRecord::Type::EVAL_END:
-            case VlExecutionRecord::Type::EVAL_LOOP_BEGIN:
-            case VlExecutionRecord::Type::EVAL_LOOP_END:
+            case VlExecutionRecord::Type::SECTION_POP:
+            case VlExecutionRecord::Type::EXEC_GRAPH_BEGIN:
+            case VlExecutionRecord::Type::EXEC_GRAPH_END:
                // No payload
                fprintf(fp, "\n");
                break;
@ -209,6 +215,11 @@ void VlExecutionProfiler::dump(const char* filenamep, uint64_t tickEnd)
                fprintf(fp, " id %u predictCost %u\n", payload.m_id, payload.m_predictCost);
                break;
            }
+            case VlExecutionRecord::Type::SECTION_PUSH: {
+                const auto& payload = er.m_payload.sectionPush;
+                fprintf(fp, " %s\n", payload.m_name);
+                break;
+            }
            default: abort();  // LCOV_EXCL_LINE
            }
        }
--- a/include/verilated_profiler.h
+++ b/include/verilated_profiler.h
@ -58,12 +58,12 @@ VL_ATTR_ALWINLINE QData VL_CPU_TICK() {

 // clang-format off
 #define FOREACH_VlExecutionRecord_TYPE(macro) \
-    _VL_FOREACH_APPLY(macro, EVAL_BEGIN) \
-    _VL_FOREACH_APPLY(macro, EVAL_END) \
-    _VL_FOREACH_APPLY(macro, EVAL_LOOP_BEGIN) \
-    _VL_FOREACH_APPLY(macro, EVAL_LOOP_END) \
+    _VL_FOREACH_APPLY(macro, SECTION_PUSH) \
+    _VL_FOREACH_APPLY(macro, SECTION_POP) \
    _VL_FOREACH_APPLY(macro, MTASK_BEGIN) \
-    _VL_FOREACH_APPLY(macro, MTASK_END)
+    _VL_FOREACH_APPLY(macro, MTASK_END) \
+    _VL_FOREACH_APPLY(macro, EXEC_GRAPH_BEGIN) \
+    _VL_FOREACH_APPLY(macro, EXEC_GRAPH_END)
 // clang-format on

 class VlExecutionRecord final {
@ -83,6 +83,9 @@ class VlExecutionRecord final {
    };

    union Payload {
+        struct {
+            const char* m_name;  // Name of section being entered
+        } sectionPush;
        struct {
            uint32_t m_id;  // MTask id
            uint32_t m_predictStart;  // Time scheduler predicted would start
@ -109,10 +112,11 @@ public:
    VlExecutionRecord() = default;

    // METHODS
-    void evalBegin() { m_type = Type::EVAL_BEGIN; }
-    void evalEnd() { m_type = Type::EVAL_END; }
-    void evalLoopBegin() { m_type = Type::EVAL_LOOP_BEGIN; }
-    void evalLoopEnd() { m_type = Type::EVAL_LOOP_END; }
+    void sectionPush(const char* name) {
+        m_payload.sectionPush.m_name = name;
+        m_type = Type::SECTION_PUSH;
+    }
+    void sectionPop() { m_type = Type::SECTION_POP; }
    void mtaskBegin(uint32_t id, uint32_t predictStart) {
        m_payload.mtaskBegin.m_id = id;
        m_payload.mtaskBegin.m_predictStart = predictStart;
@ -124,6 +128,8 @@ public:
        m_payload.mtaskEnd.m_predictCost = predictCost;
        m_type = Type::MTASK_END;
    }
+    void execGraphBegin() { m_type = Type::EXEC_GRAPH_BEGIN; }
+    void execGraphEnd() { m_type = Type::EXEC_GRAPH_END; }
 };

 static_assert(std::is_trivially_destructible<VlExecutionRecord>::value,
--- a/src/V3EmitCModel.cpp
+++ b/src/V3EmitCModel.cpp
@ -403,7 +403,7 @@ class EmitCModel final : public EmitCFunc {

        if (v3Global.opt.profExec()) {
            puts("vlSymsp->__Vm_executionProfilerp->configure();\n");
-            puts("VL_EXEC_TRACE_ADD_RECORD(vlSymsp).evalBegin();\n");
+            puts("VL_EXEC_TRACE_ADD_RECORD(vlSymsp).sectionPush(\"eval\");\n");
        }

        puts("VL_DEBUG_IF(VL_DBG_MSGF(\"+ Eval\\n\"););\n");
@ -412,7 +412,7 @@ class EmitCModel final : public EmitCFunc {
        putsDecoration("// Evaluate cleanup\n");
        puts("Verilated::endOfEval(vlSymsp->__Vm_evalMsgQp);\n");

-        if (v3Global.opt.profExec()) puts("VL_EXEC_TRACE_ADD_RECORD(vlSymsp).evalEnd();\n");
+        if (v3Global.opt.profExec()) puts("VL_EXEC_TRACE_ADD_RECORD(vlSymsp).sectionPop();\n");
        puts("}\n");
    }

--- a/src/V3Order.cpp
+++ b/src/V3Order.cpp
@ -1473,7 +1473,8 @@ AstCFunc* order(AstNetlist* netlistp,  //

    // Create the result function
    AstScope* const scopeTopp = netlistp->topScopep()->scopep();
-    AstCFunc* const funcp = new AstCFunc{netlistp->fileline(), "_eval_" + tag, scopeTopp, ""};
+    FileLine* const flp = netlistp->fileline();
+    AstCFunc* const funcp = new AstCFunc{flp, "_eval_" + tag, scopeTopp, ""};
    funcp->dontCombine(true);
    funcp->isStatic(false);
    funcp->isLoose(true);
@ -1482,9 +1483,18 @@ AstCFunc* order(AstNetlist* netlistp,  //
    funcp->declPrivate(true);
    scopeTopp->addBlocksp(funcp);

+    if (v3Global.opt.profExec()) {
+        funcp->addStmtsp(new AstCStmt{flp, "VL_EXEC_TRACE_ADD_RECORD(vlSymsp).sectionPush(\"func "
+                                               + tag + "\");\n"});
+    }
+
    // Add ordered statements to the result function
    for (AstNode* const nodep : nodeps) funcp->addStmtsp(nodep);

+    if (v3Global.opt.profExec()) {
+        funcp->addStmtsp(new AstCStmt{flp, "VL_EXEC_TRACE_ADD_RECORD(vlSymsp).sectionPop();\n"});
+    }
+
    // Dispose of the remnants of the inputs
    for (auto* const lbsp : logic) lbsp->deleteActives();

--- a/src/V3Partition.cpp
+++ b/src/V3Partition.cpp
@ -3183,6 +3183,10 @@ static void addThreadStartToExecGraph(AstExecGraph* const execGraphp,
        execGraphp->addStmtsp(new AstText{fl, text, /* tracking: */ true});
    };

+    if (v3Global.opt.profExec()) {
+        addStrStmt("VL_EXEC_TRACE_ADD_RECORD(vlSymsp).execGraphBegin();\n");
+    }
+
    addStrStmt("vlSymsp->__Vm_even_cycle__" + tag + " = !vlSymsp->__Vm_even_cycle__" + tag
               + ";\n");

@ -3206,6 +3210,10 @@ static void addThreadStartToExecGraph(AstExecGraph* const execGraphp,

    addStrStmt("vlSelf->__Vm_mtaskstate_final__" + tag
               + ".waitUntilUpstreamDone(vlSymsp->__Vm_even_cycle__" + tag + ");\n");
+
+    if (v3Global.opt.profExec()) {
+        addStrStmt("VL_EXEC_TRACE_ADD_RECORD(vlSymsp).execGraphEnd();\n");
+    }
 }

 static void implementExecGraph(AstExecGraph* const execGraphp) {
--- a/src/V3Sched.cpp
+++ b/src/V3Sched.cpp
@ -399,6 +399,17 @@ AstSenTree* createTriggerSenTree(AstNetlist* netlistp, AstVarScope* const vscp,
    return resultp;
 }

+//============================================================================
+// Utility for creating profiling statements
+
+AstNodeStmt* profExecSectionPush(FileLine* flp, const string& name) {
+    return new AstCStmt{flp, "VL_EXEC_TRACE_ADD_RECORD(vlSymsp).sectionPush(\"" + name + "\");\n"};
+};
+
+AstNodeStmt* profExecSectionPop(FileLine* flp) {
+    return new AstCStmt{flp, "VL_EXEC_TRACE_ADD_RECORD(vlSymsp).sectionPop();\n"};
+};
+
 //============================================================================
 // Utility for extra trigger allocation

@ -442,6 +453,7 @@ const TriggerKit createTriggers(AstNetlist* netlistp, AstCFunc* const initFuncp,

    // Create the trigger computation function
    AstCFunc* const funcp = makeSubFunction(netlistp, "_eval_triggers__" + name, slow);
+    if (v3Global.opt.profExec()) funcp->addStmtsp(profExecSectionPush(flp, "trig " + name));

    // Create the trigger dump function (for debugging, always 'slow')
    AstCFunc* const dumpp = makeSubFunction(netlistp, "_dump_triggers__" + name, true);
@ -581,6 +593,8 @@ const TriggerKit createTriggers(AstNetlist* netlistp, AstCFunc* const initFuncp,
        add("#endif\n");
    }

+    if (v3Global.opt.profExec()) funcp->addStmtsp(profExecSectionPop(flp));
+
    // The debug code might leak signal names, so simply delete it when using --protect-ids
    if (v3Global.opt.protectIds()) dumpp->stmtsp()->unlinkFrBackWithNext()->deleteTree();

@ -636,7 +650,9 @@ EvalLoop makeEvalLoop(AstNetlist* netlistp, const string& tag, const string& nam
    AstVarScope* const continuep = scopeTopp->createTemp("__V" + tag + "Continue", 1);
    continuep->varp()->noReset(true);

-    AstNodeStmt* nodep = setVar(counterp, 0);
+    AstNodeStmt* nodep = nullptr;
+    if (v3Global.opt.profExec()) nodep = profExecSectionPush(flp, "loop " + tag);
+    nodep = AstNode::addNext(nodep, setVar(counterp, 0));
    nodep->addNext(buildLoop(netlistp, continuep, [&](AstWhile* loopp) {
        // Compute triggers
        loopp->addStmtsp(computeTriggers());
@ -689,6 +705,8 @@ EvalLoop makeEvalLoop(AstNetlist* netlistp, const string& tag, const string& nam
        }
    }));

+    if (v3Global.opt.profExec()) nodep->addNext(profExecSectionPop(flp));
+
    return {counterp, continuep, nodep};
 }

--- a/test_regress/t/t_gantt.pl
+++ b/test_regress/t/t_gantt.pl
@ -49,7 +49,7 @@ if ($Self->{vltmt}) {
    file_grep("$Self->{obj_dir}/gantt.log", qr/Total threads += 1/i);
    file_grep("$Self->{obj_dir}/gantt.log", qr/Total mtasks += 0/i);
 }
-file_grep("$Self->{obj_dir}/gantt.log", qr/Total evals += 2/i);
+file_grep("$Self->{obj_dir}/gantt.log", qr/\|\s+2\s+\|\s+2\.0+\s+\|\s+eval/i);

 # Diff to itself, just to check parsing
 vcd_identical("$Self->{obj_dir}/profile_exec.vcd", "$Self->{obj_dir}/profile_exec.vcd");
--- a/test_regress/t/t_gantt_io.dat
+++ b/test_regress/t/t_gantt_io.dat
@ -900,22 +900,18 @@ VLPROFPROC address sizes        : 43 bits physical, 48 bits virtual
 VLPROFPROC power management: ts ttp tm hwpstate cpb eff_freq_ro [13] [14]
 VLPROFPROC 
 VLPROFTHREAD 0
-VLPROFEXEC EVAL_BEGIN 595
-VLPROFEXEC EVAL_LOOP_BEGIN 945
+VLPROFEXEC EXEC_GRAPH_BEGIN 945
 VLPROFEXEC MTASK_BEGIN 2695 id 6 predictStart 0 cpu 19
 VLPROFEXEC MTASK_END 2905 id 6 predictCost 30
 VLPROFEXEC MTASK_BEGIN 9695 id 10 predictStart 196 cpu 19
 VLPROFEXEC MTASK_END 9870 id 10 predictCost 30
-VLPROFEXEC EVAL_LOOP_END 12180
-VLPROFEXEC EVAL_END 12250
-VLPROFEXEC EVAL_BEGIN 13720
-VLPROFEXEC EVAL_LOOP_BEGIN 14000
+VLPROFEXEC EXEC_GRAPH_END 12180
+VLPROFEXEC EXEC_GRAPH_BEGIN 14000
 VLPROFEXEC MTASK_BEGIN 15610 id 6 predictStart 0 cpu 19
 VLPROFEXEC MTASK_END 15820 id 6 predictCost 30
 VLPROFEXEC MTASK_BEGIN 21700 id 10 predictStart 196 cpu 19
 VLPROFEXEC MTASK_END 21875 id 10 predictCost 30
-VLPROFEXEC EVAL_LOOP_END 22085
-VLPROFEXEC EVAL_END 22330
+VLPROFEXEC EXEC_GRAPH_END 22085
 VLPROFTHREAD 1
 VLPROFEXEC MTASK_BEGIN 5495 id 5 predictStart 0 cpu 10
 VLPROFEXEC MTASK_END 6090 id 5 predictCost 30
--- a/test_regress/t/t_gantt_io.out
+++ b/test_regress/t/t_gantt_io.out
@ -4,33 +4,44 @@ Argument settings:
  +verilator+prof+exec+start+2
  +verilator+prof+exec+window+2

-Analysis:
-  Total threads             = 2
-  Total mtasks              = 7
-  Total cpus used           = 2
-  Total yields              = 0
-  Total evals               = 2
-  Total eval loops          = 2
-  Total eval time           = 21875 rdtsc ticks
-  Longest mtask time        = 1190 rdtsc ticks
-  All-thread mtask time     = 5495 rdtsc ticks
-  Longest-thread efficiency = 5.4%
-  All-thread efficiency     = 12.6%
-  All-thread speedup        = 0.3
+Summary:
+  Total elapsed time = 23415 rdtsc ticks
+  Parallelized code  = 82.51% of elapsed time
+  Total threads      = 2
+  Total CPUs used    = 2
+  Total mtasks       = 7
+  Total yields       = 0

-Prediction (what Verilator used for scheduling):
-  All-thread efficiency     = 63.2%
-  All-thread speedup        = 1.3
+Parallelized code, measured:
+  Thread utilization =  14.22%
+  Speedup            =  0.284x
+
+Parallelized code, predicted during static scheduling:
+  Thread utilization =  63.22%
+  Speedup            =   1.26x
+
+All code, measured:
+  Thread utilization =  20.48%
+  Speedup            =   0.41x
+
+All code, measured, scaled by predicted speedup:
+  Thread utilization =  56.80%
+  Speedup            =   1.14x

 MTask statistics:
+  Longest mtask id = 5
+  Longest mtask time = 6.16% of time elapsed in parallelized code
  min log(p2e) = -3.681  from mtask 5 (predict 30, elapsed 1190)
  max log(p2e) = -2.409  from mtask 8 (predict 107, elapsed 1190)
  mean = -2.992
  stddev = 0.459
  e ^ stddev = 1.583

-CPUs:
-  cpu 10: cpu_time=4725 socket=0 core=10  Test Ryzen 9 3950X 16-Core Processor
-  cpu 19: cpu_time=770 socket=0 core=3  Test Ryzen 9 3950X 16-Core Processor
+CPU info:
+   Id | Time spent executing MTask | Socket | Core | Model
+      | % of elapsed ticks / ticks |        |      |
+  ====|============================|========|======|======
+   10 |  20.18% /             4725 |      0 |   10 | Test Ryzen 9 3950X 16-Core Processor
+   19 |   3.29% /              770 |      0 |    3 | Test Ryzen 9 3950X 16-Core Processor

 Writing profile_exec.vcd
--- a/test_regress/t/t_gantt_io.vcd.out
+++ b/test_regress/t/t_gantt_io.vcd.out
@ -2,34 +2,21 @@ $version Generated by verilator_gantt $end
 $timescale 1ns $end

 $scope module gantt $end
-  $scope module Stats $end
-   $var wire 32 vf measured_parallelism [31:0] $end
-   $var wire 32 v10 predicted_parallelism [31:0] $end
-  $upscope $end
-  $scope module cpus $end
-   $var wire 32 v5 cpu10_thread [31:0] $end
+  $scope module measured $end
+   $var wire 32 v5 cpu10_mtask [31:0] $end
+   $var wire 32 v4 cpu10_thread [31:0] $end
+   $var wire 32 v2 cpu19_mtask [31:0] $end
   $var wire 32 v1 cpu19_thread [31:0] $end
+   $var wire 32 v0 t0_mtask [31:0] $end
+   $var wire 32 v3 t1_mtask [31:0] $end
  $upscope $end
-  $scope module evals $end
-   $var wire 32 vb eval [31:0] $end
-   $var wire 32 vc eval_loop [31:0] $end
+  $scope module predicted $end
+   $var wire 32 v6 t0_mtask [31:0] $end
+   $var wire 32 v7 t1_mtask [31:0] $end
  $upscope $end
-  $scope module measured_threads $end
-   $var wire 32 v0 thread0_mtask [31:0] $end
-   $var wire 32 v4 thread1_mtask [31:0] $end
-  $upscope $end
-  $scope module mtasks $end
-   $var wire 32 v3 mtask10_cpu [31:0] $end
-   $var wire 32 va mtask11_cpu [31:0] $end
-   $var wire 32 v6 mtask5_cpu [31:0] $end
-   $var wire 32 v2 mtask6_cpu [31:0] $end
-   $var wire 32 v7 mtask7_cpu [31:0] $end
-   $var wire 32 v8 mtask8_cpu [31:0] $end
-   $var wire 32 v9 mtask9_cpu [31:0] $end
-  $upscope $end
-  $scope module predicted_threads $end
-   $var wire 32 vd predicted_thread0_mtask [31:0] $end
-   $var wire 32 ve predicted_thread1_mtask [31:0] $end
+  $scope module stats $end
+   $var wire 32 v8 measured_parallelism [31:0] $end
+   $var wire 32 v9 predicted_parallelism [31:0] $end
  $upscope $end
 $upscope $end
 $enddefinitions $end
@ -43,216 +30,193 @@ bz v4
 bz v5
 bz v6
 bz v7
-bz v8
-bz v9
-bz va
-bz vb
-bz vc
-bz vd
-bz ve
-b0 vf
-b0 v10
-#595
-b1 vb
-#945
-b1 vc
-b110 vd
-b101 ve
-b10 v10
-#2429
-bz vd
-b111 ve
-b1 v10
+b0 v8
+b0 v9
 #2695
 b110 v0
 b0 v1
-b10011 v2
-b1 vf
+b110 v2
+b110 v6
+b101 v7
+b1 v8
+b10 v9
 #2905
 bz v0
 bz v1
 bz v2
-b0 vf
-#3914
-b1000 ve
-b1 v10
-#5495
-b101 v4
-b1 v5
-b1010 v6
-b1 vf
-#6090
-bz v4
-bz v5
+b0 v8
+#3800
 bz v6
-b0 vf
+b111 v7
+b1 v9
+#4906
+b1000 v7
+b1 v9
+#5495
+b101 v3
+b1 v4
+b101 v5
+b1 v8
+#6090
+bz v3
+bz v4
+bz v5
+b0 v8
 #6300
-b111 v4
-b1 v5
-b1010 v7
-b1 vf
+b111 v3
+b1 v4
+b111 v5
+b1 v8
 #6895
+bz v3
 bz v4
 bz v5
-bz v7
-b0 vf
+b0 v8
 #7490
-b1000 v4
-b1 v5
-b1010 v8
-b1 vf
+b1000 v3
+b1 v4
+b1000 v5
+b1 v8
 #8540
+bz v3
 bz v4
 bz v5
-bz v8
-b0 vf
+b0 v8
+#8848
+b1001 v7
+b1 v9
 #9135
-b1001 v4
-b1 v5
-b1010 v9
-b1 vf
-#9210
-b1001 ve
-b1 v10
+b1001 v3
+b1 v4
+b1001 v5
+b1 v8
 #9695
 b1010 v0
 b0 v1
-b10011 v3
-b10 vf
+b1010 v2
+b10 v8
 #9730
+bz v3
 bz v4
 bz v5
-bz v9
-b1 vf
+b1 v8
 #9870
 bz v0
 bz v1
-bz v3
-b0 vf
+bz v2
+b0 v8
+#9917
+b1010 v6
+b10 v9
+#9954
+b1011 v7
+b10 v9
 #10255
-b1011 v4
-b1 v5
-b1010 va
-b1 vf
-#10645
-b1010 vd
-b10 v10
-#10695
-b1011 ve
-b10 v10
+b1011 v3
+b1 v4
+b1011 v5
+b1 v8
+#11023
+bz v6
+b1 v9
 #11060
+bz v3
 bz v4
 bz v5
-bz va
-b0 vf
-#12130
-bz vd
-b1 v10
-#12180
-bz vc
-bz ve
-b0 v10
-#12250
-bz vb
-#13720
-b10 vb
-#14000
-b10 vc
-b110 vd
-b101 ve
-b10 v10
-#15068
-bz vd
-b111 ve
-b1 v10
+bz v7
+b0 v8
+b0 v9
 #15610
 b110 v0
 b0 v1
-b10011 v2
-b1 vf
+b110 v2
+b110 v6
+b101 v7
+b1 v8
+b10 v9
 #15820
 bz v0
 bz v1
 bz v2
-b0 vf
-#16137
-b1000 ve
-b1 v10
-#18375
-b101 v4
-b1 v5
-b1010 v6
-b1 vf
-#18970
-bz v4
-bz v5
+b0 v8
+#16437
 bz v6
-b0 vf
+b111 v7
+b1 v9
+#17265
+b1000 v7
+b1 v9
+#18375
+b101 v3
+b1 v4
+b101 v5
+b1 v8
+#18970
+bz v3
+bz v4
+bz v5
+b0 v8
 #19145
-b111 v4
-b1 v5
-b1010 v7
-b1 vf
+b111 v3
+b1 v4
+b111 v5
+b1 v8
 #19320
+bz v3
 bz v4
 bz v5
-bz v7
-b0 vf
+b0 v8
 #19670
-b1000 v4
-b1 v5
-b1010 v8
-b1 vf
+b1000 v3
+b1 v4
+b1000 v5
+b1 v8
 #19810
+bz v3
 bz v4
 bz v5
-bz v8
-b0 vf
-#19947
-b1001 ve
-b1 v10
+b0 v8
+#20219
+b1001 v7
+b1 v9
 #20650
-b1001 v4
-b1 v5
-b1010 v9
-b1 vf
+b1001 v3
+b1 v4
+b1001 v5
+b1 v8
 #20720
+bz v3
 bz v4
 bz v5
-bz v9
-b0 vf
-#20980
-b1010 vd
-b10 v10
-#21016
-b1011 ve
-b10 v10
+b0 v8
+#21019
+b1010 v6
+b10 v9
+#21047
+b1011 v7
+b10 v9
 #21140
-b1011 v4
-b1 v5
-b1010 va
-b1 vf
+b1011 v3
+b1 v4
+b1011 v5
+b1 v8
 #21245
+bz v3
 bz v4
 bz v5
-bz va
-b0 vf
+b0 v8
 #21700
 b1010 v0
 b0 v1
-b10011 v3
-b1 vf
+b1010 v2
+b1 v8
+#21847
+bz v6
+b1 v9
 #21875
 bz v0
 bz v1
-bz v3
-b0 vf
-#22049
-bz vd
-b1 v10
-#22085
-bz vc
-bz ve
-b0 v10
-#22330
-bz vb
+bz v2
+bz v7
+b0 v8
+b0 v9
--- a/test_regress/t/t_gantt_io_arm.dat
+++ b/test_regress/t/t_gantt_io_arm.dat
@ -44,14 +44,12 @@ VLPROFPROC CPU part     : 0x663
 VLPROFPROC CPU revision : 3
 VLPROFPROC 
 VLPROFTHREAD 0
-VLPROFEXEC EVAL_BEGIN 57709
-VLPROFEXEC EVAL_LOOP_BEGIN 58532
+VLPROFEXEC EXEC_GRAPH_BEGIN 58532
 VLPROFEXEC MTASK_BEGIN 90465 id 85 predictStart 14315 cpu 2
 VLPROFEXEC MTASK_END 155034 id 85 predictCost 30533
 VLPROFEXEC MTASK_BEGIN 156555 id 79 predictStart 44848 cpu 2
 VLPROFEXEC MTASK_END 294309 id 79 predictCost 48001
-VLPROFEXEC EVAL_LOOP_END 18028850
-VLPROFEXEC EVAL_END 1803680
+VLPROFEXEC EXEC_GRAPH_END 300000
 VLPROFTHREAD 1
 VLPROFEXEC MTASK_BEGIN 77352 id 90 predictStart 14315 cpu 3
 VLPROFEXEC MTASK_END 78511 id 90 predictCost 21592
@ -59,4 +57,4 @@ VLPROFEXEC MTASK_BEGIN 79799 id 81 predictStart 35907 cpu 3
 VLPROFEXEC MTASK_END 80667 id 81 predictCost 29215
 VLPROFEXEC MTASK_BEGIN 81746 id 87 predictStart 65147 cpu 3
 VLPROFEXEC MTASK_END 82633 id 87 predictCost 33809
-VLPROF stat ticks 180832
+VLPROF stat ticks 300000
--- a/test_regress/t/t_gantt_io_arm.out
+++ b/test_regress/t/t_gantt_io_arm.out
@ -4,33 +4,44 @@ Argument settings:
  +verilator+prof+exec+start+1
  +verilator+prof+exec+window+2

-Analysis:
-  Total threads             = 2
-  Total mtasks              = 5
-  Total cpus used           = 2
-  Total yields              = 51
-  Total evals               = 1
-  Total eval loops          = 1
-  Total eval time           = 294309 rdtsc ticks
-  Longest mtask time        = 137754 rdtsc ticks
-  All-thread mtask time     = 205237 rdtsc ticks
-  Longest-thread efficiency = 46.8%
-  All-thread efficiency     = 34.9%
-  All-thread speedup        = 0.7
+Summary:
+  Total elapsed time = 300000 rdtsc ticks
+  Parallelized code  = 80.49% of elapsed time
+  Total threads      = 2
+  Total CPUs used    = 2
+  Total mtasks       = 5
+  Total yields       = 51

-Prediction (what Verilator used for scheduling):
-  All-thread efficiency     = 82.4%
-  All-thread speedup        = 1.6
+Parallelized code, measured:
+  Thread utilization =  42.50%
+  Speedup            =   0.85x
+
+Parallelized code, predicted during static scheduling:
+  Thread utilization =  82.44%
+  Speedup            =   1.65x
+
+All code, measured:
+  Thread utilization =  43.96%
+  Speedup            =  0.879x
+
+All code, measured, scaled by predicted speedup:
+  Thread utilization =  72.06%
+  Speedup            =   1.44x

 MTask statistics:
+  Longest mtask id = 79
+  Longest mtask time = 57.05% of time elapsed in parallelized code
  min log(p2e) = -1.054  from mtask 79 (predict 48001, elapsed 137754)
  max log(p2e) = 3.641  from mtask 87 (predict 33809, elapsed 887)
  mean = 1.656
  stddev = 2.104
  e ^ stddev = 8.200

-CPUs:
-  cpu 2: cpu_time=202323  Phytium,FT-2500/128
-  cpu 3: cpu_time=2914  Phytium,FT-2500/128
+CPU info:
+   Id | Time spent executing MTask | Socket | Core | Model
+      | % of elapsed ticks / ticks |        |      |
+  ====|============================|========|======|======
+    2 |  67.44% /           202323 |        |      | Phytium,FT-2500/128
+    3 |   0.97% /             2914 |        |      | Phytium,FT-2500/128

 Writing profile_exec.vcd
--- a/test_regress/t/t_gantt_io_noproc.dat
+++ b/test_regress/t/t_gantt_io_noproc.dat
@ -4,22 +4,18 @@ VLPROF arg +verilator+prof+exec+window+2
 VLPROF stat threads 2
 VLPROF stat yields 0
 VLPROFTHREAD 0
-VLPROFEXEC EVAL_BEGIN 595
-VLPROFEXEC EVAL_LOOP_BEGIN 945
+VLPROFEXEC EXEC_GRAPH_BEGIN 945
 VLPROFEXEC MTASK_BEGIN 2695 id 6 predictStart 0 cpu 19
 VLPROFEXEC MTASK_END 2905 id 6 predictCost 30
 VLPROFEXEC MTASK_BEGIN 9695 id 10 predictStart 196 cpu 19
 VLPROFEXEC MTASK_END 9870 id 10 predictCost 30
-VLPROFEXEC EVAL_LOOP_END 12180
-VLPROFEXEC EVAL_END 12250
-VLPROFEXEC EVAL_BEGIN 13720
-VLPROFEXEC EVAL_LOOP_BEGIN 14000
+VLPROFEXEC EXEC_GRAPH_END 12180
+VLPROFEXEC EXEC_GRAPH_BEGIN 14000
 VLPROFEXEC MTASK_BEGIN 15610 id 6 predictStart 0 cpu 19
 VLPROFEXEC MTASK_END 15820 id 6 predictCost 30
 VLPROFEXEC MTASK_BEGIN 21700 id 10 predictStart 196 cpu 19
 VLPROFEXEC MTASK_END 21875 id 10 predictCost 30
-VLPROFEXEC EVAL_LOOP_END 22085
-VLPROFEXEC EVAL_END 22330
+VLPROFEXEC EXEC_GRAPH_END 22085
 VLPROFTHREAD 1
 VLPROFEXEC MTASK_BEGIN 5495 id 5 predictStart 0 cpu 10
 VLPROFEXEC MTASK_END 6090 id 5 predictCost 30
--- a/test_regress/t/t_gantt_io_noproc.out
+++ b/test_regress/t/t_gantt_io_noproc.out
@ -4,32 +4,43 @@ Argument settings:
  +verilator+prof+exec+start+2
  +verilator+prof+exec+window+2

-Analysis:
-  Total threads             = 2
-  Total mtasks              = 7
-  Total cpus used           = 2
-  Total yields              = 0
-  Total evals               = 2
-  Total eval loops          = 2
-  Total eval time           = 21875 rdtsc ticks
-  Longest mtask time        = 1190 rdtsc ticks
-  All-thread mtask time     = 5495 rdtsc ticks
-  Longest-thread efficiency = 5.4%
-  All-thread efficiency     = 12.6%
-  All-thread speedup        = 0.3
+Summary:
+  Total elapsed time = 23415 rdtsc ticks
+  Parallelized code  = 82.51% of elapsed time
+  Total threads      = 2
+  Total CPUs used    = 2
+  Total mtasks       = 7
+  Total yields       = 0

-Prediction (what Verilator used for scheduling):
-  All-thread efficiency     = 63.2%
-  All-thread speedup        = 1.3
+Parallelized code, measured:
+  Thread utilization =  14.22%
+  Speedup            =  0.284x
+
+Parallelized code, predicted during static scheduling:
+  Thread utilization =  63.22%
+  Speedup            =   1.26x
+
+All code, measured:
+  Thread utilization =  20.48%
+  Speedup            =   0.41x
+
+All code, measured, scaled by predicted speedup:
+  Thread utilization =  56.80%
+  Speedup            =   1.14x

 MTask statistics:
+  Longest mtask id = 5
+  Longest mtask time = 6.16% of time elapsed in parallelized code
  min log(p2e) = -3.681  from mtask 5 (predict 30, elapsed 1190)
  max log(p2e) = -2.409  from mtask 8 (predict 107, elapsed 1190)
  mean = -2.992
  stddev = 0.459
  e ^ stddev = 1.583

-CPUs:
-  cpu 10: cpu_time=4725
-  cpu 19: cpu_time=770
+CPU info:
+   Id | Time spent executing MTask | Socket | Core | Model
+      | % of elapsed ticks / ticks |        |      |
+  ====|============================|========|======|======
+   10 |  20.18% /             4725 |        |      | 
+   19 |   3.29% /              770 |        |      | 

--- a/test_regress/t/t_gantt_two.pl
+++ b/test_regress/t/t_gantt_two.pl
@ -54,7 +54,7 @@ if ($Self->{vltmt}) {
    file_grep("$Self->{obj_dir}/gantt.log", qr/Total threads += 1/i);
    file_grep("$Self->{obj_dir}/gantt.log", qr/Total mtasks += 0/i);
 }
-file_grep("$Self->{obj_dir}/gantt.log", qr/Total evals += 4/i);
+file_grep("$Self->{obj_dir}/gantt.log", qr/\|\s+4\s+\|\s+4\.0+\s+\|\s+eval/i);

 # Diff to itself, just to check parsing
 vcd_identical("$Self->{obj_dir}/profile_exec.vcd", "$Self->{obj_dir}/profile_exec.vcd");