Verilator_gantt now shows the predicted mtask times, eval times, and additional statistics.

2025-04-05 04:02:37 +00:00 · 2021-09-23 22:59:36 -04:00 · 2021-09-23 22:59:36 -04:00 · c2819923c5
commit c2819923c5
parent 68f1432a68
13 changed files with 1387 additions and 1082 deletions
--- a/3
+++ b/3
@ -11,7 +11,8 @@ contributors that suggested a given feature are shown in []. Thanks!
 Verilator 4.213 devel
 ==========================

-* Include processor information in verilator_gantt data file.
+* Verilator_gantt now shows the predicted mtask times, eval times, and additional statistics.
+* Verilator_gantt data files now include processor information, to allow later processing.
 * Fix verilator_profcfunc profile accounting (#3115).
 * Fix display has no time units on class function (#3116). [Damien Pretet]
 * Fix removing if statement with side effect in condition (#3131). [Alexander Grobman]
--- a/bin/verilator_gantt
+++ b/bin/verilator_gantt
@ -11,6 +11,8 @@ import statistics

 Threads = collections.defaultdict(lambda: {})
 Mtasks = collections.defaultdict(lambda: {})
+Evals = collections.defaultdict(lambda: {})
+EvalLoops = collections.defaultdict(lambda: {})
 Global = {
    'args': {},
    'cpuinfo': collections.defaultdict(lambda: {}),
@ -29,8 +31,11 @@ def process(filename):
 def read_data(filename):
    with open(filename) as fh:
        re_prof = re.compile(
-            r'^VLPROF mtask\s(\d+)\sstart\s(\d+)\send\s(\d+)\selapsed\s(\d+)\spredict_time\s(\d+)\scpu\s(\d+)\son thread (\d+)'
+            r'^VLPROF mtask\s(\d+)\sstart\s(\d+)\selapsed\s(\d+)\spredict_start\s(\d+)\spredict_cost\s(\d+)\scpu\s(\d+)\son thread (\d+)'
        )
+        re_eval = re.compile(r'^VLPROF eval\sstart\s(\d+)\selapsed\s(\d+)')
+        re_loop = re.compile(
+            r'^VLPROF eval_loop\sstart\s(\d+)\selapsed\s(\d+)')
        re_arg1 = re.compile(r'VLPROF arg\s+(\S+)\+([0-9.])\s*')
        re_arg2 = re.compile(r'VLPROF arg\s+(\S+)\s+([0-9.])\s*$')
        re_stat = re.compile(r'VLPROF stat\s+(\S+)\s+([0-9.]+)')
@ -44,9 +49,10 @@ def read_data(filename):
                match = re_prof.match(line)
                mtask = int(match.group(1))
                start = int(match.group(2))
-                end = int(match.group(3))
-                elapsed_time = int(match.group(4))
-                predict_time = int(match.group(5))
+                elapsed_time = int(match.group(3))
+                end = start + elapsed_time
+                predict_start = int(match.group(4))
+                predict_cost = int(match.group(5))
                cpu = int(match.group(6))
                thread = int(match.group(7))
                if start not in Threads[thread]:
@ -54,12 +60,28 @@ def read_data(filename):
                Threads[thread][start]['mtask'] = mtask
                Threads[thread][start]['end'] = end
                Threads[thread][start]['cpu'] = cpu
+                Threads[thread][start]['predict_start'] = predict_start
+                Threads[thread][start]['predict_cost'] = predict_cost

                if 'elapsed' not in Mtasks[mtask]:
                    Mtasks[mtask] = {'end': 0, 'elapsed': 0}
+                Mtasks[mtask]['thread'] = thread
                Mtasks[mtask]['elapsed'] += elapsed_time
-                Mtasks[mtask]['predict_cost'] = predict_time
+                Mtasks[mtask]['predict_start'] = predict_start
+                Mtasks[mtask]['predict_cost'] = predict_cost
                Mtasks[mtask]['end'] = max(Mtasks[mtask]['end'], end)
+            elif re_eval.match(line):
+                match = re_eval.match(line)
+                start = int(match.group(1))
+                elapsed_time = int(match.group(2))
+                Evals[start]['start'] = start
+                Evals[start]['end'] = start + elapsed_time
+            elif re_loop.match(line):
+                match = re_loop.match(line)
+                start = int(match.group(1))
+                elapsed_time = int(match.group(2))
+                EvalLoops[start]['start'] = start
+                EvalLoops[start]['end'] = start + elapsed_time
            elif re.match(r'^VLPROFTHREAD', line):
                None  # pylint: disable=pointless-statement
            elif re_arg1.match(line):
@ -120,14 +142,21 @@ def report():
                Global['cpus'][cpu] = {'cpu_time': 0}
            Global['cpus'][cpu]['cpu_time'] += elapsed

-    mt_mtask_time = 0
+    measured_mt_mtask_time = 0
+    predict_mt_mtask_time = 0
    long_mtask_time = 0
-    last_end = 0
+    measured_last_end = 0
+    predict_last_end = 0
    for mtask in Mtasks:
-        mt_mtask_time += Mtasks[mtask]['elapsed']
-        last_end = max(last_end, Mtasks[mtask]['end'])
+        measured_mt_mtask_time += Mtasks[mtask]['elapsed']
+        predict_mt_mtask_time += Mtasks[mtask]['predict_cost']
+        measured_last_end = max(measured_last_end, Mtasks[mtask]['end'])
+        predict_last_end = max(
+            predict_last_end,
+            Mtasks[mtask]['predict_start'] + Mtasks[mtask]['predict_cost'])
        long_mtask_time = max(long_mtask_time, Mtasks[mtask]['elapsed'])
-    Global['last_end'] = last_end
+    Global['measured_last_end'] = measured_last_end
+    Global['predict_last_end'] = predict_last_end

    report_graph()

@ -145,19 +174,30 @@ def report():
    ncpus = len(Global['cpus'])
    print("  Total cpus used           = %d" % ncpus)
    print("  Total yields              = %d" % int(Global['stats']['yields']))
-    print("  Total eval time           = %d rdtsc ticks" % Global['last_end'])
+    print("  Total eval time           = %d rdtsc ticks" %
+          Global['measured_last_end'])
    print("  Longest mtask time        = %d rdtsc ticks" % long_mtask_time)
-    print("  All-thread mtask time     = %d rdtsc ticks" % mt_mtask_time)
-    long_efficiency = long_mtask_time / (Global.get('last_end', 1))
+    print("  All-thread mtask time     = %d rdtsc ticks" %
+          measured_mt_mtask_time)
+    long_efficiency = long_mtask_time / (Global.get('measured_last_end', 1) or 1)
    print("  Longest-thread efficiency = %0.1f%%" % (long_efficiency * 100.0))
-    mt_efficiency = mt_mtask_time / (Global.get('last_end', 1) * nthreads or 1)
+    mt_efficiency = measured_mt_mtask_time / (
+        Global.get('measured_last_end', 1) * nthreads or 1)
    print("  All-thread efficiency     = %0.1f%%" % (mt_efficiency * 100.0))
    print("  All-thread speedup        = %0.1f" % (mt_efficiency * nthreads))
    if Global['rdtsc_cycle_time'] > 0:
-        ut = mt_mtask_time / Global['rdtsc_cycle_time']
-        print("tot_mtask_cpu=" + mt_mtask_time + " cyc=" +
+        ut = measured_mt_mtask_time / Global['rdtsc_cycle_time']
+        print("tot_mtask_cpu=" + measured_mt_mtask_time + " cyc=" +
              Global['rdtsc_cycle_time'] + " ut=" + ut)

+    predict_mt_efficiency = predict_mt_mtask_time / (
+        Global.get('predict_last_end', 1) * nthreads or 1)
+    print("\nPrediction (what Verilator used for scheduling):")
+    print("  All-thread efficiency     = %0.1f%%" %
+          (predict_mt_efficiency * 100.0))
+    print("  All-thread speedup        = %0.1f" %
+          (predict_mt_efficiency * nthreads))
+
    p2e_ratios = []
    min_p2e = 1000000
    min_mtask = None
@ -253,7 +293,7 @@ def report_cpus():
 def report_graph():
    time_per = Args.scale
    if time_per == 0:
-        time_per = Global['last_end'] / 40  # Start with 40 columns
+        time_per = Global['measured_last_end'] / 40  # Start with 40 columns
        while time_per > 10:
            (graph, conflicts) = _make_graph(time_per)
            if not conflicts:
@ -270,8 +310,9 @@ def report_graph():
    print("  Legend: One character width = %s rdtsc ticks" % time_per)
    print("  Legend: '&' = multiple mtasks in this period (character width)")

-    scale = "   <-%d rdtsc total" % Global['last_end']
-    for col in range(len(scale), int(0.99 + (Global['last_end'] / time_per))):  # pylint: disable=unused-variable
+    scale = "   <-%d rdtsc total" % Global['measured_last_end']
+    for col in range(len(scale),
+                     int(0.99 + (Global['measured_last_end'] / time_per))):  # pylint: disable=unused-variable
        scale += "-"
    print("  " + scale + "->")

@ -361,30 +402,39 @@ def write_vcd(filename):
            'values':
            collections.defaultdict(lambda: {}),  # {<time>}{<code>} = value
            'sigs': {
-                'threads': {},
+                'predicted_threads': {},
+                'measured_threads': {},
                'cpus': {},
+                'evals': {},
                'mtasks': {},
                'Stats': {}
            }  # {<module>}{<sig}} = code
        }
        code = 0

-        parallelism = collections.defaultdict(lambda: 0)
+        parallelism = {
+            'measured': collections.defaultdict(lambda: 0),
+            'predicted': collections.defaultdict(lambda: 0)
+        }
+        parallelism['measured'][0] = 0
+        parallelism['predicted'][0] = 0
+
+        # Measured graph
        for thread in sorted(Threads.keys()):
            sig = "thread%d_mtask" % thread
-            if sig not in vcd['sigs']['threads']:
-                vcd['sigs']['threads'][sig] = code
+            if sig not in vcd['sigs']['measured_threads']:
+                vcd['sigs']['measured_threads'][sig] = code
                code += 1
-            mcode = vcd['sigs']['threads'][sig]
+            mcode = vcd['sigs']['measured_threads'][sig]

            for start in sorted(Threads[thread]):
-                end = Threads[thread][start]['end']
                mtask = Threads[thread][start]['mtask']
+                end = Threads[thread][start]['end']
                cpu = Threads[thread][start]['cpu']
                vcd['values'][start][mcode] = mtask
                vcd['values'][end][mcode] = None
-                parallelism[start] += 1
-                parallelism[end] -= 1
+                parallelism['measured'][start] += 1
+                parallelism['measured'][end] -= 1

                sig = "cpu%d_thread" % cpu
                if sig not in vcd['sigs']['cpus']:
@ -402,16 +452,68 @@ def write_vcd(filename):
                vcd['values'][start][ccode] = cpu
                vcd['values'][end][ccode] = None

-        # Parallelism graph
-        vcd['sigs']['Stats']["parallelism"] = code
-        pcode = code
+        # Eval graph
+        vcd['sigs']['evals']["eval"] = code
+        elcode = code
        code += 1
+        n = 0
+        for eval_start in Evals:
+            eval_end = Evals[eval_start]['end']
+            n += 1
+            vcd['values'][eval_start][elcode] = n
+            vcd['values'][eval_end][elcode] = None

-        value = 0
-        for time in sorted(parallelism.keys()):
-            value += parallelism[time]
-            vcd['values'][time][pcode] = value
+        # Eval_loop graph
+        vcd['sigs']['evals']["eval_loop"] = code
+        elcode = code
+        code += 1
+        n = 0
+        for eval_start in EvalLoops:
+            eval_end = EvalLoops[eval_start]['end']
+            n += 1
+            vcd['values'][eval_start][elcode] = n
+            vcd['values'][eval_end][elcode] = None

+        # Predicted graph
+        for eval_start in EvalLoops:
+            eval_end = EvalLoops[eval_start]['end']
+            # Compute scale so predicted graph is of same width as eval
+            measured_scaling = (eval_end -
+                                eval_start) / Global['predict_last_end']
+            # Predict mtasks that fill the time the eval occupied
+            for mtask in Mtasks:
+                thread = Mtasks[mtask]['thread']
+                pred_scaled_start = eval_start + int(
+                    Mtasks[mtask]['predict_start'] * measured_scaling)
+                pred_scaled_end = eval_start + int(
+                    (Mtasks[mtask]['predict_start'] +
+                     Mtasks[mtask]['predict_cost']) * measured_scaling)
+                if pred_scaled_start == pred_scaled_end:
+                    continue
+
+                sig = "predicted_thread%d_mtask" % thread
+                if sig not in vcd['sigs']['predicted_threads']:
+                    vcd['sigs']['predicted_threads'][sig] = code
+                    code += 1
+                mcode = vcd['sigs']['predicted_threads'][sig]
+
+                vcd['values'][pred_scaled_start][mcode] = mtask
+                vcd['values'][pred_scaled_end][mcode] = None
+
+                parallelism['predicted'][pred_scaled_start] += 1
+                parallelism['predicted'][pred_scaled_end] -= 1
+
+        # Parallelism graph
+        for measpred in ('measured', 'predicted'):
+            vcd['sigs']['Stats']["%s_parallelism" % measpred] = code
+            pcode = code
+            code += 1
+            value = 0
+            for time in sorted(parallelism[measpred].keys()):
+                value += parallelism[measpred][time]
+                vcd['values'][time][pcode] = value
+
+        # Create output file
        fh.write("$version Generated by verilator_gantt $end\n")
        fh.write("$timescale 1ns $end\n")
        fh.write("\n")
--- a/docs/guide/exe_verilator_gantt.rst
+++ b/docs/guide/exe_verilator_gantt.rst
@ -27,25 +27,51 @@ indicates multiple mtasks started at that time position.
 Also creates a value change dump (VCD) format dump file which may be viewed
 in a waveform viewer (e.g. C<GTKWave>).  See below.

+.. figure:: figures/fig_gantt_min.png
+
+   Example verilator_gantt output, as viewed with GTKWave.
+
+
 Gantt Chart VCD Signals
 -----------------------

-In waveforms there are the following signals. Most signals the "decimal"
-format will remove the leading zeros and make the traces easier to read.
+In waveforms there are the following signals. In GTKWave, using a data
+format of "decimal" will remove the leading zeros and make the traces
+easier to read.

-parallelism
+evals
+  Increments each time when eval_step was measured to be active.  This
+  allow visualization of how much time eval_step was active.
+
+eval_loop
+  Increments each time when the evaluation loop within eval_step was
+  measured to be active.  For best performance there is only a single
+  evaluation loop within each eval_step call, that is the eval_loop
+  waveform looks identical to the evals waveform.
+
+measured_parallelism
  The number of mtasks active at this time, for best performance this will
-  match the thread count. You may want to use an "analog step" format to
+  match the thread count. In GTKWave, use a data format of "analog step" to
  view this signal.

+predicted_parallelism
+  The number of mtasks Verilator predicted would be active at this time,
+  for best performance this will match the thread count. In GTKWave, use a
+  data format of "analog step" to view this signal.
+
 cpu#_thread
-  For the given CPU number, the thread number executing.
+  For the given CPU number, the thread number measured to be executing.

 mtask#_cpu
-  For the given mtask id, the CPU it is executing on.
+  For the given mtask id, the CPU it was measured to execute on.

 thread#_mtask
-  For the given thread number, the mtask id executing.
+  For the given thread number, the mtask id it was executing.
+
+predicted_thread#_mtask
+  For the given thread number, the mtask id Verilator predicted would be
+  executing.
+

 verilator_gantt Arguments
 -------------------------
--- a/docs/guide/figures/fig_gantt_min.png
+++ b/docs/guide/figures/fig_gantt_min.png
--- a/docs/guide/simulating.rst
+++ b/docs/guide/simulating.rst
@ -306,7 +306,7 @@ statistics.

   Example verilator_gantt output, as viewed with GTKWave.

-   The parallelism shows the number of CPUs being used at a given moment.
+   The measured_parallelism shows the number of CPUs being used at a given moment.

   The cpu_thread section shows which thread is executing on each of the physical CPUs.

--- a/include/verilated_threads.cpp
+++ b/include/verilated_threads.cpp
@ -160,7 +160,7 @@ void VlThreadPool::profileDump(const char* filenamep, vluint64_t tickStart, vlui

    // TODO Perhaps merge with verilated_coverage output format, so can
    // have a common merging and reporting tool, etc.
-    fprintf(fp, "VLPROFTHREAD 1.0 # Verilator thread profile dump version 1.0\n");
+    fprintf(fp, "VLPROFTHREAD 1.1 # Verilator thread profile dump version 1.1\n");
    fprintf(fp, "VLPROF arg --threads %" VL_PRI64 "u\n", vluint64_t(m_workers.size() + 1));
    fprintf(fp, "VLPROF arg +verilator+prof+threads+start+%" VL_PRI64 "u\n",
            Verilated::threadContextp()->profThreadsStart());
@ -188,14 +188,30 @@ void VlThreadPool::profileDump(const char* filenamep, vluint64_t tickStart, vlui
            case VlProfileRec::TYPE_BARRIER:  //
                printing = true;
                break;
+            case VlProfileRec::TYPE_EVAL:
+                if (!printing) break;
+                fprintf(fp,
+                        "VLPROF eval start %" VL_PRI64 "u elapsed %" VL_PRI64 "u"
+                        " cpu %u on thread %u\n",
+                        ei.m_startTime - tickStart, (ei.m_endTime - ei.m_startTime), ei.m_cpu,
+                        thread_id);
+                break;
+            case VlProfileRec::TYPE_EVAL_LOOP:
+                if (!printing) break;
+                fprintf(fp,
+                        "VLPROF eval_loop start %" VL_PRI64 "u elapsed %" VL_PRI64 "u"
+                        " cpu %u on thread %u\n",
+                        ei.m_startTime - tickStart, (ei.m_endTime - ei.m_startTime), ei.m_cpu,
+                        thread_id);
+                break;
            case VlProfileRec::TYPE_MTASK_RUN:
                if (!printing) break;
                fprintf(fp,
                        "VLPROF mtask %d"
-                        " start %" VL_PRI64 "u end %" VL_PRI64 "u elapsed %" VL_PRI64 "u"
-                        " predict_time %u cpu %u on thread %u\n",
-                        ei.m_mtaskId, ei.m_startTime - tickStart, ei.m_endTime - tickStart,
-                        (ei.m_endTime - ei.m_startTime), ei.m_predictTime, ei.m_cpu, thread_id);
+                        " start %" VL_PRI64 "u elapsed %" VL_PRI64 "u"
+                        " predict_start %u predict_cost %u cpu %u on thread %u\n",
+                        ei.m_mtaskId, ei.m_startTime - tickStart, (ei.m_endTime - ei.m_startTime),
+                        ei.m_predictStart, ei.m_predictCost, ei.m_cpu, thread_id);
                break;
            default: assert(false); break;  // LCOV_EXCL_LINE
            }
--- a/include/verilated_threads.h
+++ b/include/verilated_threads.h
@ -131,21 +131,36 @@ public:
 class VlProfileRec final {
 protected:
    friend class VlThreadPool;
-    enum VlProfileE { TYPE_MTASK_RUN, TYPE_BARRIER };
-    VlProfileE m_type = TYPE_BARRIER;  // Record type
-    vluint32_t m_mtaskId = 0;  // Mtask we're logging
-    vluint32_t m_predictTime = 0;  // How long scheduler predicted would take
-    vluint64_t m_startTime = 0;  // Tick at start of execution
+    enum VlProfileE { TYPE_MTASK_RUN, TYPE_EVAL, TYPE_EVAL_LOOP, TYPE_BARRIER };
+    // Layout below allows efficient packing.
+    // Leave endTime first, so no math needed to calculate address in endRecord
    vluint64_t m_endTime = 0;  // Tick at end of execution
+    vluint64_t m_startTime = 0;  // Tick at start of execution
+    vluint32_t m_mtaskId = 0;  // Mtask we're logging
+    vluint32_t m_predictStart = 0;  // Time scheduler predicted would start
+    vluint32_t m_predictCost = 0;  // How long scheduler predicted would take
+    VlProfileE m_type = TYPE_BARRIER;  // Record type
    unsigned m_cpu;  // Execution CPU number (at start anyways)
 public:
    class Barrier {};
    VlProfileRec() = default;
    explicit VlProfileRec(Barrier) { m_cpu = getcpu(); }
-    void startRecord(vluint64_t time, uint32_t mtask, uint32_t predict) {
+    void startEval(vluint64_t time) {
+        m_type = VlProfileRec::TYPE_EVAL;
+        m_startTime = time;
+        m_cpu = getcpu();
+    }
+    void startEvalLoop(vluint64_t time) {
+        m_type = VlProfileRec::TYPE_EVAL_LOOP;
+        m_startTime = time;
+        m_cpu = getcpu();
+    }
+    void startRecord(vluint64_t time, vluint32_t mtask, vluint32_t predictStart,
+                     vluint32_t predictCost) {
        m_type = VlProfileRec::TYPE_MTASK_RUN;
        m_mtaskId = mtask;
-        m_predictTime = predict;
+        m_predictStart = predictStart;
+        m_predictCost = predictCost;
        m_startTime = time;
        m_cpu = getcpu();
    }
--- a/src/V3EmitCModel.cpp
+++ b/src/V3EmitCModel.cpp
@ -323,7 +323,24 @@ class EmitCModel final : public EmitCFunc {
        puts(" loop\\n\"););\n");
        if (initial)
            puts(topModNameProtected + "__" + protect("_eval_settle") + "(&(vlSymsp->TOP));\n");
+
+        const string recName = "__Vprfloop";
+        if (v3Global.opt.profThreads() && !initial) {
+            puts("VlProfileRec* " + recName + " = nullptr;\n");
+            // Leave this if() here, as don't want to call VL_RDTSC_Q unless profiling
+            puts("if (VL_UNLIKELY(vlSymsp->__Vm_profile_cycle_start)) {\n");
+            // Eval start
+            puts(/**/ recName + " = vlSymsp->__Vm_threadPoolp->profileAppend();\n");
+            puts(/**/ recName + "->startEvalLoop(VL_RDTSC_Q());\n");
+            puts("}\n");
+        }
+
        puts(topModNameProtected + "__" + protect("_eval") + "(&(vlSymsp->TOP));\n");
+
+        if (v3Global.opt.profThreads() && !initial) {
+            puts("if (VL_UNLIKELY(" + recName + ")) " + recName + "->endRecord(VL_RDTSC_Q());\n");
+        }
+
        if (v3Global.rootp()->changeRequest()) {
            puts("if (VL_UNLIKELY(++__VclockLoop > " + cvtToStr(v3Global.opt.convergeLimit())
                 + ")) {\n");
@ -354,7 +371,7 @@ class EmitCModel final : public EmitCFunc {
             + ");\n");
    }

-    void emitStandardMethods(AstNodeModule* modp) {
+    void emitStandardMethods1(AstNodeModule* modp) {
        UASSERT_OBJ(modp->isTop(), modp, "Attempting to emitWrapEval for non-top class");

        const string topModNameProtected = prefixNameProtect(modp);
@ -385,16 +402,21 @@ class EmitCModel final : public EmitCFunc {
        emitSettleLoop(modp, /* initial: */ true);
        ensureNewLine();
        puts("}\n");
+    }

+    void emitStandardMethods2(AstNodeModule* modp) {
+        const string topModNameProtected = prefixNameProtect(modp);
        // ::eval_step
        puts("\nvoid " + topClassName() + "::eval_step() {\n");
        puts("VL_DEBUG_IF(VL_DBG_MSGF(\"+++++TOP Evaluate " + topClassName()
             + "::eval_step\\n\"); );\n");
+
        puts("#ifdef VL_DEBUG\n");
        putsDecoration("// Debug assertions\n");
        puts(topModNameProtected + "__" + protect("_eval_debug_assertions")
             + "(&(vlSymsp->TOP));\n");
        puts("#endif  // VL_DEBUG\n");
+
        putsDecoration("// Initialize\n");
        puts("if (VL_UNLIKELY(!vlSymsp->__Vm_didInit)) " + protect("_eval_initial_loop")
             + "(vlSymsp);\n");
@ -406,13 +428,14 @@ class EmitCModel final : public EmitCFunc {
            puts("Verilated::mtaskId(" + cvtToStr(mtaskId) + ");\n");
        }

-        if (v3Global.opt.mtasks() && v3Global.opt.profThreads()) {
-            puts("if (VL_UNLIKELY((vlSymsp->_vm_contextp__->profThreadsStart()"
-                 " != vlSymsp->__Vm_profile_time_finished)\n");
+        if (v3Global.opt.profThreads()) {
+            puts("if (VL_UNLIKELY((vlSymsp->_vm_contextp__->profThreadsStart() != "
+                 "vlSymsp->__Vm_profile_time_finished)\n");
            puts(" && (VL_TIME_Q() > vlSymsp->_vm_contextp__->profThreadsStart())\n");
            puts(" && (vlSymsp->_vm_contextp__->profThreadsWindow() >= 1))) {\n");
            // Within a profile (either starting, middle, or end)
            puts(/**/ "if (vlSymsp->__Vm_profile_window_ct == 0) {\n");  // Opening file?
+            puts(/**/ "VL_DEBUG_IF(VL_DBG_MSGF(\"+ profile start warmup\\n\"););\n");
            // Start profile on this cycle. We'll capture a window worth, then
            // only analyze the next window worth. The idea is that the first window
            // capture will hit some cache-cold stuff (eg printf) but it'll be warm
@ -430,12 +453,13 @@ class EmitCModel final : public EmitCFunc {
            // cache-warm-up cycles before the barrier from the actual profile
            // cycles afterward.
            puts(/****/ "vlSymsp->__Vm_threadPoolp->profileAppendAll(");
-            puts(/****/ "VlProfileRec(VlProfileRec::Barrier()));\n");
+            puts(/****/ "VlProfileRec{VlProfileRec::Barrier{}});\n");
            puts(/****/ "vlSymsp->__Vm_profile_cycle_start = VL_RDTSC_Q();\n");
            puts(/**/ "}\n");
+            // Ending trace file?
            puts(/**/ "else if (vlSymsp->__Vm_profile_window_ct == 0) {\n");
-            // Ending file.
            puts(/****/ "vluint64_t tick_end = VL_RDTSC_Q();\n");
+            puts(/****/ "VL_DEBUG_IF(VL_DBG_MSGF(\"+ profile end\\n\"););\n");
            puts(/****/ "vlSymsp->__Vm_threadPoolp->profileDump("
                        "vlSymsp->_vm_contextp__->profThreadsFilename().c_str(), "
                        "vlSymsp->__Vm_profile_cycle_start, "
@ -443,20 +467,41 @@ class EmitCModel final : public EmitCFunc {
            // This turns off the test to enter the profiling code, but still
            // allows the user to collect another profile by changing
            // profThreadsStart
-            puts(/****/ "vlSymsp->__Vm_profile_time_finished"
-                        " = vlSymsp->_vm_contextp__->profThreadsStart();\n");
+            puts(/****/ "vlSymsp->__Vm_profile_time_finished = "
+                        "vlSymsp->_vm_contextp__->profThreadsStart();\n");
            puts(/****/ "vlSymsp->__Vm_profile_cycle_start = 0;\n");
            puts(/**/ "}\n");
            puts("}\n");
        }

+        const string recName = "__Vprfeval";
+        if (v3Global.opt.profThreads()) {
+            puts("VlProfileRec* " + recName + " = nullptr;\n");
+            // Leave this if() here, as don't want to call VL_RDTSC_Q unless profiling
+            puts("if (VL_UNLIKELY(vlSymsp->__Vm_profile_cycle_start)) {\n");
+            // Eval start
+            puts(/**/ recName + " = vlSymsp->__Vm_threadPoolp->profileAppend();\n");
+            puts(/**/ recName + "->startEval(VL_RDTSC_Q());\n");
+            puts("}\n");
+        }
+
        emitSettleLoop(modp, /* initial: */ false);
+
+        putsDecoration("// Evaluate cleanup\n");
        if (v3Global.opt.threads() == 1) {
            puts("Verilated::endOfThreadMTask(vlSymsp->__Vm_evalMsgQp);\n");
        }
        if (v3Global.opt.threads()) puts("Verilated::endOfEval(vlSymsp->__Vm_evalMsgQp);\n");
-        puts("}\n");

+        if (v3Global.opt.profThreads()) {
+            // End eval record
+            puts("if (VL_UNLIKELY(" + recName + ")) " + recName + "->endRecord(VL_RDTSC_Q());\n");
+        }
+        puts("}\n");
+    }
+
+    void emitStandardMethods3(AstNodeModule* modp) {
+        const string topModNameProtected = prefixNameProtect(modp);
        // ::eval_end_step
        if (v3Global.needTraceDumper() && !optSystemC()) {
            puts("\nvoid " + topClassName() + "::eval_end_step() {\n");
@ -573,7 +618,9 @@ class EmitCModel final : public EmitCFunc {

        emitConstructorImplementation(modp);
        emitDestructorImplementation();
-        emitStandardMethods(modp);
+        emitStandardMethods1(modp);
+        emitStandardMethods2(modp);
+        emitStandardMethods3(modp);
        if (v3Global.opt.trace()) { emitTraceMethods(modp); }
        if (v3Global.opt.savable()) { emitSerializationFunctions(); }

--- a/src/V3Partition.cpp
+++ b/src/V3Partition.cpp
@ -2237,11 +2237,11 @@ public:
        std::vector<uint32_t> busyUntil(m_nThreads, 0);

        // MTasks ready to be assigned next. All their dependencies are already assigned.
-        std::set<const ExecMTask*, MTaskCmp> readyMTasks;
+        std::set<ExecMTask*, MTaskCmp> readyMTasks;

        // Build initial ready list
        for (V3GraphVertex* vxp = mtaskGraph.verticesBeginp(); vxp; vxp = vxp->verticesNextp()) {
-            const ExecMTask* const mtaskp = dynamic_cast<ExecMTask*>(vxp);
+            ExecMTask* const mtaskp = dynamic_cast<ExecMTask*>(vxp);
            if (isReady(schedule, mtaskp)) readyMTasks.insert(mtaskp);
        }

@ -2250,9 +2250,9 @@ public:
            // on each thread (in that thread's local time frame.)
            uint32_t bestTime = 0xffffffff;
            uint32_t bestThreadId = 0;
-            const ExecMTask* bestMtaskp = nullptr;  // Todo: const ExecMTask*
+            ExecMTask* bestMtaskp = nullptr;  // Todo: const ExecMTask*
            for (uint32_t threadId = 0; threadId < m_nThreads; ++threadId) {
-                for (const ExecMTask* const mtaskp : readyMTasks) {
+                for (ExecMTask* const mtaskp : readyMTasks) {
                    uint32_t timeBegin = busyUntil[threadId];
                    if (timeBegin > bestTime) {
                        UINFO(6, "th " << threadId << " busy until " << timeBegin
@ -2287,10 +2287,11 @@ public:
            std::vector<const ExecMTask*>& bestThread = schedule.threads[bestThreadId];

            // Update algorithm state
+            bestMtaskp->predictStart(bestTime);  // Only for gantt reporting
            const uint32_t bestEndTime = bestTime + bestMtaskp->cost();
            schedule.mtaskState[bestMtaskp].completionTime = bestEndTime;
            schedule.mtaskState[bestMtaskp].threadId = bestThreadId;
-            if (!bestThread.empty()) { schedule.mtaskState[bestThread.back()].nextp = bestMtaskp; }
+            if (!bestThread.empty()) schedule.mtaskState[bestThread.back()].nextp = bestMtaskp;
            busyUntil[bestThreadId] = bestEndTime;

            // Add the MTask to the schedule
@ -2301,7 +2302,7 @@ public:
            UASSERT_OBJ(erased > 0, bestMtaskp, "Should have erased something?");
            for (V3GraphEdge* edgeOutp = bestMtaskp->outBeginp(); edgeOutp;
                 edgeOutp = edgeOutp->outNextp()) {
-                const ExecMTask* const nextp = dynamic_cast<ExecMTask*>(edgeOutp->top());
+                ExecMTask* const nextp = dynamic_cast<ExecMTask*>(edgeOutp->top());
                // Dependent MTask should not yet be assigned to a thread
                UASSERT(schedule.threadId(nextp) == ThreadSchedule::UNASSIGNED,
                        "Tasks after one being assigned should not be assigned yet");
@ -2713,6 +2714,7 @@ static void addMTaskToFunction(const ThreadSchedule& schedule, const uint32_t th
                   recName + " = vlSymsp->__Vm_threadPoolp->profileAppend();\n" +  //
                   recName + "->startRecord(VL_RDTSC_Q()," +  //
                   " " + cvtToStr(mtaskp->id()) + "," +  //
+                   " " + cvtToStr(mtaskp->predictStart()) + "," +  //
                   " " + cvtToStr(mtaskp->cost()) + ");\n" +  //
                   "}\n");
    }
@ -2724,9 +2726,8 @@ static void addMTaskToFunction(const ThreadSchedule& schedule, const uint32_t th
    funcp->addStmtsp(mtaskp->bodyp()->unlinkFrBack());

    if (v3Global.opt.profThreads()) {
-        // Leave this if() here, as don't want to call VL_RDTSC_Q unless profiling
-        addStrStmt("if (VL_UNLIKELY(" + recName + ")) {\n" +  //
-                   recName + "->endRecord(VL_RDTSC_Q());\n" + "}\n");
+        addStrStmt("if (VL_UNLIKELY(" + recName + ")) "  //
+                   + recName + "->endRecord(VL_RDTSC_Q());\n");
    }

    // Flush message queue
--- a/src/V3PartitionGraph.h
+++ b/src/V3PartitionGraph.h
@ -61,6 +61,7 @@ private:
                              // mtask. In abstract time units.
    uint32_t m_cost = 0;  // Predicted runtime of this mtask, in the same
                          // abstract time units as priority().
+    uint64_t m_predictStart = 0;  // Predicted start time of task
    VL_UNCOPYABLE(ExecMTask);

 public:
@ -74,6 +75,8 @@ public:
    void priority(uint32_t pri) { m_priority = pri; }
    virtual uint32_t cost() const override { return m_cost; }
    void cost(uint32_t cost) { m_cost = cost; }
+    void predictStart(vluint64_t time) { m_predictStart = time; }
+    vluint64_t predictStart() const { return m_predictStart; }
    string cFuncName() const {
        // If this MTask maps to a C function, this should be the name
        return string("__Vmtask") + "__" + cvtToStr(m_id);
--- a/test_regress/t/t_gantt_io.dat
+++ b/test_regress/t/t_gantt_io.dat
--- a/test_regress/t/t_gantt_io.out
+++ b/test_regress/t/t_gantt_io.out
@ -6,33 +6,37 @@ Argument settings:
  --threads 2

 Thread gantt graph:
-  Legend: One character width = 100 rdtsc ticks
+  Legend: One character width = 136 rdtsc ticks
  Legend: '&' = multiple mtasks in this period (character width)
-     <-16065 rdtsc total-------------------------------------------------------------------------------------------------------------------------------------------->
-  t:      [1]                                                                         [1]                                   [1]                                     [1]
-  t:                                                    [16-] [16][16-------]                [16--]     [16]                                      [16-]     [1[] [16[xxx
+     <-21875 rdtsc total-------------------------------------------------------------------------------------------------------------------------------------------->
+  t:                    [19]                                                [19]                                       [19]                                         [19]
+  t:                                         [10-] [10-]    [10----]    [10-]   [10---]                                                     [10-][10][10]   [10][10]

 Analysis:
  Total threads             = 2
  Total mtasks              = 7
  Total cpus used           = 2
-  Total yields              = 1
-  Total eval time           = 16065 rdtsc ticks
-  Longest mtask time        = 1085 rdtsc ticks
-  All-thread mtask time     = 3430 rdtsc ticks
-  Longest-thread efficiency = 6.8%
-  All-thread efficiency     = 10.7%
-  All-thread speedup        = 0.2
+  Total yields              = 0
+  Total eval time           = 21875 rdtsc ticks
+  Longest mtask time        = 1190 rdtsc ticks
+  All-thread mtask time     = 5495 rdtsc ticks
+  Longest-thread efficiency = 5.4%
+  All-thread efficiency     = 12.6%
+  All-thread speedup        = 0.3
+
+Prediction (what Verilator used for scheduling):
+  All-thread efficiency     = 63.2%
+  All-thread speedup        = 1.3

 Statistics:
-  min log(p2e) = -3.332  from mtask 5 (predict 30, elapsed 840)
-  max log(p2e) = -1.764  from mtask 7 (predict 30, elapsed 175)
-  mean = -2.365
-  stddev = 0.562
-  e ^ stddev = 1.754
+  min log(p2e) = -3.681  from mtask 5 (predict 30, elapsed 1190)
+  max log(p2e) = -2.409  from mtask 8 (predict 107, elapsed 1190)
+  mean = -2.992
+  stddev = 0.459
+  e ^ stddev = 1.583

 CPUs:
-  cpu 1: cpu_time=525 socket=0 core=1  Test Ryzen 9 3950X 16-Core Processor
-  cpu 16: cpu_time=2905 socket=0 core=0  Test Ryzen 9 3950X 16-Core Processor
+  cpu 10: cpu_time=4725 socket=0 core=10  Test Ryzen 9 3950X 16-Core Processor
+  cpu 19: cpu_time=770 socket=0 core=3  Test Ryzen 9 3950X 16-Core Processor

 Writing profile_threads.vcd
--- a/test_regress/t/t_gantt_io_vcd.out
+++ b/test_regress/t/t_gantt_io_vcd.out
@ -3,171 +3,256 @@ $timescale 1ns $end

 $scope module gantt $end
  $scope module Stats $end
-   $var wire 32 vb parallelism [31:0] $end
+   $var wire 32 vf measured_parallelism [31:0] $end
+   $var wire 32 v10 predicted_parallelism [31:0] $end
  $upscope $end
  $scope module cpus $end
-   $var wire 32 v1 cpu16_thread [31:0] $end
-   $var wire 32 v8 cpu1_thread [31:0] $end
+   $var wire 32 v5 cpu10_thread [31:0] $end
+   $var wire 32 v1 cpu19_thread [31:0] $end
+  $upscope $end
+  $scope module evals $end
+   $var wire 32 vb eval [31:0] $end
+   $var wire 32 vc eval_loop [31:0] $end
+  $upscope $end
+  $scope module measured_threads $end
+   $var wire 32 v0 thread1_mtask [31:0] $end
+   $var wire 32 v4 thread2_mtask [31:0] $end
  $upscope $end
  $scope module mtasks $end
-   $var wire 32 va mtask10_cpu [31:0] $end
-   $var wire 32 v6 mtask11_cpu [31:0] $end
-   $var wire 32 v2 mtask5_cpu [31:0] $end
-   $var wire 32 v9 mtask6_cpu [31:0] $end
-   $var wire 32 v3 mtask7_cpu [31:0] $end
-   $var wire 32 v4 mtask8_cpu [31:0] $end
-   $var wire 32 v5 mtask9_cpu [31:0] $end
+   $var wire 32 v3 mtask10_cpu [31:0] $end
+   $var wire 32 va mtask11_cpu [31:0] $end
+   $var wire 32 v6 mtask5_cpu [31:0] $end
+   $var wire 32 v2 mtask6_cpu [31:0] $end
+   $var wire 32 v7 mtask7_cpu [31:0] $end
+   $var wire 32 v8 mtask8_cpu [31:0] $end
+   $var wire 32 v9 mtask9_cpu [31:0] $end
  $upscope $end
-  $scope module threads $end
-   $var wire 32 v7 thread1_mtask [31:0] $end
-   $var wire 32 v0 thread2_mtask [31:0] $end
+  $scope module predicted_threads $end
+   $var wire 32 vd predicted_thread1_mtask [31:0] $end
+   $var wire 32 ve predicted_thread2_mtask [31:0] $end
  $upscope $end
 $upscope $end
 $enddefinitions $end

+#0
+bz v0
+bz v1
+bz v2
+bz v3
+bz v4
+bz v5
+bz v6
+bz v7
+bz v8
+bz v9
+bz va
+bz vb
+bz vc
+bz vd
+bz ve
+b0 vf
+b0 v10
 #595
+b1 vb
+#945
+b1 vc
+b110 vd
+b101 ve
+b10 v10
+#2429
+bz vd
+b111 ve
+b1 v10
+#2695
+b110 v0
+b1 v1
+b10011 v2
+b1 vf
+#2905
 bz v0
 bz v1
-bz va
-b1 vb
 bz v2
-bz v3
+b0 vf
+#3914
+b1000 ve
+b1 v10
+#5495
+b101 v4
+b10 v5
+b1010 v6
+b1 vf
+#6090
 bz v4
 bz v5
 bz v6
-b110 v7
-b1 v8
-b1 v9
-#735
-b0 vb
-bz v7
-bz v8
-bz v9
-#5110
-b101 v0
-b10 v1
-b1 vb
-b10000 v2
-#5530
-bz v0
-bz v1
-b0 vb
-bz v2
-#5740
-b111 v0
-b10 v1
-b1 vb
-b10000 v3
-#5845
-bz v0
-bz v1
-b0 vb
-bz v3
-#6125
-b1000 v0
-b10 v1
-b1 vb
-b10000 v4
-#7140
-bz v0
-bz v1
-b0 vb
-bz v4
-#8120
-b1 va
-b1 vb
+b0 vf
+#6300
+b111 v4
+b10 v5
 b1010 v7
-b1 v8
-#8260
-bz va
-b0 vb
-bz v7
-bz v8
-#8820
-b1001 v0
-b10 v1
-b1 vb
-b10000 v5
-#9380
-bz v0
-bz v1
-b0 vb
-bz v5
-#9940
-b1011 v0
-b10 v1
-b1 vb
-b10000 v6
-#10045
-bz v0
-bz v1
-b0 vb
-bz v6
-#11970
-b1 vb
-b110 v7
-b1 v8
-b1 v9
-#12075
-b0 vb
-bz v7
-bz v8
-bz v9
-#14175
-b101 v0
-b10 v1
-b1 vb
-b10000 v2
-#14595
-bz v0
-bz v1
-b0 vb
-bz v2
-#15120
-b111 v0
-b10 v1
-b1 vb
-b10000 v3
-#15190
-bz v0
-bz v1
-b0 vb
-bz v3
-#15365
-b1000 v0
-b10 v1
-b1 vb
-b10000 v4
-#15435
-bz v0
-bz v1
-b0 vb
+b1 vf
+#6895
 bz v4
-#15680
-b1001 v0
-b10 v1
-b1 vb
-b10000 v5
-#15750
+bz v5
+bz v7
+b0 vf
+#7490
+b1000 v4
+b10 v5
+b1010 v8
+b1 vf
+#8540
+bz v4
+bz v5
+bz v8
+b0 vf
+#9135
+b1001 v4
+b10 v5
+b1010 v9
+b1 vf
+#9210
+b1001 ve
+b1 v10
+#9695
+b1010 v0
+b1 v1
+b10011 v3
+b10 vf
+#9730
+bz v4
+bz v5
+bz v9
+b1 vf
+#9870
 bz v0
 bz v1
-b0 vb
+bz v3
+b0 vf
+#10255
+b1011 v4
+b10 v5
+b1010 va
+b1 vf
+#10645
+b1010 vd
+b10 v10
+#10695
+b1011 ve
+b10 v10
+#11060
+bz v4
 bz v5
-#15925
-b1011 v0
-b10 v1
-b1 va
+bz va
+b0 vf
+#12130
+bz vd
+b1 v10
+#12180
+bz vc
+bz ve
+b0 v10
+#12250
+bz vb
+#13720
 b10 vb
-b10000 v6
-b1010 v7
-b1 v8
-#15995
+#14000
+b10 vc
+b110 vd
+b101 ve
+b10 v10
+#15068
+bz vd
+b111 ve
+b1 v10
+#15610
+b110 v0
+b1 v1
+b10011 v2
+b1 vf
+#15820
 bz v0
 bz v1
-b1 vb
+bz v2
+b0 vf
+#16137
+b1000 ve
+b1 v10
+#18375
+b101 v4
+b10 v5
+b1010 v6
+b1 vf
+#18970
+bz v4
+bz v5
 bz v6
-#16065
-bz va
-b0 vb
+b0 vf
+#19145
+b111 v4
+b10 v5
+b1010 v7
+b1 vf
+#19320
+bz v4
+bz v5
 bz v7
+b0 vf
+#19670
+b1000 v4
+b10 v5
+b1010 v8
+b1 vf
+#19810
+bz v4
+bz v5
 bz v8
+b0 vf
+#19947
+b1001 ve
+b1 v10
+#20650
+b1001 v4
+b10 v5
+b1010 v9
+b1 vf
+#20720
+bz v4
+bz v5
+bz v9
+b0 vf
+#20980
+b1010 vd
+b10 v10
+#21016
+b1011 ve
+b10 v10
+#21140
+b1011 v4
+b10 v5
+b1010 va
+b1 vf
+#21245
+bz v4
+bz v5
+bz va
+b0 vf
+#21700
+b1010 v0
+b1 v1
+b10011 v3
+b1 vf
+#21875
+bz v0
+bz v1
+bz v3
+b0 vf
+#22049
+bz vd
+b1 v10
+#22085
+bz vc
+bz ve
+b0 v10
+#22330
+bz vb