Verilator_gantt now shows the predicted mtask times, eval times, and additional statistics.

This commit is contained in:
Wilson Snyder 2021-09-23 22:59:36 -04:00
parent 68f1432a68
commit c2819923c5
13 changed files with 1387 additions and 1082 deletions

View File

@ -11,7 +11,8 @@ contributors that suggested a given feature are shown in []. Thanks!
Verilator 4.213 devel
==========================
* Include processor information in verilator_gantt data file.
* Verilator_gantt now shows the predicted mtask times, eval times, and additional statistics.
* Verilator_gantt data files now include processor information, to allow later processing.
* Fix verilator_profcfunc profile accounting (#3115).
* Fix display has no time units on class function (#3116). [Damien Pretet]
* Fix removing if statement with side effect in condition (#3131). [Alexander Grobman]

View File

@ -11,6 +11,8 @@ import statistics
Threads = collections.defaultdict(lambda: {})
Mtasks = collections.defaultdict(lambda: {})
Evals = collections.defaultdict(lambda: {})
EvalLoops = collections.defaultdict(lambda: {})
Global = {
'args': {},
'cpuinfo': collections.defaultdict(lambda: {}),
@ -29,8 +31,11 @@ def process(filename):
def read_data(filename):
with open(filename) as fh:
re_prof = re.compile(
r'^VLPROF mtask\s(\d+)\sstart\s(\d+)\send\s(\d+)\selapsed\s(\d+)\spredict_time\s(\d+)\scpu\s(\d+)\son thread (\d+)'
r'^VLPROF mtask\s(\d+)\sstart\s(\d+)\selapsed\s(\d+)\spredict_start\s(\d+)\spredict_cost\s(\d+)\scpu\s(\d+)\son thread (\d+)'
)
re_eval = re.compile(r'^VLPROF eval\sstart\s(\d+)\selapsed\s(\d+)')
re_loop = re.compile(
r'^VLPROF eval_loop\sstart\s(\d+)\selapsed\s(\d+)')
re_arg1 = re.compile(r'VLPROF arg\s+(\S+)\+([0-9.])\s*')
re_arg2 = re.compile(r'VLPROF arg\s+(\S+)\s+([0-9.])\s*$')
re_stat = re.compile(r'VLPROF stat\s+(\S+)\s+([0-9.]+)')
@ -44,9 +49,10 @@ def read_data(filename):
match = re_prof.match(line)
mtask = int(match.group(1))
start = int(match.group(2))
end = int(match.group(3))
elapsed_time = int(match.group(4))
predict_time = int(match.group(5))
elapsed_time = int(match.group(3))
end = start + elapsed_time
predict_start = int(match.group(4))
predict_cost = int(match.group(5))
cpu = int(match.group(6))
thread = int(match.group(7))
if start not in Threads[thread]:
@ -54,12 +60,28 @@ def read_data(filename):
Threads[thread][start]['mtask'] = mtask
Threads[thread][start]['end'] = end
Threads[thread][start]['cpu'] = cpu
Threads[thread][start]['predict_start'] = predict_start
Threads[thread][start]['predict_cost'] = predict_cost
if 'elapsed' not in Mtasks[mtask]:
Mtasks[mtask] = {'end': 0, 'elapsed': 0}
Mtasks[mtask]['thread'] = thread
Mtasks[mtask]['elapsed'] += elapsed_time
Mtasks[mtask]['predict_cost'] = predict_time
Mtasks[mtask]['predict_start'] = predict_start
Mtasks[mtask]['predict_cost'] = predict_cost
Mtasks[mtask]['end'] = max(Mtasks[mtask]['end'], end)
elif re_eval.match(line):
match = re_eval.match(line)
start = int(match.group(1))
elapsed_time = int(match.group(2))
Evals[start]['start'] = start
Evals[start]['end'] = start + elapsed_time
elif re_loop.match(line):
match = re_loop.match(line)
start = int(match.group(1))
elapsed_time = int(match.group(2))
EvalLoops[start]['start'] = start
EvalLoops[start]['end'] = start + elapsed_time
elif re.match(r'^VLPROFTHREAD', line):
None # pylint: disable=pointless-statement
elif re_arg1.match(line):
@ -120,14 +142,21 @@ def report():
Global['cpus'][cpu] = {'cpu_time': 0}
Global['cpus'][cpu]['cpu_time'] += elapsed
mt_mtask_time = 0
measured_mt_mtask_time = 0
predict_mt_mtask_time = 0
long_mtask_time = 0
last_end = 0
measured_last_end = 0
predict_last_end = 0
for mtask in Mtasks:
mt_mtask_time += Mtasks[mtask]['elapsed']
last_end = max(last_end, Mtasks[mtask]['end'])
measured_mt_mtask_time += Mtasks[mtask]['elapsed']
predict_mt_mtask_time += Mtasks[mtask]['predict_cost']
measured_last_end = max(measured_last_end, Mtasks[mtask]['end'])
predict_last_end = max(
predict_last_end,
Mtasks[mtask]['predict_start'] + Mtasks[mtask]['predict_cost'])
long_mtask_time = max(long_mtask_time, Mtasks[mtask]['elapsed'])
Global['last_end'] = last_end
Global['measured_last_end'] = measured_last_end
Global['predict_last_end'] = predict_last_end
report_graph()
@ -145,19 +174,30 @@ def report():
ncpus = len(Global['cpus'])
print(" Total cpus used = %d" % ncpus)
print(" Total yields = %d" % int(Global['stats']['yields']))
print(" Total eval time = %d rdtsc ticks" % Global['last_end'])
print(" Total eval time = %d rdtsc ticks" %
Global['measured_last_end'])
print(" Longest mtask time = %d rdtsc ticks" % long_mtask_time)
print(" All-thread mtask time = %d rdtsc ticks" % mt_mtask_time)
long_efficiency = long_mtask_time / (Global.get('last_end', 1))
print(" All-thread mtask time = %d rdtsc ticks" %
measured_mt_mtask_time)
long_efficiency = long_mtask_time / (Global.get('measured_last_end', 1) or 1)
print(" Longest-thread efficiency = %0.1f%%" % (long_efficiency * 100.0))
mt_efficiency = mt_mtask_time / (Global.get('last_end', 1) * nthreads or 1)
mt_efficiency = measured_mt_mtask_time / (
Global.get('measured_last_end', 1) * nthreads or 1)
print(" All-thread efficiency = %0.1f%%" % (mt_efficiency * 100.0))
print(" All-thread speedup = %0.1f" % (mt_efficiency * nthreads))
if Global['rdtsc_cycle_time'] > 0:
ut = mt_mtask_time / Global['rdtsc_cycle_time']
print("tot_mtask_cpu=" + mt_mtask_time + " cyc=" +
ut = measured_mt_mtask_time / Global['rdtsc_cycle_time']
print("tot_mtask_cpu=" + measured_mt_mtask_time + " cyc=" +
Global['rdtsc_cycle_time'] + " ut=" + ut)
predict_mt_efficiency = predict_mt_mtask_time / (
Global.get('predict_last_end', 1) * nthreads or 1)
print("\nPrediction (what Verilator used for scheduling):")
print(" All-thread efficiency = %0.1f%%" %
(predict_mt_efficiency * 100.0))
print(" All-thread speedup = %0.1f" %
(predict_mt_efficiency * nthreads))
p2e_ratios = []
min_p2e = 1000000
min_mtask = None
@ -253,7 +293,7 @@ def report_cpus():
def report_graph():
time_per = Args.scale
if time_per == 0:
time_per = Global['last_end'] / 40 # Start with 40 columns
time_per = Global['measured_last_end'] / 40 # Start with 40 columns
while time_per > 10:
(graph, conflicts) = _make_graph(time_per)
if not conflicts:
@ -270,8 +310,9 @@ def report_graph():
print(" Legend: One character width = %s rdtsc ticks" % time_per)
print(" Legend: '&' = multiple mtasks in this period (character width)")
scale = " <-%d rdtsc total" % Global['last_end']
for col in range(len(scale), int(0.99 + (Global['last_end'] / time_per))): # pylint: disable=unused-variable
scale = " <-%d rdtsc total" % Global['measured_last_end']
for col in range(len(scale),
int(0.99 + (Global['measured_last_end'] / time_per))): # pylint: disable=unused-variable
scale += "-"
print(" " + scale + "->")
@ -361,30 +402,39 @@ def write_vcd(filename):
'values':
collections.defaultdict(lambda: {}), # {<time>}{<code>} = value
'sigs': {
'threads': {},
'predicted_threads': {},
'measured_threads': {},
'cpus': {},
'evals': {},
'mtasks': {},
'Stats': {}
} # {<module>}{<sig}} = code
}
code = 0
parallelism = collections.defaultdict(lambda: 0)
parallelism = {
'measured': collections.defaultdict(lambda: 0),
'predicted': collections.defaultdict(lambda: 0)
}
parallelism['measured'][0] = 0
parallelism['predicted'][0] = 0
# Measured graph
for thread in sorted(Threads.keys()):
sig = "thread%d_mtask" % thread
if sig not in vcd['sigs']['threads']:
vcd['sigs']['threads'][sig] = code
if sig not in vcd['sigs']['measured_threads']:
vcd['sigs']['measured_threads'][sig] = code
code += 1
mcode = vcd['sigs']['threads'][sig]
mcode = vcd['sigs']['measured_threads'][sig]
for start in sorted(Threads[thread]):
end = Threads[thread][start]['end']
mtask = Threads[thread][start]['mtask']
end = Threads[thread][start]['end']
cpu = Threads[thread][start]['cpu']
vcd['values'][start][mcode] = mtask
vcd['values'][end][mcode] = None
parallelism[start] += 1
parallelism[end] -= 1
parallelism['measured'][start] += 1
parallelism['measured'][end] -= 1
sig = "cpu%d_thread" % cpu
if sig not in vcd['sigs']['cpus']:
@ -402,16 +452,68 @@ def write_vcd(filename):
vcd['values'][start][ccode] = cpu
vcd['values'][end][ccode] = None
# Parallelism graph
vcd['sigs']['Stats']["parallelism"] = code
pcode = code
# Eval graph
vcd['sigs']['evals']["eval"] = code
elcode = code
code += 1
n = 0
for eval_start in Evals:
eval_end = Evals[eval_start]['end']
n += 1
vcd['values'][eval_start][elcode] = n
vcd['values'][eval_end][elcode] = None
value = 0
for time in sorted(parallelism.keys()):
value += parallelism[time]
vcd['values'][time][pcode] = value
# Eval_loop graph
vcd['sigs']['evals']["eval_loop"] = code
elcode = code
code += 1
n = 0
for eval_start in EvalLoops:
eval_end = EvalLoops[eval_start]['end']
n += 1
vcd['values'][eval_start][elcode] = n
vcd['values'][eval_end][elcode] = None
# Predicted graph
for eval_start in EvalLoops:
eval_end = EvalLoops[eval_start]['end']
# Compute scale so predicted graph is of same width as eval
measured_scaling = (eval_end -
eval_start) / Global['predict_last_end']
# Predict mtasks that fill the time the eval occupied
for mtask in Mtasks:
thread = Mtasks[mtask]['thread']
pred_scaled_start = eval_start + int(
Mtasks[mtask]['predict_start'] * measured_scaling)
pred_scaled_end = eval_start + int(
(Mtasks[mtask]['predict_start'] +
Mtasks[mtask]['predict_cost']) * measured_scaling)
if pred_scaled_start == pred_scaled_end:
continue
sig = "predicted_thread%d_mtask" % thread
if sig not in vcd['sigs']['predicted_threads']:
vcd['sigs']['predicted_threads'][sig] = code
code += 1
mcode = vcd['sigs']['predicted_threads'][sig]
vcd['values'][pred_scaled_start][mcode] = mtask
vcd['values'][pred_scaled_end][mcode] = None
parallelism['predicted'][pred_scaled_start] += 1
parallelism['predicted'][pred_scaled_end] -= 1
# Parallelism graph
for measpred in ('measured', 'predicted'):
vcd['sigs']['Stats']["%s_parallelism" % measpred] = code
pcode = code
code += 1
value = 0
for time in sorted(parallelism[measpred].keys()):
value += parallelism[measpred][time]
vcd['values'][time][pcode] = value
# Create output file
fh.write("$version Generated by verilator_gantt $end\n")
fh.write("$timescale 1ns $end\n")
fh.write("\n")

View File

@ -27,25 +27,51 @@ indicates multiple mtasks started at that time position.
Also creates a value change dump (VCD) format dump file which may be viewed
in a waveform viewer (e.g. C<GTKWave>). See below.
.. figure:: figures/fig_gantt_min.png
Example verilator_gantt output, as viewed with GTKWave.
Gantt Chart VCD Signals
-----------------------
In waveforms there are the following signals. Most signals the "decimal"
format will remove the leading zeros and make the traces easier to read.
In waveforms there are the following signals. In GTKWave, using a data
format of "decimal" will remove the leading zeros and make the traces
easier to read.
parallelism
evals
Increments each time when eval_step was measured to be active. This
allow visualization of how much time eval_step was active.
eval_loop
Increments each time when the evaluation loop within eval_step was
measured to be active. For best performance there is only a single
evaluation loop within each eval_step call, that is the eval_loop
waveform looks identical to the evals waveform.
measured_parallelism
The number of mtasks active at this time, for best performance this will
match the thread count. You may want to use an "analog step" format to
match the thread count. In GTKWave, use a data format of "analog step" to
view this signal.
predicted_parallelism
The number of mtasks Verilator predicted would be active at this time,
for best performance this will match the thread count. In GTKWave, use a
data format of "analog step" to view this signal.
cpu#_thread
For the given CPU number, the thread number executing.
For the given CPU number, the thread number measured to be executing.
mtask#_cpu
For the given mtask id, the CPU it is executing on.
For the given mtask id, the CPU it was measured to execute on.
thread#_mtask
For the given thread number, the mtask id executing.
For the given thread number, the mtask id it was executing.
predicted_thread#_mtask
For the given thread number, the mtask id Verilator predicted would be
executing.
verilator_gantt Arguments
-------------------------

Binary file not shown.

Before

Width:  |  Height:  |  Size: 43 KiB

After

Width:  |  Height:  |  Size: 37 KiB

View File

@ -306,7 +306,7 @@ statistics.
Example verilator_gantt output, as viewed with GTKWave.
The parallelism shows the number of CPUs being used at a given moment.
The measured_parallelism shows the number of CPUs being used at a given moment.
The cpu_thread section shows which thread is executing on each of the physical CPUs.

View File

@ -160,7 +160,7 @@ void VlThreadPool::profileDump(const char* filenamep, vluint64_t tickStart, vlui
// TODO Perhaps merge with verilated_coverage output format, so can
// have a common merging and reporting tool, etc.
fprintf(fp, "VLPROFTHREAD 1.0 # Verilator thread profile dump version 1.0\n");
fprintf(fp, "VLPROFTHREAD 1.1 # Verilator thread profile dump version 1.1\n");
fprintf(fp, "VLPROF arg --threads %" VL_PRI64 "u\n", vluint64_t(m_workers.size() + 1));
fprintf(fp, "VLPROF arg +verilator+prof+threads+start+%" VL_PRI64 "u\n",
Verilated::threadContextp()->profThreadsStart());
@ -188,14 +188,30 @@ void VlThreadPool::profileDump(const char* filenamep, vluint64_t tickStart, vlui
case VlProfileRec::TYPE_BARRIER: //
printing = true;
break;
case VlProfileRec::TYPE_EVAL:
if (!printing) break;
fprintf(fp,
"VLPROF eval start %" VL_PRI64 "u elapsed %" VL_PRI64 "u"
" cpu %u on thread %u\n",
ei.m_startTime - tickStart, (ei.m_endTime - ei.m_startTime), ei.m_cpu,
thread_id);
break;
case VlProfileRec::TYPE_EVAL_LOOP:
if (!printing) break;
fprintf(fp,
"VLPROF eval_loop start %" VL_PRI64 "u elapsed %" VL_PRI64 "u"
" cpu %u on thread %u\n",
ei.m_startTime - tickStart, (ei.m_endTime - ei.m_startTime), ei.m_cpu,
thread_id);
break;
case VlProfileRec::TYPE_MTASK_RUN:
if (!printing) break;
fprintf(fp,
"VLPROF mtask %d"
" start %" VL_PRI64 "u end %" VL_PRI64 "u elapsed %" VL_PRI64 "u"
" predict_time %u cpu %u on thread %u\n",
ei.m_mtaskId, ei.m_startTime - tickStart, ei.m_endTime - tickStart,
(ei.m_endTime - ei.m_startTime), ei.m_predictTime, ei.m_cpu, thread_id);
" start %" VL_PRI64 "u elapsed %" VL_PRI64 "u"
" predict_start %u predict_cost %u cpu %u on thread %u\n",
ei.m_mtaskId, ei.m_startTime - tickStart, (ei.m_endTime - ei.m_startTime),
ei.m_predictStart, ei.m_predictCost, ei.m_cpu, thread_id);
break;
default: assert(false); break; // LCOV_EXCL_LINE
}

View File

@ -131,21 +131,36 @@ public:
class VlProfileRec final {
protected:
friend class VlThreadPool;
enum VlProfileE { TYPE_MTASK_RUN, TYPE_BARRIER };
VlProfileE m_type = TYPE_BARRIER; // Record type
vluint32_t m_mtaskId = 0; // Mtask we're logging
vluint32_t m_predictTime = 0; // How long scheduler predicted would take
vluint64_t m_startTime = 0; // Tick at start of execution
enum VlProfileE { TYPE_MTASK_RUN, TYPE_EVAL, TYPE_EVAL_LOOP, TYPE_BARRIER };
// Layout below allows efficient packing.
// Leave endTime first, so no math needed to calculate address in endRecord
vluint64_t m_endTime = 0; // Tick at end of execution
vluint64_t m_startTime = 0; // Tick at start of execution
vluint32_t m_mtaskId = 0; // Mtask we're logging
vluint32_t m_predictStart = 0; // Time scheduler predicted would start
vluint32_t m_predictCost = 0; // How long scheduler predicted would take
VlProfileE m_type = TYPE_BARRIER; // Record type
unsigned m_cpu; // Execution CPU number (at start anyways)
public:
class Barrier {};
VlProfileRec() = default;
explicit VlProfileRec(Barrier) { m_cpu = getcpu(); }
void startRecord(vluint64_t time, uint32_t mtask, uint32_t predict) {
void startEval(vluint64_t time) {
m_type = VlProfileRec::TYPE_EVAL;
m_startTime = time;
m_cpu = getcpu();
}
void startEvalLoop(vluint64_t time) {
m_type = VlProfileRec::TYPE_EVAL_LOOP;
m_startTime = time;
m_cpu = getcpu();
}
void startRecord(vluint64_t time, vluint32_t mtask, vluint32_t predictStart,
vluint32_t predictCost) {
m_type = VlProfileRec::TYPE_MTASK_RUN;
m_mtaskId = mtask;
m_predictTime = predict;
m_predictStart = predictStart;
m_predictCost = predictCost;
m_startTime = time;
m_cpu = getcpu();
}

View File

@ -323,7 +323,24 @@ class EmitCModel final : public EmitCFunc {
puts(" loop\\n\"););\n");
if (initial)
puts(topModNameProtected + "__" + protect("_eval_settle") + "(&(vlSymsp->TOP));\n");
const string recName = "__Vprfloop";
if (v3Global.opt.profThreads() && !initial) {
puts("VlProfileRec* " + recName + " = nullptr;\n");
// Leave this if() here, as don't want to call VL_RDTSC_Q unless profiling
puts("if (VL_UNLIKELY(vlSymsp->__Vm_profile_cycle_start)) {\n");
// Eval start
puts(/**/ recName + " = vlSymsp->__Vm_threadPoolp->profileAppend();\n");
puts(/**/ recName + "->startEvalLoop(VL_RDTSC_Q());\n");
puts("}\n");
}
puts(topModNameProtected + "__" + protect("_eval") + "(&(vlSymsp->TOP));\n");
if (v3Global.opt.profThreads() && !initial) {
puts("if (VL_UNLIKELY(" + recName + ")) " + recName + "->endRecord(VL_RDTSC_Q());\n");
}
if (v3Global.rootp()->changeRequest()) {
puts("if (VL_UNLIKELY(++__VclockLoop > " + cvtToStr(v3Global.opt.convergeLimit())
+ ")) {\n");
@ -354,7 +371,7 @@ class EmitCModel final : public EmitCFunc {
+ ");\n");
}
void emitStandardMethods(AstNodeModule* modp) {
void emitStandardMethods1(AstNodeModule* modp) {
UASSERT_OBJ(modp->isTop(), modp, "Attempting to emitWrapEval for non-top class");
const string topModNameProtected = prefixNameProtect(modp);
@ -385,16 +402,21 @@ class EmitCModel final : public EmitCFunc {
emitSettleLoop(modp, /* initial: */ true);
ensureNewLine();
puts("}\n");
}
void emitStandardMethods2(AstNodeModule* modp) {
const string topModNameProtected = prefixNameProtect(modp);
// ::eval_step
puts("\nvoid " + topClassName() + "::eval_step() {\n");
puts("VL_DEBUG_IF(VL_DBG_MSGF(\"+++++TOP Evaluate " + topClassName()
+ "::eval_step\\n\"); );\n");
puts("#ifdef VL_DEBUG\n");
putsDecoration("// Debug assertions\n");
puts(topModNameProtected + "__" + protect("_eval_debug_assertions")
+ "(&(vlSymsp->TOP));\n");
puts("#endif // VL_DEBUG\n");
putsDecoration("// Initialize\n");
puts("if (VL_UNLIKELY(!vlSymsp->__Vm_didInit)) " + protect("_eval_initial_loop")
+ "(vlSymsp);\n");
@ -406,13 +428,14 @@ class EmitCModel final : public EmitCFunc {
puts("Verilated::mtaskId(" + cvtToStr(mtaskId) + ");\n");
}
if (v3Global.opt.mtasks() && v3Global.opt.profThreads()) {
puts("if (VL_UNLIKELY((vlSymsp->_vm_contextp__->profThreadsStart()"
" != vlSymsp->__Vm_profile_time_finished)\n");
if (v3Global.opt.profThreads()) {
puts("if (VL_UNLIKELY((vlSymsp->_vm_contextp__->profThreadsStart() != "
"vlSymsp->__Vm_profile_time_finished)\n");
puts(" && (VL_TIME_Q() > vlSymsp->_vm_contextp__->profThreadsStart())\n");
puts(" && (vlSymsp->_vm_contextp__->profThreadsWindow() >= 1))) {\n");
// Within a profile (either starting, middle, or end)
puts(/**/ "if (vlSymsp->__Vm_profile_window_ct == 0) {\n"); // Opening file?
puts(/**/ "VL_DEBUG_IF(VL_DBG_MSGF(\"+ profile start warmup\\n\"););\n");
// Start profile on this cycle. We'll capture a window worth, then
// only analyze the next window worth. The idea is that the first window
// capture will hit some cache-cold stuff (eg printf) but it'll be warm
@ -430,12 +453,13 @@ class EmitCModel final : public EmitCFunc {
// cache-warm-up cycles before the barrier from the actual profile
// cycles afterward.
puts(/****/ "vlSymsp->__Vm_threadPoolp->profileAppendAll(");
puts(/****/ "VlProfileRec(VlProfileRec::Barrier()));\n");
puts(/****/ "VlProfileRec{VlProfileRec::Barrier{}});\n");
puts(/****/ "vlSymsp->__Vm_profile_cycle_start = VL_RDTSC_Q();\n");
puts(/**/ "}\n");
// Ending trace file?
puts(/**/ "else if (vlSymsp->__Vm_profile_window_ct == 0) {\n");
// Ending file.
puts(/****/ "vluint64_t tick_end = VL_RDTSC_Q();\n");
puts(/****/ "VL_DEBUG_IF(VL_DBG_MSGF(\"+ profile end\\n\"););\n");
puts(/****/ "vlSymsp->__Vm_threadPoolp->profileDump("
"vlSymsp->_vm_contextp__->profThreadsFilename().c_str(), "
"vlSymsp->__Vm_profile_cycle_start, "
@ -443,20 +467,41 @@ class EmitCModel final : public EmitCFunc {
// This turns off the test to enter the profiling code, but still
// allows the user to collect another profile by changing
// profThreadsStart
puts(/****/ "vlSymsp->__Vm_profile_time_finished"
" = vlSymsp->_vm_contextp__->profThreadsStart();\n");
puts(/****/ "vlSymsp->__Vm_profile_time_finished = "
"vlSymsp->_vm_contextp__->profThreadsStart();\n");
puts(/****/ "vlSymsp->__Vm_profile_cycle_start = 0;\n");
puts(/**/ "}\n");
puts("}\n");
}
const string recName = "__Vprfeval";
if (v3Global.opt.profThreads()) {
puts("VlProfileRec* " + recName + " = nullptr;\n");
// Leave this if() here, as don't want to call VL_RDTSC_Q unless profiling
puts("if (VL_UNLIKELY(vlSymsp->__Vm_profile_cycle_start)) {\n");
// Eval start
puts(/**/ recName + " = vlSymsp->__Vm_threadPoolp->profileAppend();\n");
puts(/**/ recName + "->startEval(VL_RDTSC_Q());\n");
puts("}\n");
}
emitSettleLoop(modp, /* initial: */ false);
putsDecoration("// Evaluate cleanup\n");
if (v3Global.opt.threads() == 1) {
puts("Verilated::endOfThreadMTask(vlSymsp->__Vm_evalMsgQp);\n");
}
if (v3Global.opt.threads()) puts("Verilated::endOfEval(vlSymsp->__Vm_evalMsgQp);\n");
puts("}\n");
if (v3Global.opt.profThreads()) {
// End eval record
puts("if (VL_UNLIKELY(" + recName + ")) " + recName + "->endRecord(VL_RDTSC_Q());\n");
}
puts("}\n");
}
void emitStandardMethods3(AstNodeModule* modp) {
const string topModNameProtected = prefixNameProtect(modp);
// ::eval_end_step
if (v3Global.needTraceDumper() && !optSystemC()) {
puts("\nvoid " + topClassName() + "::eval_end_step() {\n");
@ -573,7 +618,9 @@ class EmitCModel final : public EmitCFunc {
emitConstructorImplementation(modp);
emitDestructorImplementation();
emitStandardMethods(modp);
emitStandardMethods1(modp);
emitStandardMethods2(modp);
emitStandardMethods3(modp);
if (v3Global.opt.trace()) { emitTraceMethods(modp); }
if (v3Global.opt.savable()) { emitSerializationFunctions(); }

View File

@ -2237,11 +2237,11 @@ public:
std::vector<uint32_t> busyUntil(m_nThreads, 0);
// MTasks ready to be assigned next. All their dependencies are already assigned.
std::set<const ExecMTask*, MTaskCmp> readyMTasks;
std::set<ExecMTask*, MTaskCmp> readyMTasks;
// Build initial ready list
for (V3GraphVertex* vxp = mtaskGraph.verticesBeginp(); vxp; vxp = vxp->verticesNextp()) {
const ExecMTask* const mtaskp = dynamic_cast<ExecMTask*>(vxp);
ExecMTask* const mtaskp = dynamic_cast<ExecMTask*>(vxp);
if (isReady(schedule, mtaskp)) readyMTasks.insert(mtaskp);
}
@ -2250,9 +2250,9 @@ public:
// on each thread (in that thread's local time frame.)
uint32_t bestTime = 0xffffffff;
uint32_t bestThreadId = 0;
const ExecMTask* bestMtaskp = nullptr; // Todo: const ExecMTask*
ExecMTask* bestMtaskp = nullptr; // Todo: const ExecMTask*
for (uint32_t threadId = 0; threadId < m_nThreads; ++threadId) {
for (const ExecMTask* const mtaskp : readyMTasks) {
for (ExecMTask* const mtaskp : readyMTasks) {
uint32_t timeBegin = busyUntil[threadId];
if (timeBegin > bestTime) {
UINFO(6, "th " << threadId << " busy until " << timeBegin
@ -2287,10 +2287,11 @@ public:
std::vector<const ExecMTask*>& bestThread = schedule.threads[bestThreadId];
// Update algorithm state
bestMtaskp->predictStart(bestTime); // Only for gantt reporting
const uint32_t bestEndTime = bestTime + bestMtaskp->cost();
schedule.mtaskState[bestMtaskp].completionTime = bestEndTime;
schedule.mtaskState[bestMtaskp].threadId = bestThreadId;
if (!bestThread.empty()) { schedule.mtaskState[bestThread.back()].nextp = bestMtaskp; }
if (!bestThread.empty()) schedule.mtaskState[bestThread.back()].nextp = bestMtaskp;
busyUntil[bestThreadId] = bestEndTime;
// Add the MTask to the schedule
@ -2301,7 +2302,7 @@ public:
UASSERT_OBJ(erased > 0, bestMtaskp, "Should have erased something?");
for (V3GraphEdge* edgeOutp = bestMtaskp->outBeginp(); edgeOutp;
edgeOutp = edgeOutp->outNextp()) {
const ExecMTask* const nextp = dynamic_cast<ExecMTask*>(edgeOutp->top());
ExecMTask* const nextp = dynamic_cast<ExecMTask*>(edgeOutp->top());
// Dependent MTask should not yet be assigned to a thread
UASSERT(schedule.threadId(nextp) == ThreadSchedule::UNASSIGNED,
"Tasks after one being assigned should not be assigned yet");
@ -2713,6 +2714,7 @@ static void addMTaskToFunction(const ThreadSchedule& schedule, const uint32_t th
recName + " = vlSymsp->__Vm_threadPoolp->profileAppend();\n" + //
recName + "->startRecord(VL_RDTSC_Q()," + //
" " + cvtToStr(mtaskp->id()) + "," + //
" " + cvtToStr(mtaskp->predictStart()) + "," + //
" " + cvtToStr(mtaskp->cost()) + ");\n" + //
"}\n");
}
@ -2724,9 +2726,8 @@ static void addMTaskToFunction(const ThreadSchedule& schedule, const uint32_t th
funcp->addStmtsp(mtaskp->bodyp()->unlinkFrBack());
if (v3Global.opt.profThreads()) {
// Leave this if() here, as don't want to call VL_RDTSC_Q unless profiling
addStrStmt("if (VL_UNLIKELY(" + recName + ")) {\n" + //
recName + "->endRecord(VL_RDTSC_Q());\n" + "}\n");
addStrStmt("if (VL_UNLIKELY(" + recName + ")) " //
+ recName + "->endRecord(VL_RDTSC_Q());\n");
}
// Flush message queue

View File

@ -61,6 +61,7 @@ private:
// mtask. In abstract time units.
uint32_t m_cost = 0; // Predicted runtime of this mtask, in the same
// abstract time units as priority().
uint64_t m_predictStart = 0; // Predicted start time of task
VL_UNCOPYABLE(ExecMTask);
public:
@ -74,6 +75,8 @@ public:
void priority(uint32_t pri) { m_priority = pri; }
virtual uint32_t cost() const override { return m_cost; }
void cost(uint32_t cost) { m_cost = cost; }
void predictStart(vluint64_t time) { m_predictStart = time; }
vluint64_t predictStart() const { return m_predictStart; }
string cFuncName() const {
// If this MTask maps to a C function, this should be the name
return string("__Vmtask") + "__" + cvtToStr(m_id);

File diff suppressed because it is too large Load Diff

View File

@ -6,33 +6,37 @@ Argument settings:
--threads 2
Thread gantt graph:
Legend: One character width = 100 rdtsc ticks
Legend: One character width = 136 rdtsc ticks
Legend: '&' = multiple mtasks in this period (character width)
<-16065 rdtsc total-------------------------------------------------------------------------------------------------------------------------------------------->
t: [1] [1] [1] [1]
t: [16-] [16][16-------] [16--] [16] [16-] [1[] [16[xxx
<-21875 rdtsc total-------------------------------------------------------------------------------------------------------------------------------------------->
t: [19] [19] [19] [19]
t: [10-] [10-] [10----] [10-] [10---] [10-][10][10] [10][10]
Analysis:
Total threads = 2
Total mtasks = 7
Total cpus used = 2
Total yields = 1
Total eval time = 16065 rdtsc ticks
Longest mtask time = 1085 rdtsc ticks
All-thread mtask time = 3430 rdtsc ticks
Longest-thread efficiency = 6.8%
All-thread efficiency = 10.7%
All-thread speedup = 0.2
Total yields = 0
Total eval time = 21875 rdtsc ticks
Longest mtask time = 1190 rdtsc ticks
All-thread mtask time = 5495 rdtsc ticks
Longest-thread efficiency = 5.4%
All-thread efficiency = 12.6%
All-thread speedup = 0.3
Prediction (what Verilator used for scheduling):
All-thread efficiency = 63.2%
All-thread speedup = 1.3
Statistics:
min log(p2e) = -3.332 from mtask 5 (predict 30, elapsed 840)
max log(p2e) = -1.764 from mtask 7 (predict 30, elapsed 175)
mean = -2.365
stddev = 0.562
e ^ stddev = 1.754
min log(p2e) = -3.681 from mtask 5 (predict 30, elapsed 1190)
max log(p2e) = -2.409 from mtask 8 (predict 107, elapsed 1190)
mean = -2.992
stddev = 0.459
e ^ stddev = 1.583
CPUs:
cpu 1: cpu_time=525 socket=0 core=1 Test Ryzen 9 3950X 16-Core Processor
cpu 16: cpu_time=2905 socket=0 core=0 Test Ryzen 9 3950X 16-Core Processor
cpu 10: cpu_time=4725 socket=0 core=10 Test Ryzen 9 3950X 16-Core Processor
cpu 19: cpu_time=770 socket=0 core=3 Test Ryzen 9 3950X 16-Core Processor
Writing profile_threads.vcd

View File

@ -3,171 +3,256 @@ $timescale 1ns $end
$scope module gantt $end
$scope module Stats $end
$var wire 32 vb parallelism [31:0] $end
$var wire 32 vf measured_parallelism [31:0] $end
$var wire 32 v10 predicted_parallelism [31:0] $end
$upscope $end
$scope module cpus $end
$var wire 32 v1 cpu16_thread [31:0] $end
$var wire 32 v8 cpu1_thread [31:0] $end
$var wire 32 v5 cpu10_thread [31:0] $end
$var wire 32 v1 cpu19_thread [31:0] $end
$upscope $end
$scope module evals $end
$var wire 32 vb eval [31:0] $end
$var wire 32 vc eval_loop [31:0] $end
$upscope $end
$scope module measured_threads $end
$var wire 32 v0 thread1_mtask [31:0] $end
$var wire 32 v4 thread2_mtask [31:0] $end
$upscope $end
$scope module mtasks $end
$var wire 32 va mtask10_cpu [31:0] $end
$var wire 32 v6 mtask11_cpu [31:0] $end
$var wire 32 v2 mtask5_cpu [31:0] $end
$var wire 32 v9 mtask6_cpu [31:0] $end
$var wire 32 v3 mtask7_cpu [31:0] $end
$var wire 32 v4 mtask8_cpu [31:0] $end
$var wire 32 v5 mtask9_cpu [31:0] $end
$var wire 32 v3 mtask10_cpu [31:0] $end
$var wire 32 va mtask11_cpu [31:0] $end
$var wire 32 v6 mtask5_cpu [31:0] $end
$var wire 32 v2 mtask6_cpu [31:0] $end
$var wire 32 v7 mtask7_cpu [31:0] $end
$var wire 32 v8 mtask8_cpu [31:0] $end
$var wire 32 v9 mtask9_cpu [31:0] $end
$upscope $end
$scope module threads $end
$var wire 32 v7 thread1_mtask [31:0] $end
$var wire 32 v0 thread2_mtask [31:0] $end
$scope module predicted_threads $end
$var wire 32 vd predicted_thread1_mtask [31:0] $end
$var wire 32 ve predicted_thread2_mtask [31:0] $end
$upscope $end
$upscope $end
$enddefinitions $end
#0
bz v0
bz v1
bz v2
bz v3
bz v4
bz v5
bz v6
bz v7
bz v8
bz v9
bz va
bz vb
bz vc
bz vd
bz ve
b0 vf
b0 v10
#595
b1 vb
#945
b1 vc
b110 vd
b101 ve
b10 v10
#2429
bz vd
b111 ve
b1 v10
#2695
b110 v0
b1 v1
b10011 v2
b1 vf
#2905
bz v0
bz v1
bz va
b1 vb
bz v2
bz v3
b0 vf
#3914
b1000 ve
b1 v10
#5495
b101 v4
b10 v5
b1010 v6
b1 vf
#6090
bz v4
bz v5
bz v6
b110 v7
b1 v8
b1 v9
#735
b0 vb
bz v7
bz v8
bz v9
#5110
b101 v0
b10 v1
b1 vb
b10000 v2
#5530
bz v0
bz v1
b0 vb
bz v2
#5740
b111 v0
b10 v1
b1 vb
b10000 v3
#5845
bz v0
bz v1
b0 vb
bz v3
#6125
b1000 v0
b10 v1
b1 vb
b10000 v4
#7140
bz v0
bz v1
b0 vb
bz v4
#8120
b1 va
b1 vb
b0 vf
#6300
b111 v4
b10 v5
b1010 v7
b1 v8
#8260
bz va
b0 vb
bz v7
bz v8
#8820
b1001 v0
b10 v1
b1 vb
b10000 v5
#9380
bz v0
bz v1
b0 vb
bz v5
#9940
b1011 v0
b10 v1
b1 vb
b10000 v6
#10045
bz v0
bz v1
b0 vb
bz v6
#11970
b1 vb
b110 v7
b1 v8
b1 v9
#12075
b0 vb
bz v7
bz v8
bz v9
#14175
b101 v0
b10 v1
b1 vb
b10000 v2
#14595
bz v0
bz v1
b0 vb
bz v2
#15120
b111 v0
b10 v1
b1 vb
b10000 v3
#15190
bz v0
bz v1
b0 vb
bz v3
#15365
b1000 v0
b10 v1
b1 vb
b10000 v4
#15435
bz v0
bz v1
b0 vb
b1 vf
#6895
bz v4
#15680
b1001 v0
b10 v1
b1 vb
b10000 v5
#15750
bz v5
bz v7
b0 vf
#7490
b1000 v4
b10 v5
b1010 v8
b1 vf
#8540
bz v4
bz v5
bz v8
b0 vf
#9135
b1001 v4
b10 v5
b1010 v9
b1 vf
#9210
b1001 ve
b1 v10
#9695
b1010 v0
b1 v1
b10011 v3
b10 vf
#9730
bz v4
bz v5
bz v9
b1 vf
#9870
bz v0
bz v1
b0 vb
bz v3
b0 vf
#10255
b1011 v4
b10 v5
b1010 va
b1 vf
#10645
b1010 vd
b10 v10
#10695
b1011 ve
b10 v10
#11060
bz v4
bz v5
#15925
b1011 v0
b10 v1
b1 va
bz va
b0 vf
#12130
bz vd
b1 v10
#12180
bz vc
bz ve
b0 v10
#12250
bz vb
#13720
b10 vb
b10000 v6
b1010 v7
b1 v8
#15995
#14000
b10 vc
b110 vd
b101 ve
b10 v10
#15068
bz vd
b111 ve
b1 v10
#15610
b110 v0
b1 v1
b10011 v2
b1 vf
#15820
bz v0
bz v1
b1 vb
bz v2
b0 vf
#16137
b1000 ve
b1 v10
#18375
b101 v4
b10 v5
b1010 v6
b1 vf
#18970
bz v4
bz v5
bz v6
#16065
bz va
b0 vb
b0 vf
#19145
b111 v4
b10 v5
b1010 v7
b1 vf
#19320
bz v4
bz v5
bz v7
b0 vf
#19670
b1000 v4
b10 v5
b1010 v8
b1 vf
#19810
bz v4
bz v5
bz v8
b0 vf
#19947
b1001 ve
b1 v10
#20650
b1001 v4
b10 v5
b1010 v9
b1 vf
#20720
bz v4
bz v5
bz v9
b0 vf
#20980
b1010 vd
b10 v10
#21016
b1011 ve
b10 v10
#21140
b1011 v4
b10 v5
b1010 va
b1 vf
#21245
bz v4
bz v5
bz va
b0 vf
#21700
b1010 v0
b1 v1
b10011 v3
b1 vf
#21875
bz v0
bz v1
bz v3
b0 vf
#22049
bz vd
b1 v10
#22085
bz vc
bz ve
b0 v10
#22330
bz vb