forked from github/verilator
Verilator_gantt now shows the predicted mtask times, eval times, and additional statistics.
This commit is contained in:
parent
68f1432a68
commit
c2819923c5
3
Changes
3
Changes
@ -11,7 +11,8 @@ contributors that suggested a given feature are shown in []. Thanks!
|
|||||||
Verilator 4.213 devel
|
Verilator 4.213 devel
|
||||||
==========================
|
==========================
|
||||||
|
|
||||||
* Include processor information in verilator_gantt data file.
|
* Verilator_gantt now shows the predicted mtask times, eval times, and additional statistics.
|
||||||
|
* Verilator_gantt data files now include processor information, to allow later processing.
|
||||||
* Fix verilator_profcfunc profile accounting (#3115).
|
* Fix verilator_profcfunc profile accounting (#3115).
|
||||||
* Fix display has no time units on class function (#3116). [Damien Pretet]
|
* Fix display has no time units on class function (#3116). [Damien Pretet]
|
||||||
* Fix removing if statement with side effect in condition (#3131). [Alexander Grobman]
|
* Fix removing if statement with side effect in condition (#3131). [Alexander Grobman]
|
||||||
|
@ -11,6 +11,8 @@ import statistics
|
|||||||
|
|
||||||
Threads = collections.defaultdict(lambda: {})
|
Threads = collections.defaultdict(lambda: {})
|
||||||
Mtasks = collections.defaultdict(lambda: {})
|
Mtasks = collections.defaultdict(lambda: {})
|
||||||
|
Evals = collections.defaultdict(lambda: {})
|
||||||
|
EvalLoops = collections.defaultdict(lambda: {})
|
||||||
Global = {
|
Global = {
|
||||||
'args': {},
|
'args': {},
|
||||||
'cpuinfo': collections.defaultdict(lambda: {}),
|
'cpuinfo': collections.defaultdict(lambda: {}),
|
||||||
@ -29,8 +31,11 @@ def process(filename):
|
|||||||
def read_data(filename):
|
def read_data(filename):
|
||||||
with open(filename) as fh:
|
with open(filename) as fh:
|
||||||
re_prof = re.compile(
|
re_prof = re.compile(
|
||||||
r'^VLPROF mtask\s(\d+)\sstart\s(\d+)\send\s(\d+)\selapsed\s(\d+)\spredict_time\s(\d+)\scpu\s(\d+)\son thread (\d+)'
|
r'^VLPROF mtask\s(\d+)\sstart\s(\d+)\selapsed\s(\d+)\spredict_start\s(\d+)\spredict_cost\s(\d+)\scpu\s(\d+)\son thread (\d+)'
|
||||||
)
|
)
|
||||||
|
re_eval = re.compile(r'^VLPROF eval\sstart\s(\d+)\selapsed\s(\d+)')
|
||||||
|
re_loop = re.compile(
|
||||||
|
r'^VLPROF eval_loop\sstart\s(\d+)\selapsed\s(\d+)')
|
||||||
re_arg1 = re.compile(r'VLPROF arg\s+(\S+)\+([0-9.])\s*')
|
re_arg1 = re.compile(r'VLPROF arg\s+(\S+)\+([0-9.])\s*')
|
||||||
re_arg2 = re.compile(r'VLPROF arg\s+(\S+)\s+([0-9.])\s*$')
|
re_arg2 = re.compile(r'VLPROF arg\s+(\S+)\s+([0-9.])\s*$')
|
||||||
re_stat = re.compile(r'VLPROF stat\s+(\S+)\s+([0-9.]+)')
|
re_stat = re.compile(r'VLPROF stat\s+(\S+)\s+([0-9.]+)')
|
||||||
@ -44,9 +49,10 @@ def read_data(filename):
|
|||||||
match = re_prof.match(line)
|
match = re_prof.match(line)
|
||||||
mtask = int(match.group(1))
|
mtask = int(match.group(1))
|
||||||
start = int(match.group(2))
|
start = int(match.group(2))
|
||||||
end = int(match.group(3))
|
elapsed_time = int(match.group(3))
|
||||||
elapsed_time = int(match.group(4))
|
end = start + elapsed_time
|
||||||
predict_time = int(match.group(5))
|
predict_start = int(match.group(4))
|
||||||
|
predict_cost = int(match.group(5))
|
||||||
cpu = int(match.group(6))
|
cpu = int(match.group(6))
|
||||||
thread = int(match.group(7))
|
thread = int(match.group(7))
|
||||||
if start not in Threads[thread]:
|
if start not in Threads[thread]:
|
||||||
@ -54,12 +60,28 @@ def read_data(filename):
|
|||||||
Threads[thread][start]['mtask'] = mtask
|
Threads[thread][start]['mtask'] = mtask
|
||||||
Threads[thread][start]['end'] = end
|
Threads[thread][start]['end'] = end
|
||||||
Threads[thread][start]['cpu'] = cpu
|
Threads[thread][start]['cpu'] = cpu
|
||||||
|
Threads[thread][start]['predict_start'] = predict_start
|
||||||
|
Threads[thread][start]['predict_cost'] = predict_cost
|
||||||
|
|
||||||
if 'elapsed' not in Mtasks[mtask]:
|
if 'elapsed' not in Mtasks[mtask]:
|
||||||
Mtasks[mtask] = {'end': 0, 'elapsed': 0}
|
Mtasks[mtask] = {'end': 0, 'elapsed': 0}
|
||||||
|
Mtasks[mtask]['thread'] = thread
|
||||||
Mtasks[mtask]['elapsed'] += elapsed_time
|
Mtasks[mtask]['elapsed'] += elapsed_time
|
||||||
Mtasks[mtask]['predict_cost'] = predict_time
|
Mtasks[mtask]['predict_start'] = predict_start
|
||||||
|
Mtasks[mtask]['predict_cost'] = predict_cost
|
||||||
Mtasks[mtask]['end'] = max(Mtasks[mtask]['end'], end)
|
Mtasks[mtask]['end'] = max(Mtasks[mtask]['end'], end)
|
||||||
|
elif re_eval.match(line):
|
||||||
|
match = re_eval.match(line)
|
||||||
|
start = int(match.group(1))
|
||||||
|
elapsed_time = int(match.group(2))
|
||||||
|
Evals[start]['start'] = start
|
||||||
|
Evals[start]['end'] = start + elapsed_time
|
||||||
|
elif re_loop.match(line):
|
||||||
|
match = re_loop.match(line)
|
||||||
|
start = int(match.group(1))
|
||||||
|
elapsed_time = int(match.group(2))
|
||||||
|
EvalLoops[start]['start'] = start
|
||||||
|
EvalLoops[start]['end'] = start + elapsed_time
|
||||||
elif re.match(r'^VLPROFTHREAD', line):
|
elif re.match(r'^VLPROFTHREAD', line):
|
||||||
None # pylint: disable=pointless-statement
|
None # pylint: disable=pointless-statement
|
||||||
elif re_arg1.match(line):
|
elif re_arg1.match(line):
|
||||||
@ -120,14 +142,21 @@ def report():
|
|||||||
Global['cpus'][cpu] = {'cpu_time': 0}
|
Global['cpus'][cpu] = {'cpu_time': 0}
|
||||||
Global['cpus'][cpu]['cpu_time'] += elapsed
|
Global['cpus'][cpu]['cpu_time'] += elapsed
|
||||||
|
|
||||||
mt_mtask_time = 0
|
measured_mt_mtask_time = 0
|
||||||
|
predict_mt_mtask_time = 0
|
||||||
long_mtask_time = 0
|
long_mtask_time = 0
|
||||||
last_end = 0
|
measured_last_end = 0
|
||||||
|
predict_last_end = 0
|
||||||
for mtask in Mtasks:
|
for mtask in Mtasks:
|
||||||
mt_mtask_time += Mtasks[mtask]['elapsed']
|
measured_mt_mtask_time += Mtasks[mtask]['elapsed']
|
||||||
last_end = max(last_end, Mtasks[mtask]['end'])
|
predict_mt_mtask_time += Mtasks[mtask]['predict_cost']
|
||||||
|
measured_last_end = max(measured_last_end, Mtasks[mtask]['end'])
|
||||||
|
predict_last_end = max(
|
||||||
|
predict_last_end,
|
||||||
|
Mtasks[mtask]['predict_start'] + Mtasks[mtask]['predict_cost'])
|
||||||
long_mtask_time = max(long_mtask_time, Mtasks[mtask]['elapsed'])
|
long_mtask_time = max(long_mtask_time, Mtasks[mtask]['elapsed'])
|
||||||
Global['last_end'] = last_end
|
Global['measured_last_end'] = measured_last_end
|
||||||
|
Global['predict_last_end'] = predict_last_end
|
||||||
|
|
||||||
report_graph()
|
report_graph()
|
||||||
|
|
||||||
@ -145,19 +174,30 @@ def report():
|
|||||||
ncpus = len(Global['cpus'])
|
ncpus = len(Global['cpus'])
|
||||||
print(" Total cpus used = %d" % ncpus)
|
print(" Total cpus used = %d" % ncpus)
|
||||||
print(" Total yields = %d" % int(Global['stats']['yields']))
|
print(" Total yields = %d" % int(Global['stats']['yields']))
|
||||||
print(" Total eval time = %d rdtsc ticks" % Global['last_end'])
|
print(" Total eval time = %d rdtsc ticks" %
|
||||||
|
Global['measured_last_end'])
|
||||||
print(" Longest mtask time = %d rdtsc ticks" % long_mtask_time)
|
print(" Longest mtask time = %d rdtsc ticks" % long_mtask_time)
|
||||||
print(" All-thread mtask time = %d rdtsc ticks" % mt_mtask_time)
|
print(" All-thread mtask time = %d rdtsc ticks" %
|
||||||
long_efficiency = long_mtask_time / (Global.get('last_end', 1))
|
measured_mt_mtask_time)
|
||||||
|
long_efficiency = long_mtask_time / (Global.get('measured_last_end', 1) or 1)
|
||||||
print(" Longest-thread efficiency = %0.1f%%" % (long_efficiency * 100.0))
|
print(" Longest-thread efficiency = %0.1f%%" % (long_efficiency * 100.0))
|
||||||
mt_efficiency = mt_mtask_time / (Global.get('last_end', 1) * nthreads or 1)
|
mt_efficiency = measured_mt_mtask_time / (
|
||||||
|
Global.get('measured_last_end', 1) * nthreads or 1)
|
||||||
print(" All-thread efficiency = %0.1f%%" % (mt_efficiency * 100.0))
|
print(" All-thread efficiency = %0.1f%%" % (mt_efficiency * 100.0))
|
||||||
print(" All-thread speedup = %0.1f" % (mt_efficiency * nthreads))
|
print(" All-thread speedup = %0.1f" % (mt_efficiency * nthreads))
|
||||||
if Global['rdtsc_cycle_time'] > 0:
|
if Global['rdtsc_cycle_time'] > 0:
|
||||||
ut = mt_mtask_time / Global['rdtsc_cycle_time']
|
ut = measured_mt_mtask_time / Global['rdtsc_cycle_time']
|
||||||
print("tot_mtask_cpu=" + mt_mtask_time + " cyc=" +
|
print("tot_mtask_cpu=" + measured_mt_mtask_time + " cyc=" +
|
||||||
Global['rdtsc_cycle_time'] + " ut=" + ut)
|
Global['rdtsc_cycle_time'] + " ut=" + ut)
|
||||||
|
|
||||||
|
predict_mt_efficiency = predict_mt_mtask_time / (
|
||||||
|
Global.get('predict_last_end', 1) * nthreads or 1)
|
||||||
|
print("\nPrediction (what Verilator used for scheduling):")
|
||||||
|
print(" All-thread efficiency = %0.1f%%" %
|
||||||
|
(predict_mt_efficiency * 100.0))
|
||||||
|
print(" All-thread speedup = %0.1f" %
|
||||||
|
(predict_mt_efficiency * nthreads))
|
||||||
|
|
||||||
p2e_ratios = []
|
p2e_ratios = []
|
||||||
min_p2e = 1000000
|
min_p2e = 1000000
|
||||||
min_mtask = None
|
min_mtask = None
|
||||||
@ -253,7 +293,7 @@ def report_cpus():
|
|||||||
def report_graph():
|
def report_graph():
|
||||||
time_per = Args.scale
|
time_per = Args.scale
|
||||||
if time_per == 0:
|
if time_per == 0:
|
||||||
time_per = Global['last_end'] / 40 # Start with 40 columns
|
time_per = Global['measured_last_end'] / 40 # Start with 40 columns
|
||||||
while time_per > 10:
|
while time_per > 10:
|
||||||
(graph, conflicts) = _make_graph(time_per)
|
(graph, conflicts) = _make_graph(time_per)
|
||||||
if not conflicts:
|
if not conflicts:
|
||||||
@ -270,8 +310,9 @@ def report_graph():
|
|||||||
print(" Legend: One character width = %s rdtsc ticks" % time_per)
|
print(" Legend: One character width = %s rdtsc ticks" % time_per)
|
||||||
print(" Legend: '&' = multiple mtasks in this period (character width)")
|
print(" Legend: '&' = multiple mtasks in this period (character width)")
|
||||||
|
|
||||||
scale = " <-%d rdtsc total" % Global['last_end']
|
scale = " <-%d rdtsc total" % Global['measured_last_end']
|
||||||
for col in range(len(scale), int(0.99 + (Global['last_end'] / time_per))): # pylint: disable=unused-variable
|
for col in range(len(scale),
|
||||||
|
int(0.99 + (Global['measured_last_end'] / time_per))): # pylint: disable=unused-variable
|
||||||
scale += "-"
|
scale += "-"
|
||||||
print(" " + scale + "->")
|
print(" " + scale + "->")
|
||||||
|
|
||||||
@ -361,30 +402,39 @@ def write_vcd(filename):
|
|||||||
'values':
|
'values':
|
||||||
collections.defaultdict(lambda: {}), # {<time>}{<code>} = value
|
collections.defaultdict(lambda: {}), # {<time>}{<code>} = value
|
||||||
'sigs': {
|
'sigs': {
|
||||||
'threads': {},
|
'predicted_threads': {},
|
||||||
|
'measured_threads': {},
|
||||||
'cpus': {},
|
'cpus': {},
|
||||||
|
'evals': {},
|
||||||
'mtasks': {},
|
'mtasks': {},
|
||||||
'Stats': {}
|
'Stats': {}
|
||||||
} # {<module>}{<sig}} = code
|
} # {<module>}{<sig}} = code
|
||||||
}
|
}
|
||||||
code = 0
|
code = 0
|
||||||
|
|
||||||
parallelism = collections.defaultdict(lambda: 0)
|
parallelism = {
|
||||||
|
'measured': collections.defaultdict(lambda: 0),
|
||||||
|
'predicted': collections.defaultdict(lambda: 0)
|
||||||
|
}
|
||||||
|
parallelism['measured'][0] = 0
|
||||||
|
parallelism['predicted'][0] = 0
|
||||||
|
|
||||||
|
# Measured graph
|
||||||
for thread in sorted(Threads.keys()):
|
for thread in sorted(Threads.keys()):
|
||||||
sig = "thread%d_mtask" % thread
|
sig = "thread%d_mtask" % thread
|
||||||
if sig not in vcd['sigs']['threads']:
|
if sig not in vcd['sigs']['measured_threads']:
|
||||||
vcd['sigs']['threads'][sig] = code
|
vcd['sigs']['measured_threads'][sig] = code
|
||||||
code += 1
|
code += 1
|
||||||
mcode = vcd['sigs']['threads'][sig]
|
mcode = vcd['sigs']['measured_threads'][sig]
|
||||||
|
|
||||||
for start in sorted(Threads[thread]):
|
for start in sorted(Threads[thread]):
|
||||||
end = Threads[thread][start]['end']
|
|
||||||
mtask = Threads[thread][start]['mtask']
|
mtask = Threads[thread][start]['mtask']
|
||||||
|
end = Threads[thread][start]['end']
|
||||||
cpu = Threads[thread][start]['cpu']
|
cpu = Threads[thread][start]['cpu']
|
||||||
vcd['values'][start][mcode] = mtask
|
vcd['values'][start][mcode] = mtask
|
||||||
vcd['values'][end][mcode] = None
|
vcd['values'][end][mcode] = None
|
||||||
parallelism[start] += 1
|
parallelism['measured'][start] += 1
|
||||||
parallelism[end] -= 1
|
parallelism['measured'][end] -= 1
|
||||||
|
|
||||||
sig = "cpu%d_thread" % cpu
|
sig = "cpu%d_thread" % cpu
|
||||||
if sig not in vcd['sigs']['cpus']:
|
if sig not in vcd['sigs']['cpus']:
|
||||||
@ -402,16 +452,68 @@ def write_vcd(filename):
|
|||||||
vcd['values'][start][ccode] = cpu
|
vcd['values'][start][ccode] = cpu
|
||||||
vcd['values'][end][ccode] = None
|
vcd['values'][end][ccode] = None
|
||||||
|
|
||||||
# Parallelism graph
|
# Eval graph
|
||||||
vcd['sigs']['Stats']["parallelism"] = code
|
vcd['sigs']['evals']["eval"] = code
|
||||||
pcode = code
|
elcode = code
|
||||||
code += 1
|
code += 1
|
||||||
|
n = 0
|
||||||
|
for eval_start in Evals:
|
||||||
|
eval_end = Evals[eval_start]['end']
|
||||||
|
n += 1
|
||||||
|
vcd['values'][eval_start][elcode] = n
|
||||||
|
vcd['values'][eval_end][elcode] = None
|
||||||
|
|
||||||
value = 0
|
# Eval_loop graph
|
||||||
for time in sorted(parallelism.keys()):
|
vcd['sigs']['evals']["eval_loop"] = code
|
||||||
value += parallelism[time]
|
elcode = code
|
||||||
vcd['values'][time][pcode] = value
|
code += 1
|
||||||
|
n = 0
|
||||||
|
for eval_start in EvalLoops:
|
||||||
|
eval_end = EvalLoops[eval_start]['end']
|
||||||
|
n += 1
|
||||||
|
vcd['values'][eval_start][elcode] = n
|
||||||
|
vcd['values'][eval_end][elcode] = None
|
||||||
|
|
||||||
|
# Predicted graph
|
||||||
|
for eval_start in EvalLoops:
|
||||||
|
eval_end = EvalLoops[eval_start]['end']
|
||||||
|
# Compute scale so predicted graph is of same width as eval
|
||||||
|
measured_scaling = (eval_end -
|
||||||
|
eval_start) / Global['predict_last_end']
|
||||||
|
# Predict mtasks that fill the time the eval occupied
|
||||||
|
for mtask in Mtasks:
|
||||||
|
thread = Mtasks[mtask]['thread']
|
||||||
|
pred_scaled_start = eval_start + int(
|
||||||
|
Mtasks[mtask]['predict_start'] * measured_scaling)
|
||||||
|
pred_scaled_end = eval_start + int(
|
||||||
|
(Mtasks[mtask]['predict_start'] +
|
||||||
|
Mtasks[mtask]['predict_cost']) * measured_scaling)
|
||||||
|
if pred_scaled_start == pred_scaled_end:
|
||||||
|
continue
|
||||||
|
|
||||||
|
sig = "predicted_thread%d_mtask" % thread
|
||||||
|
if sig not in vcd['sigs']['predicted_threads']:
|
||||||
|
vcd['sigs']['predicted_threads'][sig] = code
|
||||||
|
code += 1
|
||||||
|
mcode = vcd['sigs']['predicted_threads'][sig]
|
||||||
|
|
||||||
|
vcd['values'][pred_scaled_start][mcode] = mtask
|
||||||
|
vcd['values'][pred_scaled_end][mcode] = None
|
||||||
|
|
||||||
|
parallelism['predicted'][pred_scaled_start] += 1
|
||||||
|
parallelism['predicted'][pred_scaled_end] -= 1
|
||||||
|
|
||||||
|
# Parallelism graph
|
||||||
|
for measpred in ('measured', 'predicted'):
|
||||||
|
vcd['sigs']['Stats']["%s_parallelism" % measpred] = code
|
||||||
|
pcode = code
|
||||||
|
code += 1
|
||||||
|
value = 0
|
||||||
|
for time in sorted(parallelism[measpred].keys()):
|
||||||
|
value += parallelism[measpred][time]
|
||||||
|
vcd['values'][time][pcode] = value
|
||||||
|
|
||||||
|
# Create output file
|
||||||
fh.write("$version Generated by verilator_gantt $end\n")
|
fh.write("$version Generated by verilator_gantt $end\n")
|
||||||
fh.write("$timescale 1ns $end\n")
|
fh.write("$timescale 1ns $end\n")
|
||||||
fh.write("\n")
|
fh.write("\n")
|
||||||
|
@ -27,25 +27,51 @@ indicates multiple mtasks started at that time position.
|
|||||||
Also creates a value change dump (VCD) format dump file which may be viewed
|
Also creates a value change dump (VCD) format dump file which may be viewed
|
||||||
in a waveform viewer (e.g. C<GTKWave>). See below.
|
in a waveform viewer (e.g. C<GTKWave>). See below.
|
||||||
|
|
||||||
|
.. figure:: figures/fig_gantt_min.png
|
||||||
|
|
||||||
|
Example verilator_gantt output, as viewed with GTKWave.
|
||||||
|
|
||||||
|
|
||||||
Gantt Chart VCD Signals
|
Gantt Chart VCD Signals
|
||||||
-----------------------
|
-----------------------
|
||||||
|
|
||||||
In waveforms there are the following signals. Most signals the "decimal"
|
In waveforms there are the following signals. In GTKWave, using a data
|
||||||
format will remove the leading zeros and make the traces easier to read.
|
format of "decimal" will remove the leading zeros and make the traces
|
||||||
|
easier to read.
|
||||||
|
|
||||||
parallelism
|
evals
|
||||||
|
Increments each time when eval_step was measured to be active. This
|
||||||
|
allow visualization of how much time eval_step was active.
|
||||||
|
|
||||||
|
eval_loop
|
||||||
|
Increments each time when the evaluation loop within eval_step was
|
||||||
|
measured to be active. For best performance there is only a single
|
||||||
|
evaluation loop within each eval_step call, that is the eval_loop
|
||||||
|
waveform looks identical to the evals waveform.
|
||||||
|
|
||||||
|
measured_parallelism
|
||||||
The number of mtasks active at this time, for best performance this will
|
The number of mtasks active at this time, for best performance this will
|
||||||
match the thread count. You may want to use an "analog step" format to
|
match the thread count. In GTKWave, use a data format of "analog step" to
|
||||||
view this signal.
|
view this signal.
|
||||||
|
|
||||||
|
predicted_parallelism
|
||||||
|
The number of mtasks Verilator predicted would be active at this time,
|
||||||
|
for best performance this will match the thread count. In GTKWave, use a
|
||||||
|
data format of "analog step" to view this signal.
|
||||||
|
|
||||||
cpu#_thread
|
cpu#_thread
|
||||||
For the given CPU number, the thread number executing.
|
For the given CPU number, the thread number measured to be executing.
|
||||||
|
|
||||||
mtask#_cpu
|
mtask#_cpu
|
||||||
For the given mtask id, the CPU it is executing on.
|
For the given mtask id, the CPU it was measured to execute on.
|
||||||
|
|
||||||
thread#_mtask
|
thread#_mtask
|
||||||
For the given thread number, the mtask id executing.
|
For the given thread number, the mtask id it was executing.
|
||||||
|
|
||||||
|
predicted_thread#_mtask
|
||||||
|
For the given thread number, the mtask id Verilator predicted would be
|
||||||
|
executing.
|
||||||
|
|
||||||
|
|
||||||
verilator_gantt Arguments
|
verilator_gantt Arguments
|
||||||
-------------------------
|
-------------------------
|
||||||
|
Binary file not shown.
Before Width: | Height: | Size: 43 KiB After Width: | Height: | Size: 37 KiB |
@ -306,7 +306,7 @@ statistics.
|
|||||||
|
|
||||||
Example verilator_gantt output, as viewed with GTKWave.
|
Example verilator_gantt output, as viewed with GTKWave.
|
||||||
|
|
||||||
The parallelism shows the number of CPUs being used at a given moment.
|
The measured_parallelism shows the number of CPUs being used at a given moment.
|
||||||
|
|
||||||
The cpu_thread section shows which thread is executing on each of the physical CPUs.
|
The cpu_thread section shows which thread is executing on each of the physical CPUs.
|
||||||
|
|
||||||
|
@ -160,7 +160,7 @@ void VlThreadPool::profileDump(const char* filenamep, vluint64_t tickStart, vlui
|
|||||||
|
|
||||||
// TODO Perhaps merge with verilated_coverage output format, so can
|
// TODO Perhaps merge with verilated_coverage output format, so can
|
||||||
// have a common merging and reporting tool, etc.
|
// have a common merging and reporting tool, etc.
|
||||||
fprintf(fp, "VLPROFTHREAD 1.0 # Verilator thread profile dump version 1.0\n");
|
fprintf(fp, "VLPROFTHREAD 1.1 # Verilator thread profile dump version 1.1\n");
|
||||||
fprintf(fp, "VLPROF arg --threads %" VL_PRI64 "u\n", vluint64_t(m_workers.size() + 1));
|
fprintf(fp, "VLPROF arg --threads %" VL_PRI64 "u\n", vluint64_t(m_workers.size() + 1));
|
||||||
fprintf(fp, "VLPROF arg +verilator+prof+threads+start+%" VL_PRI64 "u\n",
|
fprintf(fp, "VLPROF arg +verilator+prof+threads+start+%" VL_PRI64 "u\n",
|
||||||
Verilated::threadContextp()->profThreadsStart());
|
Verilated::threadContextp()->profThreadsStart());
|
||||||
@ -188,14 +188,30 @@ void VlThreadPool::profileDump(const char* filenamep, vluint64_t tickStart, vlui
|
|||||||
case VlProfileRec::TYPE_BARRIER: //
|
case VlProfileRec::TYPE_BARRIER: //
|
||||||
printing = true;
|
printing = true;
|
||||||
break;
|
break;
|
||||||
|
case VlProfileRec::TYPE_EVAL:
|
||||||
|
if (!printing) break;
|
||||||
|
fprintf(fp,
|
||||||
|
"VLPROF eval start %" VL_PRI64 "u elapsed %" VL_PRI64 "u"
|
||||||
|
" cpu %u on thread %u\n",
|
||||||
|
ei.m_startTime - tickStart, (ei.m_endTime - ei.m_startTime), ei.m_cpu,
|
||||||
|
thread_id);
|
||||||
|
break;
|
||||||
|
case VlProfileRec::TYPE_EVAL_LOOP:
|
||||||
|
if (!printing) break;
|
||||||
|
fprintf(fp,
|
||||||
|
"VLPROF eval_loop start %" VL_PRI64 "u elapsed %" VL_PRI64 "u"
|
||||||
|
" cpu %u on thread %u\n",
|
||||||
|
ei.m_startTime - tickStart, (ei.m_endTime - ei.m_startTime), ei.m_cpu,
|
||||||
|
thread_id);
|
||||||
|
break;
|
||||||
case VlProfileRec::TYPE_MTASK_RUN:
|
case VlProfileRec::TYPE_MTASK_RUN:
|
||||||
if (!printing) break;
|
if (!printing) break;
|
||||||
fprintf(fp,
|
fprintf(fp,
|
||||||
"VLPROF mtask %d"
|
"VLPROF mtask %d"
|
||||||
" start %" VL_PRI64 "u end %" VL_PRI64 "u elapsed %" VL_PRI64 "u"
|
" start %" VL_PRI64 "u elapsed %" VL_PRI64 "u"
|
||||||
" predict_time %u cpu %u on thread %u\n",
|
" predict_start %u predict_cost %u cpu %u on thread %u\n",
|
||||||
ei.m_mtaskId, ei.m_startTime - tickStart, ei.m_endTime - tickStart,
|
ei.m_mtaskId, ei.m_startTime - tickStart, (ei.m_endTime - ei.m_startTime),
|
||||||
(ei.m_endTime - ei.m_startTime), ei.m_predictTime, ei.m_cpu, thread_id);
|
ei.m_predictStart, ei.m_predictCost, ei.m_cpu, thread_id);
|
||||||
break;
|
break;
|
||||||
default: assert(false); break; // LCOV_EXCL_LINE
|
default: assert(false); break; // LCOV_EXCL_LINE
|
||||||
}
|
}
|
||||||
|
@ -131,21 +131,36 @@ public:
|
|||||||
class VlProfileRec final {
|
class VlProfileRec final {
|
||||||
protected:
|
protected:
|
||||||
friend class VlThreadPool;
|
friend class VlThreadPool;
|
||||||
enum VlProfileE { TYPE_MTASK_RUN, TYPE_BARRIER };
|
enum VlProfileE { TYPE_MTASK_RUN, TYPE_EVAL, TYPE_EVAL_LOOP, TYPE_BARRIER };
|
||||||
VlProfileE m_type = TYPE_BARRIER; // Record type
|
// Layout below allows efficient packing.
|
||||||
vluint32_t m_mtaskId = 0; // Mtask we're logging
|
// Leave endTime first, so no math needed to calculate address in endRecord
|
||||||
vluint32_t m_predictTime = 0; // How long scheduler predicted would take
|
|
||||||
vluint64_t m_startTime = 0; // Tick at start of execution
|
|
||||||
vluint64_t m_endTime = 0; // Tick at end of execution
|
vluint64_t m_endTime = 0; // Tick at end of execution
|
||||||
|
vluint64_t m_startTime = 0; // Tick at start of execution
|
||||||
|
vluint32_t m_mtaskId = 0; // Mtask we're logging
|
||||||
|
vluint32_t m_predictStart = 0; // Time scheduler predicted would start
|
||||||
|
vluint32_t m_predictCost = 0; // How long scheduler predicted would take
|
||||||
|
VlProfileE m_type = TYPE_BARRIER; // Record type
|
||||||
unsigned m_cpu; // Execution CPU number (at start anyways)
|
unsigned m_cpu; // Execution CPU number (at start anyways)
|
||||||
public:
|
public:
|
||||||
class Barrier {};
|
class Barrier {};
|
||||||
VlProfileRec() = default;
|
VlProfileRec() = default;
|
||||||
explicit VlProfileRec(Barrier) { m_cpu = getcpu(); }
|
explicit VlProfileRec(Barrier) { m_cpu = getcpu(); }
|
||||||
void startRecord(vluint64_t time, uint32_t mtask, uint32_t predict) {
|
void startEval(vluint64_t time) {
|
||||||
|
m_type = VlProfileRec::TYPE_EVAL;
|
||||||
|
m_startTime = time;
|
||||||
|
m_cpu = getcpu();
|
||||||
|
}
|
||||||
|
void startEvalLoop(vluint64_t time) {
|
||||||
|
m_type = VlProfileRec::TYPE_EVAL_LOOP;
|
||||||
|
m_startTime = time;
|
||||||
|
m_cpu = getcpu();
|
||||||
|
}
|
||||||
|
void startRecord(vluint64_t time, vluint32_t mtask, vluint32_t predictStart,
|
||||||
|
vluint32_t predictCost) {
|
||||||
m_type = VlProfileRec::TYPE_MTASK_RUN;
|
m_type = VlProfileRec::TYPE_MTASK_RUN;
|
||||||
m_mtaskId = mtask;
|
m_mtaskId = mtask;
|
||||||
m_predictTime = predict;
|
m_predictStart = predictStart;
|
||||||
|
m_predictCost = predictCost;
|
||||||
m_startTime = time;
|
m_startTime = time;
|
||||||
m_cpu = getcpu();
|
m_cpu = getcpu();
|
||||||
}
|
}
|
||||||
|
@ -323,7 +323,24 @@ class EmitCModel final : public EmitCFunc {
|
|||||||
puts(" loop\\n\"););\n");
|
puts(" loop\\n\"););\n");
|
||||||
if (initial)
|
if (initial)
|
||||||
puts(topModNameProtected + "__" + protect("_eval_settle") + "(&(vlSymsp->TOP));\n");
|
puts(topModNameProtected + "__" + protect("_eval_settle") + "(&(vlSymsp->TOP));\n");
|
||||||
|
|
||||||
|
const string recName = "__Vprfloop";
|
||||||
|
if (v3Global.opt.profThreads() && !initial) {
|
||||||
|
puts("VlProfileRec* " + recName + " = nullptr;\n");
|
||||||
|
// Leave this if() here, as don't want to call VL_RDTSC_Q unless profiling
|
||||||
|
puts("if (VL_UNLIKELY(vlSymsp->__Vm_profile_cycle_start)) {\n");
|
||||||
|
// Eval start
|
||||||
|
puts(/**/ recName + " = vlSymsp->__Vm_threadPoolp->profileAppend();\n");
|
||||||
|
puts(/**/ recName + "->startEvalLoop(VL_RDTSC_Q());\n");
|
||||||
|
puts("}\n");
|
||||||
|
}
|
||||||
|
|
||||||
puts(topModNameProtected + "__" + protect("_eval") + "(&(vlSymsp->TOP));\n");
|
puts(topModNameProtected + "__" + protect("_eval") + "(&(vlSymsp->TOP));\n");
|
||||||
|
|
||||||
|
if (v3Global.opt.profThreads() && !initial) {
|
||||||
|
puts("if (VL_UNLIKELY(" + recName + ")) " + recName + "->endRecord(VL_RDTSC_Q());\n");
|
||||||
|
}
|
||||||
|
|
||||||
if (v3Global.rootp()->changeRequest()) {
|
if (v3Global.rootp()->changeRequest()) {
|
||||||
puts("if (VL_UNLIKELY(++__VclockLoop > " + cvtToStr(v3Global.opt.convergeLimit())
|
puts("if (VL_UNLIKELY(++__VclockLoop > " + cvtToStr(v3Global.opt.convergeLimit())
|
||||||
+ ")) {\n");
|
+ ")) {\n");
|
||||||
@ -354,7 +371,7 @@ class EmitCModel final : public EmitCFunc {
|
|||||||
+ ");\n");
|
+ ");\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
void emitStandardMethods(AstNodeModule* modp) {
|
void emitStandardMethods1(AstNodeModule* modp) {
|
||||||
UASSERT_OBJ(modp->isTop(), modp, "Attempting to emitWrapEval for non-top class");
|
UASSERT_OBJ(modp->isTop(), modp, "Attempting to emitWrapEval for non-top class");
|
||||||
|
|
||||||
const string topModNameProtected = prefixNameProtect(modp);
|
const string topModNameProtected = prefixNameProtect(modp);
|
||||||
@ -385,16 +402,21 @@ class EmitCModel final : public EmitCFunc {
|
|||||||
emitSettleLoop(modp, /* initial: */ true);
|
emitSettleLoop(modp, /* initial: */ true);
|
||||||
ensureNewLine();
|
ensureNewLine();
|
||||||
puts("}\n");
|
puts("}\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
void emitStandardMethods2(AstNodeModule* modp) {
|
||||||
|
const string topModNameProtected = prefixNameProtect(modp);
|
||||||
// ::eval_step
|
// ::eval_step
|
||||||
puts("\nvoid " + topClassName() + "::eval_step() {\n");
|
puts("\nvoid " + topClassName() + "::eval_step() {\n");
|
||||||
puts("VL_DEBUG_IF(VL_DBG_MSGF(\"+++++TOP Evaluate " + topClassName()
|
puts("VL_DEBUG_IF(VL_DBG_MSGF(\"+++++TOP Evaluate " + topClassName()
|
||||||
+ "::eval_step\\n\"); );\n");
|
+ "::eval_step\\n\"); );\n");
|
||||||
|
|
||||||
puts("#ifdef VL_DEBUG\n");
|
puts("#ifdef VL_DEBUG\n");
|
||||||
putsDecoration("// Debug assertions\n");
|
putsDecoration("// Debug assertions\n");
|
||||||
puts(topModNameProtected + "__" + protect("_eval_debug_assertions")
|
puts(topModNameProtected + "__" + protect("_eval_debug_assertions")
|
||||||
+ "(&(vlSymsp->TOP));\n");
|
+ "(&(vlSymsp->TOP));\n");
|
||||||
puts("#endif // VL_DEBUG\n");
|
puts("#endif // VL_DEBUG\n");
|
||||||
|
|
||||||
putsDecoration("// Initialize\n");
|
putsDecoration("// Initialize\n");
|
||||||
puts("if (VL_UNLIKELY(!vlSymsp->__Vm_didInit)) " + protect("_eval_initial_loop")
|
puts("if (VL_UNLIKELY(!vlSymsp->__Vm_didInit)) " + protect("_eval_initial_loop")
|
||||||
+ "(vlSymsp);\n");
|
+ "(vlSymsp);\n");
|
||||||
@ -406,13 +428,14 @@ class EmitCModel final : public EmitCFunc {
|
|||||||
puts("Verilated::mtaskId(" + cvtToStr(mtaskId) + ");\n");
|
puts("Verilated::mtaskId(" + cvtToStr(mtaskId) + ");\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (v3Global.opt.mtasks() && v3Global.opt.profThreads()) {
|
if (v3Global.opt.profThreads()) {
|
||||||
puts("if (VL_UNLIKELY((vlSymsp->_vm_contextp__->profThreadsStart()"
|
puts("if (VL_UNLIKELY((vlSymsp->_vm_contextp__->profThreadsStart() != "
|
||||||
" != vlSymsp->__Vm_profile_time_finished)\n");
|
"vlSymsp->__Vm_profile_time_finished)\n");
|
||||||
puts(" && (VL_TIME_Q() > vlSymsp->_vm_contextp__->profThreadsStart())\n");
|
puts(" && (VL_TIME_Q() > vlSymsp->_vm_contextp__->profThreadsStart())\n");
|
||||||
puts(" && (vlSymsp->_vm_contextp__->profThreadsWindow() >= 1))) {\n");
|
puts(" && (vlSymsp->_vm_contextp__->profThreadsWindow() >= 1))) {\n");
|
||||||
// Within a profile (either starting, middle, or end)
|
// Within a profile (either starting, middle, or end)
|
||||||
puts(/**/ "if (vlSymsp->__Vm_profile_window_ct == 0) {\n"); // Opening file?
|
puts(/**/ "if (vlSymsp->__Vm_profile_window_ct == 0) {\n"); // Opening file?
|
||||||
|
puts(/**/ "VL_DEBUG_IF(VL_DBG_MSGF(\"+ profile start warmup\\n\"););\n");
|
||||||
// Start profile on this cycle. We'll capture a window worth, then
|
// Start profile on this cycle. We'll capture a window worth, then
|
||||||
// only analyze the next window worth. The idea is that the first window
|
// only analyze the next window worth. The idea is that the first window
|
||||||
// capture will hit some cache-cold stuff (eg printf) but it'll be warm
|
// capture will hit some cache-cold stuff (eg printf) but it'll be warm
|
||||||
@ -430,12 +453,13 @@ class EmitCModel final : public EmitCFunc {
|
|||||||
// cache-warm-up cycles before the barrier from the actual profile
|
// cache-warm-up cycles before the barrier from the actual profile
|
||||||
// cycles afterward.
|
// cycles afterward.
|
||||||
puts(/****/ "vlSymsp->__Vm_threadPoolp->profileAppendAll(");
|
puts(/****/ "vlSymsp->__Vm_threadPoolp->profileAppendAll(");
|
||||||
puts(/****/ "VlProfileRec(VlProfileRec::Barrier()));\n");
|
puts(/****/ "VlProfileRec{VlProfileRec::Barrier{}});\n");
|
||||||
puts(/****/ "vlSymsp->__Vm_profile_cycle_start = VL_RDTSC_Q();\n");
|
puts(/****/ "vlSymsp->__Vm_profile_cycle_start = VL_RDTSC_Q();\n");
|
||||||
puts(/**/ "}\n");
|
puts(/**/ "}\n");
|
||||||
|
// Ending trace file?
|
||||||
puts(/**/ "else if (vlSymsp->__Vm_profile_window_ct == 0) {\n");
|
puts(/**/ "else if (vlSymsp->__Vm_profile_window_ct == 0) {\n");
|
||||||
// Ending file.
|
|
||||||
puts(/****/ "vluint64_t tick_end = VL_RDTSC_Q();\n");
|
puts(/****/ "vluint64_t tick_end = VL_RDTSC_Q();\n");
|
||||||
|
puts(/****/ "VL_DEBUG_IF(VL_DBG_MSGF(\"+ profile end\\n\"););\n");
|
||||||
puts(/****/ "vlSymsp->__Vm_threadPoolp->profileDump("
|
puts(/****/ "vlSymsp->__Vm_threadPoolp->profileDump("
|
||||||
"vlSymsp->_vm_contextp__->profThreadsFilename().c_str(), "
|
"vlSymsp->_vm_contextp__->profThreadsFilename().c_str(), "
|
||||||
"vlSymsp->__Vm_profile_cycle_start, "
|
"vlSymsp->__Vm_profile_cycle_start, "
|
||||||
@ -443,20 +467,41 @@ class EmitCModel final : public EmitCFunc {
|
|||||||
// This turns off the test to enter the profiling code, but still
|
// This turns off the test to enter the profiling code, but still
|
||||||
// allows the user to collect another profile by changing
|
// allows the user to collect another profile by changing
|
||||||
// profThreadsStart
|
// profThreadsStart
|
||||||
puts(/****/ "vlSymsp->__Vm_profile_time_finished"
|
puts(/****/ "vlSymsp->__Vm_profile_time_finished = "
|
||||||
" = vlSymsp->_vm_contextp__->profThreadsStart();\n");
|
"vlSymsp->_vm_contextp__->profThreadsStart();\n");
|
||||||
puts(/****/ "vlSymsp->__Vm_profile_cycle_start = 0;\n");
|
puts(/****/ "vlSymsp->__Vm_profile_cycle_start = 0;\n");
|
||||||
puts(/**/ "}\n");
|
puts(/**/ "}\n");
|
||||||
puts("}\n");
|
puts("}\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const string recName = "__Vprfeval";
|
||||||
|
if (v3Global.opt.profThreads()) {
|
||||||
|
puts("VlProfileRec* " + recName + " = nullptr;\n");
|
||||||
|
// Leave this if() here, as don't want to call VL_RDTSC_Q unless profiling
|
||||||
|
puts("if (VL_UNLIKELY(vlSymsp->__Vm_profile_cycle_start)) {\n");
|
||||||
|
// Eval start
|
||||||
|
puts(/**/ recName + " = vlSymsp->__Vm_threadPoolp->profileAppend();\n");
|
||||||
|
puts(/**/ recName + "->startEval(VL_RDTSC_Q());\n");
|
||||||
|
puts("}\n");
|
||||||
|
}
|
||||||
|
|
||||||
emitSettleLoop(modp, /* initial: */ false);
|
emitSettleLoop(modp, /* initial: */ false);
|
||||||
|
|
||||||
|
putsDecoration("// Evaluate cleanup\n");
|
||||||
if (v3Global.opt.threads() == 1) {
|
if (v3Global.opt.threads() == 1) {
|
||||||
puts("Verilated::endOfThreadMTask(vlSymsp->__Vm_evalMsgQp);\n");
|
puts("Verilated::endOfThreadMTask(vlSymsp->__Vm_evalMsgQp);\n");
|
||||||
}
|
}
|
||||||
if (v3Global.opt.threads()) puts("Verilated::endOfEval(vlSymsp->__Vm_evalMsgQp);\n");
|
if (v3Global.opt.threads()) puts("Verilated::endOfEval(vlSymsp->__Vm_evalMsgQp);\n");
|
||||||
puts("}\n");
|
|
||||||
|
|
||||||
|
if (v3Global.opt.profThreads()) {
|
||||||
|
// End eval record
|
||||||
|
puts("if (VL_UNLIKELY(" + recName + ")) " + recName + "->endRecord(VL_RDTSC_Q());\n");
|
||||||
|
}
|
||||||
|
puts("}\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
void emitStandardMethods3(AstNodeModule* modp) {
|
||||||
|
const string topModNameProtected = prefixNameProtect(modp);
|
||||||
// ::eval_end_step
|
// ::eval_end_step
|
||||||
if (v3Global.needTraceDumper() && !optSystemC()) {
|
if (v3Global.needTraceDumper() && !optSystemC()) {
|
||||||
puts("\nvoid " + topClassName() + "::eval_end_step() {\n");
|
puts("\nvoid " + topClassName() + "::eval_end_step() {\n");
|
||||||
@ -573,7 +618,9 @@ class EmitCModel final : public EmitCFunc {
|
|||||||
|
|
||||||
emitConstructorImplementation(modp);
|
emitConstructorImplementation(modp);
|
||||||
emitDestructorImplementation();
|
emitDestructorImplementation();
|
||||||
emitStandardMethods(modp);
|
emitStandardMethods1(modp);
|
||||||
|
emitStandardMethods2(modp);
|
||||||
|
emitStandardMethods3(modp);
|
||||||
if (v3Global.opt.trace()) { emitTraceMethods(modp); }
|
if (v3Global.opt.trace()) { emitTraceMethods(modp); }
|
||||||
if (v3Global.opt.savable()) { emitSerializationFunctions(); }
|
if (v3Global.opt.savable()) { emitSerializationFunctions(); }
|
||||||
|
|
||||||
|
@ -2237,11 +2237,11 @@ public:
|
|||||||
std::vector<uint32_t> busyUntil(m_nThreads, 0);
|
std::vector<uint32_t> busyUntil(m_nThreads, 0);
|
||||||
|
|
||||||
// MTasks ready to be assigned next. All their dependencies are already assigned.
|
// MTasks ready to be assigned next. All their dependencies are already assigned.
|
||||||
std::set<const ExecMTask*, MTaskCmp> readyMTasks;
|
std::set<ExecMTask*, MTaskCmp> readyMTasks;
|
||||||
|
|
||||||
// Build initial ready list
|
// Build initial ready list
|
||||||
for (V3GraphVertex* vxp = mtaskGraph.verticesBeginp(); vxp; vxp = vxp->verticesNextp()) {
|
for (V3GraphVertex* vxp = mtaskGraph.verticesBeginp(); vxp; vxp = vxp->verticesNextp()) {
|
||||||
const ExecMTask* const mtaskp = dynamic_cast<ExecMTask*>(vxp);
|
ExecMTask* const mtaskp = dynamic_cast<ExecMTask*>(vxp);
|
||||||
if (isReady(schedule, mtaskp)) readyMTasks.insert(mtaskp);
|
if (isReady(schedule, mtaskp)) readyMTasks.insert(mtaskp);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2250,9 +2250,9 @@ public:
|
|||||||
// on each thread (in that thread's local time frame.)
|
// on each thread (in that thread's local time frame.)
|
||||||
uint32_t bestTime = 0xffffffff;
|
uint32_t bestTime = 0xffffffff;
|
||||||
uint32_t bestThreadId = 0;
|
uint32_t bestThreadId = 0;
|
||||||
const ExecMTask* bestMtaskp = nullptr; // Todo: const ExecMTask*
|
ExecMTask* bestMtaskp = nullptr; // Todo: const ExecMTask*
|
||||||
for (uint32_t threadId = 0; threadId < m_nThreads; ++threadId) {
|
for (uint32_t threadId = 0; threadId < m_nThreads; ++threadId) {
|
||||||
for (const ExecMTask* const mtaskp : readyMTasks) {
|
for (ExecMTask* const mtaskp : readyMTasks) {
|
||||||
uint32_t timeBegin = busyUntil[threadId];
|
uint32_t timeBegin = busyUntil[threadId];
|
||||||
if (timeBegin > bestTime) {
|
if (timeBegin > bestTime) {
|
||||||
UINFO(6, "th " << threadId << " busy until " << timeBegin
|
UINFO(6, "th " << threadId << " busy until " << timeBegin
|
||||||
@ -2287,10 +2287,11 @@ public:
|
|||||||
std::vector<const ExecMTask*>& bestThread = schedule.threads[bestThreadId];
|
std::vector<const ExecMTask*>& bestThread = schedule.threads[bestThreadId];
|
||||||
|
|
||||||
// Update algorithm state
|
// Update algorithm state
|
||||||
|
bestMtaskp->predictStart(bestTime); // Only for gantt reporting
|
||||||
const uint32_t bestEndTime = bestTime + bestMtaskp->cost();
|
const uint32_t bestEndTime = bestTime + bestMtaskp->cost();
|
||||||
schedule.mtaskState[bestMtaskp].completionTime = bestEndTime;
|
schedule.mtaskState[bestMtaskp].completionTime = bestEndTime;
|
||||||
schedule.mtaskState[bestMtaskp].threadId = bestThreadId;
|
schedule.mtaskState[bestMtaskp].threadId = bestThreadId;
|
||||||
if (!bestThread.empty()) { schedule.mtaskState[bestThread.back()].nextp = bestMtaskp; }
|
if (!bestThread.empty()) schedule.mtaskState[bestThread.back()].nextp = bestMtaskp;
|
||||||
busyUntil[bestThreadId] = bestEndTime;
|
busyUntil[bestThreadId] = bestEndTime;
|
||||||
|
|
||||||
// Add the MTask to the schedule
|
// Add the MTask to the schedule
|
||||||
@ -2301,7 +2302,7 @@ public:
|
|||||||
UASSERT_OBJ(erased > 0, bestMtaskp, "Should have erased something?");
|
UASSERT_OBJ(erased > 0, bestMtaskp, "Should have erased something?");
|
||||||
for (V3GraphEdge* edgeOutp = bestMtaskp->outBeginp(); edgeOutp;
|
for (V3GraphEdge* edgeOutp = bestMtaskp->outBeginp(); edgeOutp;
|
||||||
edgeOutp = edgeOutp->outNextp()) {
|
edgeOutp = edgeOutp->outNextp()) {
|
||||||
const ExecMTask* const nextp = dynamic_cast<ExecMTask*>(edgeOutp->top());
|
ExecMTask* const nextp = dynamic_cast<ExecMTask*>(edgeOutp->top());
|
||||||
// Dependent MTask should not yet be assigned to a thread
|
// Dependent MTask should not yet be assigned to a thread
|
||||||
UASSERT(schedule.threadId(nextp) == ThreadSchedule::UNASSIGNED,
|
UASSERT(schedule.threadId(nextp) == ThreadSchedule::UNASSIGNED,
|
||||||
"Tasks after one being assigned should not be assigned yet");
|
"Tasks after one being assigned should not be assigned yet");
|
||||||
@ -2713,6 +2714,7 @@ static void addMTaskToFunction(const ThreadSchedule& schedule, const uint32_t th
|
|||||||
recName + " = vlSymsp->__Vm_threadPoolp->profileAppend();\n" + //
|
recName + " = vlSymsp->__Vm_threadPoolp->profileAppend();\n" + //
|
||||||
recName + "->startRecord(VL_RDTSC_Q()," + //
|
recName + "->startRecord(VL_RDTSC_Q()," + //
|
||||||
" " + cvtToStr(mtaskp->id()) + "," + //
|
" " + cvtToStr(mtaskp->id()) + "," + //
|
||||||
|
" " + cvtToStr(mtaskp->predictStart()) + "," + //
|
||||||
" " + cvtToStr(mtaskp->cost()) + ");\n" + //
|
" " + cvtToStr(mtaskp->cost()) + ");\n" + //
|
||||||
"}\n");
|
"}\n");
|
||||||
}
|
}
|
||||||
@ -2724,9 +2726,8 @@ static void addMTaskToFunction(const ThreadSchedule& schedule, const uint32_t th
|
|||||||
funcp->addStmtsp(mtaskp->bodyp()->unlinkFrBack());
|
funcp->addStmtsp(mtaskp->bodyp()->unlinkFrBack());
|
||||||
|
|
||||||
if (v3Global.opt.profThreads()) {
|
if (v3Global.opt.profThreads()) {
|
||||||
// Leave this if() here, as don't want to call VL_RDTSC_Q unless profiling
|
addStrStmt("if (VL_UNLIKELY(" + recName + ")) " //
|
||||||
addStrStmt("if (VL_UNLIKELY(" + recName + ")) {\n" + //
|
+ recName + "->endRecord(VL_RDTSC_Q());\n");
|
||||||
recName + "->endRecord(VL_RDTSC_Q());\n" + "}\n");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Flush message queue
|
// Flush message queue
|
||||||
|
@ -61,6 +61,7 @@ private:
|
|||||||
// mtask. In abstract time units.
|
// mtask. In abstract time units.
|
||||||
uint32_t m_cost = 0; // Predicted runtime of this mtask, in the same
|
uint32_t m_cost = 0; // Predicted runtime of this mtask, in the same
|
||||||
// abstract time units as priority().
|
// abstract time units as priority().
|
||||||
|
uint64_t m_predictStart = 0; // Predicted start time of task
|
||||||
VL_UNCOPYABLE(ExecMTask);
|
VL_UNCOPYABLE(ExecMTask);
|
||||||
|
|
||||||
public:
|
public:
|
||||||
@ -74,6 +75,8 @@ public:
|
|||||||
void priority(uint32_t pri) { m_priority = pri; }
|
void priority(uint32_t pri) { m_priority = pri; }
|
||||||
virtual uint32_t cost() const override { return m_cost; }
|
virtual uint32_t cost() const override { return m_cost; }
|
||||||
void cost(uint32_t cost) { m_cost = cost; }
|
void cost(uint32_t cost) { m_cost = cost; }
|
||||||
|
void predictStart(vluint64_t time) { m_predictStart = time; }
|
||||||
|
vluint64_t predictStart() const { return m_predictStart; }
|
||||||
string cFuncName() const {
|
string cFuncName() const {
|
||||||
// If this MTask maps to a C function, this should be the name
|
// If this MTask maps to a C function, this should be the name
|
||||||
return string("__Vmtask") + "__" + cvtToStr(m_id);
|
return string("__Vmtask") + "__" + cvtToStr(m_id);
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -6,33 +6,37 @@ Argument settings:
|
|||||||
--threads 2
|
--threads 2
|
||||||
|
|
||||||
Thread gantt graph:
|
Thread gantt graph:
|
||||||
Legend: One character width = 100 rdtsc ticks
|
Legend: One character width = 136 rdtsc ticks
|
||||||
Legend: '&' = multiple mtasks in this period (character width)
|
Legend: '&' = multiple mtasks in this period (character width)
|
||||||
<-16065 rdtsc total-------------------------------------------------------------------------------------------------------------------------------------------->
|
<-21875 rdtsc total-------------------------------------------------------------------------------------------------------------------------------------------->
|
||||||
t: [1] [1] [1] [1]
|
t: [19] [19] [19] [19]
|
||||||
t: [16-] [16][16-------] [16--] [16] [16-] [1[] [16[xxx
|
t: [10-] [10-] [10----] [10-] [10---] [10-][10][10] [10][10]
|
||||||
|
|
||||||
Analysis:
|
Analysis:
|
||||||
Total threads = 2
|
Total threads = 2
|
||||||
Total mtasks = 7
|
Total mtasks = 7
|
||||||
Total cpus used = 2
|
Total cpus used = 2
|
||||||
Total yields = 1
|
Total yields = 0
|
||||||
Total eval time = 16065 rdtsc ticks
|
Total eval time = 21875 rdtsc ticks
|
||||||
Longest mtask time = 1085 rdtsc ticks
|
Longest mtask time = 1190 rdtsc ticks
|
||||||
All-thread mtask time = 3430 rdtsc ticks
|
All-thread mtask time = 5495 rdtsc ticks
|
||||||
Longest-thread efficiency = 6.8%
|
Longest-thread efficiency = 5.4%
|
||||||
All-thread efficiency = 10.7%
|
All-thread efficiency = 12.6%
|
||||||
All-thread speedup = 0.2
|
All-thread speedup = 0.3
|
||||||
|
|
||||||
|
Prediction (what Verilator used for scheduling):
|
||||||
|
All-thread efficiency = 63.2%
|
||||||
|
All-thread speedup = 1.3
|
||||||
|
|
||||||
Statistics:
|
Statistics:
|
||||||
min log(p2e) = -3.332 from mtask 5 (predict 30, elapsed 840)
|
min log(p2e) = -3.681 from mtask 5 (predict 30, elapsed 1190)
|
||||||
max log(p2e) = -1.764 from mtask 7 (predict 30, elapsed 175)
|
max log(p2e) = -2.409 from mtask 8 (predict 107, elapsed 1190)
|
||||||
mean = -2.365
|
mean = -2.992
|
||||||
stddev = 0.562
|
stddev = 0.459
|
||||||
e ^ stddev = 1.754
|
e ^ stddev = 1.583
|
||||||
|
|
||||||
CPUs:
|
CPUs:
|
||||||
cpu 1: cpu_time=525 socket=0 core=1 Test Ryzen 9 3950X 16-Core Processor
|
cpu 10: cpu_time=4725 socket=0 core=10 Test Ryzen 9 3950X 16-Core Processor
|
||||||
cpu 16: cpu_time=2905 socket=0 core=0 Test Ryzen 9 3950X 16-Core Processor
|
cpu 19: cpu_time=770 socket=0 core=3 Test Ryzen 9 3950X 16-Core Processor
|
||||||
|
|
||||||
Writing profile_threads.vcd
|
Writing profile_threads.vcd
|
||||||
|
@ -3,171 +3,256 @@ $timescale 1ns $end
|
|||||||
|
|
||||||
$scope module gantt $end
|
$scope module gantt $end
|
||||||
$scope module Stats $end
|
$scope module Stats $end
|
||||||
$var wire 32 vb parallelism [31:0] $end
|
$var wire 32 vf measured_parallelism [31:0] $end
|
||||||
|
$var wire 32 v10 predicted_parallelism [31:0] $end
|
||||||
$upscope $end
|
$upscope $end
|
||||||
$scope module cpus $end
|
$scope module cpus $end
|
||||||
$var wire 32 v1 cpu16_thread [31:0] $end
|
$var wire 32 v5 cpu10_thread [31:0] $end
|
||||||
$var wire 32 v8 cpu1_thread [31:0] $end
|
$var wire 32 v1 cpu19_thread [31:0] $end
|
||||||
|
$upscope $end
|
||||||
|
$scope module evals $end
|
||||||
|
$var wire 32 vb eval [31:0] $end
|
||||||
|
$var wire 32 vc eval_loop [31:0] $end
|
||||||
|
$upscope $end
|
||||||
|
$scope module measured_threads $end
|
||||||
|
$var wire 32 v0 thread1_mtask [31:0] $end
|
||||||
|
$var wire 32 v4 thread2_mtask [31:0] $end
|
||||||
$upscope $end
|
$upscope $end
|
||||||
$scope module mtasks $end
|
$scope module mtasks $end
|
||||||
$var wire 32 va mtask10_cpu [31:0] $end
|
$var wire 32 v3 mtask10_cpu [31:0] $end
|
||||||
$var wire 32 v6 mtask11_cpu [31:0] $end
|
$var wire 32 va mtask11_cpu [31:0] $end
|
||||||
$var wire 32 v2 mtask5_cpu [31:0] $end
|
$var wire 32 v6 mtask5_cpu [31:0] $end
|
||||||
$var wire 32 v9 mtask6_cpu [31:0] $end
|
$var wire 32 v2 mtask6_cpu [31:0] $end
|
||||||
$var wire 32 v3 mtask7_cpu [31:0] $end
|
$var wire 32 v7 mtask7_cpu [31:0] $end
|
||||||
$var wire 32 v4 mtask8_cpu [31:0] $end
|
$var wire 32 v8 mtask8_cpu [31:0] $end
|
||||||
$var wire 32 v5 mtask9_cpu [31:0] $end
|
$var wire 32 v9 mtask9_cpu [31:0] $end
|
||||||
$upscope $end
|
$upscope $end
|
||||||
$scope module threads $end
|
$scope module predicted_threads $end
|
||||||
$var wire 32 v7 thread1_mtask [31:0] $end
|
$var wire 32 vd predicted_thread1_mtask [31:0] $end
|
||||||
$var wire 32 v0 thread2_mtask [31:0] $end
|
$var wire 32 ve predicted_thread2_mtask [31:0] $end
|
||||||
$upscope $end
|
$upscope $end
|
||||||
$upscope $end
|
$upscope $end
|
||||||
$enddefinitions $end
|
$enddefinitions $end
|
||||||
|
|
||||||
|
#0
|
||||||
|
bz v0
|
||||||
|
bz v1
|
||||||
|
bz v2
|
||||||
|
bz v3
|
||||||
|
bz v4
|
||||||
|
bz v5
|
||||||
|
bz v6
|
||||||
|
bz v7
|
||||||
|
bz v8
|
||||||
|
bz v9
|
||||||
|
bz va
|
||||||
|
bz vb
|
||||||
|
bz vc
|
||||||
|
bz vd
|
||||||
|
bz ve
|
||||||
|
b0 vf
|
||||||
|
b0 v10
|
||||||
#595
|
#595
|
||||||
|
b1 vb
|
||||||
|
#945
|
||||||
|
b1 vc
|
||||||
|
b110 vd
|
||||||
|
b101 ve
|
||||||
|
b10 v10
|
||||||
|
#2429
|
||||||
|
bz vd
|
||||||
|
b111 ve
|
||||||
|
b1 v10
|
||||||
|
#2695
|
||||||
|
b110 v0
|
||||||
|
b1 v1
|
||||||
|
b10011 v2
|
||||||
|
b1 vf
|
||||||
|
#2905
|
||||||
bz v0
|
bz v0
|
||||||
bz v1
|
bz v1
|
||||||
bz va
|
|
||||||
b1 vb
|
|
||||||
bz v2
|
bz v2
|
||||||
bz v3
|
b0 vf
|
||||||
|
#3914
|
||||||
|
b1000 ve
|
||||||
|
b1 v10
|
||||||
|
#5495
|
||||||
|
b101 v4
|
||||||
|
b10 v5
|
||||||
|
b1010 v6
|
||||||
|
b1 vf
|
||||||
|
#6090
|
||||||
bz v4
|
bz v4
|
||||||
bz v5
|
bz v5
|
||||||
bz v6
|
bz v6
|
||||||
b110 v7
|
b0 vf
|
||||||
b1 v8
|
#6300
|
||||||
b1 v9
|
b111 v4
|
||||||
#735
|
b10 v5
|
||||||
b0 vb
|
|
||||||
bz v7
|
|
||||||
bz v8
|
|
||||||
bz v9
|
|
||||||
#5110
|
|
||||||
b101 v0
|
|
||||||
b10 v1
|
|
||||||
b1 vb
|
|
||||||
b10000 v2
|
|
||||||
#5530
|
|
||||||
bz v0
|
|
||||||
bz v1
|
|
||||||
b0 vb
|
|
||||||
bz v2
|
|
||||||
#5740
|
|
||||||
b111 v0
|
|
||||||
b10 v1
|
|
||||||
b1 vb
|
|
||||||
b10000 v3
|
|
||||||
#5845
|
|
||||||
bz v0
|
|
||||||
bz v1
|
|
||||||
b0 vb
|
|
||||||
bz v3
|
|
||||||
#6125
|
|
||||||
b1000 v0
|
|
||||||
b10 v1
|
|
||||||
b1 vb
|
|
||||||
b10000 v4
|
|
||||||
#7140
|
|
||||||
bz v0
|
|
||||||
bz v1
|
|
||||||
b0 vb
|
|
||||||
bz v4
|
|
||||||
#8120
|
|
||||||
b1 va
|
|
||||||
b1 vb
|
|
||||||
b1010 v7
|
b1010 v7
|
||||||
b1 v8
|
b1 vf
|
||||||
#8260
|
#6895
|
||||||
bz va
|
|
||||||
b0 vb
|
|
||||||
bz v7
|
|
||||||
bz v8
|
|
||||||
#8820
|
|
||||||
b1001 v0
|
|
||||||
b10 v1
|
|
||||||
b1 vb
|
|
||||||
b10000 v5
|
|
||||||
#9380
|
|
||||||
bz v0
|
|
||||||
bz v1
|
|
||||||
b0 vb
|
|
||||||
bz v5
|
|
||||||
#9940
|
|
||||||
b1011 v0
|
|
||||||
b10 v1
|
|
||||||
b1 vb
|
|
||||||
b10000 v6
|
|
||||||
#10045
|
|
||||||
bz v0
|
|
||||||
bz v1
|
|
||||||
b0 vb
|
|
||||||
bz v6
|
|
||||||
#11970
|
|
||||||
b1 vb
|
|
||||||
b110 v7
|
|
||||||
b1 v8
|
|
||||||
b1 v9
|
|
||||||
#12075
|
|
||||||
b0 vb
|
|
||||||
bz v7
|
|
||||||
bz v8
|
|
||||||
bz v9
|
|
||||||
#14175
|
|
||||||
b101 v0
|
|
||||||
b10 v1
|
|
||||||
b1 vb
|
|
||||||
b10000 v2
|
|
||||||
#14595
|
|
||||||
bz v0
|
|
||||||
bz v1
|
|
||||||
b0 vb
|
|
||||||
bz v2
|
|
||||||
#15120
|
|
||||||
b111 v0
|
|
||||||
b10 v1
|
|
||||||
b1 vb
|
|
||||||
b10000 v3
|
|
||||||
#15190
|
|
||||||
bz v0
|
|
||||||
bz v1
|
|
||||||
b0 vb
|
|
||||||
bz v3
|
|
||||||
#15365
|
|
||||||
b1000 v0
|
|
||||||
b10 v1
|
|
||||||
b1 vb
|
|
||||||
b10000 v4
|
|
||||||
#15435
|
|
||||||
bz v0
|
|
||||||
bz v1
|
|
||||||
b0 vb
|
|
||||||
bz v4
|
bz v4
|
||||||
#15680
|
bz v5
|
||||||
b1001 v0
|
bz v7
|
||||||
b10 v1
|
b0 vf
|
||||||
b1 vb
|
#7490
|
||||||
b10000 v5
|
b1000 v4
|
||||||
#15750
|
b10 v5
|
||||||
|
b1010 v8
|
||||||
|
b1 vf
|
||||||
|
#8540
|
||||||
|
bz v4
|
||||||
|
bz v5
|
||||||
|
bz v8
|
||||||
|
b0 vf
|
||||||
|
#9135
|
||||||
|
b1001 v4
|
||||||
|
b10 v5
|
||||||
|
b1010 v9
|
||||||
|
b1 vf
|
||||||
|
#9210
|
||||||
|
b1001 ve
|
||||||
|
b1 v10
|
||||||
|
#9695
|
||||||
|
b1010 v0
|
||||||
|
b1 v1
|
||||||
|
b10011 v3
|
||||||
|
b10 vf
|
||||||
|
#9730
|
||||||
|
bz v4
|
||||||
|
bz v5
|
||||||
|
bz v9
|
||||||
|
b1 vf
|
||||||
|
#9870
|
||||||
bz v0
|
bz v0
|
||||||
bz v1
|
bz v1
|
||||||
b0 vb
|
bz v3
|
||||||
|
b0 vf
|
||||||
|
#10255
|
||||||
|
b1011 v4
|
||||||
|
b10 v5
|
||||||
|
b1010 va
|
||||||
|
b1 vf
|
||||||
|
#10645
|
||||||
|
b1010 vd
|
||||||
|
b10 v10
|
||||||
|
#10695
|
||||||
|
b1011 ve
|
||||||
|
b10 v10
|
||||||
|
#11060
|
||||||
|
bz v4
|
||||||
bz v5
|
bz v5
|
||||||
#15925
|
bz va
|
||||||
b1011 v0
|
b0 vf
|
||||||
b10 v1
|
#12130
|
||||||
b1 va
|
bz vd
|
||||||
|
b1 v10
|
||||||
|
#12180
|
||||||
|
bz vc
|
||||||
|
bz ve
|
||||||
|
b0 v10
|
||||||
|
#12250
|
||||||
|
bz vb
|
||||||
|
#13720
|
||||||
b10 vb
|
b10 vb
|
||||||
b10000 v6
|
#14000
|
||||||
b1010 v7
|
b10 vc
|
||||||
b1 v8
|
b110 vd
|
||||||
#15995
|
b101 ve
|
||||||
|
b10 v10
|
||||||
|
#15068
|
||||||
|
bz vd
|
||||||
|
b111 ve
|
||||||
|
b1 v10
|
||||||
|
#15610
|
||||||
|
b110 v0
|
||||||
|
b1 v1
|
||||||
|
b10011 v2
|
||||||
|
b1 vf
|
||||||
|
#15820
|
||||||
bz v0
|
bz v0
|
||||||
bz v1
|
bz v1
|
||||||
b1 vb
|
bz v2
|
||||||
|
b0 vf
|
||||||
|
#16137
|
||||||
|
b1000 ve
|
||||||
|
b1 v10
|
||||||
|
#18375
|
||||||
|
b101 v4
|
||||||
|
b10 v5
|
||||||
|
b1010 v6
|
||||||
|
b1 vf
|
||||||
|
#18970
|
||||||
|
bz v4
|
||||||
|
bz v5
|
||||||
bz v6
|
bz v6
|
||||||
#16065
|
b0 vf
|
||||||
bz va
|
#19145
|
||||||
b0 vb
|
b111 v4
|
||||||
|
b10 v5
|
||||||
|
b1010 v7
|
||||||
|
b1 vf
|
||||||
|
#19320
|
||||||
|
bz v4
|
||||||
|
bz v5
|
||||||
bz v7
|
bz v7
|
||||||
|
b0 vf
|
||||||
|
#19670
|
||||||
|
b1000 v4
|
||||||
|
b10 v5
|
||||||
|
b1010 v8
|
||||||
|
b1 vf
|
||||||
|
#19810
|
||||||
|
bz v4
|
||||||
|
bz v5
|
||||||
bz v8
|
bz v8
|
||||||
|
b0 vf
|
||||||
|
#19947
|
||||||
|
b1001 ve
|
||||||
|
b1 v10
|
||||||
|
#20650
|
||||||
|
b1001 v4
|
||||||
|
b10 v5
|
||||||
|
b1010 v9
|
||||||
|
b1 vf
|
||||||
|
#20720
|
||||||
|
bz v4
|
||||||
|
bz v5
|
||||||
|
bz v9
|
||||||
|
b0 vf
|
||||||
|
#20980
|
||||||
|
b1010 vd
|
||||||
|
b10 v10
|
||||||
|
#21016
|
||||||
|
b1011 ve
|
||||||
|
b10 v10
|
||||||
|
#21140
|
||||||
|
b1011 v4
|
||||||
|
b10 v5
|
||||||
|
b1010 va
|
||||||
|
b1 vf
|
||||||
|
#21245
|
||||||
|
bz v4
|
||||||
|
bz v5
|
||||||
|
bz va
|
||||||
|
b0 vf
|
||||||
|
#21700
|
||||||
|
b1010 v0
|
||||||
|
b1 v1
|
||||||
|
b10011 v3
|
||||||
|
b1 vf
|
||||||
|
#21875
|
||||||
|
bz v0
|
||||||
|
bz v1
|
||||||
|
bz v3
|
||||||
|
b0 vf
|
||||||
|
#22049
|
||||||
|
bz vd
|
||||||
|
b1 v10
|
||||||
|
#22085
|
||||||
|
bz vc
|
||||||
|
bz ve
|
||||||
|
b0 v10
|
||||||
|
#22330
|
||||||
|
bz vb
|
||||||
|
Loading…
Reference in New Issue
Block a user