Improve run-time profiling

The --prof-threads option has been split into two independent options:
1. --prof-exec, for collecting verilator_gantt and other execution
related profiling data, and
2. --prof-pgo, for collecting data needed for PGO

The implementation of execution profiling is extricated from
VlThreadPool and is now a separate class VlExecutionProfiler. This means
--prof-exec can now be used for single-threaded models (though it does
not measure a lot of things just yet). For consistency VerilatedProfiler
is renamed VlPgoProfiler. Both VlExecutionProfiler and VlPgoProfiler are
in verilated_profiler.{h/cpp}, but can be used completely independently.

Also re-worked the execution profile format so it now only emits events
without holding onto any temporaries. This is in preparation for some
future optimizations that would be hindered by the introduction of function
locals via AstText.

Also removed the Barrier event. Clearing the profile buffers is not
notably more expensive as the profiling records are trivially
destructible.
This commit is contained in:
Geza Lore 2022-03-25 19:46:50 +00:00
parent c7440b250f
commit b1b5b5dfe2
40 changed files with 900 additions and 712 deletions

View File

@ -13,6 +13,7 @@ Verilator 4.221 devel
**Minor:**
* Split --prof-threads into --prof-exec and --prof-pgo (#3365). [Geza Lore, Shunyao CAD]
Verilator 4.220 2022-03-12

View File

@ -370,7 +370,8 @@ detailed descriptions of these arguments.
--prefix <topname> Name of top level class
--prof-c Compile C++ code with profiling
--prof-cfuncs Name functions for profiling
--prof-threads Enable generating gantt chart data for threads
--prof-exec Enable generating execution profile for gantt chart
--prof-pgo Enable generating profiling data for PGO
--protect-key <key> Key for symbol protection
--protect-ids Hash identifier names for obscurity
--protect-lib <name> Create a DPI protected library
@ -445,10 +446,10 @@ description of these arguments.
+verilator+error+limit+<value> Set error limit
+verilator+help Display help
+verilator+noassert Disable assert checking
+verilator+prof+threads+file+<filename> Set profile filename
+verilator+prof+threads+start+<value> Set profile starting point
+verilator+prof+threads+window+<value> Set profile duration
+verilator+prof+vlt+file+<filename> Set profile guided filename
+verilator+prof+exec+file+<filename> Set execution profile filename
+verilator+prof+exec+start+<value> Set execution profile starting point
+verilator+prof+exec+window+<value> Set execution profile duration
+verilator+prof+vlt+file+<filename> Set PGO profile filename
+verilator+rand+reset+<value> Set random reset technique
+verilator+seed+<value> Set random seed
+verilator+V Verbose version and config

View File

@ -9,7 +9,7 @@ import re
import statistics
# from pprint import pprint
Threads = collections.defaultdict(lambda: {})
Threads = collections.defaultdict(lambda: collections.defaultdict(lambda: {}))
Mtasks = collections.defaultdict(lambda: {})
Evals = collections.defaultdict(lambda: {})
EvalLoops = collections.defaultdict(lambda: {})
@ -30,12 +30,12 @@ def process(filename):
def read_data(filename):
with open(filename) as fh:
re_prof = re.compile(
r'^VLPROF mtask\s(\d+)\sstart\s(\d+)\selapsed\s(\d+)\spredict_start\s(\d+)\spredict_cost\s(\d+)\scpu\s(\d+)\son thread (\d+)'
)
re_eval = re.compile(r'^VLPROF eval\sstart\s(\d+)\selapsed\s(\d+)')
re_loop = re.compile(
r'^VLPROF eval_loop\sstart\s(\d+)\selapsed\s(\d+)')
re_thread = re.compile(r'^VLPROFTHREAD (\d+)$')
re_record = re.compile(r'^VLPROFEXEC (\S+) (\d+)(.*)$')
re_payload_mtaskBegin = re.compile(
r'id (\d+) predictStart (\d+) cpu (\d+)')
re_payload_mtaskEnd = re.compile(r'id (\d+) predictCost (\d+)')
re_arg1 = re.compile(r'VLPROF arg\s+(\S+)\+([0-9.]*)\s*')
re_arg2 = re.compile(r'VLPROF arg\s+(\S+)\s+([0-9.]*)\s*$')
re_stat = re.compile(r'VLPROF stat\s+(\S+)\s+([0-9.]+)')
@ -43,46 +43,59 @@ def read_data(filename):
re_proc_cpu = re.compile(r'VLPROFPROC processor\s*:\s*(\d+)\s*$')
re_proc_dat = re.compile(r'VLPROFPROC ([a-z_ ]+)\s*:\s*(.*)$')
cpu = None
thread = None
lastEvalBeginTick = None
lastEvalLoopBeginTick = None
for line in fh:
if re_prof.match(line):
match = re_prof.match(line)
mtask = int(match.group(1))
start = int(match.group(2))
elapsed_time = int(match.group(3))
end = start + elapsed_time
predict_start = int(match.group(4))
predict_cost = int(match.group(5))
cpu = int(match.group(6))
thread = int(match.group(7))
if start not in Threads[thread]:
Threads[thread][start] = {}
Threads[thread][start]['mtask'] = mtask
Threads[thread][start]['end'] = end
Threads[thread][start]['cpu'] = cpu
Threads[thread][start]['predict_start'] = predict_start
Threads[thread][start]['predict_cost'] = predict_cost
if 'elapsed' not in Mtasks[mtask]:
Mtasks[mtask] = {'end': 0, 'elapsed': 0}
Mtasks[mtask]['thread'] = thread
Mtasks[mtask]['elapsed'] += elapsed_time
Mtasks[mtask]['predict_start'] = predict_start
Mtasks[mtask]['predict_cost'] = predict_cost
Mtasks[mtask]['end'] = max(Mtasks[mtask]['end'], end)
elif re_eval.match(line):
match = re_eval.match(line)
start = int(match.group(1))
elapsed_time = int(match.group(2))
Evals[start]['start'] = start
Evals[start]['end'] = start + elapsed_time
elif re_loop.match(line):
match = re_loop.match(line)
start = int(match.group(1))
elapsed_time = int(match.group(2))
EvalLoops[start]['start'] = start
EvalLoops[start]['end'] = start + elapsed_time
elif re.match(r'^VLPROFTHREAD', line):
recordMatch = re_record.match(line)
if recordMatch:
kind, tick, payload = recordMatch.groups()
tick = int(tick)
payload = payload.strip()
if kind == "EVAL_BEGIN":
Evals[tick]['start'] = tick
lastEvalBeginTick = tick
elif kind == "EVAL_END":
Evals[lastEvalBeginTick]['end'] = tick
lastEvalBeginTick = None
elif kind == "EVAL_LOOP_BEGIN":
EvalLoops[tick]['start'] = tick
lastEvalLoopBeginTick = tick
elif kind == "EVAL_LOOP_END":
EvalLoops[lastEvalLoopBeginTick]['end'] = tick
lastEvalLoopBeginTick = None
elif kind == "MTASK_BEGIN":
mtask, predict_start, ecpu = re_payload_mtaskBegin.match(
payload).groups()
mtask = int(mtask)
predict_start = int(predict_start)
ecpu = int(ecpu)
Threads[thread][tick]['mtask'] = mtask
Threads[thread][tick]['predict_start'] = predict_start
Threads[thread][tick]['cpu'] = ecpu
if 'elapsed' not in Mtasks[mtask]:
Mtasks[mtask] = {'end': 0, 'elapsed': 0}
Mtasks[mtask]['begin'] = tick
Mtasks[mtask]['thread'] = thread
Mtasks[mtask]['predict_start'] = predict_start
elif kind == "MTASK_END":
mtask, predict_cost = re_payload_mtaskEnd.match(
payload).groups()
mtask = int(mtask)
predict_cost = int(predict_cost)
begin = Mtasks[mtask]['begin']
Threads[thread][begin]['end'] = tick
Threads[thread][begin]['predict_cost'] = predict_cost
Mtasks[mtask]['elapsed'] += tick - begin
Mtasks[mtask]['predict_cost'] = predict_cost
Mtasks[mtask]['end'] = max(Mtasks[mtask]['end'], tick)
elif Args.debug:
print("-Unknown execution trace record: %s" % line)
elif re_thread.match(line):
thread = int(re_thread.match(line).group(1))
elif re.match(r'^VLPROF(THREAD|VERSION)', line):
pass
elif re_arg1.match(line):
match = re_arg1.match(line)
@ -131,11 +144,12 @@ def report():
plus = "+" if re.match(r'^\+', arg) else " "
print(" %s%s%s" % (arg, plus, Global['args'][arg]))
nthreads = len(Threads)
nthreads = int(Global['stats']['threads'])
Global['cpus'] = {}
for thread in Threads:
# Make potentially multiple characters per column
for start in Threads[thread]:
if not Threads[thread][start]: continue
cpu = Threads[thread][start]['cpu']
elapsed = Threads[thread][start]['end'] - start
if cpu not in Global['cpus']:
@ -169,74 +183,79 @@ def report():
print("\nAnalysis:")
print(" Total threads = %d" % nthreads)
print(" Total mtasks = %d" % len(Mtasks))
ncpus = len(Global['cpus'])
ncpus = max(len(Global['cpus']), 1)
print(" Total cpus used = %d" % ncpus)
print(" Total yields = %d" % int(Global['stats']['yields']))
print(" Total yields = %d" %
int(Global['stats'].get('yields', 0)))
print(" Total evals = %d" % len(Evals))
print(" Total eval loops = %d" % len(EvalLoops))
print(" Total eval time = %d rdtsc ticks" %
Global['measured_last_end'])
print(" Longest mtask time = %d rdtsc ticks" % long_mtask_time)
print(" All-thread mtask time = %d rdtsc ticks" %
measured_mt_mtask_time)
long_efficiency = long_mtask_time / (Global.get('measured_last_end', 1)
or 1)
print(" Longest-thread efficiency = %0.1f%%" % (long_efficiency * 100.0))
mt_efficiency = measured_mt_mtask_time / (
Global.get('measured_last_end', 1) * nthreads or 1)
print(" All-thread efficiency = %0.1f%%" % (mt_efficiency * 100.0))
print(" All-thread speedup = %0.1f" % (mt_efficiency * nthreads))
if Global['rdtsc_cycle_time'] > 0:
ut = measured_mt_mtask_time / Global['rdtsc_cycle_time']
print("tot_mtask_cpu=" + measured_mt_mtask_time + " cyc=" +
Global['rdtsc_cycle_time'] + " ut=" + ut)
if Mtasks:
print(" Total eval time = %d rdtsc ticks" %
Global['measured_last_end'])
print(" Longest mtask time = %d rdtsc ticks" % long_mtask_time)
print(" All-thread mtask time = %d rdtsc ticks" %
measured_mt_mtask_time)
long_efficiency = long_mtask_time / (Global.get(
'measured_last_end', 1) or 1)
print(" Longest-thread efficiency = %0.1f%%" %
(long_efficiency * 100.0))
mt_efficiency = measured_mt_mtask_time / (
Global.get('measured_last_end', 1) * nthreads or 1)
print(" All-thread efficiency = %0.1f%%" %
(mt_efficiency * 100.0))
print(" All-thread speedup = %0.1f" %
(mt_efficiency * nthreads))
if Global['rdtsc_cycle_time'] > 0:
ut = measured_mt_mtask_time / Global['rdtsc_cycle_time']
print("tot_mtask_cpu=" + measured_mt_mtask_time + " cyc=" +
Global['rdtsc_cycle_time'] + " ut=" + ut)
predict_mt_efficiency = predict_mt_mtask_time / (
Global.get('predict_last_end', 1) * nthreads or 1)
print("\nPrediction (what Verilator used for scheduling):")
print(" All-thread efficiency = %0.1f%%" %
(predict_mt_efficiency * 100.0))
print(" All-thread speedup = %0.1f" %
(predict_mt_efficiency * nthreads))
predict_mt_efficiency = predict_mt_mtask_time / (
Global.get('predict_last_end', 1) * nthreads or 1)
print("\nPrediction (what Verilator used for scheduling):")
print(" All-thread efficiency = %0.1f%%" %
(predict_mt_efficiency * 100.0))
print(" All-thread speedup = %0.1f" %
(predict_mt_efficiency * nthreads))
p2e_ratios = []
min_p2e = 1000000
min_mtask = None
max_p2e = -1000000
max_mtask = None
p2e_ratios = []
min_p2e = 1000000
min_mtask = None
max_p2e = -1000000
max_mtask = None
for mtask in sorted(Mtasks.keys()):
if Mtasks[mtask]['elapsed'] > 0:
if Mtasks[mtask]['predict_cost'] == 0:
Mtasks[mtask]['predict_cost'] = 1 # don't log(0) below
p2e_ratio = math.log(Mtasks[mtask]['predict_cost'] /
Mtasks[mtask]['elapsed'])
p2e_ratios.append(p2e_ratio)
for mtask in sorted(Mtasks.keys()):
if Mtasks[mtask]['elapsed'] > 0:
if Mtasks[mtask]['predict_cost'] == 0:
Mtasks[mtask]['predict_cost'] = 1 # don't log(0) below
p2e_ratio = math.log(Mtasks[mtask]['predict_cost'] /
Mtasks[mtask]['elapsed'])
p2e_ratios.append(p2e_ratio)
if p2e_ratio > max_p2e:
max_p2e = p2e_ratio
max_mtask = mtask
if p2e_ratio < min_p2e:
min_p2e = p2e_ratio
min_mtask = mtask
if p2e_ratio > max_p2e:
max_p2e = p2e_ratio
max_mtask = mtask
if p2e_ratio < min_p2e:
min_p2e = p2e_ratio
min_mtask = mtask
print("\nStatistics:")
print(" min log(p2e) = %0.3f" % min_p2e, end="")
print(" from mtask %d (predict %d," %
(min_mtask, Mtasks[min_mtask]['predict_cost']),
end="")
print(" elapsed %d)" % Mtasks[min_mtask]['elapsed'])
print(" max log(p2e) = %0.3f" % max_p2e, end="")
print(" from mtask %d (predict %d," %
(max_mtask, Mtasks[max_mtask]['predict_cost']),
end="")
print(" elapsed %d)" % Mtasks[max_mtask]['elapsed'])
print("\nMTask statistics:")
print(" min log(p2e) = %0.3f" % min_p2e, end="")
print(" from mtask %d (predict %d," %
(min_mtask, Mtasks[min_mtask]['predict_cost']),
end="")
print(" elapsed %d)" % Mtasks[min_mtask]['elapsed'])
print(" max log(p2e) = %0.3f" % max_p2e, end="")
print(" from mtask %d (predict %d," %
(max_mtask, Mtasks[max_mtask]['predict_cost']),
end="")
print(" elapsed %d)" % Mtasks[max_mtask]['elapsed'])
stddev = statistics.pstdev(p2e_ratios)
mean = statistics.mean(p2e_ratios)
print(" mean = %0.3f" % mean)
print(" stddev = %0.3f" % stddev)
print(" e ^ stddev = %0.3f" % math.exp(stddev))
stddev = statistics.pstdev(p2e_ratios)
mean = statistics.mean(p2e_ratios)
print(" mean = %0.3f" % mean)
print(" stddev = %0.3f" % stddev)
print(" e ^ stddev = %0.3f" % math.exp(stddev))
report_cpus()
@ -375,44 +394,45 @@ def write_vcd(filename):
vcd['values'][eval_start][elcode] = n
vcd['values'][eval_end][elcode] = None
# Predicted graph
for eval_start in EvalLoops:
eval_end = EvalLoops[eval_start]['end']
# Compute scale so predicted graph is of same width as eval
measured_scaling = (eval_end -
eval_start) / Global['predict_last_end']
# Predict mtasks that fill the time the eval occupied
for mtask in Mtasks:
thread = Mtasks[mtask]['thread']
pred_scaled_start = eval_start + int(
Mtasks[mtask]['predict_start'] * measured_scaling)
pred_scaled_end = eval_start + int(
(Mtasks[mtask]['predict_start'] +
Mtasks[mtask]['predict_cost']) * measured_scaling)
if pred_scaled_start == pred_scaled_end:
continue
if Mtasks:
# Predicted graph
for eval_start in EvalLoops:
eval_end = EvalLoops[eval_start]['end']
# Compute scale so predicted graph is of same width as eval
measured_scaling = (eval_end -
eval_start) / Global['predict_last_end']
# Predict mtasks that fill the time the eval occupied
for mtask in Mtasks:
thread = Mtasks[mtask]['thread']
pred_scaled_start = eval_start + int(
Mtasks[mtask]['predict_start'] * measured_scaling)
pred_scaled_end = eval_start + int(
(Mtasks[mtask]['predict_start'] +
Mtasks[mtask]['predict_cost']) * measured_scaling)
if pred_scaled_start == pred_scaled_end:
continue
sig = "predicted_thread%d_mtask" % thread
if sig not in vcd['sigs']['predicted_threads']:
vcd['sigs']['predicted_threads'][sig] = code
code += 1
mcode = vcd['sigs']['predicted_threads'][sig]
sig = "predicted_thread%d_mtask" % thread
if sig not in vcd['sigs']['predicted_threads']:
vcd['sigs']['predicted_threads'][sig] = code
code += 1
mcode = vcd['sigs']['predicted_threads'][sig]
vcd['values'][pred_scaled_start][mcode] = mtask
vcd['values'][pred_scaled_end][mcode] = None
vcd['values'][pred_scaled_start][mcode] = mtask
vcd['values'][pred_scaled_end][mcode] = None
parallelism['predicted'][pred_scaled_start] += 1
parallelism['predicted'][pred_scaled_end] -= 1
parallelism['predicted'][pred_scaled_start] += 1
parallelism['predicted'][pred_scaled_end] -= 1
# Parallelism graph
for measpred in ('measured', 'predicted'):
vcd['sigs']['Stats']["%s_parallelism" % measpred] = code
pcode = code
code += 1
value = 0
for time in sorted(parallelism[measpred].keys()):
value += parallelism[measpred][time]
vcd['values'][time][pcode] = value
# Parallelism graph
for measpred in ('measured', 'predicted'):
vcd['sigs']['Stats']["%s_parallelism" % measpred] = code
pcode = code
code += 1
value = 0
for time in sorted(parallelism[measpred].keys()):
value += parallelism[measpred][time]
vcd['values'][time][pcode] = value
# Create output file
fh.write("$version Generated by verilator_gantt $end\n")
@ -476,10 +496,10 @@ parser.add_argument('--no-vcd',
action='store_true')
parser.add_argument('--vcd',
help='filename for vcd outpue',
default='profile_threads.vcd')
default='profile_exec.vcd')
parser.add_argument('filename',
help='input profile_threads.dat filename to process',
default='profile_threads.dat')
help='input profile_exec.dat filename to process',
default='profile_exec.dat')
Args = parser.parse_args()

View File

@ -19,3 +19,14 @@ Verilated_heavy.h
Option `--cdc`
The experimental `--cdc` option is believed to be generally unused and is
planned for removal no sooner than January 2023.
Option `--prof-threads`
The `--prof-threads` option has been superseded by the `--prof-exec` and
`--prof-pgo` options and is planned for removal no sooner than April 2023.
Verilated model options `+verilator+prof+threads+*`
The `+verilator+prof+threads+start`, `+verilator+prof+threads+window` and
`+verilator+prof+threads+file` options have been superseded by the
`+verilator+prof+exec+start`, `+verilator+prof+exec+window` and
`+verilator+prof+exec+file` options respectively and are planned for removal
no sooner than April 2023.

View File

@ -38,33 +38,45 @@ Summary:
Display help and exit.
.. option:: +verilator+prof+threads+file+<filename>
.. option:: +verilator+prof+exec+file+<filename>
When a model was Verilated using :vlopt:`--prof-threads`, sets the
When a model was Verilated using :vlopt:`--prof-exec`, sets the
simulation runtime filename to dump to. Defaults to
:file:`profile_threads.dat`.
:file:`profile_exec.dat`.
.. option:: +verilator+prof+threads+start+<value>
.. option:: +verilator+prof+exec+start+<value>
When a model was Verilated using :vlopt:`--prof-threads`, the simulation
When a model was Verilated using :vlopt:`--prof-exec`, the simulation
runtime will wait until $time is at this value (expressed in units of
the time precision), then start the profiling warmup, then
capturing. Generally this should be set to some time that is well within
the normal operation of the simulation, i.e. outside of reset. If 0, the
dump is disabled. Defaults to 1.
.. option:: +verilator+prof+threads+window+<value>
.. option:: +verilator+prof+exec+window+<value>
When a model was Verilated using :vlopt:`--prof-threads`, after $time
reaches :vlopt:`+verilator+prof+threads+start+\<value\>`, Verilator will
When a model was Verilated using :vlopt:`--prof-exec`, after $time
reaches :vlopt:`+verilator+prof+exec+start+\<value\>`, Verilator will
warm up the profiling for this number of eval() calls, then will capture
the profiling of this number of eval() calls. Defaults to 2, which
makes sense for a single-clock-domain module where it's typical to want
to capture one posedge eval() and one negedge eval().
.. option:: +verilator+prof+threads+file+<filename>
Deprecated. Alias for :vlopt:`+verilator+prof+exec+file+\<filename\>`
.. option:: +verilator+prof+threads+start+<value>
Deprecated. Alias for :vlopt:`+verilator+prof+exec+start+\<value\>`
.. option:: +verilator+prof+threads+window+<value>
Deprecated. Alias for :vlopt:`+verilator+prof+exec+window+\<filename\>`
.. option:: +verilator+prof+vlt+file+<filename>
When a model was Verilated using :vlopt:`--prof-threads`, sets the
When a model was Verilated using :vlopt:`--prof-pgo`, sets the
profile-guided optimization data runtime filename to dump to. Defaults
to :file:`profile.vlt`.

View File

@ -845,10 +845,19 @@ Summary:
Using :vlopt:`--prof-cfuncs` also enables :vlopt:`--prof-c`.
.. option:: --prof-exec
Enable collection of execution trace, that can be convered into a gantt
chart with verilator_gantt See :ref:`Execution Profiling`.
.. option:: --prof-pgo
Enable collection of profiling data for profile guided verilation. Currently
this is only useful with :vlopt:`--threads`. See :ref:`Thread PGO`.
.. option:: --prof-threads
Enable gantt chart data collection for threaded builds. See :ref:`Thread
Profiling` and :ref:`Thread PGO`.
Deprecated. Same as --prof-exec and --prof-pgo together.
.. option:: --protect-key <key>

View File

@ -72,7 +72,7 @@ verilator_gantt Arguments
.. option:: <filename>
The filename to read data from, defaults to "profile_threads.dat".
The filename to read data from, defaults to "profile_exec.dat".
.. option:: --help

View File

@ -155,13 +155,13 @@ The Verilated executable may produce the following:
* - gmon.out
- GCC/clang code profiler output, often fed into :command:`verilator_profcfunc`
* - profile.vlt
- -profile data file for :ref:`Thread PGO`
* - profile_threads.dat
- -profile-threads data file for :command:`verilator_gantt`
- --prof-pgo data file for :ref:`Thread PGO`
* - profile_exec.dat
- --prof-exec data file for :command:`verilator_gantt`
Verilator_gantt may produce the following:
.. list-table::
* - profile_threads.vcd
* - profile_exec.vcd
- Gantt report waveform output

View File

@ -279,26 +279,25 @@ To use profiling:
is being spent.
.. _Thread Profiling:
.. _Execution Profiling:
Thread Profiling
================
Execution Profiling
===================
When using multithreaded mode (:vlopt:`--threads`), it is useful to see
statistics and visualize how well the multiple CPUs are being utilized.
For performance optimization, it is useful to see statistics and visualize how
execution time is distributed in a verilated model.
With the :vlopt:`--prof-threads` option, Verilator will:
With the :vlopt:`--prof-exec` option, Verilator will:
* Add code to the Verilated model to record the start and end time of each
macro-task across a number of calls to eval. (What is a macro-task? See
the Verilator internals document (:file:`docs/internals.rst` in the
distribution.)
* Add code to the Verilated model to record execution flow.
* Add code to save profiling data in non-human-friendly form to the file
specified with :vlopt:`+verilator+prof+threads+file+\<filename\>`.
specified with :vlopt:`+verilator+prof+exec+file+\<filename\>`.
* Add code to save profiling data for thread profile-guided
optimization. See :ref:`Thread PGO`.
* In multi-threaded models, add code to record the start and end time of each
macro-task across a number of calls to eval. (What is a macro-task? See the
Verilator internals document (:file:`docs/internals.rst` in the
distribution.)
The :command:`verilator_gantt` program may then be run to transform the
saved profiling file into a nicer visual format and produce some related
@ -406,8 +405,8 @@ others as they prove beneficial.
Thread Profile-Guided Optimization
----------------------------------
Verilator supports thread profile-guided optimization (Thread PGO) to
improve multithreaded performance.
Verilator supports profile-guided optimization (verilation) of multi-threaded
models (Thread PGO) to improve performance.
When using multithreading, Verilator computes how long macro tasks take and
tries to balance those across threads. (What is a macro-task? See the
@ -417,13 +416,14 @@ balanced, leading to decreased performance. Thread PGO allows collecting
profiling data to replace the estimates and better optimize these
decisions.
To use Thread PGO, Verilate the model with the :vlopt:`--prof-threads`
option.
To use Thread PGO, Verilate the model with the :vlopt:`--prof-pgo` option. This
will code to the verilated model to save profiling data for profile-guided
optimization.
Run the model executable. When the executable exits, it will create a
profile.vlt file.
Rerun Verilator, optionally omitting the :vlopt:`--prof-threads` option,
Rerun Verilator, optionally omitting the :vlopt:`--prof-pgo` option,
and adding the profile.vlt generated earlier to the command line.
Note there is no Verilator equivalent to GCC's --fprofile-use. Verilator's

View File

@ -265,7 +265,7 @@ This will limit memory to socket 0, and threads to cores 0, 1, 2, 3,
(presumably on socket 0) optimizing performance. Of course this must be
adjusted if you want another simulator using e.g. socket 1, or if you
Verilated with a different number of threads. To see what CPUs are
actually used, use :vlopt:`--prof-threads`.
actually used, use :vlopt:`--prof-exec`.
Multithreaded Verilog and Library Support

View File

@ -301,7 +301,7 @@ prerequisites on other threads have finished.
The synchronization cost is cheap if the prereqs are done. If they're not,
fragmentation (idle CPU cores waiting) is possible. This is the major
source of overhead in this approach. The ``--prof-threads`` switch and the
source of overhead in this approach. The ``--prof-exec`` switch and the
``verilator_gantt`` script can visualize the time lost to such
fragmentation.

View File

@ -2280,7 +2280,7 @@ VerilatedContext::VerilatedContext()
: m_impdatap{new VerilatedContextImpData} {
Verilated::lastContextp(this);
Verilated::threadContextp(this);
m_ns.m_profThreadsFilename = "profile_threads.dat";
m_ns.m_profExecFilename = "profile_exec.dat";
m_ns.m_profVltFilename = "profile.vlt";
m_fdps.resize(31);
std::fill(m_fdps.begin(), m_fdps.end(), static_cast<FILE*>(nullptr));
@ -2348,21 +2348,21 @@ void VerilatedContext::gotFinish(bool flag) VL_MT_SAFE {
const VerilatedLockGuard lock{m_mutex};
m_s.m_gotFinish = flag;
}
void VerilatedContext::profThreadsStart(vluint64_t flag) VL_MT_SAFE {
void VerilatedContext::profExecStart(vluint64_t flag) VL_MT_SAFE {
const VerilatedLockGuard lock{m_mutex};
m_ns.m_profThreadsStart = flag;
m_ns.m_profExecStart = flag;
}
void VerilatedContext::profThreadsWindow(vluint64_t flag) VL_MT_SAFE {
void VerilatedContext::profExecWindow(vluint64_t flag) VL_MT_SAFE {
const VerilatedLockGuard lock{m_mutex};
m_ns.m_profThreadsWindow = flag;
m_ns.m_profExecWindow = flag;
}
void VerilatedContext::profThreadsFilename(const std::string& flag) VL_MT_SAFE {
void VerilatedContext::profExecFilename(const std::string& flag) VL_MT_SAFE {
const VerilatedLockGuard lock{m_mutex};
m_ns.m_profThreadsFilename = flag;
m_ns.m_profExecFilename = flag;
}
std::string VerilatedContext::profThreadsFilename() const VL_MT_SAFE {
std::string VerilatedContext::profExecFilename() const VL_MT_SAFE {
const VerilatedLockGuard lock{m_mutex};
return m_ns.m_profThreadsFilename;
return m_ns.m_profExecFilename;
}
void VerilatedContext::profVltFilename(const std::string& flag) VL_MT_SAFE {
const VerilatedLockGuard lock{m_mutex};
@ -2524,12 +2524,15 @@ void VerilatedContextImp::commandArgVl(const std::string& arg) {
"Exiting due to command line argument (not an error)");
} else if (arg == "+verilator+noassert") {
assertOn(false);
} else if (commandArgVlUint64(arg, "+verilator+prof+threads+start+", u64)) {
profThreadsStart(u64);
} else if (commandArgVlUint64(arg, "+verilator+prof+threads+window+", u64, 1)) {
profThreadsWindow(u64);
} else if (commandArgVlString(arg, "+verilator+prof+threads+file+", str)) {
profThreadsFilename(str);
} else if (commandArgVlUint64(arg, "+verilator+prof+exec+start+", u64)
|| commandArgVlUint64(arg, "+verilator+prof+threads+start+", u64)) {
profExecStart(u64);
} else if (commandArgVlUint64(arg, "+verilator+prof+exec+window+", u64, 1)
|| commandArgVlUint64(arg, "+verilator+prof+threads+window+", u64, 1)) {
profExecWindow(u64);
} else if (commandArgVlString(arg, "+verilator+prof+exec+file+", str)
|| commandArgVlString(arg, "+verilator+prof+threads+file+", str)) {
profExecFilename(str);
} else if (commandArgVlString(arg, "+verilator+prof+vlt+file+", str)) {
profVltFilename(str);
} else if (commandArgVlUint64(arg, "+verilator+rand+reset+", u64, 0, 2)) {

View File

@ -344,10 +344,10 @@ protected:
struct NonSerialized { // Non-serialized information
// These are reloaded from on command-line settings, so do not need to persist
// Fast path
vluint64_t m_profThreadsStart = 1; // +prof+threads starting time
vluint32_t m_profThreadsWindow = 2; // +prof+threads window size
vluint64_t m_profExecStart = 1; // +prof+exec+start time
vluint32_t m_profExecWindow = 2; // +prof+exec+window size
// Slow path
std::string m_profThreadsFilename; // +prof+threads filename
std::string m_profExecFilename; // +prof+exec+file filename
std::string m_profVltFilename; // +prof+vlt filename
} m_ns;
@ -518,13 +518,13 @@ public: // But for internal use only
std::string dumpfile() const VL_MT_SAFE_EXCLUDES(m_timeDumpMutex);
std::string dumpfileCheck() const VL_MT_SAFE_EXCLUDES(m_timeDumpMutex);
// Internal: --prof-threads related settings
void profThreadsStart(vluint64_t flag) VL_MT_SAFE;
vluint64_t profThreadsStart() const VL_MT_SAFE { return m_ns.m_profThreadsStart; }
void profThreadsWindow(vluint64_t flag) VL_MT_SAFE;
vluint32_t profThreadsWindow() const VL_MT_SAFE { return m_ns.m_profThreadsWindow; }
void profThreadsFilename(const std::string& flag) VL_MT_SAFE;
std::string profThreadsFilename() const VL_MT_SAFE;
// Internal: --prof-exec related settings
void profExecStart(vluint64_t flag) VL_MT_SAFE;
vluint64_t profExecStart() const VL_MT_SAFE { return m_ns.m_profExecStart; }
void profExecWindow(vluint64_t flag) VL_MT_SAFE;
vluint32_t profExecWindow() const VL_MT_SAFE { return m_ns.m_profExecWindow; }
void profExecFilename(const std::string& flag) VL_MT_SAFE;
std::string profExecFilename() const VL_MT_SAFE;
void profVltFilename(const std::string& flag) VL_MT_SAFE;
std::string profVltFilename() const VL_MT_SAFE;

View File

@ -112,15 +112,6 @@ extern WDataOutP VL_RAND_RESET_W(int obits, WDataOutP outwp);
/// Zero reset a signal (slow - else use VL_ZERO_W)
extern WDataOutP VL_ZERO_RESET_W(int obits, WDataOutP outwp);
#if VL_THREADED
/// Return high-precision counter for profiling, or 0x0 if not available
inline QData VL_RDTSC_Q() {
vluint64_t val;
VL_RDTSC(val);
return val;
}
#endif
extern void VL_PRINTTIMESCALE(const char* namep, const char* timeunitp,
const VerilatedContext* contextp) VL_MT_SAFE;

View File

@ -0,0 +1,191 @@
// -*- mode: C++; c-file-style: "cc-mode" -*-
//=============================================================================
//
// Code available from: https://verilator.org
//
// Copyright 2012-2022 by Wilson Snyder. This program is free software; you can
// redistribute it and/or modify it under the terms of either the GNU
// Lesser General Public License Version 3 or the Perl Artistic License
// Version 2.0.
// SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
//
//=============================================================================
///
/// \file
/// \brief Verilated run-time profiling implementation code
///
//=============================================================================
#include "verilatedos.h"
#include "verilated_profiler.h"
#if VL_THREADED
#include "verilated_threads.h"
#endif
#include <fstream>
#include <string>
//=============================================================================
// Globals
// Internal note: Globals may multi-construct, see verilated.cpp top.
VL_THREAD_LOCAL VlExecutionProfiler::ExecutionTrace VlExecutionProfiler::t_trace;
constexpr const char* const VlExecutionRecord::s_ascii[];
//=============================================================================
// VlPgoProfiler implementation
vluint16_t VlExecutionRecord::getcpu() {
#if defined(__linux)
return sched_getcpu(); // TODO: this is a system call. Not exactly cheap.
#elif defined(__APPLE__) && !defined(__arm64__)
vluint32_t info[4];
__cpuid_count(1, 0, info[0], info[1], info[2], info[3]);
// info[1] is EBX, bits 24-31 are APIC ID
if ((info[3] & (1 << 9)) == 0) {
return -1; // no APIC on chip
} else {
return (unsigned)info[1] >> 24;
}
#elif defined(_WIN32)
return GetCurrentProcessorNumber();
#else
return 0;
#endif
}
//=============================================================================
// VlExecutionProfiler implementation
template <size_t N> size_t roundUptoMultipleOf(size_t value) {
static_assert((N & (N - 1)) == 0, "'N' must be a power of 2");
size_t mask = N - 1;
return (value + mask) & ~mask;
}
VlExecutionProfiler::VlExecutionProfiler() {
// Setup profiling on main thread
setupThread(0);
}
void VlExecutionProfiler::configure(const VerilatedContext& context) {
if (VL_UNLIKELY(m_enabled)) {
--m_windowCount;
if (VL_UNLIKELY(m_windowCount == context.profExecWindow())) {
VL_DEBUG_IF(VL_DBG_MSGF("+ profile start collection\n"););
clear(); // Clear the profile after the cache warm-up cycles.
m_tickBegin = VL_CPU_TICK();
} else if (VL_UNLIKELY(m_windowCount == 0)) {
const vluint64_t tickEnd = VL_CPU_TICK();
VL_DEBUG_IF(VL_DBG_MSGF("+ profile end\n"););
const std::string& fileName = context.profExecFilename();
dump(fileName.c_str(), tickEnd);
m_enabled = false;
}
return;
}
const vluint64_t startReq = context.profExecStart() + 1; // + 1, so we can start at time 0
if (VL_UNLIKELY(m_lastStartReq < startReq && VL_TIME_Q() >= context.profExecStart())) {
VL_DEBUG_IF(VL_DBG_MSGF("+ profile start warmup\n"););
VL_DEBUG_IF(assert(m_windowCount == 0););
m_enabled = true;
m_windowCount = context.profExecWindow() * 2;
m_lastStartReq = startReq;
}
}
void VlExecutionProfiler::setupThread(uint32_t threadId) {
// Reserve some space in the thread-local profiling buffer, in order to try to avoid malloc
// while profiling.
t_trace.reserve(RESERVED_TRACE_CAPACITY);
// Register thread-local buffer in list of all buffers
{
const VerilatedLockGuard lock{m_mutex};
bool exists = !m_traceps.emplace(threadId, &t_trace).second;
assert(!exists);
}
}
void VlExecutionProfiler::clear() VL_MT_SAFE_EXCLUDES(m_mutex) {
const VerilatedLockGuard lock{m_mutex};
for (const auto& pair : m_traceps) {
ExecutionTrace* const tracep = pair.second;
const size_t reserve = roundUptoMultipleOf<RESERVED_TRACE_CAPACITY>(tracep->size());
tracep->clear();
tracep->reserve(reserve);
}
}
void VlExecutionProfiler::dump(const char* filenamep, vluint64_t tickEnd)
VL_MT_SAFE_EXCLUDES(m_mutex) {
const VerilatedLockGuard lock{m_mutex};
VL_DEBUG_IF(VL_DBG_MSGF("+prof+exec writing to '%s'\n", filenamep););
FILE* const fp = std::fopen(filenamep, "w");
if (VL_UNLIKELY(!fp)) { VL_FATAL_MT(filenamep, 0, "", "+prof+exec+file file not writable"); }
// TODO Perhaps merge with verilated_coverage output format, so can
// have a common merging and reporting tool, etc.
fprintf(fp, "VLPROFVERSION 2.0 # Verilator execution profile version 2.0\n");
fprintf(fp, "VLPROF arg +verilator+prof+exec+start+%" PRIu64 "\n",
Verilated::threadContextp()->profExecStart());
fprintf(fp, "VLPROF arg +verilator+prof+exec+window+%u\n",
Verilated::threadContextp()->profExecWindow());
const unsigned threads = static_cast<unsigned>(m_traceps.size());
fprintf(fp, "VLPROF stat threads %u\n", threads);
#ifdef VL_THREADED
fprintf(fp, "VLPROF stat yields %" PRIu64 "\n", VlMTaskVertex::yields());
#endif
// Copy /proc/cpuinfo into this output so verilator_gantt can be run on
// a different machine
{
const std::unique_ptr<std::ifstream> ifp{new std::ifstream("/proc/cpuinfo")};
if (!ifp->fail()) {
std::string line;
while (std::getline(*ifp, line)) { fprintf(fp, "VLPROFPROC %s\n", line.c_str()); }
}
}
for (const auto& pair : m_traceps) {
const uint32_t threadId = pair.first;
ExecutionTrace* const tracep = pair.second;
fprintf(fp, "VLPROFTHREAD %" PRIu32 "\n", threadId);
for (const VlExecutionRecord& er : *tracep) {
const char* const name = VlExecutionRecord::s_ascii[static_cast<uint8_t>(er.m_type)];
const vluint64_t time = er.m_tick - m_tickBegin;
fprintf(fp, "VLPROFEXEC %s %" PRIu64, name, time);
switch (er.m_type) {
case VlExecutionRecord::Type::EVAL_BEGIN:
case VlExecutionRecord::Type::EVAL_END:
case VlExecutionRecord::Type::EVAL_LOOP_BEGIN:
case VlExecutionRecord::Type::EVAL_LOOP_END:
// No payload
fprintf(fp, "\n");
break;
case VlExecutionRecord::Type::MTASK_BEGIN: {
const auto& payload = er.m_payload.mtaskBegin;
fprintf(fp, " id %u predictStart %u cpu %u\n", payload.m_id,
payload.m_predictStart, payload.m_cpu);
break;
}
case VlExecutionRecord::Type::MTASK_END: {
const auto& payload = er.m_payload.mtaskEnd;
fprintf(fp, " id %u predictCost %u\n", payload.m_id, payload.m_predictCost);
break;
}
default: abort(); // LCOV_EXCL_LINE
}
}
}
fprintf(fp, "VLPROF stat ticks %" PRIu64 "\n", tickEnd - m_tickBegin);
std::fclose(fp);
}

View File

@ -12,7 +12,7 @@
//=============================================================================
///
/// \file
/// \brief Verilated general profiling header
/// \brief Verilated run-time profiling header
///
/// This file is not part of the Verilated public-facing API.
/// It is only for internal use by Verilated library routines.
@ -23,58 +23,204 @@
#define VERILATOR_VERILATED_PROFILER_H_
#include "verilatedos.h"
#include "verilated.h" // for VerilatedMutex and clang annotations
#include <deque>
#ifndef VL_PROFILER
#error "verilated_profiler.h/cpp expects VL_PROFILER (from --prof-{exec, pgo}"
#endif
#include "verilated.h"
#include <array>
#include <atomic>
#include <cassert>
#include <string>
#include <type_traits>
#include <vector>
class VlExecutionProfiler;
//=============================================================================
// Macros to simplify generated code
#define VL_EXEC_TRACE_ADD_RECORD(vlSymsp) \
if (VL_UNLIKELY((vlSymsp)->__Vm_executionProfiler.enabled())) \
(vlSymsp)->__Vm_executionProfiler.addRecord()
//=============================================================================
// Return high-precision counter for profiling, or 0x0 if not available
VL_ATTR_ALWINLINE
inline QData VL_CPU_TICK() {
vluint64_t val;
VL_GET_CPU_TICK(val);
return val;
}
//=============================================================================
// Private class used by VlExecutionProfiler
#define _VL_FOREACH_APPLY(macro, arg) macro(arg, #arg)
// clang-format off
#define FOREACH_VlExecutionRecord_TYPE(macro) \
_VL_FOREACH_APPLY(macro, EVAL_BEGIN) \
_VL_FOREACH_APPLY(macro, EVAL_END) \
_VL_FOREACH_APPLY(macro, EVAL_LOOP_BEGIN) \
_VL_FOREACH_APPLY(macro, EVAL_LOOP_END) \
_VL_FOREACH_APPLY(macro, MTASK_BEGIN) \
_VL_FOREACH_APPLY(macro, MTASK_END)
// clang-format on
class VlExecutionRecord final {
friend class VlExecutionProfiler;
// TYPES
enum class Type : uint8_t {
#define VL_FOREACH_MACRO(id, name) id,
FOREACH_VlExecutionRecord_TYPE(VL_FOREACH_MACRO)
#undef VL_FOREACH_MACRO
};
static constexpr const char* const s_ascii[] = {
#define VL_FOREACH_MACRO(id, name) name,
FOREACH_VlExecutionRecord_TYPE(VL_FOREACH_MACRO)
#undef VL_FOREACH_MACRO
};
union Payload {
struct {
vluint32_t m_id; // MTask id
vluint32_t m_predictStart; // Time scheduler predicted would start
vluint32_t m_cpu; // Executing CPU id
} mtaskBegin;
struct {
vluint32_t m_id; // MTask id
vluint32_t m_predictCost; // How long scheduler predicted would take
} mtaskEnd;
};
// STATE
// Layout below allows efficient packing.
const vluint64_t m_tick = VL_CPU_TICK(); // Tick at construction
Payload m_payload; // The record payload
Type m_type; // The record type
static_assert(alignof(vluint64_t) >= alignof(Payload), "Padding not allowed");
static_assert(alignof(Payload) >= alignof(Type), "Padding not allowed");
static vluint16_t getcpu(); // Return currently executing CPU id
// Profile record, private class used only by this header
class VerilatedProfilerRec final {
const std::string m_name; // Hashed name of mtask/etc
const size_t m_counterNumber = 0; // Which counter has data
public:
// CONSTRUCTOR
VlExecutionRecord() = default;
// METHODS
VerilatedProfilerRec(size_t counterNumber, const std::string& name)
: m_name{name}
, m_counterNumber{counterNumber} {}
VerilatedProfilerRec() = default;
size_t counterNumber() const { return m_counterNumber; }
std::string name() const { return m_name; }
void evalBegin() { m_type = Type::EVAL_BEGIN; }
void evalEnd() { m_type = Type::EVAL_END; }
void evalLoopBegin() { m_type = Type::EVAL_LOOP_BEGIN; }
void evalLoopEnd() { m_type = Type::EVAL_LOOP_END; }
void mtaskBegin(vluint32_t id, vluint32_t predictStart) {
m_payload.mtaskBegin.m_id = id;
m_payload.mtaskBegin.m_predictStart = predictStart;
m_payload.mtaskBegin.m_cpu = getcpu();
m_type = Type::MTASK_BEGIN;
}
void mtaskEnd(vluint32_t id, vluint32_t predictCost) {
m_payload.mtaskEnd.m_id = id;
m_payload.mtaskEnd.m_predictCost = predictCost;
m_type = Type::MTASK_END;
}
};
// Create some number of bucketed profilers
template <std::size_t T_Entries> class VerilatedProfiler final {
// Counters are stored packed, all together, versus in VerilatedProfilerRec to
// reduce cache effects
std::array<vluint64_t, T_Entries> m_counters{}; // Time spent on this record
std::deque<VerilatedProfilerRec> m_records; // Record information
static_assert(std::is_trivially_destructible<VlExecutionRecord>::value,
"VlExecutionRecord should be trivially destructible for fast buffer clearing");
//=============================================================================
// VlExecutionProfiler is for collecting profiling data about model execution
class VlExecutionProfiler final {
// CONSTANTS
// In order to try to avoid dynamic memory allocations during the actual profiling phase,
// trace buffers are pre-allocated to be able to hold [a multiple] of this many records.
static constexpr size_t RESERVED_TRACE_CAPACITY = 4096;
// TYPES
// Execution traces are recorded into thread local vectors. We can append records of profiling
// events to this vector with very low overhead, and then dump them out later. This prevents
// the overhead of printf/malloc/IO from corrupting the profiling data. It's super cheap to
// append a VlProfileRec struct on the end of a pre-allocated vector; this is the only cost we
// pay in real-time during a profiling cycle. Internal note: Globals may multi-construct, see
// verilated.cpp top.
using ExecutionTrace = std::vector<VlExecutionRecord>;
// STATE
static VL_THREAD_LOCAL ExecutionTrace t_trace; // thread-local trace buffers
VerilatedMutex m_mutex;
// Map from thread id to &t_trace of given thread
std::map<uint32_t, ExecutionTrace*> m_traceps VL_GUARDED_BY(m_mutex);
bool m_enabled = false; // Is profiling currently enabled
vluint64_t m_tickBegin = 0; // Sample time (rdtsc() on x86) at beginning of collection
vluint64_t m_lastStartReq = 0; // Last requested profiling start (in simulation time)
vluint32_t m_windowCount = 0; // Track our position in the cache warmup and profile window
public:
// CONSTRUCTOR
VlExecutionProfiler();
// METHODS
// Is profiling enabled
inline bool enabled() const { return m_enabled; }
// Append a trace record to the trace buffer of the current thread
inline VlExecutionRecord& addRecord() {
t_trace.emplace_back();
return t_trace.back();
}
// Configure profiler (called in beginning of 'eval')
void configure(const VerilatedContext&);
// Setup profiling on a particular thread;
void setupThread(uint32_t threadId);
// Clear all profiling data
void clear() VL_MT_SAFE_EXCLUDES(m_mutex);
// Write profiling data into file
void dump(const char* filenamep, vluint64_t tickEnd) VL_MT_SAFE_EXCLUDES(m_mutex);
};
//=============================================================================
// VlPgoProfiler is for collecting profiling data for PGO
template <std::size_t T_Entries> class VlPgoProfiler final {
// TYPES
struct Record final {
const std::string m_name; // Hashed name of mtask/etc
const size_t m_counterNumber = 0; // Which counter has data
};
// Counters are stored packed, all together to reduce cache effects
std::array<vluint64_t, T_Entries> m_counters; // Time spent on this record
std::vector<Record> m_records; // Record information
public:
// METHODS
VerilatedProfiler() = default;
~VerilatedProfiler() = default;
VlPgoProfiler() = default;
~VlPgoProfiler() = default;
void write(const char* modelp, const std::string& filename) VL_MT_SAFE;
void addCounter(size_t counter, const std::string& name) {
VL_DEBUG_IF(assert(counter < T_Entries););
m_records.emplace_back(VerilatedProfilerRec{counter, name});
m_records.emplace_back(Record{name, counter});
}
void startCounter(size_t counter) {
vluint64_t val;
VL_RDTSC(val);
// -= so when we add end time in stopCounter, we already subtracted
// out, without needing to hold another temporary
m_counters[counter] -= val;
}
void stopCounter(size_t counter) {
vluint64_t val;
VL_RDTSC(val);
m_counters[counter] += val;
// -= so when we add end time in stopCounter, the net effect is adding the difference,
// without needing to hold onto a temporary
m_counters[counter] -= VL_CPU_TICK();
}
void stopCounter(size_t counter) { m_counters[counter] += VL_CPU_TICK(); }
};
template <std::size_t T_Entries>
void VerilatedProfiler<T_Entries>::write(const char* modelp,
const std::string& filename) VL_MT_SAFE {
void VlPgoProfiler<T_Entries>::write(const char* modelp, const std::string& filename) VL_MT_SAFE {
static VerilatedMutex s_mutex;
const VerilatedLockGuard lock{s_mutex};
@ -88,14 +234,9 @@ void VerilatedProfiler<T_Entries>::write(const char* modelp,
VL_DEBUG_IF(VL_DBG_MSGF("+prof+vlt+file writing to '%s'\n", filename.c_str()););
FILE* fp = nullptr;
if (!s_firstCall) fp = std::fopen(filename.c_str(), "a");
if (VL_UNLIKELY(!fp))
fp = std::fopen(filename.c_str(), "w"); // firstCall, or doesn't exist yet
FILE* const fp = std::fopen(filename.c_str(), s_firstCall ? "w" : "a");
if (VL_UNLIKELY(!fp)) {
VL_FATAL_MT(filename.c_str(), 0, "", "+prof+vlt+file file not writable");
// cppcheck-suppress resourceLeak // bug, doesn't realize fp is nullptr
return; // LCOV_EXCL_LINE
}
s_firstCall = false;
@ -104,10 +245,9 @@ void VerilatedProfiler<T_Entries>::write(const char* modelp,
fprintf(fp, "// Verilated model profile-guided optimization data dump file\n");
fprintf(fp, "`verilator_config\n");
for (const auto& it : m_records) {
const std::string& name = it.name();
for (const Record& rec : m_records) {
fprintf(fp, "profile_data -model \"%s\" -mtask \"%s\" -cost 64'd%" PRIu64 "\n", modelp,
name.c_str(), m_counters[it.counterNumber()]);
rec.m_name.c_str(), m_counters[rec.m_counterNumber]);
}
std::fclose(fp);

View File

@ -24,8 +24,11 @@
#include "verilatedos.h"
#include "verilated_threads.h"
#ifdef VL_PROFILER
#include "verilated_profiler.h"
#endif
#include <cstdio>
#include <fstream>
#include <memory>
#include <string>
@ -36,8 +39,6 @@
std::atomic<vluint64_t> VlMTaskVertex::s_yields;
VL_THREAD_LOCAL VlThreadPool::ProfileTrace* VlThreadPool::t_profilep = nullptr;
//=============================================================================
// VlMTaskVertex
@ -50,12 +51,11 @@ VlMTaskVertex::VlMTaskVertex(vluint32_t upstreamDepCount)
//=============================================================================
// VlWorkerThread
VlWorkerThread::VlWorkerThread(VlThreadPool* poolp, VerilatedContext* contextp, bool profiling)
VlWorkerThread::VlWorkerThread(uint32_t threadId, VerilatedContext* contextp,
VlExecutionProfiler* profilerp)
: m_ready_size{0}
, m_poolp{poolp}
, m_profiling{profiling} // Must init this last -- after setting up fields that it might read:
, m_exiting{false}
, m_cthread{startWorker, this}
, m_cthread{startWorker, this, threadId, profilerp}
, m_contextp{contextp} {}
VlWorkerThread::~VlWorkerThread() {
@ -66,8 +66,6 @@ VlWorkerThread::~VlWorkerThread() {
}
void VlWorkerThread::workerLoop() {
if (VL_UNLIKELY(m_profiling)) m_poolp->setupProfilingClientThread();
ExecRec work;
work.m_fnp = nullptr;
@ -82,143 +80,42 @@ void VlWorkerThread::workerLoop() {
work.m_fnp = nullptr;
}
}
if (VL_UNLIKELY(m_profiling)) m_poolp->tearDownProfilingClientThread();
}
void VlWorkerThread::startWorker(VlWorkerThread* workerp) {
void VlWorkerThread::startWorker(VlWorkerThread* workerp, uint32_t threadId,
VlExecutionProfiler* profilerp) {
Verilated::threadContextp(workerp->m_contextp);
#ifdef VL_PROFILER
// Note: setupThread is not defined without VL_PROFILER, hence the #ifdef. Still, we might
// not be profiling execution (e.g.: PGO only), so profilerp might still be nullptr.
if (profilerp) profilerp->setupThread(threadId);
#endif
workerp->workerLoop();
}
//=============================================================================
// VlThreadPool
VlThreadPool::VlThreadPool(VerilatedContext* contextp, int nThreads, bool profiling)
: m_profiling{profiling} {
VlThreadPool::VlThreadPool(VerilatedContext* contextp, int nThreads,
VlExecutionProfiler* profiler) {
// --threads N passes nThreads=N-1, as the "main" threads counts as 1
++nThreads;
const unsigned cpus = std::thread::hardware_concurrency();
if (cpus < nThreads + 1) {
if (cpus < nThreads) {
static int warnedOnce = 0;
if (!warnedOnce++) {
VL_PRINTF_MT("%%Warning: System has %u CPUs but model Verilated with"
" --threads %d; may run slow.\n",
cpus, nThreads + 1);
cpus, nThreads);
}
}
// Create'em
for (int i = 0; i < nThreads; ++i) {
m_workers.push_back(new VlWorkerThread{this, contextp, profiling});
// Create worker threads
for (uint32_t threadId = 1; threadId < nThreads; ++threadId) {
m_workers.push_back(new VlWorkerThread{threadId, contextp, profiler});
}
// Set up a profile buffer for the current thread too -- on the
// assumption that it's the same thread that calls eval and may be
// donated to run mtasks during the eval.
if (VL_UNLIKELY(m_profiling)) setupProfilingClientThread();
}
VlThreadPool::~VlThreadPool() {
// Each ~WorkerThread will wait for its thread to exit.
for (auto& i : m_workers) delete i;
if (VL_UNLIKELY(m_profiling)) tearDownProfilingClientThread();
}
void VlThreadPool::tearDownProfilingClientThread() {
assert(t_profilep);
delete t_profilep;
t_profilep = nullptr;
}
void VlThreadPool::setupProfilingClientThread() VL_MT_SAFE_EXCLUDES(m_mutex) {
assert(!t_profilep);
t_profilep = new ProfileTrace;
// Reserve some space in the thread-local profiling buffer;
// try not to malloc while collecting profiling.
t_profilep->reserve(4096);
{
const VerilatedLockGuard lock{m_mutex};
m_allProfiles.insert(t_profilep);
}
}
void VlThreadPool::profileAppendAll(const VlProfileRec& rec) VL_MT_SAFE_EXCLUDES(m_mutex) {
const VerilatedLockGuard lock{m_mutex};
for (const auto& profilep : m_allProfiles) {
// Every thread's profile trace gets a copy of rec.
profilep->emplace_back(rec);
}
}
void VlThreadPool::profileDump(const char* filenamep, vluint64_t tickStart, vluint64_t tickEnd)
VL_MT_SAFE_EXCLUDES(m_mutex) {
const VerilatedLockGuard lock{m_mutex};
VL_DEBUG_IF(VL_DBG_MSGF("+prof+threads writing to '%s'\n", filenamep););
FILE* const fp = std::fopen(filenamep, "w");
if (VL_UNLIKELY(!fp)) {
VL_FATAL_MT(filenamep, 0, "", "+prof+threads+file file not writable");
// cppcheck-suppress resourceLeak // bug, doesn't realize fp is nullptr
return; // LCOV_EXCL_LINE
}
// TODO Perhaps merge with verilated_coverage output format, so can
// have a common merging and reporting tool, etc.
fprintf(fp, "VLPROFTHREAD 1.1 # Verilator thread profile dump version 1.1\n");
fprintf(fp, "VLPROF arg --threads %" PRIu64 "\n", vluint64_t(m_workers.size() + 1));
fprintf(fp, "VLPROF arg +verilator+prof+threads+start+%" PRIu64 "\n",
Verilated::threadContextp()->profThreadsStart());
fprintf(fp, "VLPROF arg +verilator+prof+threads+window+%u\n",
Verilated::threadContextp()->profThreadsWindow());
fprintf(fp, "VLPROF stat yields %" PRIu64 "\n", VlMTaskVertex::yields());
// Copy /proc/cpuinfo into this output so verilator_gantt can be run on
// a different machine
{
const std::unique_ptr<std::ifstream> ifp{new std::ifstream("/proc/cpuinfo")};
if (!ifp->fail()) {
std::string line;
while (std::getline(*ifp, line)) { fprintf(fp, "VLPROFPROC %s\n", line.c_str()); }
}
}
vluint32_t thread_id = 0;
for (const auto& pi : m_allProfiles) {
++thread_id;
bool printing = false; // False while in warmup phase
for (const auto& ei : *pi) {
switch (ei.m_type) {
case VlProfileRec::TYPE_BARRIER: //
printing = true;
break;
case VlProfileRec::TYPE_EVAL:
if (!printing) break;
fprintf(fp,
"VLPROF eval start %" PRIu64 " elapsed %" PRIu64 " cpu %u on thread %u\n",
ei.m_startTime - tickStart, (ei.m_endTime - ei.m_startTime), ei.m_cpu,
thread_id);
break;
case VlProfileRec::TYPE_EVAL_LOOP:
if (!printing) break;
fprintf(fp,
"VLPROF eval_loop start %" PRIu64 " elapsed %" PRIu64
" cpu %u on thread %u\n",
ei.m_startTime - tickStart, (ei.m_endTime - ei.m_startTime), ei.m_cpu,
thread_id);
break;
case VlProfileRec::TYPE_MTASK_RUN:
if (!printing) break;
fprintf(fp,
"VLPROF mtask %d"
" start %" PRIu64 " elapsed %" PRIu64
" predict_start %u predict_cost %u cpu %u on thread %u\n",
ei.m_mtaskId, ei.m_startTime - tickStart, (ei.m_endTime - ei.m_startTime),
ei.m_predictStart, ei.m_predictCost, ei.m_cpu, thread_id);
break;
default: assert(false); break; // LCOV_EXCL_LINE
}
}
}
fprintf(fp, "VLPROF stat ticks %" PRIu64 "\n", tickEnd - tickStart);
std::fclose(fp);
}

View File

@ -35,8 +35,10 @@
#error "verilated_threads.h/cpp expected VL_THREADED (from verilator --threads)"
#endif
#include <atomic>
#include <condition_variable>
#include <set>
#include <thread>
#include <vector>
// clang-format off
@ -127,64 +129,7 @@ public:
}
};
// Profiling support
class VlProfileRec final {
protected:
friend class VlThreadPool;
enum VlProfileE { TYPE_MTASK_RUN, TYPE_EVAL, TYPE_EVAL_LOOP, TYPE_BARRIER };
// Layout below allows efficient packing.
// Leave endTime first, so no math needed to calculate address in endRecord
vluint64_t m_endTime = 0; // Tick at end of execution
vluint64_t m_startTime = 0; // Tick at start of execution
vluint32_t m_mtaskId = 0; // Mtask we're logging
vluint32_t m_predictStart = 0; // Time scheduler predicted would start
vluint32_t m_predictCost = 0; // How long scheduler predicted would take
VlProfileE m_type = TYPE_BARRIER; // Record type
unsigned m_cpu; // Execution CPU number (at start anyways)
public:
class Barrier {};
VlProfileRec() = default;
explicit VlProfileRec(Barrier) { m_cpu = getcpu(); }
void startEval(vluint64_t time) {
m_type = VlProfileRec::TYPE_EVAL;
m_startTime = time;
m_cpu = getcpu();
}
void startEvalLoop(vluint64_t time) {
m_type = VlProfileRec::TYPE_EVAL_LOOP;
m_startTime = time;
m_cpu = getcpu();
}
void startRecord(vluint64_t time, vluint32_t mtask, vluint32_t predictStart,
vluint32_t predictCost) {
m_type = VlProfileRec::TYPE_MTASK_RUN;
m_mtaskId = mtask;
m_predictStart = predictStart;
m_predictCost = predictCost;
m_startTime = time;
m_cpu = getcpu();
}
void endRecord(vluint64_t time) { m_endTime = time; }
static int getcpu() { // Return current executing CPU
#if defined(__linux)
return sched_getcpu();
#elif defined(__APPLE__) && !defined(__arm64__)
vluint32_t info[4];
__cpuid_count(1, 0, info[0], info[1], info[2], info[3]);
// info[1] is EBX, bits 24-31 are APIC ID
if ((info[3] & (1 << 9)) == 0) {
return -1; // no APIC on chip
} else {
return (unsigned)info[1] >> 24;
}
#elif defined(_WIN32)
return GetCurrentProcessorNumber();
#else
return 0;
#endif
}
};
class VlExecutionProfiler;
class VlThreadPool;
class VlWorkerThread final {
@ -217,9 +162,6 @@ private:
// Store the size atomically, so we can spin wait
std::atomic<size_t> m_ready_size;
VlThreadPool* const m_poolp; // Our associated thread pool
const bool m_profiling; // Is profiling enabled?
std::atomic<bool> m_exiting; // Worker thread should exit
std::thread m_cthread; // Underlying C++ thread record
VerilatedContext* const m_contextp; // Context for spawned thread
@ -228,7 +170,8 @@ private:
public:
// CONSTRUCTORS
explicit VlWorkerThread(VlThreadPool* poolp, VerilatedContext* contextp, bool profiling);
explicit VlWorkerThread(uint32_t threadId, VerilatedContext* contextp,
VlExecutionProfiler* profilerp);
~VlWorkerThread();
// METHODS
@ -265,34 +208,20 @@ public:
if (notify) m_cv.notify_one();
}
void workerLoop();
static void startWorker(VlWorkerThread* workerp);
static void startWorker(VlWorkerThread* workerp, uint32_t threadId,
VlExecutionProfiler* profilerp);
};
class VlThreadPool final {
// TYPES
using ProfileTrace = std::vector<VlProfileRec>;
// MEMBERS
std::vector<VlWorkerThread*> m_workers; // our workers
const bool m_profiling; // is profiling enabled?
// Support profiling -- we can append records of profiling events
// to this vector with very low overhead, and then dump them out
// later. This prevents the overhead of printf/malloc/IO from
// corrupting the profiling data. It's super cheap to append
// a VlProfileRec struct on the end of a pre-allocated vector;
// this is the only cost we pay in real-time during a profiling cycle.
// Internal note: Globals may multi-construct, see verilated.cpp top.
static VL_THREAD_LOCAL ProfileTrace* t_profilep;
std::set<ProfileTrace*> m_allProfiles VL_GUARDED_BY(m_mutex);
VerilatedMutex m_mutex;
public:
// CONSTRUCTORS
// Construct a thread pool with 'nThreads' dedicated threads. The thread
// pool will create these threads and make them available to execute tasks
// via this->workerp(index)->addTask(...)
VlThreadPool(VerilatedContext* contextp, int nThreads, bool profiling);
VlThreadPool(VerilatedContext* contextp, int nThreads, VlExecutionProfiler* profilerp);
~VlThreadPool();
// METHODS
@ -302,17 +231,6 @@ public:
assert(index < m_workers.size());
return m_workers[index];
}
inline VlProfileRec* profileAppend() {
t_profilep->emplace_back();
return &(t_profilep->back());
}
void profileAppendAll(const VlProfileRec& rec) VL_MT_SAFE_EXCLUDES(m_mutex);
void profileDump(const char* filenamep, vluint64_t tickStart, vluint64_t tickEnd)
VL_MT_SAFE_EXCLUDES(m_mutex);
// In profiling mode, each executing thread must call
// this once to setup profiling state:
void setupProfilingClientThread() VL_MT_SAFE_EXCLUDES(m_mutex);
void tearDownProfilingClientThread();
private:
VL_UNCOPYABLE(VlThreadPool);

View File

@ -438,7 +438,7 @@ using ssize_t = uint32_t; ///< signed size_t; returned from read()
#if defined(__i386__) || defined(__x86_64__)
// The vluint64_t argument is loaded with a high-performance counter for profiling
// or 0x0 if not implemented on this platform
#define VL_RDTSC(val) \
#define VL_GET_CPU_TICK(val) \
{ \
vluint32_t hi, lo; \
asm volatile("rdtsc" : "=a"(lo), "=d"(hi)); \
@ -446,14 +446,14 @@ using ssize_t = uint32_t; ///< signed size_t; returned from read()
}
#elif defined(__aarch64__)
// 1 GHz virtual system timer on SBSA level 5 compliant systems, else often 100 MHz
# define VL_RDTSC(val) \
# define VL_GET_CPU_TICK(val) \
{ \
asm volatile("isb" : : : "memory"); \
asm volatile("mrs %[rt],CNTVCT_EL0" : [rt] "=r"(val)); \
}
#else
// We just silently ignore unknown OSes, as only leads to missing statistics
# define VL_RDTSC(val) (val) = 0;
# define VL_GET_CPU_TICK(val) (val) = 0;
#endif
//=========================================================================

View File

@ -63,7 +63,7 @@ class EmitCGatherDependencies final : VNVisitor {
UASSERT_OBJ(selfPointer.find("vlSymsp") != string::npos, nodep,
"Unknown self pointer: '" << selfPointer << "'");
// Dereferencing vlSymsp, so we need it's definition...
m_dependencies.insert(EmitCBaseVisitor::symClassName());
addSymsDependency();
}
}
@ -117,9 +117,7 @@ class EmitCGatherDependencies final : VNVisitor {
iterateChildrenConst(nodep);
}
virtual void visit(AstNodeSimpleText* nodep) override {
if (nodep->text().find("vlSymsp") != string::npos) {
m_dependencies.insert(EmitCBaseVisitor::symClassName());
}
if (nodep->text().find("vlSymsp") != string::npos) addSymsDependency();
iterateChildrenConst(nodep);
}
virtual void visit(AstNode* nodep) override { iterateChildrenConst(nodep); }

View File

@ -178,6 +178,9 @@ class CMakeEmitter final {
if (v3Global.opt.mtasks()) {
global.emplace_back("${VERILATOR_ROOT}/include/verilated_threads.cpp");
}
if (v3Global.opt.usesProfiler()) {
global.emplace_back("${VERILATOR_ROOT}/include/verilated_profiler.cpp");
}
if (!v3Global.opt.libCreate().empty()) {
global.emplace_back(v3Global.opt.makeDir() + "/" + v3Global.opt.libCreate() + ".cpp");
}

View File

@ -330,21 +330,14 @@ class EmitCModel final : public EmitCFunc {
if (initial)
puts(topModNameProtected + "__" + protect("_eval_settle") + "(&(vlSymsp->TOP));\n");
const string recName = "__Vprfloop";
if (v3Global.opt.profThreads() && !initial) {
puts("VlProfileRec* " + recName + " = nullptr;\n");
// Leave this if() here, as don't want to call VL_RDTSC_Q unless profiling
puts("if (VL_UNLIKELY(vlSymsp->__Vm_profile_cycle_start)) {\n");
// Eval start
puts(/**/ recName + " = vlSymsp->__Vm_threadPoolp->profileAppend();\n");
puts(/**/ recName + "->startEvalLoop(VL_RDTSC_Q());\n");
puts("}\n");
if (v3Global.opt.profExec() && !initial) {
puts("VL_EXEC_TRACE_ADD_RECORD(vlSymsp).evalLoopBegin();\n");
}
puts(topModNameProtected + "__" + protect("_eval") + "(&(vlSymsp->TOP));\n");
if (v3Global.opt.profThreads() && !initial) {
puts("if (VL_UNLIKELY(" + recName + ")) " + recName + "->endRecord(VL_RDTSC_Q());\n");
if (v3Global.opt.profExec() && !initial) {
puts("VL_EXEC_TRACE_ADD_RECORD(vlSymsp).evalLoopEnd();\n");
}
if (v3Global.rootp()->changeRequest()) {
@ -434,61 +427,9 @@ class EmitCModel final : public EmitCFunc {
puts("Verilated::mtaskId(" + cvtToStr(mtaskId) + ");\n");
}
if (v3Global.opt.profThreads()) {
puts("if (VL_UNLIKELY((vlSymsp->_vm_contextp__->profThreadsStart() != "
"vlSymsp->__Vm_profile_time_finished)\n");
puts(" && (VL_TIME_Q() > vlSymsp->_vm_contextp__->profThreadsStart())\n");
puts(" && (vlSymsp->_vm_contextp__->profThreadsWindow() >= 1))) {\n");
// Within a profile (either starting, middle, or end)
puts(/**/ "if (vlSymsp->__Vm_profile_window_ct == 0) {\n"); // Opening file?
puts(/**/ "VL_DEBUG_IF(VL_DBG_MSGF(\"+ profile start warmup\\n\"););\n");
// Start profile on this cycle. We'll capture a window worth, then
// only analyze the next window worth. The idea is that the first window
// capture will hit some cache-cold stuff (eg printf) but it'll be warm
// by the time we hit the second window, we hope.
puts(/****/ "vlSymsp->__Vm_profile_cycle_start = VL_RDTSC_Q();\n");
// "* 2" as first half is warmup, second half is collection
puts(/****/ "vlSymsp->__Vm_profile_window_ct"
" = vlSymsp->_vm_contextp__->profThreadsWindow()"
" * 2 + 1;\n");
puts(/**/ "}\n");
puts(/**/ "--(vlSymsp->__Vm_profile_window_ct);\n");
puts(/**/ "if (vlSymsp->__Vm_profile_window_ct"
" == vlSymsp->_vm_contextp__->profThreadsWindow()) {\n");
// This barrier record in every threads' profile demarcates the
// cache-warm-up cycles before the barrier from the actual profile
// cycles afterward.
puts(/****/ "vlSymsp->__Vm_threadPoolp->profileAppendAll(");
puts(/****/ "VlProfileRec{VlProfileRec::Barrier{}});\n");
puts(/****/ "vlSymsp->__Vm_profile_cycle_start = VL_RDTSC_Q();\n");
puts(/**/ "}\n");
// Ending trace file?
puts(/**/ "else if (vlSymsp->__Vm_profile_window_ct == 0) {\n");
puts(/****/ "vluint64_t tick_end = VL_RDTSC_Q();\n");
puts(/****/ "VL_DEBUG_IF(VL_DBG_MSGF(\"+ profile end\\n\"););\n");
puts(/****/ "vlSymsp->__Vm_threadPoolp->profileDump("
"vlSymsp->_vm_contextp__->profThreadsFilename().c_str(), "
"vlSymsp->__Vm_profile_cycle_start, "
"tick_end);\n");
// This turns off the test to enter the profiling code, but still
// allows the user to collect another profile by changing
// profThreadsStart
puts(/****/ "vlSymsp->__Vm_profile_time_finished = "
"vlSymsp->_vm_contextp__->profThreadsStart();\n");
puts(/****/ "vlSymsp->__Vm_profile_cycle_start = 0;\n");
puts(/**/ "}\n");
puts("}\n");
}
const string recName = "__Vprfeval";
if (v3Global.opt.profThreads()) {
puts("VlProfileRec* " + recName + " = nullptr;\n");
// Leave this if() here, as don't want to call VL_RDTSC_Q unless profiling
puts("if (VL_UNLIKELY(vlSymsp->__Vm_profile_cycle_start)) {\n");
// Eval start
puts(/**/ recName + " = vlSymsp->__Vm_threadPoolp->profileAppend();\n");
puts(/**/ recName + "->startEval(VL_RDTSC_Q());\n");
puts("}\n");
if (v3Global.opt.profExec()) {
puts("vlSymsp->__Vm_executionProfiler.configure(*(vlSymsp->_vm_contextp__));\n");
puts("VL_EXEC_TRACE_ADD_RECORD(vlSymsp).evalBegin();\n");
}
emitSettleLoop(modp, /* initial: */ false);
@ -499,10 +440,7 @@ class EmitCModel final : public EmitCFunc {
}
if (v3Global.opt.threads()) puts("Verilated::endOfEval(vlSymsp->__Vm_evalMsgQp);\n");
if (v3Global.opt.profThreads()) {
// End eval record
puts("if (VL_UNLIKELY(" + recName + ")) " + recName + "->endRecord(VL_RDTSC_Q());\n");
}
if (v3Global.opt.profExec()) puts("VL_EXEC_TRACE_ADD_RECORD(vlSymsp).evalEnd();\n");
puts("}\n");
}

View File

@ -395,7 +395,7 @@ void EmitCSyms::emitSymHdr() {
if (v3Global.needTraceDumper()) {
puts("#include \"" + v3Global.opt.traceSourceLang() + ".h\"\n");
}
if (v3Global.opt.profThreads()) puts("#include \"verilated_profiler.h\"\n");
if (v3Global.opt.usesProfiler()) puts("#include \"verilated_profiler.h\"\n");
puts("\n// INCLUDE MODEL CLASS\n");
puts("\n#include \"" + topClassName() + ".h\"\n");
@ -445,18 +445,15 @@ void EmitCSyms::emitSymHdr() {
}
puts("bool __Vm_didInit = false;\n");
if (v3Global.opt.profExec()) {
puts("\n// EXECUTION PROFILING\n");
puts("VlExecutionProfiler __Vm_executionProfiler;\n");
}
if (v3Global.opt.mtasks()) {
puts("\n// MULTI-THREADING\n");
puts("VlThreadPool* const __Vm_threadPoolp;\n");
puts("bool __Vm_even_cycle = false;\n");
if (v3Global.opt.profThreads()) {
// rdtsc() at current cycle start
puts("vluint64_t __Vm_profile_cycle_start = 0;\n");
// Time we finished analysis
puts("vluint64_t __Vm_profile_time_finished = 0;\n");
// Track our position in the cache warmup and actual profile window
puts("vluint32_t __Vm_profile_window_ct = 0;\n");
}
}
puts("\n// MODULE INSTANCE STATE\n");
@ -477,8 +474,8 @@ void EmitCSyms::emitSymHdr() {
puts("];\n");
}
if (v3Global.opt.profThreads()) {
puts("\n// PROFILING\n");
if (v3Global.opt.profPgo()) {
puts("\n// PGO PROFILING\n");
vluint64_t maxProfilerId = 0;
if (v3Global.opt.mtasks()) {
for (const V3GraphVertex* vxp
@ -490,7 +487,7 @@ void EmitCSyms::emitSymHdr() {
}
}
++maxProfilerId; // As size must include 0
puts("VerilatedProfiler<" + cvtToStr(maxProfilerId) + "> _vm_profiler;\n");
puts("VlPgoProfiler<" + cvtToStr(maxProfilerId) + "> _vm_pgoProfiler;\n");
}
if (!m_scopeNames.empty()) { // Scope names
@ -682,8 +679,8 @@ void EmitCSyms::emitSymImp() {
puts("if (__Vm_dumping) _traceDumpClose();\n");
puts("#endif // VM_TRACE\n");
}
if (v3Global.opt.profThreads()) {
puts("_vm_profiler.write(\"" + topClassName()
if (v3Global.opt.profPgo()) {
puts("_vm_pgoProfiler.write(\"" + topClassName()
+ "\", _vm_contextp__->profVltFilename());\n");
}
if (v3Global.opt.mtasks()) puts("delete __Vm_threadPoolp;\n");
@ -719,8 +716,8 @@ void EmitCSyms::emitSymImp() {
// that calls eval() becomes the final Nth thread for the
// duration of the eval call.
puts(" , __Vm_threadPoolp{new VlThreadPool{_vm_contextp__, "
+ cvtToStr(v3Global.opt.threads() - 1) + ", " + cvtToStr(v3Global.opt.profThreads())
+ "}}\n");
+ cvtToStr(v3Global.opt.threads() - 1) + ", "
+ (v3Global.opt.profExec() ? "&__Vm_executionProfiler" : "nullptr") + "}}\n");
}
puts(" // Setup module instances\n");
@ -741,14 +738,14 @@ void EmitCSyms::emitSymImp() {
}
puts("{\n");
if (v3Global.opt.profThreads()) {
puts("// Configure profiling\n");
if (v3Global.opt.profPgo()) {
puts("// Configure profiling for PGO\n");
if (v3Global.opt.mtasks()) {
for (const V3GraphVertex* vxp
= v3Global.rootp()->execGraphp()->depGraphp()->verticesBeginp();
vxp; vxp = vxp->verticesNextp()) {
ExecMTask* const mtp = dynamic_cast<ExecMTask*>(const_cast<V3GraphVertex*>(vxp));
puts("_vm_profiler.addCounter(" + cvtToStr(mtp->profilerId()) + ", \""
puts("_vm_pgoProfiler.addCounter(" + cvtToStr(mtp->profilerId()) + ", \""
+ mtp->hashName() + "\");\n");
}
}

View File

@ -112,6 +112,9 @@ public:
}
}
if (v3Global.opt.mtasks()) putMakeClassEntry(of, "verilated_threads.cpp");
if (v3Global.opt.usesProfiler()) {
putMakeClassEntry(of, "verilated_profiler.cpp");
}
} else if (support == 2 && slow) {
} else {
for (AstNodeFile* nodep = v3Global.rootp()->filesp(); nodep;
@ -189,6 +192,7 @@ public:
of.puts("# User CFLAGS (from -CFLAGS on Verilator command line)\n");
of.puts("VM_USER_CFLAGS = \\\n");
if (!v3Global.opt.libCreate().empty()) of.puts("\t-fPIC \\\n");
if (v3Global.opt.usesProfiler()) of.puts("\t-DVL_PROFILER \\\n");
const V3StringList& cFlags = v3Global.opt.cFlags();
for (const string& i : cFlags) of.puts("\t" + i + " \\\n");
of.puts("\n");

View File

@ -1236,7 +1236,13 @@ void V3Options::parseOptsList(FileLine* fl, const string& optdir, int argc, char
DECL_OPTION("-prof-cfuncs", CbCall, [this]() { m_profC = m_profCFuncs = true; });
DECL_OPTION("-profile-cfuncs", CbCall,
[this]() { m_profC = m_profCFuncs = true; }); // Renamed
DECL_OPTION("-prof-threads", OnOff, &m_profThreads);
DECL_OPTION("-prof-exec", OnOff, &m_profExec);
DECL_OPTION("-prof-pgo", OnOff, &m_profPgo);
DECL_OPTION("-prof-threads", CbOnOff, [this, fl](bool flag) {
fl->v3warn(DEPRECATED, "Option --prof-threads is deprecated. "
"Use --prof-exec and --prof-pgo instead.");
m_profExec = m_profPgo = flag;
});
DECL_OPTION("-protect-ids", OnOff, &m_protectIds);
DECL_OPTION("-protect-key", Set, &m_protectKey);
DECL_OPTION("-protect-lib", CbVal, [this](const char* valp) {

View File

@ -255,7 +255,8 @@ private:
bool m_ppComments = false; // main switch: --pp-comments
bool m_profC = false; // main switch: --prof-c
bool m_profCFuncs = false; // main switch: --prof-cfuncs
bool m_profThreads = false; // main switch: --prof-threads
bool m_profExec = false; // main switch: --prof-exec
bool m_profPgo = false; // main switch: --prof-pgo
bool m_protectIds = false; // main switch: --protect-ids
bool m_public = false; // main switch: --public
bool m_publicFlatRW = false; // main switch: --public-flat-rw
@ -468,7 +469,9 @@ public:
bool ppComments() const { return m_ppComments; }
bool profC() const { return m_profC; }
bool profCFuncs() const { return m_profCFuncs; }
bool profThreads() const { return m_profThreads; }
bool profExec() const { return m_profExec; }
bool profPgo() const { return m_profPgo; }
bool usesProfiler() const { return profExec() || profPgo(); }
bool protectIds() const { return m_protectIds; }
bool allPublic() const { return m_public; }
bool publicFlatRW() const { return m_publicFlatRW; }

View File

@ -2918,43 +2918,39 @@ static void addMTaskToFunction(const ThreadSchedule& schedule, const uint32_t th
addStrStmt("vlSelf->" + name + +".waitUntilUpstreamDone(even_cycle);\n");
}
string recName;
if (v3Global.opt.profThreads()) {
recName = "__Vprfthr_" + cvtToStr(mtaskp->id());
addStrStmt("VlProfileRec* " + recName + " = nullptr;\n");
// Leave this if() here, as don't want to call VL_RDTSC_Q unless profiling
addStrStmt("if (VL_UNLIKELY(vlSymsp->__Vm_profile_cycle_start)) {\n" + //
recName + " = vlSymsp->__Vm_threadPoolp->profileAppend();\n" + //
recName + "->startRecord(VL_RDTSC_Q()," + //
" " + cvtToStr(mtaskp->id()) + "," + //
" " + cvtToStr(mtaskp->predictStart()) + "," + //
" " + cvtToStr(mtaskp->cost()) + ");\n" + //
"}\n");
if (v3Global.opt.profExec()) {
const string& id = cvtToStr(mtaskp->id());
const string& predictStart = cvtToStr(mtaskp->predictStart());
addStrStmt("VL_EXEC_TRACE_ADD_RECORD(vlSymsp).mtaskBegin(" + id + ", " + predictStart
+ ");\n");
}
if (v3Global.opt.profThreads()) {
if (v3Global.opt.profPgo()) {
// No lock around startCounter, as counter numbers are unique per thread
addStrStmt("vlSymsp->_vm_profiler.startCounter(" + cvtToStr(mtaskp->profilerId())
addStrStmt("vlSymsp->_vm_pgoProfiler.startCounter(" + cvtToStr(mtaskp->profilerId())
+ ");\n");
}
//
addStrStmt("Verilated::mtaskId(" + cvtToStr(mtaskp->id()) + ");\n");
// Move the the actual body of calls to leaf functions into this function
// Move the actual body of calls to leaf functions into this function
funcp->addStmtsp(mtaskp->bodyp()->unlinkFrBack());
if (v3Global.opt.profThreads()) {
// No lock around stopCounter, as counter numbers are unique per thread
addStrStmt("vlSymsp->_vm_profiler.stopCounter(" + cvtToStr(mtaskp->profilerId()) + ");\n");
}
if (v3Global.opt.profThreads()) {
addStrStmt("if (VL_UNLIKELY(" + recName + ")) " //
+ recName + "->endRecord(VL_RDTSC_Q());\n");
}
// Flush message queue
addStrStmt("Verilated::endOfThreadMTask(vlSymsp->__Vm_evalMsgQp);\n");
if (v3Global.opt.profPgo()) {
// No lock around stopCounter, as counter numbers are unique per thread
addStrStmt("vlSymsp->_vm_pgoProfiler.stopCounter(" + cvtToStr(mtaskp->profilerId())
+ ");\n");
}
if (v3Global.opt.profExec()) {
const string& id = cvtToStr(mtaskp->id());
const string& predictConst = cvtToStr(mtaskp->cost());
addStrStmt("VL_EXEC_TRACE_ADD_RECORD(vlSymsp).mtaskEnd(" + id + ", " + predictConst
+ ");\n");
}
// For any dependent mtask that's on another thread, signal one dependency completion.
for (V3GraphEdge* edgep = mtaskp->outBeginp(); edgep; edgep = edgep->outNextp()) {
const ExecMTask* const nextp = dynamic_cast<ExecMTask*>(edgep->top());

View File

@ -9,9 +9,8 @@ if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); di
# SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
# Test for bin/verilator_gantt,
#
# Only needed in multithreaded regression.
scenarios(vltmt => 1);
scenarios(vlt_all => 1);
# It doesn't really matter what test
# we use, so long as it runs several cycles,
@ -20,13 +19,13 @@ top_filename("t/t_gen_alw.v");
compile(
# Checks below care about thread count, so use 2 (minimum reasonable)
v_flags2 => ["--prof-threads --threads 2"]
v_flags2 => ["--prof-exec", ($Self->{vltmt} ? "--threads 2" : "")]
);
execute(
all_run_flags => ["+verilator+prof+threads+start+2",
" +verilator+prof+threads+window+2",
" +verilator+prof+threads+file+$Self->{obj_dir}/profile_threads.dat",
all_run_flags => ["+verilator+prof+exec+start+2",
" +verilator+prof+exec+window+2",
" +verilator+prof+exec+file+$Self->{obj_dir}/profile_exec.dat",
" +verilator+prof+vlt+file+$Self->{obj_dir}/profile.vlt",
],
check_finished => 1,
@ -37,17 +36,22 @@ execute(
# The profiling data still goes direct to the runtime's STDOUT
# (maybe that should go to a separate file - gantt.dat?)
run(cmd => ["$ENV{VERILATOR_ROOT}/bin/verilator_gantt",
"$Self->{obj_dir}/profile_threads.dat",
"--vcd $Self->{obj_dir}/profile_threads.vcd",
"$Self->{obj_dir}/profile_exec.dat",
"--vcd $Self->{obj_dir}/profile_exec.vcd",
"| tee $Self->{obj_dir}/gantt.log"],
);
file_grep("$Self->{obj_dir}/gantt.log", qr/Total threads += 2/i);
file_grep("$Self->{obj_dir}/gantt.log", qr/Total mtasks += 7/i);
if ($Self->{vltmt}) {
file_grep("$Self->{obj_dir}/gantt.log", qr/Total threads += 2/i);
file_grep("$Self->{obj_dir}/gantt.log", qr/Total mtasks += 7/i);
} else {
file_grep("$Self->{obj_dir}/gantt.log", qr/Total threads += 1/i);
file_grep("$Self->{obj_dir}/gantt.log", qr/Total mtasks += 0/i);
}
file_grep("$Self->{obj_dir}/gantt.log", qr/Total evals += 2/i);
# Diff to itself, just to check parsing
vcd_identical("$Self->{obj_dir}/profile_threads.vcd", "$Self->{obj_dir}/profile_threads.vcd");
vcd_identical("$Self->{obj_dir}/profile_exec.vcd", "$Self->{obj_dir}/profile_exec.vcd");
ok(1);
1;

View File

@ -1,8 +1,8 @@
VLPROFTHREAD 1.1 # Verilator thread profile dump version 1.1
VLPROF arg --threads 2
VLPROF arg +verilator+prof+threads+start+2
VLPROF arg +verilator+prof+threads+window+2
VLPROFVERSION 2.0
VLPROF arg +verilator+prof+exec+start+2
VLPROF arg +verilator+prof+exec+window+2
VLPROF stat yields 0
VLPROF stat threads 2
VLPROFPROC processor : 0
VLPROFPROC vendor_id : AuthenticTest
VLPROFPROC cpu family : 23
@ -899,22 +899,42 @@ VLPROFPROC cache_alignment : 64
VLPROFPROC address sizes : 43 bits physical, 48 bits virtual
VLPROFPROC power management: ts ttp tm hwpstate cpb eff_freq_ro [13] [14]
VLPROFPROC
VLPROF eval start 595 elapsed 11655 cpu 19 on thread 1
VLPROF eval_loop start 945 elapsed 11235 cpu 19 on thread 1
VLPROF mtask 6 start 2695 elapsed 210 predict_start 0 predict_cost 30 cpu 19 on thread 1
VLPROF mtask 10 start 9695 elapsed 175 predict_start 196 predict_cost 30 cpu 19 on thread 1
VLPROF eval start 13720 elapsed 8610 cpu 19 on thread 1
VLPROF eval_loop start 14000 elapsed 8085 cpu 19 on thread 1
VLPROF mtask 6 start 15610 elapsed 210 predict_start 0 predict_cost 30 cpu 19 on thread 1
VLPROF mtask 10 start 21700 elapsed 175 predict_start 196 predict_cost 30 cpu 19 on thread 1
VLPROF mtask 5 start 5495 elapsed 595 predict_start 0 predict_cost 30 cpu 10 on thread 2
VLPROF mtask 7 start 6300 elapsed 595 predict_start 30 predict_cost 30 cpu 10 on thread 2
VLPROF mtask 8 start 7490 elapsed 1050 predict_start 60 predict_cost 107 cpu 10 on thread 2
VLPROF mtask 9 start 9135 elapsed 595 predict_start 167 predict_cost 30 cpu 10 on thread 2
VLPROF mtask 11 start 10255 elapsed 805 predict_start 197 predict_cost 30 cpu 10 on thread 2
VLPROF mtask 5 start 18375 elapsed 595 predict_start 0 predict_cost 30 cpu 10 on thread 2
VLPROF mtask 7 start 19145 elapsed 175 predict_start 30 predict_cost 30 cpu 10 on thread 2
VLPROF mtask 8 start 19670 elapsed 140 predict_start 60 predict_cost 107 cpu 10 on thread 2
VLPROF mtask 9 start 20650 elapsed 70 predict_start 167 predict_cost 30 cpu 10 on thread 2
VLPROF mtask 11 start 21140 elapsed 105 predict_start 197 predict_cost 30 cpu 10 on thread 2
VLPROFTHREAD 0
VLPROFEXEC EVAL_BEGIN 595
VLPROFEXEC EVAL_LOOP_BEGIN 945
VLPROFEXEC MTASK_BEGIN 2695 id 6 predictStart 0 cpu 19
VLPROFEXEC MTASK_END 2905 id 6 predictCost 30
VLPROFEXEC MTASK_BEGIN 9695 id 10 predictStart 196 cpu 19
VLPROFEXEC MTASK_END 9870 id 10 predictCost 30
VLPROFEXEC EVAL_LOOP_END 12180
VLPROFEXEC EVAL_END 12250
VLPROFEXEC EVAL_BEGIN 13720
VLPROFEXEC EVAL_LOOP_BEGIN 14000
VLPROFEXEC MTASK_BEGIN 15610 id 6 predictStart 0 cpu 19
VLPROFEXEC MTASK_END 15820 id 6 predictCost 30
VLPROFEXEC MTASK_BEGIN 21700 id 10 predictStart 196 cpu 19
VLPROFEXEC MTASK_END 21875 id 10 predictCost 30
VLPROFEXEC EVAL_LOOP_END 22085
VLPROFEXEC EVAL_END 22330
VLPROFTHREAD 1
VLPROFEXEC MTASK_BEGIN 5495 id 5 predictStart 0 cpu 10
VLPROFEXEC MTASK_END 6090 id 5 predictCost 30
VLPROFEXEC MTASK_BEGIN 6300 id 7 predictStart 30 cpu 10
VLPROFEXEC MTASK_END 6895 id 7 predictCost 30
VLPROFEXEC MTASK_BEGIN 7490 id 8 predictStart 60 cpu 10
VLPROFEXEC MTASK_END 8540 id 8 predictCost 107
VLPROFEXEC MTASK_BEGIN 9135 id 9 predictStart 167 cpu 10
VLPROFEXEC MTASK_END 9730 id 9 predictCost 30
VLPROFEXEC MTASK_BEGIN 10255 id 11 predictStart 197 cpu 10
VLPROFEXEC MTASK_END 11060 id 11 predictCost 30
VLPROFEXEC MTASK_BEGIN 18375 id 5 predictStart 0 cpu 10
VLPROFEXEC MTASK_END 18970 id 5 predictCost 30
VLPROFEXEC MTASK_BEGIN 19145 id 7 predictStart 30 cpu 10
VLPROFEXEC MTASK_END 19320 id 7 predictCost 30
VLPROFEXEC MTASK_BEGIN 19670 id 8 predictStart 60 cpu 10
VLPROFEXEC MTASK_END 19810 id 8 predictCost 107
VLPROFEXEC MTASK_BEGIN 20650 id 9 predictStart 167 cpu 10
VLPROFEXEC MTASK_END 20720 id 9 predictCost 30
VLPROFEXEC MTASK_BEGIN 21140 id 11 predictStart 197 cpu 10
VLPROFEXEC MTASK_END 21245 id 11 predictCost 30
VLPROF stat ticks 23415

View File

@ -1,9 +1,8 @@
Verilator Gantt report
Argument settings:
+verilator+prof+threads+start+2
+verilator+prof+threads+window+2
--threads 2
+verilator+prof+exec+start+2
+verilator+prof+exec+window+2
Analysis:
Total threads = 2
@ -23,7 +22,7 @@ Prediction (what Verilator used for scheduling):
All-thread efficiency = 63.2%
All-thread speedup = 1.3
Statistics:
MTask statistics:
min log(p2e) = -3.681 from mtask 5 (predict 30, elapsed 1190)
max log(p2e) = -2.409 from mtask 8 (predict 107, elapsed 1190)
mean = -2.992
@ -34,4 +33,4 @@ CPUs:
cpu 10: cpu_time=4725 socket=0 core=10 Test Ryzen 9 3950X 16-Core Processor
cpu 19: cpu_time=770 socket=0 core=3 Test Ryzen 9 3950X 16-Core Processor
Writing profile_threads.vcd
Writing profile_exec.vcd

View File

@ -16,7 +16,7 @@ run(cmd => ["cd $Self->{obj_dir} && $ENV{VERILATOR_ROOT}/bin/verilator_gantt"
files_identical("$Self->{obj_dir}/gantt.log", $Self->{golden_filename});
vcd_identical("$Self->{obj_dir}/profile_threads.vcd", "$Self->{t_dir}/$Self->{name}.vcd.out");
vcd_identical("$Self->{obj_dir}/profile_exec.vcd", "$Self->{t_dir}/$Self->{name}.vcd.out");
ok(1);
1;

View File

@ -15,8 +15,8 @@ $timescale 1ns $end
$var wire 32 vc eval_loop [31:0] $end
$upscope $end
$scope module measured_threads $end
$var wire 32 v0 thread1_mtask [31:0] $end
$var wire 32 v4 thread2_mtask [31:0] $end
$var wire 32 v0 thread0_mtask [31:0] $end
$var wire 32 v4 thread1_mtask [31:0] $end
$upscope $end
$scope module mtasks $end
$var wire 32 v3 mtask10_cpu [31:0] $end
@ -28,8 +28,8 @@ $timescale 1ns $end
$var wire 32 v9 mtask9_cpu [31:0] $end
$upscope $end
$scope module predicted_threads $end
$var wire 32 vd predicted_thread1_mtask [31:0] $end
$var wire 32 ve predicted_thread2_mtask [31:0] $end
$var wire 32 vd predicted_thread0_mtask [31:0] $end
$var wire 32 ve predicted_thread1_mtask [31:0] $end
$upscope $end
$upscope $end
$enddefinitions $end
@ -65,7 +65,7 @@ b111 ve
b1 v10
#2695
b110 v0
b1 v1
b0 v1
b10011 v2
b1 vf
#2905
@ -78,7 +78,7 @@ b1000 ve
b1 v10
#5495
b101 v4
b10 v5
b1 v5
b1010 v6
b1 vf
#6090
@ -88,7 +88,7 @@ bz v6
b0 vf
#6300
b111 v4
b10 v5
b1 v5
b1010 v7
b1 vf
#6895
@ -98,7 +98,7 @@ bz v7
b0 vf
#7490
b1000 v4
b10 v5
b1 v5
b1010 v8
b1 vf
#8540
@ -108,7 +108,7 @@ bz v8
b0 vf
#9135
b1001 v4
b10 v5
b1 v5
b1010 v9
b1 vf
#9210
@ -116,7 +116,7 @@ b1001 ve
b1 v10
#9695
b1010 v0
b1 v1
b0 v1
b10011 v3
b10 vf
#9730
@ -131,7 +131,7 @@ bz v3
b0 vf
#10255
b1011 v4
b10 v5
b1 v5
b1010 va
b1 vf
#10645
@ -167,7 +167,7 @@ b111 ve
b1 v10
#15610
b110 v0
b1 v1
b0 v1
b10011 v2
b1 vf
#15820
@ -180,7 +180,7 @@ b1000 ve
b1 v10
#18375
b101 v4
b10 v5
b1 v5
b1010 v6
b1 vf
#18970
@ -190,7 +190,7 @@ bz v6
b0 vf
#19145
b111 v4
b10 v5
b1 v5
b1010 v7
b1 vf
#19320
@ -200,7 +200,7 @@ bz v7
b0 vf
#19670
b1000 v4
b10 v5
b1 v5
b1010 v8
b1 vf
#19810
@ -213,7 +213,7 @@ b1001 ve
b1 v10
#20650
b1001 v4
b10 v5
b1 v5
b1010 v9
b1 vf
#20720
@ -229,7 +229,7 @@ b1011 ve
b10 v10
#21140
b1011 v4
b10 v5
b1 v5
b1010 va
b1 vf
#21245
@ -239,7 +239,7 @@ bz va
b0 vf
#21700
b1010 v0
b1 v1
b0 v1
b10011 v3
b1 vf
#21875

View File

@ -1,7 +1,7 @@
VLPROFTHREAD 1.1 # Verilator thread profile dump version 1.1
VLPROF arg --threads 4
VLPROF arg +verilator+prof+threads+start+1
VLPROF arg +verilator+prof+threads+window+2
VLPROFVERSION 2.0
VLPROF arg +verilator+prof+exec+start+1
VLPROF arg +verilator+prof+exec+window+2
VLPROF stat threads 2
VLPROF stat yields 51
VLPROFPROC processor : 0
VLPROFPROC model name : Phytium,FT-2500/128
@ -43,11 +43,20 @@ VLPROFPROC CPU variant : 0x1
VLPROFPROC CPU part : 0x663
VLPROFPROC CPU revision : 3
VLPROFPROC
VLPROF eval start 57709 elapsed 1745979 cpu 2 on thread 1
VLPROF eval_loop start 58532 elapsed 1744353 cpu 2 on thread 1
VLPROF mtask 85 start 90465 elapsed 64569 predict_start 14315 predict_cost 30533 cpu 2 on thread 1
VLPROF mtask 79 start 156555 elapsed 137754 predict_start 44848 predict_cost 48001 cpu 2 on thread 1
VLPROF mtask 90 start 77352 elapsed 1159 predict_start 14315 predict_cost 21592 cpu 3 on thread 2
VLPROF mtask 81 start 79799 elapsed 868 predict_start 35907 predict_cost 29215 cpu 3 on thread 2
VLPROF mtask 87 start 81746 elapsed 887 predict_start 65147 predict_cost 33809 cpu 3 on thread 2
VLPROFTHREAD 0
VLPROFEXEC EVAL_BEGIN 57709
VLPROFEXEC EVAL_LOOP_BEGIN 58532
VLPROFEXEC MTASK_BEGIN 90465 id 85 predictStart 14315 cpu 2
VLPROFEXEC MTASK_END 155034 id 85 predictCost 30533
VLPROFEXEC MTASK_BEGIN 156555 id 79 predictStart 44848 cpu 2
VLPROFEXEC MTASK_END 294309 id 79 predictCost 48001
VLPROFEXEC EVAL_LOOP_END 18028850
VLPROFEXEC EVAL_END 1803680
VLPROFTHREAD 1
VLPROFEXEC MTASK_BEGIN 77352 id 90 predictStart 14315 cpu 3
VLPROFEXEC MTASK_END 78511 id 90 predictCost 21592
VLPROFEXEC MTASK_BEGIN 79799 id 81 predictStart 35907 cpu 3
VLPROFEXEC MTASK_END 80667 id 81 predictCost 29215
VLPROFEXEC MTASK_BEGIN 81746 id 87 predictStart 65147 cpu 3
VLPROFEXEC MTASK_END 82633 id 87 predictCost 33809
VLPROF stat ticks 180832

View File

@ -1,9 +1,8 @@
Verilator Gantt report
Argument settings:
+verilator+prof+threads+start+1
+verilator+prof+threads+window+2
--threads 4
+verilator+prof+exec+start+1
+verilator+prof+exec+window+2
Analysis:
Total threads = 2
@ -23,7 +22,7 @@ Prediction (what Verilator used for scheduling):
All-thread efficiency = 82.4%
All-thread speedup = 1.6
Statistics:
MTask statistics:
min log(p2e) = -1.054 from mtask 79 (predict 48001, elapsed 137754)
max log(p2e) = 3.641 from mtask 87 (predict 33809, elapsed 887)
mean = 1.656
@ -34,4 +33,4 @@ CPUs:
cpu 2: cpu_time=202323 Phytium,FT-2500/128
cpu 3: cpu_time=2914 Phytium,FT-2500/128
Writing profile_threads.vcd
Writing profile_exec.vcd

View File

@ -1,24 +1,44 @@
VLPROFTHREAD 1.1 # Verilator thread profile dump version 1.1
VLPROF arg --threads 2
VLPROF arg +verilator+prof+threads+start+2
VLPROF arg +verilator+prof+threads+window+2
VLPROFVERSION 2.0
VLPROF arg +verilator+prof+exec+start+2
VLPROF arg +verilator+prof+exec+window+2
VLPROF stat threads 2
VLPROF stat yields 0
VLPROF eval start 595 elapsed 11655 cpu 19 on thread 1
VLPROF eval_loop start 945 elapsed 11235 cpu 19 on thread 1
VLPROF mtask 6 start 2695 elapsed 210 predict_start 0 predict_cost 30 cpu 19 on thread 1
VLPROF mtask 10 start 9695 elapsed 175 predict_start 196 predict_cost 30 cpu 19 on thread 1
VLPROF eval start 13720 elapsed 8610 cpu 19 on thread 1
VLPROF eval_loop start 14000 elapsed 8085 cpu 19 on thread 1
VLPROF mtask 6 start 15610 elapsed 210 predict_start 0 predict_cost 30 cpu 19 on thread 1
VLPROF mtask 10 start 21700 elapsed 175 predict_start 196 predict_cost 30 cpu 19 on thread 1
VLPROF mtask 5 start 5495 elapsed 595 predict_start 0 predict_cost 30 cpu 10 on thread 2
VLPROF mtask 7 start 6300 elapsed 595 predict_start 30 predict_cost 30 cpu 10 on thread 2
VLPROF mtask 8 start 7490 elapsed 1050 predict_start 60 predict_cost 107 cpu 10 on thread 2
VLPROF mtask 9 start 9135 elapsed 595 predict_start 167 predict_cost 30 cpu 10 on thread 2
VLPROF mtask 11 start 10255 elapsed 805 predict_start 197 predict_cost 30 cpu 10 on thread 2
VLPROF mtask 5 start 18375 elapsed 595 predict_start 0 predict_cost 30 cpu 10 on thread 2
VLPROF mtask 7 start 19145 elapsed 175 predict_start 30 predict_cost 30 cpu 10 on thread 2
VLPROF mtask 8 start 19670 elapsed 140 predict_start 60 predict_cost 107 cpu 10 on thread 2
VLPROF mtask 9 start 20650 elapsed 70 predict_start 167 predict_cost 30 cpu 10 on thread 2
VLPROF mtask 11 start 21140 elapsed 105 predict_start 197 predict_cost 30 cpu 10 on thread 2
VLPROFTHREAD 0
VLPROFEXEC EVAL_BEGIN 595
VLPROFEXEC EVAL_LOOP_BEGIN 945
VLPROFEXEC MTASK_BEGIN 2695 id 6 predictStart 0 cpu 19
VLPROFEXEC MTASK_END 2905 id 6 predictCost 30
VLPROFEXEC MTASK_BEGIN 9695 id 10 predictStart 196 cpu 19
VLPROFEXEC MTASK_END 9870 id 10 predictCost 30
VLPROFEXEC EVAL_LOOP_END 12180
VLPROFEXEC EVAL_END 12250
VLPROFEXEC EVAL_BEGIN 13720
VLPROFEXEC EVAL_LOOP_BEGIN 14000
VLPROFEXEC MTASK_BEGIN 15610 id 6 predictStart 0 cpu 19
VLPROFEXEC MTASK_END 15820 id 6 predictCost 30
VLPROFEXEC MTASK_BEGIN 21700 id 10 predictStart 196 cpu 19
VLPROFEXEC MTASK_END 21875 id 10 predictCost 30
VLPROFEXEC EVAL_LOOP_END 22085
VLPROFEXEC EVAL_END 22330
VLPROFTHREAD 1
VLPROFEXEC MTASK_BEGIN 5495 id 5 predictStart 0 cpu 10
VLPROFEXEC MTASK_END 6090 id 5 predictCost 30
VLPROFEXEC MTASK_BEGIN 6300 id 7 predictStart 30 cpu 10
VLPROFEXEC MTASK_END 6895 id 7 predictCost 30
VLPROFEXEC MTASK_BEGIN 7490 id 8 predictStart 60 cpu 10
VLPROFEXEC MTASK_END 8540 id 8 predictCost 107
VLPROFEXEC MTASK_BEGIN 9135 id 9 predictStart 167 cpu 10
VLPROFEXEC MTASK_END 9730 id 9 predictCost 30
VLPROFEXEC MTASK_BEGIN 10255 id 11 predictStart 197 cpu 10
VLPROFEXEC MTASK_END 11060 id 11 predictCost 30
VLPROFEXEC MTASK_BEGIN 18375 id 5 predictStart 0 cpu 10
VLPROFEXEC MTASK_END 18970 id 5 predictCost 30
VLPROFEXEC MTASK_BEGIN 19145 id 7 predictStart 30 cpu 10
VLPROFEXEC MTASK_END 19320 id 7 predictCost 30
VLPROFEXEC MTASK_BEGIN 19670 id 8 predictStart 60 cpu 10
VLPROFEXEC MTASK_END 19810 id 8 predictCost 107
VLPROFEXEC MTASK_BEGIN 20650 id 9 predictStart 167 cpu 10
VLPROFEXEC MTASK_END 20720 id 9 predictCost 30
VLPROFEXEC MTASK_BEGIN 21140 id 11 predictStart 197 cpu 10
VLPROFEXEC MTASK_END 21245 id 11 predictCost 30
VLPROF stat ticks 23415

View File

@ -1,9 +1,8 @@
Verilator Gantt report
Argument settings:
+verilator+prof+threads+start+2
+verilator+prof+threads+window+2
--threads 2
+verilator+prof+exec+start+2
+verilator+prof+exec+window+2
Analysis:
Total threads = 2
@ -23,7 +22,7 @@ Prediction (what Verilator used for scheduling):
All-thread efficiency = 63.2%
All-thread speedup = 1.3
Statistics:
MTask statistics:
min log(p2e) = -3.681 from mtask 5 (predict 30, elapsed 1190)
max log(p2e) = -2.409 from mtask 8 (predict 107, elapsed 1190)
mean = -2.992

View File

@ -118,9 +118,9 @@ compile(
);
execute(
all_run_flags => ["+verilator+prof+threads+start+100",
" +verilator+prof+threads+window+2",
" +verilator+prof+threads+file+$Self->{obj_dir}/profile_threads.dat",
all_run_flags => ["+verilator+prof+exec+start+100",
" +verilator+prof+exec+window+2",
" +verilator+prof+exec+file+$Self->{obj_dir}/profile_exec.dat",
" +verilator+prof+vlt+file+$Self->{obj_dir}/profile.vlt",
],
check_finished => 1,

View File

@ -14,12 +14,12 @@ scenarios(vltmt => 1);
top_filename("t/t_gen_alw.v");
compile(
v_flags2 => ["--prof-threads --threads 2"]
v_flags2 => ["--prof-pgo --threads 2"]
);
execute(
all_run_flags => ["+verilator+prof+threads+start+0",
" +verilator+prof+threads+file+/dev/null",
all_run_flags => ["+verilator+prof+exec+start+0",
" +verilator+prof+exec+file+/dev/null",
" +verilator+prof+vlt+file+$Self->{obj_dir}/profile.vlt",
],
check_finished => 1,
@ -28,8 +28,8 @@ execute(
file_grep("$Self->{obj_dir}/profile.vlt", qr/profile_data/i);
compile(
# Intentinally no --prof-threads here, so we make sure profile data
# can read in without it (that is no prof-thread effect on profile_data hash names)
# Intentinally no --prof-pgo here to make sure profile data can be read in
# without it (that is: --prof-pgo has no effect on profile_data hash names)
v_flags2 => ["--threads 2",
" $Self->{obj_dir}/profile.vlt"],
);

View File

@ -21,13 +21,12 @@ compile(
? "--threads 2 $root/include/verilated_threads.cpp" : ""),
($Self->cfg_with_threaded
? "--trace-threads 1" : ""),
($Self->cfg_with_threaded
? "--prof-threads" : ""),
"--prof-exec", "--prof-pgo",
"$root/include/verilated_save.cpp"],
);
execute(
all_run_flags => [" +verilator+prof+threads+file+/dev/null",
all_run_flags => [" +verilator+prof+exec+file+/dev/null",
" +verilator+prof+vlt+file+/dev/null",
],
check_finished => 1,

View File

@ -16,12 +16,12 @@ my $root = "..";
compile(
# Can't use --coverage and --savable together, so cheat and compile inline
verilator_flags2 => ["--cc --coverage-toggle --coverage-line --coverage-user --trace --vpi $root/include/verilated_save.cpp"],
verilator_flags2 => ["--cc --coverage-toggle --coverage-line --coverage-user --trace --prof-exec --prof-pgo --vpi $root/include/verilated_save.cpp"],
make_flags => 'DRIVER_STD=newest',
);
execute(
all_run_flags => [" +verilator+prof+threads+file+/dev/null",
all_run_flags => [" +verilator+prof+exec+file+/dev/null",
" +verilator+prof+vlt+file+/dev/null",
],
check_finished => 1,