forked from github/verilator
b1b5b5dfe2
The --prof-threads option has been split into two independent options: 1. --prof-exec, for collecting verilator_gantt and other execution related profiling data, and 2. --prof-pgo, for collecting data needed for PGO The implementation of execution profiling is extricated from VlThreadPool and is now a separate class VlExecutionProfiler. This means --prof-exec can now be used for single-threaded models (though it does not measure a lot of things just yet). For consistency VerilatedProfiler is renamed VlPgoProfiler. Both VlExecutionProfiler and VlPgoProfiler are in verilated_profiler.{h/cpp}, but can be used completely independently. Also re-worked the execution profile format so it now only emits events without holding onto any temporaries. This is in preparation for some future optimizations that would be hindered by the introduction of function locals via AstText. Also removed the Barrier event. Clearing the profile buffers is not notably more expensive as the profiling records are trivially destructible.
514 lines
20 KiB
Python
Executable File
514 lines
20 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
# pylint: disable=C0103,C0114,C0116,C0301,R0914,R0912,R0915,W0511,eval-used
|
|
######################################################################
|
|
|
|
import argparse
|
|
import collections
|
|
import math
|
|
import re
|
|
import statistics
|
|
# from pprint import pprint
|
|
|
|
Threads = collections.defaultdict(lambda: collections.defaultdict(lambda: {}))
|
|
Mtasks = collections.defaultdict(lambda: {})
|
|
Evals = collections.defaultdict(lambda: {})
|
|
EvalLoops = collections.defaultdict(lambda: {})
|
|
Global = {
|
|
'args': {},
|
|
'cpuinfo': collections.defaultdict(lambda: {}),
|
|
'rdtsc_cycle_time': 0,
|
|
'stats': {}
|
|
}
|
|
|
|
######################################################################
|
|
|
|
|
|
def process(filename):
|
|
read_data(filename)
|
|
report()
|
|
|
|
|
|
def read_data(filename):
|
|
with open(filename) as fh:
|
|
re_thread = re.compile(r'^VLPROFTHREAD (\d+)$')
|
|
re_record = re.compile(r'^VLPROFEXEC (\S+) (\d+)(.*)$')
|
|
re_payload_mtaskBegin = re.compile(
|
|
r'id (\d+) predictStart (\d+) cpu (\d+)')
|
|
re_payload_mtaskEnd = re.compile(r'id (\d+) predictCost (\d+)')
|
|
|
|
re_arg1 = re.compile(r'VLPROF arg\s+(\S+)\+([0-9.]*)\s*')
|
|
re_arg2 = re.compile(r'VLPROF arg\s+(\S+)\s+([0-9.]*)\s*$')
|
|
re_stat = re.compile(r'VLPROF stat\s+(\S+)\s+([0-9.]+)')
|
|
re_time = re.compile(r'rdtsc time = (\d+) ticks')
|
|
re_proc_cpu = re.compile(r'VLPROFPROC processor\s*:\s*(\d+)\s*$')
|
|
re_proc_dat = re.compile(r'VLPROFPROC ([a-z_ ]+)\s*:\s*(.*)$')
|
|
cpu = None
|
|
thread = None
|
|
|
|
lastEvalBeginTick = None
|
|
lastEvalLoopBeginTick = None
|
|
|
|
for line in fh:
|
|
recordMatch = re_record.match(line)
|
|
if recordMatch:
|
|
kind, tick, payload = recordMatch.groups()
|
|
tick = int(tick)
|
|
payload = payload.strip()
|
|
if kind == "EVAL_BEGIN":
|
|
Evals[tick]['start'] = tick
|
|
lastEvalBeginTick = tick
|
|
elif kind == "EVAL_END":
|
|
Evals[lastEvalBeginTick]['end'] = tick
|
|
lastEvalBeginTick = None
|
|
elif kind == "EVAL_LOOP_BEGIN":
|
|
EvalLoops[tick]['start'] = tick
|
|
lastEvalLoopBeginTick = tick
|
|
elif kind == "EVAL_LOOP_END":
|
|
EvalLoops[lastEvalLoopBeginTick]['end'] = tick
|
|
lastEvalLoopBeginTick = None
|
|
elif kind == "MTASK_BEGIN":
|
|
mtask, predict_start, ecpu = re_payload_mtaskBegin.match(
|
|
payload).groups()
|
|
mtask = int(mtask)
|
|
predict_start = int(predict_start)
|
|
ecpu = int(ecpu)
|
|
Threads[thread][tick]['mtask'] = mtask
|
|
Threads[thread][tick]['predict_start'] = predict_start
|
|
Threads[thread][tick]['cpu'] = ecpu
|
|
if 'elapsed' not in Mtasks[mtask]:
|
|
Mtasks[mtask] = {'end': 0, 'elapsed': 0}
|
|
Mtasks[mtask]['begin'] = tick
|
|
Mtasks[mtask]['thread'] = thread
|
|
Mtasks[mtask]['predict_start'] = predict_start
|
|
elif kind == "MTASK_END":
|
|
mtask, predict_cost = re_payload_mtaskEnd.match(
|
|
payload).groups()
|
|
mtask = int(mtask)
|
|
predict_cost = int(predict_cost)
|
|
begin = Mtasks[mtask]['begin']
|
|
Threads[thread][begin]['end'] = tick
|
|
Threads[thread][begin]['predict_cost'] = predict_cost
|
|
Mtasks[mtask]['elapsed'] += tick - begin
|
|
Mtasks[mtask]['predict_cost'] = predict_cost
|
|
Mtasks[mtask]['end'] = max(Mtasks[mtask]['end'], tick)
|
|
elif Args.debug:
|
|
print("-Unknown execution trace record: %s" % line)
|
|
elif re_thread.match(line):
|
|
thread = int(re_thread.match(line).group(1))
|
|
elif re.match(r'^VLPROF(THREAD|VERSION)', line):
|
|
pass
|
|
elif re_arg1.match(line):
|
|
match = re_arg1.match(line)
|
|
Global['args'][match.group(1)] = match.group(2)
|
|
elif re_arg2.match(line):
|
|
match = re_arg2.match(line)
|
|
Global['args'][match.group(1)] = match.group(2)
|
|
elif re_stat.match(line):
|
|
match = re_stat.match(line)
|
|
Global['stats'][match.group(1)] = match.group(2)
|
|
elif re_proc_cpu.match(line):
|
|
match = re_proc_cpu.match(line)
|
|
cpu = int(match.group(1))
|
|
elif cpu and re_proc_dat.match(line):
|
|
match = re_proc_dat.match(line)
|
|
term = match.group(1)
|
|
value = match.group(2)
|
|
term = re.sub(r'\s+$', '', term)
|
|
term = re.sub(r'\s+', '_', term)
|
|
value = re.sub(r'\s+$', '', value)
|
|
Global['cpuinfo'][cpu][term] = value
|
|
elif re.match(r'^#', line):
|
|
pass
|
|
elif Args.debug:
|
|
print("-Unk: %s" % line)
|
|
# TODO -- this is parsing text printed by a client.
|
|
# Really, verilator proper should generate this
|
|
# if it's useful...
|
|
if re_time.match(line):
|
|
Global['rdtsc_cycle_time'] = re_time.group(1)
|
|
|
|
|
|
def re_match_result(regexp, line, result_to):
|
|
result_to = re.match(regexp, line)
|
|
return result_to
|
|
|
|
|
|
######################################################################
|
|
|
|
|
|
def report():
|
|
print("Verilator Gantt report")
|
|
|
|
print("\nArgument settings:")
|
|
for arg in sorted(Global['args'].keys()):
|
|
plus = "+" if re.match(r'^\+', arg) else " "
|
|
print(" %s%s%s" % (arg, plus, Global['args'][arg]))
|
|
|
|
nthreads = int(Global['stats']['threads'])
|
|
Global['cpus'] = {}
|
|
for thread in Threads:
|
|
# Make potentially multiple characters per column
|
|
for start in Threads[thread]:
|
|
if not Threads[thread][start]: continue
|
|
cpu = Threads[thread][start]['cpu']
|
|
elapsed = Threads[thread][start]['end'] - start
|
|
if cpu not in Global['cpus']:
|
|
Global['cpus'][cpu] = {'cpu_time': 0}
|
|
Global['cpus'][cpu]['cpu_time'] += elapsed
|
|
|
|
measured_mt_mtask_time = 0
|
|
predict_mt_mtask_time = 0
|
|
long_mtask_time = 0
|
|
measured_last_end = 0
|
|
predict_last_end = 0
|
|
for mtask in Mtasks:
|
|
measured_mt_mtask_time += Mtasks[mtask]['elapsed']
|
|
predict_mt_mtask_time += Mtasks[mtask]['predict_cost']
|
|
measured_last_end = max(measured_last_end, Mtasks[mtask]['end'])
|
|
predict_last_end = max(
|
|
predict_last_end,
|
|
Mtasks[mtask]['predict_start'] + Mtasks[mtask]['predict_cost'])
|
|
long_mtask_time = max(long_mtask_time, Mtasks[mtask]['elapsed'])
|
|
Global['measured_last_end'] = measured_last_end
|
|
Global['predict_last_end'] = predict_last_end
|
|
|
|
# If we know cycle time in the same (rdtsc) units,
|
|
# this will give us an actual utilization number,
|
|
# (how effectively we keep the cores busy.)
|
|
#
|
|
# It also gives us a number we can compare against
|
|
# serial mode, to estimate the overhead of data sharing,
|
|
# which will show up in the total elapsed time. (Overhead
|
|
# of synchronization and scheduling should not.)
|
|
print("\nAnalysis:")
|
|
print(" Total threads = %d" % nthreads)
|
|
print(" Total mtasks = %d" % len(Mtasks))
|
|
ncpus = max(len(Global['cpus']), 1)
|
|
print(" Total cpus used = %d" % ncpus)
|
|
print(" Total yields = %d" %
|
|
int(Global['stats'].get('yields', 0)))
|
|
print(" Total evals = %d" % len(Evals))
|
|
print(" Total eval loops = %d" % len(EvalLoops))
|
|
if Mtasks:
|
|
print(" Total eval time = %d rdtsc ticks" %
|
|
Global['measured_last_end'])
|
|
print(" Longest mtask time = %d rdtsc ticks" % long_mtask_time)
|
|
print(" All-thread mtask time = %d rdtsc ticks" %
|
|
measured_mt_mtask_time)
|
|
long_efficiency = long_mtask_time / (Global.get(
|
|
'measured_last_end', 1) or 1)
|
|
print(" Longest-thread efficiency = %0.1f%%" %
|
|
(long_efficiency * 100.0))
|
|
mt_efficiency = measured_mt_mtask_time / (
|
|
Global.get('measured_last_end', 1) * nthreads or 1)
|
|
print(" All-thread efficiency = %0.1f%%" %
|
|
(mt_efficiency * 100.0))
|
|
print(" All-thread speedup = %0.1f" %
|
|
(mt_efficiency * nthreads))
|
|
if Global['rdtsc_cycle_time'] > 0:
|
|
ut = measured_mt_mtask_time / Global['rdtsc_cycle_time']
|
|
print("tot_mtask_cpu=" + measured_mt_mtask_time + " cyc=" +
|
|
Global['rdtsc_cycle_time'] + " ut=" + ut)
|
|
|
|
predict_mt_efficiency = predict_mt_mtask_time / (
|
|
Global.get('predict_last_end', 1) * nthreads or 1)
|
|
print("\nPrediction (what Verilator used for scheduling):")
|
|
print(" All-thread efficiency = %0.1f%%" %
|
|
(predict_mt_efficiency * 100.0))
|
|
print(" All-thread speedup = %0.1f" %
|
|
(predict_mt_efficiency * nthreads))
|
|
|
|
p2e_ratios = []
|
|
min_p2e = 1000000
|
|
min_mtask = None
|
|
max_p2e = -1000000
|
|
max_mtask = None
|
|
|
|
for mtask in sorted(Mtasks.keys()):
|
|
if Mtasks[mtask]['elapsed'] > 0:
|
|
if Mtasks[mtask]['predict_cost'] == 0:
|
|
Mtasks[mtask]['predict_cost'] = 1 # don't log(0) below
|
|
p2e_ratio = math.log(Mtasks[mtask]['predict_cost'] /
|
|
Mtasks[mtask]['elapsed'])
|
|
p2e_ratios.append(p2e_ratio)
|
|
|
|
if p2e_ratio > max_p2e:
|
|
max_p2e = p2e_ratio
|
|
max_mtask = mtask
|
|
if p2e_ratio < min_p2e:
|
|
min_p2e = p2e_ratio
|
|
min_mtask = mtask
|
|
|
|
print("\nMTask statistics:")
|
|
print(" min log(p2e) = %0.3f" % min_p2e, end="")
|
|
print(" from mtask %d (predict %d," %
|
|
(min_mtask, Mtasks[min_mtask]['predict_cost']),
|
|
end="")
|
|
print(" elapsed %d)" % Mtasks[min_mtask]['elapsed'])
|
|
print(" max log(p2e) = %0.3f" % max_p2e, end="")
|
|
print(" from mtask %d (predict %d," %
|
|
(max_mtask, Mtasks[max_mtask]['predict_cost']),
|
|
end="")
|
|
print(" elapsed %d)" % Mtasks[max_mtask]['elapsed'])
|
|
|
|
stddev = statistics.pstdev(p2e_ratios)
|
|
mean = statistics.mean(p2e_ratios)
|
|
print(" mean = %0.3f" % mean)
|
|
print(" stddev = %0.3f" % stddev)
|
|
print(" e ^ stddev = %0.3f" % math.exp(stddev))
|
|
|
|
report_cpus()
|
|
|
|
if nthreads > ncpus:
|
|
print()
|
|
print("%%Warning: There were fewer CPUs (%d) then threads (%d)." %
|
|
(ncpus, nthreads))
|
|
print(" : See docs on use of numactl.")
|
|
else:
|
|
if 'cpu_socket_cores_warning' in Global:
|
|
print()
|
|
print(
|
|
"%Warning: Multiple threads scheduled on same hyperthreaded core."
|
|
)
|
|
print(" : See docs on use of numactl.")
|
|
if 'cpu_sockets_warning' in Global:
|
|
print()
|
|
print("%Warning: Threads scheduled on multiple sockets.")
|
|
print(" : See docs on use of numactl.")
|
|
print()
|
|
|
|
|
|
def report_cpus():
|
|
print("\nCPUs:")
|
|
|
|
Global['cpu_sockets'] = collections.defaultdict(lambda: 0)
|
|
Global['cpu_socket_cores'] = collections.defaultdict(lambda: 0)
|
|
|
|
for cpu in sorted(Global['cpus'].keys()):
|
|
print(" cpu %d: " % cpu, end='')
|
|
print("cpu_time=%d" % Global['cpus'][cpu]['cpu_time'], end='')
|
|
|
|
socket = None
|
|
if cpu in Global['cpuinfo']:
|
|
cpuinfo = Global['cpuinfo'][cpu]
|
|
if 'physical_id' in cpuinfo and 'core_id' in cpuinfo:
|
|
socket = int(cpuinfo['physical_id'])
|
|
Global['cpu_sockets'][socket] += 1
|
|
print(" socket=%d" % socket, end='')
|
|
|
|
core = int(cpuinfo['core_id'])
|
|
Global['cpu_socket_cores'][str(socket) + "__" + str(core)] += 1
|
|
print(" core=%d" % core, end='')
|
|
|
|
if 'model_name' in cpuinfo:
|
|
model = cpuinfo['model_name']
|
|
print(" %s" % model, end='')
|
|
print()
|
|
|
|
if len(Global['cpu_sockets']) > 1:
|
|
Global['cpu_sockets_warning'] = True
|
|
for scn in Global['cpu_socket_cores'].values():
|
|
if scn > 1:
|
|
Global['cpu_socket_cores_warning'] = True
|
|
|
|
|
|
######################################################################
|
|
|
|
|
|
def write_vcd(filename):
|
|
print("Writing %s" % filename)
|
|
with open(filename, "w") as fh:
|
|
vcd = {
|
|
'values':
|
|
collections.defaultdict(lambda: {}), # {<time>}{<code>} = value
|
|
'sigs': {
|
|
'predicted_threads': {},
|
|
'measured_threads': {},
|
|
'cpus': {},
|
|
'evals': {},
|
|
'mtasks': {},
|
|
'Stats': {}
|
|
} # {<module>}{<sig}} = code
|
|
}
|
|
code = 0
|
|
|
|
parallelism = {
|
|
'measured': collections.defaultdict(lambda: 0),
|
|
'predicted': collections.defaultdict(lambda: 0)
|
|
}
|
|
parallelism['measured'][0] = 0
|
|
parallelism['predicted'][0] = 0
|
|
|
|
# Measured graph
|
|
for thread in sorted(Threads.keys()):
|
|
sig = "thread%d_mtask" % thread
|
|
if sig not in vcd['sigs']['measured_threads']:
|
|
vcd['sigs']['measured_threads'][sig] = code
|
|
code += 1
|
|
mcode = vcd['sigs']['measured_threads'][sig]
|
|
|
|
for start in sorted(Threads[thread]):
|
|
mtask = Threads[thread][start]['mtask']
|
|
end = Threads[thread][start]['end']
|
|
cpu = Threads[thread][start]['cpu']
|
|
vcd['values'][start][mcode] = mtask
|
|
vcd['values'][end][mcode] = None
|
|
parallelism['measured'][start] += 1
|
|
parallelism['measured'][end] -= 1
|
|
|
|
sig = "cpu%d_thread" % cpu
|
|
if sig not in vcd['sigs']['cpus']:
|
|
vcd['sigs']['cpus'][sig] = code
|
|
code += 1
|
|
ccode = vcd['sigs']['cpus'][sig]
|
|
vcd['values'][start][ccode] = thread
|
|
vcd['values'][end][ccode] = None
|
|
|
|
sig = "mtask%d_cpu" % mtask
|
|
if sig not in vcd['sigs']['mtasks']:
|
|
vcd['sigs']['mtasks'][sig] = code
|
|
code += 1
|
|
ccode = vcd['sigs']['mtasks'][sig]
|
|
vcd['values'][start][ccode] = cpu
|
|
vcd['values'][end][ccode] = None
|
|
|
|
# Eval graph
|
|
vcd['sigs']['evals']["eval"] = code
|
|
elcode = code
|
|
code += 1
|
|
n = 0
|
|
for eval_start in Evals:
|
|
eval_end = Evals[eval_start]['end']
|
|
n += 1
|
|
vcd['values'][eval_start][elcode] = n
|
|
vcd['values'][eval_end][elcode] = None
|
|
|
|
# Eval_loop graph
|
|
vcd['sigs']['evals']["eval_loop"] = code
|
|
elcode = code
|
|
code += 1
|
|
n = 0
|
|
for eval_start in EvalLoops:
|
|
eval_end = EvalLoops[eval_start]['end']
|
|
n += 1
|
|
vcd['values'][eval_start][elcode] = n
|
|
vcd['values'][eval_end][elcode] = None
|
|
|
|
if Mtasks:
|
|
# Predicted graph
|
|
for eval_start in EvalLoops:
|
|
eval_end = EvalLoops[eval_start]['end']
|
|
# Compute scale so predicted graph is of same width as eval
|
|
measured_scaling = (eval_end -
|
|
eval_start) / Global['predict_last_end']
|
|
# Predict mtasks that fill the time the eval occupied
|
|
for mtask in Mtasks:
|
|
thread = Mtasks[mtask]['thread']
|
|
pred_scaled_start = eval_start + int(
|
|
Mtasks[mtask]['predict_start'] * measured_scaling)
|
|
pred_scaled_end = eval_start + int(
|
|
(Mtasks[mtask]['predict_start'] +
|
|
Mtasks[mtask]['predict_cost']) * measured_scaling)
|
|
if pred_scaled_start == pred_scaled_end:
|
|
continue
|
|
|
|
sig = "predicted_thread%d_mtask" % thread
|
|
if sig not in vcd['sigs']['predicted_threads']:
|
|
vcd['sigs']['predicted_threads'][sig] = code
|
|
code += 1
|
|
mcode = vcd['sigs']['predicted_threads'][sig]
|
|
|
|
vcd['values'][pred_scaled_start][mcode] = mtask
|
|
vcd['values'][pred_scaled_end][mcode] = None
|
|
|
|
parallelism['predicted'][pred_scaled_start] += 1
|
|
parallelism['predicted'][pred_scaled_end] -= 1
|
|
|
|
# Parallelism graph
|
|
for measpred in ('measured', 'predicted'):
|
|
vcd['sigs']['Stats']["%s_parallelism" % measpred] = code
|
|
pcode = code
|
|
code += 1
|
|
value = 0
|
|
for time in sorted(parallelism[measpred].keys()):
|
|
value += parallelism[measpred][time]
|
|
vcd['values'][time][pcode] = value
|
|
|
|
# Create output file
|
|
fh.write("$version Generated by verilator_gantt $end\n")
|
|
fh.write("$timescale 1ns $end\n")
|
|
fh.write("\n")
|
|
|
|
all_codes = {}
|
|
fh.write(" $scope module gantt $end\n")
|
|
for module in sorted(vcd['sigs'].keys()):
|
|
fh.write(" $scope module %s $end\n" % module)
|
|
for sig in sorted(vcd['sigs'][module].keys()):
|
|
code = vcd['sigs'][module][sig]
|
|
fh.write(" $var wire 32 v%x %s [31:0] $end\n" % (code, sig))
|
|
all_codes[code] = 1
|
|
fh.write(" $upscope $end\n")
|
|
fh.write(" $upscope $end\n")
|
|
fh.write("$enddefinitions $end\n")
|
|
fh.write("\n")
|
|
|
|
first = True
|
|
for time in sorted(vcd['values']):
|
|
if first:
|
|
first = False
|
|
# Start with Z for any signals without time zero data
|
|
for code in sorted(all_codes.keys()):
|
|
if code not in vcd['values'][time]:
|
|
vcd['values'][time][code] = None
|
|
fh.write("#%d\n" % time)
|
|
for code in sorted(vcd['values'][time].keys()):
|
|
value = vcd['values'][time][code]
|
|
if value is None:
|
|
fh.write("bz v%x\n" % code)
|
|
else:
|
|
fh.write("b%s v%x\n" % (format(value, 'b'), code))
|
|
|
|
|
|
######################################################################
|
|
|
|
parser = argparse.ArgumentParser(
|
|
allow_abbrev=False,
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
description="""Create Gantt chart of multi-threaded execution""",
|
|
epilog=
|
|
"""Verilator_gantt creates a visual representation to help analyze Verilator
|
|
#xmultithreaded simulation performance, by showing when each macro-task
|
|
#xstarts and ends, and showing when each thread is busy or idle.
|
|
|
|
For documentation see
|
|
https://verilator.org/guide/latest/exe_verilator_gantt.html
|
|
|
|
Copyright 2018-2022 by Wilson Snyder. This program is free software; you
|
|
can redistribute it and/or modify it under the terms of either the GNU
|
|
Lesser General Public License Version 3 or the Perl Artistic License
|
|
Version 2.0.
|
|
|
|
SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0""")
|
|
|
|
parser.add_argument('--debug', action='store_true', help='enable debug')
|
|
parser.add_argument('--no-vcd',
|
|
help='disable creating vcd',
|
|
action='store_true')
|
|
parser.add_argument('--vcd',
|
|
help='filename for vcd outpue',
|
|
default='profile_exec.vcd')
|
|
parser.add_argument('filename',
|
|
help='input profile_exec.dat filename to process',
|
|
default='profile_exec.dat')
|
|
|
|
Args = parser.parse_args()
|
|
|
|
process(Args.filename)
|
|
if not Args.no_vcd:
|
|
write_vcd(Args.vcd)
|
|
|
|
######################################################################
|
|
# Local Variables:
|
|
# compile-command: "./verilator_gantt ../test_regress/t/t_gantt_io.dat"
|
|
# End:
|