forked from github/verilator
515 lines
20 KiB
Python
Executable File
515 lines
20 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
# pylint: disable=C0103,C0114,C0116,C0209,C0301,R0914,R0912,R0915,W0511,eval-used
|
|
######################################################################
|
|
|
|
import argparse
|
|
import collections
|
|
import math
|
|
import re
|
|
import statistics
|
|
# from pprint import pprint
|
|
|
|
Threads = collections.defaultdict(lambda: collections.defaultdict(lambda: {}))
|
|
Mtasks = collections.defaultdict(lambda: {})
|
|
Evals = collections.defaultdict(lambda: {})
|
|
EvalLoops = collections.defaultdict(lambda: {})
|
|
Global = {
|
|
'args': {},
|
|
'cpuinfo': collections.defaultdict(lambda: {}),
|
|
'rdtsc_cycle_time': 0,
|
|
'stats': {}
|
|
}
|
|
|
|
######################################################################
|
|
|
|
|
|
def process(filename):
|
|
read_data(filename)
|
|
report()
|
|
|
|
|
|
def read_data(filename):
|
|
with open(filename, "r", encoding="utf8") as fh:
|
|
re_thread = re.compile(r'^VLPROFTHREAD (\d+)$')
|
|
re_record = re.compile(r'^VLPROFEXEC (\S+) (\d+)(.*)$')
|
|
re_payload_mtaskBegin = re.compile(
|
|
r'id (\d+) predictStart (\d+) cpu (\d+)')
|
|
re_payload_mtaskEnd = re.compile(r'id (\d+) predictCost (\d+)')
|
|
|
|
re_arg1 = re.compile(r'VLPROF arg\s+(\S+)\+([0-9.]*)\s*')
|
|
re_arg2 = re.compile(r'VLPROF arg\s+(\S+)\s+([0-9.]*)\s*$')
|
|
re_stat = re.compile(r'VLPROF stat\s+(\S+)\s+([0-9.]+)')
|
|
re_time = re.compile(r'rdtsc time = (\d+) ticks')
|
|
re_proc_cpu = re.compile(r'VLPROFPROC processor\s*:\s*(\d+)\s*$')
|
|
re_proc_dat = re.compile(r'VLPROFPROC ([a-z_ ]+)\s*:\s*(.*)$')
|
|
cpu = None
|
|
thread = None
|
|
|
|
lastEvalBeginTick = None
|
|
lastEvalLoopBeginTick = None
|
|
|
|
for line in fh:
|
|
recordMatch = re_record.match(line)
|
|
if recordMatch:
|
|
kind, tick, payload = recordMatch.groups()
|
|
tick = int(tick)
|
|
payload = payload.strip()
|
|
if kind == "EVAL_BEGIN":
|
|
Evals[tick]['start'] = tick
|
|
lastEvalBeginTick = tick
|
|
elif kind == "EVAL_END":
|
|
Evals[lastEvalBeginTick]['end'] = tick
|
|
lastEvalBeginTick = None
|
|
elif kind == "EVAL_LOOP_BEGIN":
|
|
EvalLoops[tick]['start'] = tick
|
|
lastEvalLoopBeginTick = tick
|
|
elif kind == "EVAL_LOOP_END":
|
|
EvalLoops[lastEvalLoopBeginTick]['end'] = tick
|
|
lastEvalLoopBeginTick = None
|
|
elif kind == "MTASK_BEGIN":
|
|
mtask, predict_start, ecpu = re_payload_mtaskBegin.match(
|
|
payload).groups()
|
|
mtask = int(mtask)
|
|
predict_start = int(predict_start)
|
|
ecpu = int(ecpu)
|
|
Threads[thread][tick]['mtask'] = mtask
|
|
Threads[thread][tick]['predict_start'] = predict_start
|
|
Threads[thread][tick]['cpu'] = ecpu
|
|
if 'elapsed' not in Mtasks[mtask]:
|
|
Mtasks[mtask] = {'end': 0, 'elapsed': 0}
|
|
Mtasks[mtask]['begin'] = tick
|
|
Mtasks[mtask]['thread'] = thread
|
|
Mtasks[mtask]['predict_start'] = predict_start
|
|
elif kind == "MTASK_END":
|
|
mtask, predict_cost = re_payload_mtaskEnd.match(
|
|
payload).groups()
|
|
mtask = int(mtask)
|
|
predict_cost = int(predict_cost)
|
|
begin = Mtasks[mtask]['begin']
|
|
Threads[thread][begin]['end'] = tick
|
|
Threads[thread][begin]['predict_cost'] = predict_cost
|
|
Mtasks[mtask]['elapsed'] += tick - begin
|
|
Mtasks[mtask]['predict_cost'] = predict_cost
|
|
Mtasks[mtask]['end'] = max(Mtasks[mtask]['end'], tick)
|
|
elif Args.debug:
|
|
print("-Unknown execution trace record: %s" % line)
|
|
elif re_thread.match(line):
|
|
thread = int(re_thread.match(line).group(1))
|
|
elif re.match(r'^VLPROF(THREAD|VERSION)', line):
|
|
pass
|
|
elif re_arg1.match(line):
|
|
match = re_arg1.match(line)
|
|
Global['args'][match.group(1)] = match.group(2)
|
|
elif re_arg2.match(line):
|
|
match = re_arg2.match(line)
|
|
Global['args'][match.group(1)] = match.group(2)
|
|
elif re_stat.match(line):
|
|
match = re_stat.match(line)
|
|
Global['stats'][match.group(1)] = match.group(2)
|
|
elif re_proc_cpu.match(line):
|
|
match = re_proc_cpu.match(line)
|
|
cpu = int(match.group(1))
|
|
elif cpu and re_proc_dat.match(line):
|
|
match = re_proc_dat.match(line)
|
|
term = match.group(1)
|
|
value = match.group(2)
|
|
term = re.sub(r'\s+$', '', term)
|
|
term = re.sub(r'\s+', '_', term)
|
|
value = re.sub(r'\s+$', '', value)
|
|
Global['cpuinfo'][cpu][term] = value
|
|
elif re.match(r'^#', line):
|
|
pass
|
|
elif Args.debug:
|
|
print("-Unk: %s" % line)
|
|
# TODO -- this is parsing text printed by a client.
|
|
# Really, verilator proper should generate this
|
|
# if it's useful...
|
|
if re_time.match(line):
|
|
Global['rdtsc_cycle_time'] = re_time.group(1)
|
|
|
|
|
|
def re_match_result(regexp, line, result_to):
|
|
result_to = re.match(regexp, line)
|
|
return result_to
|
|
|
|
|
|
######################################################################
|
|
|
|
|
|
def report():
|
|
print("Verilator Gantt report")
|
|
|
|
print("\nArgument settings:")
|
|
for arg in sorted(Global['args'].keys()):
|
|
plus = "+" if re.match(r'^\+', arg) else " "
|
|
print(" %s%s%s" % (arg, plus, Global['args'][arg]))
|
|
|
|
nthreads = int(Global['stats']['threads'])
|
|
Global['cpus'] = {}
|
|
for thread in Threads:
|
|
# Make potentially multiple characters per column
|
|
for start in Threads[thread]:
|
|
if not Threads[thread][start]:
|
|
continue
|
|
cpu = Threads[thread][start]['cpu']
|
|
elapsed = Threads[thread][start]['end'] - start
|
|
if cpu not in Global['cpus']:
|
|
Global['cpus'][cpu] = {'cpu_time': 0}
|
|
Global['cpus'][cpu]['cpu_time'] += elapsed
|
|
|
|
measured_mt_mtask_time = 0
|
|
predict_mt_mtask_time = 0
|
|
long_mtask_time = 0
|
|
measured_last_end = 0
|
|
predict_last_end = 0
|
|
for mtask in Mtasks:
|
|
measured_mt_mtask_time += Mtasks[mtask]['elapsed']
|
|
predict_mt_mtask_time += Mtasks[mtask]['predict_cost']
|
|
measured_last_end = max(measured_last_end, Mtasks[mtask]['end'])
|
|
predict_last_end = max(
|
|
predict_last_end,
|
|
Mtasks[mtask]['predict_start'] + Mtasks[mtask]['predict_cost'])
|
|
long_mtask_time = max(long_mtask_time, Mtasks[mtask]['elapsed'])
|
|
Global['measured_last_end'] = measured_last_end
|
|
Global['predict_last_end'] = predict_last_end
|
|
|
|
# If we know cycle time in the same (rdtsc) units,
|
|
# this will give us an actual utilization number,
|
|
# (how effectively we keep the cores busy.)
|
|
#
|
|
# It also gives us a number we can compare against
|
|
# serial mode, to estimate the overhead of data sharing,
|
|
# which will show up in the total elapsed time. (Overhead
|
|
# of synchronization and scheduling should not.)
|
|
print("\nAnalysis:")
|
|
print(" Total threads = %d" % nthreads)
|
|
print(" Total mtasks = %d" % len(Mtasks))
|
|
ncpus = max(len(Global['cpus']), 1)
|
|
print(" Total cpus used = %d" % ncpus)
|
|
print(" Total yields = %d" %
|
|
int(Global['stats'].get('yields', 0)))
|
|
print(" Total evals = %d" % len(Evals))
|
|
print(" Total eval loops = %d" % len(EvalLoops))
|
|
if Mtasks:
|
|
print(" Total eval time = %d rdtsc ticks" %
|
|
Global['measured_last_end'])
|
|
print(" Longest mtask time = %d rdtsc ticks" % long_mtask_time)
|
|
print(" All-thread mtask time = %d rdtsc ticks" %
|
|
measured_mt_mtask_time)
|
|
long_efficiency = long_mtask_time / (Global.get(
|
|
'measured_last_end', 1) or 1)
|
|
print(" Longest-thread efficiency = %0.1f%%" %
|
|
(long_efficiency * 100.0))
|
|
mt_efficiency = measured_mt_mtask_time / (
|
|
Global.get('measured_last_end', 1) * nthreads or 1)
|
|
print(" All-thread efficiency = %0.1f%%" %
|
|
(mt_efficiency * 100.0))
|
|
print(" All-thread speedup = %0.1f" %
|
|
(mt_efficiency * nthreads))
|
|
if Global['rdtsc_cycle_time'] > 0:
|
|
ut = measured_mt_mtask_time / Global['rdtsc_cycle_time']
|
|
print("tot_mtask_cpu=" + measured_mt_mtask_time + " cyc=" +
|
|
Global['rdtsc_cycle_time'] + " ut=" + ut)
|
|
|
|
predict_mt_efficiency = predict_mt_mtask_time / (
|
|
Global.get('predict_last_end', 1) * nthreads or 1)
|
|
print("\nPrediction (what Verilator used for scheduling):")
|
|
print(" All-thread efficiency = %0.1f%%" %
|
|
(predict_mt_efficiency * 100.0))
|
|
print(" All-thread speedup = %0.1f" %
|
|
(predict_mt_efficiency * nthreads))
|
|
|
|
p2e_ratios = []
|
|
min_p2e = 1000000
|
|
min_mtask = None
|
|
max_p2e = -1000000
|
|
max_mtask = None
|
|
|
|
for mtask in sorted(Mtasks.keys()):
|
|
if Mtasks[mtask]['elapsed'] > 0:
|
|
if Mtasks[mtask]['predict_cost'] == 0:
|
|
Mtasks[mtask]['predict_cost'] = 1 # don't log(0) below
|
|
p2e_ratio = math.log(Mtasks[mtask]['predict_cost'] /
|
|
Mtasks[mtask]['elapsed'])
|
|
p2e_ratios.append(p2e_ratio)
|
|
|
|
if p2e_ratio > max_p2e:
|
|
max_p2e = p2e_ratio
|
|
max_mtask = mtask
|
|
if p2e_ratio < min_p2e:
|
|
min_p2e = p2e_ratio
|
|
min_mtask = mtask
|
|
|
|
print("\nMTask statistics:")
|
|
print(" min log(p2e) = %0.3f" % min_p2e, end="")
|
|
print(" from mtask %d (predict %d," %
|
|
(min_mtask, Mtasks[min_mtask]['predict_cost']),
|
|
end="")
|
|
print(" elapsed %d)" % Mtasks[min_mtask]['elapsed'])
|
|
print(" max log(p2e) = %0.3f" % max_p2e, end="")
|
|
print(" from mtask %d (predict %d," %
|
|
(max_mtask, Mtasks[max_mtask]['predict_cost']),
|
|
end="")
|
|
print(" elapsed %d)" % Mtasks[max_mtask]['elapsed'])
|
|
|
|
stddev = statistics.pstdev(p2e_ratios)
|
|
mean = statistics.mean(p2e_ratios)
|
|
print(" mean = %0.3f" % mean)
|
|
print(" stddev = %0.3f" % stddev)
|
|
print(" e ^ stddev = %0.3f" % math.exp(stddev))
|
|
|
|
report_cpus()
|
|
|
|
if nthreads > ncpus:
|
|
print()
|
|
print("%%Warning: There were fewer CPUs (%d) then threads (%d)." %
|
|
(ncpus, nthreads))
|
|
print(" : See docs on use of numactl.")
|
|
else:
|
|
if 'cpu_socket_cores_warning' in Global:
|
|
print()
|
|
print(
|
|
"%Warning: Multiple threads scheduled on same hyperthreaded core."
|
|
)
|
|
print(" : See docs on use of numactl.")
|
|
if 'cpu_sockets_warning' in Global:
|
|
print()
|
|
print("%Warning: Threads scheduled on multiple sockets.")
|
|
print(" : See docs on use of numactl.")
|
|
print()
|
|
|
|
|
|
def report_cpus():
|
|
print("\nCPUs:")
|
|
|
|
Global['cpu_sockets'] = collections.defaultdict(lambda: 0)
|
|
Global['cpu_socket_cores'] = collections.defaultdict(lambda: 0)
|
|
|
|
for cpu in sorted(Global['cpus'].keys()):
|
|
print(" cpu %d: " % cpu, end='')
|
|
print("cpu_time=%d" % Global['cpus'][cpu]['cpu_time'], end='')
|
|
|
|
socket = None
|
|
if cpu in Global['cpuinfo']:
|
|
cpuinfo = Global['cpuinfo'][cpu]
|
|
if 'physical_id' in cpuinfo and 'core_id' in cpuinfo:
|
|
socket = int(cpuinfo['physical_id'])
|
|
Global['cpu_sockets'][socket] += 1
|
|
print(" socket=%d" % socket, end='')
|
|
|
|
core = int(cpuinfo['core_id'])
|
|
Global['cpu_socket_cores'][str(socket) + "__" + str(core)] += 1
|
|
print(" core=%d" % core, end='')
|
|
|
|
if 'model_name' in cpuinfo:
|
|
model = cpuinfo['model_name']
|
|
print(" %s" % model, end='')
|
|
print()
|
|
|
|
if len(Global['cpu_sockets']) > 1:
|
|
Global['cpu_sockets_warning'] = True
|
|
for scn in Global['cpu_socket_cores'].values():
|
|
if scn > 1:
|
|
Global['cpu_socket_cores_warning'] = True
|
|
|
|
|
|
######################################################################
|
|
|
|
|
|
def write_vcd(filename):
|
|
print("Writing %s" % filename)
|
|
with open(filename, "w", encoding="utf8") as fh:
|
|
vcd = {
|
|
'values':
|
|
collections.defaultdict(lambda: {}), # {<time>}{<code>} = value
|
|
'sigs': {
|
|
'predicted_threads': {},
|
|
'measured_threads': {},
|
|
'cpus': {},
|
|
'evals': {},
|
|
'mtasks': {},
|
|
'Stats': {}
|
|
} # {<module>}{<sig}} = code
|
|
}
|
|
code = 0
|
|
|
|
parallelism = {
|
|
'measured': collections.defaultdict(lambda: 0),
|
|
'predicted': collections.defaultdict(lambda: 0)
|
|
}
|
|
parallelism['measured'][0] = 0
|
|
parallelism['predicted'][0] = 0
|
|
|
|
# Measured graph
|
|
for thread in sorted(Threads.keys()):
|
|
sig = "thread%d_mtask" % thread
|
|
if sig not in vcd['sigs']['measured_threads']:
|
|
vcd['sigs']['measured_threads'][sig] = code
|
|
code += 1
|
|
mcode = vcd['sigs']['measured_threads'][sig]
|
|
|
|
for start in sorted(Threads[thread]):
|
|
mtask = Threads[thread][start]['mtask']
|
|
end = Threads[thread][start]['end']
|
|
cpu = Threads[thread][start]['cpu']
|
|
vcd['values'][start][mcode] = mtask
|
|
vcd['values'][end][mcode] = None
|
|
parallelism['measured'][start] += 1
|
|
parallelism['measured'][end] -= 1
|
|
|
|
sig = "cpu%d_thread" % cpu
|
|
if sig not in vcd['sigs']['cpus']:
|
|
vcd['sigs']['cpus'][sig] = code
|
|
code += 1
|
|
ccode = vcd['sigs']['cpus'][sig]
|
|
vcd['values'][start][ccode] = thread
|
|
vcd['values'][end][ccode] = None
|
|
|
|
sig = "mtask%d_cpu" % mtask
|
|
if sig not in vcd['sigs']['mtasks']:
|
|
vcd['sigs']['mtasks'][sig] = code
|
|
code += 1
|
|
ccode = vcd['sigs']['mtasks'][sig]
|
|
vcd['values'][start][ccode] = cpu
|
|
vcd['values'][end][ccode] = None
|
|
|
|
# Eval graph
|
|
vcd['sigs']['evals']["eval"] = code
|
|
elcode = code
|
|
code += 1
|
|
n = 0
|
|
for eval_start in Evals:
|
|
eval_end = Evals[eval_start]['end']
|
|
n += 1
|
|
vcd['values'][eval_start][elcode] = n
|
|
vcd['values'][eval_end][elcode] = None
|
|
|
|
# Eval_loop graph
|
|
vcd['sigs']['evals']["eval_loop"] = code
|
|
elcode = code
|
|
code += 1
|
|
n = 0
|
|
for eval_start in EvalLoops:
|
|
eval_end = EvalLoops[eval_start]['end']
|
|
n += 1
|
|
vcd['values'][eval_start][elcode] = n
|
|
vcd['values'][eval_end][elcode] = None
|
|
|
|
if Mtasks:
|
|
# Predicted graph
|
|
for eval_start in EvalLoops:
|
|
eval_end = EvalLoops[eval_start]['end']
|
|
# Compute scale so predicted graph is of same width as eval
|
|
measured_scaling = (eval_end -
|
|
eval_start) / Global['predict_last_end']
|
|
# Predict mtasks that fill the time the eval occupied
|
|
for mtask in Mtasks:
|
|
thread = Mtasks[mtask]['thread']
|
|
pred_scaled_start = eval_start + int(
|
|
Mtasks[mtask]['predict_start'] * measured_scaling)
|
|
pred_scaled_end = eval_start + int(
|
|
(Mtasks[mtask]['predict_start'] +
|
|
Mtasks[mtask]['predict_cost']) * measured_scaling)
|
|
if pred_scaled_start == pred_scaled_end:
|
|
continue
|
|
|
|
sig = "predicted_thread%d_mtask" % thread
|
|
if sig not in vcd['sigs']['predicted_threads']:
|
|
vcd['sigs']['predicted_threads'][sig] = code
|
|
code += 1
|
|
mcode = vcd['sigs']['predicted_threads'][sig]
|
|
|
|
vcd['values'][pred_scaled_start][mcode] = mtask
|
|
vcd['values'][pred_scaled_end][mcode] = None
|
|
|
|
parallelism['predicted'][pred_scaled_start] += 1
|
|
parallelism['predicted'][pred_scaled_end] -= 1
|
|
|
|
# Parallelism graph
|
|
for measpred in ('measured', 'predicted'):
|
|
vcd['sigs']['Stats']["%s_parallelism" % measpred] = code
|
|
pcode = code
|
|
code += 1
|
|
value = 0
|
|
for time in sorted(parallelism[measpred].keys()):
|
|
value += parallelism[measpred][time]
|
|
vcd['values'][time][pcode] = value
|
|
|
|
# Create output file
|
|
fh.write("$version Generated by verilator_gantt $end\n")
|
|
fh.write("$timescale 1ns $end\n")
|
|
fh.write("\n")
|
|
|
|
all_codes = {}
|
|
fh.write(" $scope module gantt $end\n")
|
|
for module in sorted(vcd['sigs'].keys()):
|
|
fh.write(" $scope module %s $end\n" % module)
|
|
for sig in sorted(vcd['sigs'][module].keys()):
|
|
code = vcd['sigs'][module][sig]
|
|
fh.write(" $var wire 32 v%x %s [31:0] $end\n" % (code, sig))
|
|
all_codes[code] = 1
|
|
fh.write(" $upscope $end\n")
|
|
fh.write(" $upscope $end\n")
|
|
fh.write("$enddefinitions $end\n")
|
|
fh.write("\n")
|
|
|
|
first = True
|
|
for time in sorted(vcd['values']):
|
|
if first:
|
|
first = False
|
|
# Start with Z for any signals without time zero data
|
|
for code in sorted(all_codes.keys()):
|
|
if code not in vcd['values'][time]:
|
|
vcd['values'][time][code] = None
|
|
fh.write("#%d\n" % time)
|
|
for code in sorted(vcd['values'][time].keys()):
|
|
value = vcd['values'][time][code]
|
|
if value is None:
|
|
fh.write("bz v%x\n" % code)
|
|
else:
|
|
fh.write("b%s v%x\n" % (format(value, 'b'), code))
|
|
|
|
|
|
######################################################################
|
|
|
|
parser = argparse.ArgumentParser(
|
|
allow_abbrev=False,
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
description="""Create Gantt chart of multi-threaded execution.
|
|
|
|
Verilator_gantt creates a visual representation to help analyze Verilator
|
|
#xmultithreaded simulation performance, by showing when each macro-task
|
|
#xstarts and ends, and showing when each thread is busy or idle.
|
|
|
|
For documentation see
|
|
https://verilator.org/guide/latest/exe_verilator_gantt.html""",
|
|
epilog=
|
|
"""Copyright 2018-2023 by Wilson Snyder. This program is free software; you
|
|
can redistribute it and/or modify it under the terms of either the GNU
|
|
Lesser General Public License Version 3 or the Perl Artistic License
|
|
Version 2.0.
|
|
|
|
SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0""")
|
|
|
|
parser.add_argument('--debug', action='store_true', help='enable debug')
|
|
parser.add_argument('--no-vcd',
|
|
help='disable creating vcd',
|
|
action='store_true')
|
|
parser.add_argument('--vcd',
|
|
help='filename for vcd outpue',
|
|
default='profile_exec.vcd')
|
|
parser.add_argument('filename',
|
|
help='input profile_exec.dat filename to process',
|
|
default='profile_exec.dat')
|
|
|
|
Args = parser.parse_args()
|
|
|
|
process(Args.filename)
|
|
if not Args.no_vcd:
|
|
write_vcd(Args.vcd)
|
|
|
|
######################################################################
|
|
# Local Variables:
|
|
# compile-command: "./verilator_gantt ../test_regress/t/t_gantt_io.dat"
|
|
# End:
|