verilator/bin/verilator_gantt

597 lines
22 KiB
Python
Executable File

#!/usr/bin/env python3
# pylint: disable=C0103,C0114,C0116,C0301,R0914,R0912,R0915,W0511,eval-used
######################################################################
import argparse
import collections
import math
import re
import statistics
# from pprint import pprint
Threads = collections.defaultdict(lambda: {})
Mtasks = collections.defaultdict(lambda: {})
Evals = collections.defaultdict(lambda: {})
EvalLoops = collections.defaultdict(lambda: {})
Global = {
'args': {},
'cpuinfo': collections.defaultdict(lambda: {}),
'rdtsc_cycle_time': 0,
'stats': {}
}
######################################################################
def process(filename):
read_data(filename)
report()
def read_data(filename):
with open(filename) as fh:
re_prof = re.compile(
r'^VLPROF mtask\s(\d+)\sstart\s(\d+)\selapsed\s(\d+)\spredict_start\s(\d+)\spredict_cost\s(\d+)\scpu\s(\d+)\son thread (\d+)'
)
re_eval = re.compile(r'^VLPROF eval\sstart\s(\d+)\selapsed\s(\d+)')
re_loop = re.compile(
r'^VLPROF eval_loop\sstart\s(\d+)\selapsed\s(\d+)')
re_arg1 = re.compile(r'VLPROF arg\s+(\S+)\+([0-9.])\s*')
re_arg2 = re.compile(r'VLPROF arg\s+(\S+)\s+([0-9.])\s*$')
re_stat = re.compile(r'VLPROF stat\s+(\S+)\s+([0-9.]+)')
re_time = re.compile(r'rdtsc time = (\d+) ticks')
re_proc_cpu = re.compile(r'VLPROFPROC processor\s*:\s*(\d+)\s*$')
re_proc_dat = re.compile(r'VLPROFPROC ([a-z_ ]+)\s*:\s*(.*)$')
cpu = None
for line in fh:
if re_prof.match(line):
match = re_prof.match(line)
mtask = int(match.group(1))
start = int(match.group(2))
elapsed_time = int(match.group(3))
end = start + elapsed_time
predict_start = int(match.group(4))
predict_cost = int(match.group(5))
cpu = int(match.group(6))
thread = int(match.group(7))
if start not in Threads[thread]:
Threads[thread][start] = {}
Threads[thread][start]['mtask'] = mtask
Threads[thread][start]['end'] = end
Threads[thread][start]['cpu'] = cpu
Threads[thread][start]['predict_start'] = predict_start
Threads[thread][start]['predict_cost'] = predict_cost
if 'elapsed' not in Mtasks[mtask]:
Mtasks[mtask] = {'end': 0, 'elapsed': 0}
Mtasks[mtask]['thread'] = thread
Mtasks[mtask]['elapsed'] += elapsed_time
Mtasks[mtask]['predict_start'] = predict_start
Mtasks[mtask]['predict_cost'] = predict_cost
Mtasks[mtask]['end'] = max(Mtasks[mtask]['end'], end)
elif re_eval.match(line):
match = re_eval.match(line)
start = int(match.group(1))
elapsed_time = int(match.group(2))
Evals[start]['start'] = start
Evals[start]['end'] = start + elapsed_time
elif re_loop.match(line):
match = re_loop.match(line)
start = int(match.group(1))
elapsed_time = int(match.group(2))
EvalLoops[start]['start'] = start
EvalLoops[start]['end'] = start + elapsed_time
elif re.match(r'^VLPROFTHREAD', line):
None # pylint: disable=pointless-statement
elif re_arg1.match(line):
match = re_arg1.match(line)
Global['args'][match.group(1)] = match.group(2)
elif re_arg2.match(line):
match = re_arg2.match(line)
Global['args'][match.group(1)] = match.group(2)
elif re_stat.match(line):
match = re_stat.match(line)
Global['stats'][match.group(1)] = match.group(2)
elif re_proc_cpu.match(line):
match = re_proc_cpu.match(line)
cpu = int(match.group(1))
elif cpu and re_proc_dat.match(line):
match = re_proc_dat.match(line)
term = match.group(1)
value = match.group(2)
term = re.sub(r'\s+$', '', term)
term = re.sub(r'\s+', '_', term)
value = re.sub(r'\s+$', '', value)
Global['cpuinfo'][cpu][term] = value
elif re.match(r'^#', line):
None # pylint: disable=pointless-statement
elif Args.debug:
print("-Unk: %s" % line)
# TODO -- this is parsing text printed by a client.
# Really, verilator proper should generate this
# if it's useful...
if re_time.match(line):
Global['rdtsc_cycle_time'] = re_time.group(1)
def re_match_result(regexp, line, result_to):
result_to = re.match(regexp, line)
return result_to
######################################################################
def report():
print("Verilator Gantt report")
print("\nArgument settings:")
for arg in sorted(Global['args'].keys()):
plus = "+" if re.match(r'^\+', arg) else " "
print(" %s%s%s" % (arg, plus, Global['args'][arg]))
nthreads = len(Threads)
Global['cpus'] = {}
for thread in Threads:
# Make potentially multiple characters per column
for start in Threads[thread]:
cpu = Threads[thread][start]['cpu']
elapsed = Threads[thread][start]['end'] - start
if cpu not in Global['cpus']:
Global['cpus'][cpu] = {'cpu_time': 0}
Global['cpus'][cpu]['cpu_time'] += elapsed
measured_mt_mtask_time = 0
predict_mt_mtask_time = 0
long_mtask_time = 0
measured_last_end = 0
predict_last_end = 0
for mtask in Mtasks:
measured_mt_mtask_time += Mtasks[mtask]['elapsed']
predict_mt_mtask_time += Mtasks[mtask]['predict_cost']
measured_last_end = max(measured_last_end, Mtasks[mtask]['end'])
predict_last_end = max(
predict_last_end,
Mtasks[mtask]['predict_start'] + Mtasks[mtask]['predict_cost'])
long_mtask_time = max(long_mtask_time, Mtasks[mtask]['elapsed'])
Global['measured_last_end'] = measured_last_end
Global['predict_last_end'] = predict_last_end
report_graph()
# If we know cycle time in the same (rdtsc) units,
# this will give us an actual utilization number,
# (how effectively we keep the cores busy.)
#
# It also gives us a number we can compare against
# serial mode, to estimate the overhead of data sharing,
# which will show up in the total elapsed time. (Overhead
# of synchronization and scheduling should not.)
print("\nAnalysis:")
print(" Total threads = %d" % nthreads)
print(" Total mtasks = %d" % len(Mtasks))
ncpus = len(Global['cpus'])
print(" Total cpus used = %d" % ncpus)
print(" Total yields = %d" % int(Global['stats']['yields']))
print(" Total eval time = %d rdtsc ticks" %
Global['measured_last_end'])
print(" Longest mtask time = %d rdtsc ticks" % long_mtask_time)
print(" All-thread mtask time = %d rdtsc ticks" %
measured_mt_mtask_time)
long_efficiency = long_mtask_time / (Global.get('measured_last_end', 1) or 1)
print(" Longest-thread efficiency = %0.1f%%" % (long_efficiency * 100.0))
mt_efficiency = measured_mt_mtask_time / (
Global.get('measured_last_end', 1) * nthreads or 1)
print(" All-thread efficiency = %0.1f%%" % (mt_efficiency * 100.0))
print(" All-thread speedup = %0.1f" % (mt_efficiency * nthreads))
if Global['rdtsc_cycle_time'] > 0:
ut = measured_mt_mtask_time / Global['rdtsc_cycle_time']
print("tot_mtask_cpu=" + measured_mt_mtask_time + " cyc=" +
Global['rdtsc_cycle_time'] + " ut=" + ut)
predict_mt_efficiency = predict_mt_mtask_time / (
Global.get('predict_last_end', 1) * nthreads or 1)
print("\nPrediction (what Verilator used for scheduling):")
print(" All-thread efficiency = %0.1f%%" %
(predict_mt_efficiency * 100.0))
print(" All-thread speedup = %0.1f" %
(predict_mt_efficiency * nthreads))
p2e_ratios = []
min_p2e = 1000000
min_mtask = None
max_p2e = -1000000
max_mtask = None
for mtask in sorted(Mtasks.keys()):
if Mtasks[mtask]['elapsed'] > 0:
if Mtasks[mtask]['predict_cost'] == 0:
Mtasks[mtask]['predict_cost'] = 1 # don't log(0) below
p2e_ratio = math.log(Mtasks[mtask]['predict_cost'] /
Mtasks[mtask]['elapsed'])
p2e_ratios.append(p2e_ratio)
if p2e_ratio > max_p2e:
max_p2e = p2e_ratio
max_mtask = mtask
if p2e_ratio < min_p2e:
min_p2e = p2e_ratio
min_mtask = mtask
print("\nStatistics:")
print(" min log(p2e) = %0.3f" % min_p2e, end="")
print(" from mtask %d (predict %d," %
(min_mtask, Mtasks[min_mtask]['predict_cost']),
end="")
print(" elapsed %d)" % Mtasks[min_mtask]['elapsed'])
print(" max log(p2e) = %0.3f" % max_p2e, end="")
print(" from mtask %d (predict %d," %
(max_mtask, Mtasks[max_mtask]['predict_cost']),
end="")
print(" elapsed %d)" % Mtasks[max_mtask]['elapsed'])
stddev = statistics.pstdev(p2e_ratios)
mean = statistics.mean(p2e_ratios)
print(" mean = %0.3f" % mean)
print(" stddev = %0.3f" % stddev)
print(" e ^ stddev = %0.3f" % math.exp(stddev))
report_cpus()
if nthreads > ncpus:
print()
print("%%Warning: There were fewer CPUs (%d) then threads (%d)." %
(ncpus, nthreads))
print(" : See docs on use of numactl.")
else:
if 'cpu_socket_cores_warning' in Global:
print()
print(
"%Warning: Multiple threads scheduled on same hyperthreaded core."
)
print(" : See docs on use of numactl.")
if 'cpu_sockets_warning' in Global:
print()
print("%Warning: Threads scheduled on multiple sockets.")
print(" : See docs on use of numactl.")
print()
def report_cpus():
print("\nCPUs:")
Global['cpu_sockets'] = collections.defaultdict(lambda: 0)
Global['cpu_socket_cores'] = collections.defaultdict(lambda: 0)
for cpu in sorted(Global['cpus'].keys()):
print(" cpu %d: " % cpu, end='')
print("cpu_time=%d" % Global['cpus'][cpu]['cpu_time'], end='')
socket = None
if cpu in Global['cpuinfo']:
socket = int(Global['cpuinfo'][cpu]['physical_id'])
Global['cpu_sockets'][socket] += 1
print(" socket=%d" % socket, end='')
core = int(Global['cpuinfo'][cpu]['core_id'])
Global['cpu_socket_cores'][str(socket) + "__" + str(core)] += 1
print(" core=%d" % core, end='')
model = Global['cpuinfo'][cpu]['model_name']
if model:
print(" %s" % model, end='')
print()
if len(Global['cpu_sockets']) > 1:
Global['cpu_sockets_warning'] = True
for scn in Global['cpu_socket_cores'].values():
if scn > 1:
Global['cpu_socket_cores_warning'] = True
def report_graph():
time_per = Args.scale
if time_per == 0:
time_per = Global['measured_last_end'] / 40 # Start with 40 columns
while time_per > 10:
(graph, conflicts) = _make_graph(time_per)
if not conflicts:
break
time_per = int(time_per / 2)
# One more step so we can fit more labels
time_per = int(time_per / 2)
if time_per <= 0:
time_per = 1
(graph, conflicts) = _make_graph(time_per)
print("\nThread gantt graph:")
print(" Legend: One character width = %s rdtsc ticks" % time_per)
print(" Legend: '&' = multiple mtasks in this period (character width)")
scale = " <-%d rdtsc total" % Global['measured_last_end']
for col in range(len(scale),
int(0.99 + (Global['measured_last_end'] / time_per))): # pylint: disable=unused-variable
scale += "-"
print(" " + scale + "->")
for thread in sorted(graph.keys()):
print(" t: ", end="")
_print_graph_line(graph[thread], '')
def _make_graph(time_per):
# [thread][column] = char or #
graph = collections.defaultdict(
lambda: collections.defaultdict(lambda: ''))
conflicts = 0
for thread in Threads:
# Make potentially multiple characters per column
multi_at_col = collections.defaultdict(lambda: '')
for start in sorted(Threads[thread].keys()):
end = Threads[thread][start]['end']
# mtask = Threads[thread][start]['mtask']
cpu = Threads[thread][start]['cpu']
startcol = _time_col(time_per, start)
endcol = _time_col(time_per, end)
label = "["
label += str(cpu) # Maybe make optional in future
width = endcol - startcol + 1
while len(label) < (width - 1): # -1 for ']'
label += "-"
label += "]"
multi_at_col[startcol] += label
# Expand line to one char per column
for col in multi_at_col:
chars = multi_at_col[col]
ok = True
for coladd in range(0, len(chars)):
if col + coladd in graph[thread]:
ok = False
break
if not ok:
if re.search(r'\[.*\[', chars): # Two begins or more
conflicts += 1
graph[thread][col] = "&"
else:
graph[thread][col] = "["
for coladd in range(1, len(chars)):
if col + coladd in graph[thread]:
break
graph[thread][col + coladd] = 'x'
else:
coladd = 0
for char in chars:
graph[thread][col + coladd] = char
coladd += 1
if Args.debug:
print("# Conflicts %d" % conflicts)
return (graph, conflicts)
def _print_graph_line(graph_thread, sep):
at = 0
for col in sorted(graph_thread.keys()):
while at < col:
print(' ', end="")
at += 1
c = graph_thread[col]
print(c + sep, end="")
at += len(c)
print()
def _time_col(time_per, time):
return int(time / time_per)
######################################################################
def write_vcd(filename):
print("Writing %s" % filename)
with open(filename, "w") as fh:
vcd = {
'values':
collections.defaultdict(lambda: {}), # {<time>}{<code>} = value
'sigs': {
'predicted_threads': {},
'measured_threads': {},
'cpus': {},
'evals': {},
'mtasks': {},
'Stats': {}
} # {<module>}{<sig}} = code
}
code = 0
parallelism = {
'measured': collections.defaultdict(lambda: 0),
'predicted': collections.defaultdict(lambda: 0)
}
parallelism['measured'][0] = 0
parallelism['predicted'][0] = 0
# Measured graph
for thread in sorted(Threads.keys()):
sig = "thread%d_mtask" % thread
if sig not in vcd['sigs']['measured_threads']:
vcd['sigs']['measured_threads'][sig] = code
code += 1
mcode = vcd['sigs']['measured_threads'][sig]
for start in sorted(Threads[thread]):
mtask = Threads[thread][start]['mtask']
end = Threads[thread][start]['end']
cpu = Threads[thread][start]['cpu']
vcd['values'][start][mcode] = mtask
vcd['values'][end][mcode] = None
parallelism['measured'][start] += 1
parallelism['measured'][end] -= 1
sig = "cpu%d_thread" % cpu
if sig not in vcd['sigs']['cpus']:
vcd['sigs']['cpus'][sig] = code
code += 1
ccode = vcd['sigs']['cpus'][sig]
vcd['values'][start][ccode] = thread
vcd['values'][end][ccode] = None
sig = "mtask%d_cpu" % mtask
if sig not in vcd['sigs']['mtasks']:
vcd['sigs']['mtasks'][sig] = code
code += 1
ccode = vcd['sigs']['mtasks'][sig]
vcd['values'][start][ccode] = cpu
vcd['values'][end][ccode] = None
# Eval graph
vcd['sigs']['evals']["eval"] = code
elcode = code
code += 1
n = 0
for eval_start in Evals:
eval_end = Evals[eval_start]['end']
n += 1
vcd['values'][eval_start][elcode] = n
vcd['values'][eval_end][elcode] = None
# Eval_loop graph
vcd['sigs']['evals']["eval_loop"] = code
elcode = code
code += 1
n = 0
for eval_start in EvalLoops:
eval_end = EvalLoops[eval_start]['end']
n += 1
vcd['values'][eval_start][elcode] = n
vcd['values'][eval_end][elcode] = None
# Predicted graph
for eval_start in EvalLoops:
eval_end = EvalLoops[eval_start]['end']
# Compute scale so predicted graph is of same width as eval
measured_scaling = (eval_end -
eval_start) / Global['predict_last_end']
# Predict mtasks that fill the time the eval occupied
for mtask in Mtasks:
thread = Mtasks[mtask]['thread']
pred_scaled_start = eval_start + int(
Mtasks[mtask]['predict_start'] * measured_scaling)
pred_scaled_end = eval_start + int(
(Mtasks[mtask]['predict_start'] +
Mtasks[mtask]['predict_cost']) * measured_scaling)
if pred_scaled_start == pred_scaled_end:
continue
sig = "predicted_thread%d_mtask" % thread
if sig not in vcd['sigs']['predicted_threads']:
vcd['sigs']['predicted_threads'][sig] = code
code += 1
mcode = vcd['sigs']['predicted_threads'][sig]
vcd['values'][pred_scaled_start][mcode] = mtask
vcd['values'][pred_scaled_end][mcode] = None
parallelism['predicted'][pred_scaled_start] += 1
parallelism['predicted'][pred_scaled_end] -= 1
# Parallelism graph
for measpred in ('measured', 'predicted'):
vcd['sigs']['Stats']["%s_parallelism" % measpred] = code
pcode = code
code += 1
value = 0
for time in sorted(parallelism[measpred].keys()):
value += parallelism[measpred][time]
vcd['values'][time][pcode] = value
# Create output file
fh.write("$version Generated by verilator_gantt $end\n")
fh.write("$timescale 1ns $end\n")
fh.write("\n")
all_codes = {}
fh.write(" $scope module gantt $end\n")
for module in sorted(vcd['sigs'].keys()):
fh.write(" $scope module %s $end\n" % module)
for sig in sorted(vcd['sigs'][module].keys()):
code = vcd['sigs'][module][sig]
fh.write(" $var wire 32 v%x %s [31:0] $end\n" % (code, sig))
all_codes[code] = 1
fh.write(" $upscope $end\n")
fh.write(" $upscope $end\n")
fh.write("$enddefinitions $end\n")
fh.write("\n")
first = True
for time in sorted(vcd['values']):
if first:
first = False
# Start with Z for any signals without time zero data
for code in sorted(all_codes.keys()):
if code not in vcd['values'][time]:
vcd['values'][time][code] = None
fh.write("#%d\n" % time)
for code in sorted(vcd['values'][time].keys()):
value = vcd['values'][time][code]
if value is None:
fh.write("bz v%x\n" % code)
else:
fh.write("b%s v%x\n" % (format(value, 'b'), code))
######################################################################
parser = argparse.ArgumentParser(
allow_abbrev=False,
formatter_class=argparse.RawDescriptionHelpFormatter,
description="""Create Gantt chart of multi-threaded execution""",
epilog=
"""Verilator_gantt creates a visual representation to help analyze Verilator
#xmultithreaded simulation performance, by showing when each macro-task
#xstarts and ends, and showing when each thread is busy or idle.
For documentation see
https://verilator.org/guide/latest/exe_verilator_gantt.html
Copyright 2018-2021 by Wilson Snyder. This program is free software; you
can redistribute it and/or modify it under the terms of either the GNU
Lesser General Public License Version 3 or the Perl Artistic License
Version 2.0.
SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0""")
parser.add_argument('--debug', action='store_true', help='enable debug')
parser.add_argument('--scale',
help='number of time units per character in graph',
type=int,
default=0)
parser.add_argument('--no-vcd',
help='disable creating vcd',
action='store_true')
parser.add_argument('--vcd',
help='filename for vcd outpue',
default='profile_threads.vcd')
parser.add_argument('filename',
help='input profile_threads.dat filename to process',
default='profile_threads.dat')
Args = parser.parse_args()
process(Args.filename)
if not Args.no_vcd:
write_vcd(Args.vcd)
######################################################################
# Local Variables:
# compile-command: "./verilator_gantt ../test_regress/t/t_gantt_io.dat"
# End: