forked from github/verilator
495 lines
18 KiB
Python
Executable File
495 lines
18 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
# pylint: disable=C0103,C0114,C0116,C0301,R0914,R0912,R0915,W0511,eval-used
|
|
######################################################################
|
|
|
|
import argparse
|
|
import collections
|
|
import math
|
|
import re
|
|
import statistics
|
|
# from pprint import pprint
|
|
|
|
Threads = collections.defaultdict(lambda: {})
|
|
Mtasks = collections.defaultdict(lambda: {})
|
|
Global = {
|
|
'args': {},
|
|
'cpuinfo': collections.defaultdict(lambda: {}),
|
|
'rdtsc_cycle_time': 0,
|
|
'stats': {}
|
|
}
|
|
|
|
######################################################################
|
|
|
|
|
|
def process(filename):
|
|
read_data(filename)
|
|
report()
|
|
|
|
|
|
def read_data(filename):
|
|
with open(filename) as fh:
|
|
re_prof = re.compile(
|
|
r'^VLPROF mtask\s(\d+)\sstart\s(\d+)\send\s(\d+)\selapsed\s(\d+)\spredict_time\s(\d+)\scpu\s(\d+)\son thread (\d+)'
|
|
)
|
|
re_arg1 = re.compile(r'VLPROF arg\s+(\S+)\+([0-9.])\s*')
|
|
re_arg2 = re.compile(r'VLPROF arg\s+(\S+)\s+([0-9.])\s*$')
|
|
re_stat = re.compile(r'VLPROF stat\s+(\S+)\s+([0-9.]+)')
|
|
re_time = re.compile(r'rdtsc time = (\d+) ticks')
|
|
re_proc_cpu = re.compile(r'VLPROFPROC processor\s*:\s*(\d+)\s*$')
|
|
re_proc_dat = re.compile(r'VLPROFPROC ([a-z_ ]+)\s*:\s*(.*)$')
|
|
cpu = None
|
|
|
|
for line in fh:
|
|
if re_prof.match(line):
|
|
match = re_prof.match(line)
|
|
mtask = int(match.group(1))
|
|
start = int(match.group(2))
|
|
end = int(match.group(3))
|
|
elapsed_time = int(match.group(4))
|
|
predict_time = int(match.group(5))
|
|
cpu = int(match.group(6))
|
|
thread = int(match.group(7))
|
|
if start not in Threads[thread]:
|
|
Threads[thread][start] = {}
|
|
Threads[thread][start]['mtask'] = mtask
|
|
Threads[thread][start]['end'] = end
|
|
Threads[thread][start]['cpu'] = cpu
|
|
|
|
if 'elapsed' not in Mtasks[mtask]:
|
|
Mtasks[mtask] = {'end': 0, 'elapsed': 0}
|
|
Mtasks[mtask]['elapsed'] += elapsed_time
|
|
Mtasks[mtask]['predict'] = predict_time
|
|
Mtasks[mtask]['end'] = max(Mtasks[mtask]['end'], end)
|
|
elif re.match(r'^VLPROFTHREAD', line):
|
|
None # pylint: disable=pointless-statement
|
|
elif re_arg1.match(line):
|
|
match = re_arg1.match(line)
|
|
Global['args'][match.group(1)] = match.group(2)
|
|
elif re_arg2.match(line):
|
|
match = re_arg2.match(line)
|
|
Global['args'][match.group(1)] = match.group(2)
|
|
elif re_stat.match(line):
|
|
match = re_stat.match(line)
|
|
Global['stats'][match.group(1)] = match.group(2)
|
|
elif re_proc_cpu.match(line):
|
|
match = re_proc_cpu.match(line)
|
|
cpu = int(match.group(1))
|
|
elif cpu and re_proc_dat.match(line):
|
|
match = re_proc_dat.match(line)
|
|
term = match.group(1)
|
|
value = match.group(2)
|
|
term = re.sub(r'\s+$', '', term)
|
|
term = re.sub(r'\s+', '_', term)
|
|
value = re.sub(r'\s+$', '', value)
|
|
Global['cpuinfo'][cpu][term] = value
|
|
elif re.match(r'^#', line):
|
|
None # pylint: disable=pointless-statement
|
|
elif Args.debug:
|
|
print("-Unk: %s" % line)
|
|
# TODO -- this is parsing text printed by a client.
|
|
# Really, verilator proper should generate this
|
|
# if it's useful...
|
|
if re_time.match(line):
|
|
Global['rdtsc_cycle_time'] = re_time.group(1)
|
|
|
|
|
|
def re_match_result(regexp, line, result_to):
|
|
result_to = re.match(regexp, line)
|
|
return result_to
|
|
|
|
|
|
######################################################################
|
|
|
|
|
|
def report():
|
|
print("Verilator Gantt report")
|
|
|
|
print("\nArgument settings:")
|
|
for arg in sorted(Global['args'].keys()):
|
|
plus = "+" if re.match(r'^\+', arg) else " "
|
|
print(" %s%s%s" % (arg, plus, Global['args'][arg]))
|
|
|
|
nthreads = len(Threads)
|
|
Global['cpus'] = {}
|
|
for thread in Threads:
|
|
# Make potentially multiple characters per column
|
|
for start in Threads[thread]:
|
|
cpu = Threads[thread][start]['cpu']
|
|
elapsed = Threads[thread][start]['end'] - start
|
|
if cpu not in Global['cpus']:
|
|
Global['cpus'][cpu] = {'cpu_time': 0}
|
|
Global['cpus'][cpu]['cpu_time'] += elapsed
|
|
|
|
mt_mtask_time = 0
|
|
long_mtask_time = 0
|
|
last_end = 0
|
|
for mtask in Mtasks:
|
|
mt_mtask_time += Mtasks[mtask]['elapsed']
|
|
last_end = max(last_end, Mtasks[mtask]['end'])
|
|
long_mtask_time = max(long_mtask_time, Mtasks[mtask]['elapsed'])
|
|
Global['last_end'] = last_end
|
|
|
|
report_graph()
|
|
|
|
# If we know cycle time in the same (rdtsc) units,
|
|
# this will give us an actual utilization number,
|
|
# (how effectively we keep the cores busy.)
|
|
#
|
|
# It also gives us a number we can compare against
|
|
# serial mode, to estimate the overhead of data sharing,
|
|
# which will show up in the total elapsed time. (Overhead
|
|
# of synchronization and scheduling should not.)
|
|
print("\nAnalysis:")
|
|
print(" Total threads = %d" % nthreads)
|
|
print(" Total mtasks = %d" % len(Mtasks))
|
|
ncpus = len(Global['cpus'])
|
|
print(" Total cpus used = %d" % ncpus)
|
|
print(" Total yields = %d" % int(Global['stats']['yields']))
|
|
print(" Total eval time = %d rdtsc ticks" % Global['last_end'])
|
|
print(" Longest mtask time = %d rdtsc ticks" % long_mtask_time)
|
|
print(" All-thread mtask time = %d rdtsc ticks" % mt_mtask_time)
|
|
long_efficiency = long_mtask_time / (Global.get('last_end', 1))
|
|
print(" Longest-thread efficiency = %0.1f%%" % (long_efficiency * 100.0))
|
|
mt_efficiency = mt_mtask_time / (Global.get('last_end', 1) * nthreads or 1)
|
|
print(" All-thread efficiency = %0.1f%%" % (mt_efficiency * 100.0))
|
|
print(" All-thread speedup = %0.1f" % (mt_efficiency * nthreads))
|
|
if Global['rdtsc_cycle_time'] > 0:
|
|
ut = mt_mtask_time / Global['rdtsc_cycle_time']
|
|
print("tot_mtask_cpu=" + mt_mtask_time + " cyc=" +
|
|
Global['rdtsc_cycle_time'] + " ut=" + ut)
|
|
|
|
p2e_ratios = []
|
|
min_p2e = 1000000
|
|
min_mtask = None
|
|
max_p2e = -1000000
|
|
max_mtask = None
|
|
|
|
for mtask in sorted(Mtasks.keys()):
|
|
if Mtasks[mtask]['elapsed'] > 0:
|
|
if Mtasks[mtask]['predict'] == 0:
|
|
Mtasks[mtask]['predict'] = 1 # don't log(0) below
|
|
p2e_ratio = math.log(Mtasks[mtask]['predict'] /
|
|
Mtasks[mtask]['elapsed'])
|
|
p2e_ratios.append(p2e_ratio)
|
|
|
|
if p2e_ratio > max_p2e:
|
|
max_p2e = p2e_ratio
|
|
max_mtask = mtask
|
|
if p2e_ratio < min_p2e:
|
|
min_p2e = p2e_ratio
|
|
min_mtask = mtask
|
|
|
|
print("\nStatistics:")
|
|
print(" min log(p2e) = %0.3f" % min_p2e, end="")
|
|
print(" from mtask %d (predict %d," %
|
|
(min_mtask, Mtasks[min_mtask]['predict']),
|
|
end="")
|
|
print(" elapsed %d)" % Mtasks[min_mtask]['elapsed'])
|
|
print(" max log(p2e) = %0.3f" % max_p2e, end="")
|
|
print(" from mtask %d (predict %d," %
|
|
(max_mtask, Mtasks[max_mtask]['predict']),
|
|
end="")
|
|
print(" elapsed %d)" % Mtasks[max_mtask]['elapsed'])
|
|
|
|
stddev = statistics.pstdev(p2e_ratios)
|
|
mean = statistics.mean(p2e_ratios)
|
|
print(" mean = %0.3f" % mean)
|
|
print(" stddev = %0.3f" % stddev)
|
|
print(" e ^ stddev = %0.3f" % math.exp(stddev))
|
|
|
|
report_cpus()
|
|
|
|
if nthreads > ncpus:
|
|
print()
|
|
print("%%Warning: There were fewer CPUs (%d) then threads (%d)." %
|
|
(ncpus, nthreads))
|
|
print(" : See docs on use of numactl.")
|
|
else:
|
|
if 'cpu_socket_cores_warning' in Global:
|
|
print()
|
|
print(
|
|
"%Warning: Multiple threads scheduled on same hyperthreaded core."
|
|
)
|
|
print(" : See docs on use of numactl.")
|
|
if 'cpu_sockets_warning' in Global:
|
|
print()
|
|
print("%Warning: Threads scheduled on multiple sockets.")
|
|
print(" : See docs on use of numactl.")
|
|
print()
|
|
|
|
|
|
def report_cpus():
|
|
print("\nCPUs:")
|
|
|
|
Global['cpu_sockets'] = collections.defaultdict(lambda: 0)
|
|
Global['cpu_socket_cores'] = collections.defaultdict(lambda: 0)
|
|
|
|
for cpu in sorted(Global['cpus'].keys()):
|
|
print(" cpu %d: " % cpu, end='')
|
|
print("cpu_time=%d" % Global['cpus'][cpu]['cpu_time'], end='')
|
|
|
|
socket = None
|
|
if cpu in Global['cpuinfo']:
|
|
socket = int(Global['cpuinfo'][cpu]['physical_id'])
|
|
Global['cpu_sockets'][socket] += 1
|
|
print(" socket=%d" % socket, end='')
|
|
|
|
core = int(Global['cpuinfo'][cpu]['core_id'])
|
|
Global['cpu_socket_cores'][str(socket) + "__" + str(core)] += 1
|
|
print(" core=%d" % core, end='')
|
|
|
|
model = Global['cpuinfo'][cpu]['model_name']
|
|
if model:
|
|
print(" %s" % model, end='')
|
|
print()
|
|
|
|
if len(Global['cpu_sockets']) > 1:
|
|
Global['cpu_sockets_warning'] = True
|
|
for scn in Global['cpu_socket_cores'].values():
|
|
if scn > 1:
|
|
Global['cpu_socket_cores_warning'] = True
|
|
|
|
|
|
def report_graph():
|
|
time_per = Args.scale
|
|
if time_per == 0:
|
|
time_per = Global['last_end'] / 40 # Start with 40 columns
|
|
while time_per > 10:
|
|
(graph, conflicts) = _make_graph(time_per)
|
|
if not conflicts:
|
|
break
|
|
time_per = int(time_per / 2)
|
|
# One more step so we can fit more labels
|
|
time_per = int(time_per / 2)
|
|
if time_per <= 0:
|
|
time_per = 1
|
|
|
|
(graph, conflicts) = _make_graph(time_per)
|
|
|
|
print("\nThread gantt graph:")
|
|
print(" Legend: One character width = %s rdtsc ticks" % time_per)
|
|
print(" Legend: '&' = multiple mtasks in this period (character width)")
|
|
|
|
scale = " <-%d rdtsc total" % Global['last_end']
|
|
for col in range(len(scale), int(0.99 + (Global['last_end'] / time_per))): # pylint: disable=unused-variable
|
|
scale += "-"
|
|
print(" " + scale + "->")
|
|
|
|
for thread in sorted(graph.keys()):
|
|
print(" t: ", end="")
|
|
_print_graph_line(graph[thread], '')
|
|
|
|
|
|
def _make_graph(time_per):
|
|
|
|
# [thread][column] = char or #
|
|
graph = collections.defaultdict(
|
|
lambda: collections.defaultdict(lambda: ''))
|
|
|
|
conflicts = 0
|
|
for thread in Threads:
|
|
# Make potentially multiple characters per column
|
|
multi_at_col = collections.defaultdict(lambda: '')
|
|
for start in sorted(Threads[thread].keys()):
|
|
end = Threads[thread][start]['end']
|
|
# mtask = Threads[thread][start]['mtask']
|
|
cpu = Threads[thread][start]['cpu']
|
|
|
|
startcol = _time_col(time_per, start)
|
|
endcol = _time_col(time_per, end)
|
|
|
|
label = "["
|
|
label += str(cpu) # Maybe make optional in future
|
|
width = endcol - startcol + 1
|
|
while len(label) < (width - 1): # -1 for ']'
|
|
label += "-"
|
|
label += "]"
|
|
multi_at_col[startcol] += label
|
|
|
|
# Expand line to one char per column
|
|
for col in multi_at_col:
|
|
chars = multi_at_col[col]
|
|
ok = True
|
|
for coladd in range(0, len(chars)):
|
|
if col + coladd in graph[thread]:
|
|
ok = False
|
|
break
|
|
if not ok:
|
|
if re.search(r'\[.*\[', chars): # Two begins or more
|
|
conflicts += 1
|
|
graph[thread][col] = "&"
|
|
else:
|
|
graph[thread][col] = "["
|
|
for coladd in range(1, len(chars)):
|
|
if col + coladd in graph[thread]:
|
|
break
|
|
graph[thread][col + coladd] = 'x'
|
|
else:
|
|
coladd = 0
|
|
for char in chars:
|
|
graph[thread][col + coladd] = char
|
|
coladd += 1
|
|
|
|
if Args.debug:
|
|
print("# Conflicts %d" % conflicts)
|
|
return (graph, conflicts)
|
|
|
|
|
|
def _print_graph_line(graph_thread, sep):
|
|
at = 0
|
|
for col in sorted(graph_thread.keys()):
|
|
while at < col:
|
|
print(' ', end="")
|
|
at += 1
|
|
c = graph_thread[col]
|
|
print(c + sep, end="")
|
|
at += len(c)
|
|
print()
|
|
|
|
|
|
def _time_col(time_per, time):
|
|
return int(time / time_per)
|
|
|
|
|
|
######################################################################
|
|
|
|
|
|
def write_vcd(filename):
|
|
print("Writing %s" % filename)
|
|
with open(filename, "w") as fh:
|
|
vcd = {
|
|
'values':
|
|
collections.defaultdict(lambda: {}), # {<time>}{<code>} = value
|
|
'sigs': {
|
|
'threads': {},
|
|
'cpus': {},
|
|
'mtasks': {},
|
|
'Stats': {}
|
|
} # {<module>}{<sig}} = code
|
|
}
|
|
code = 0
|
|
|
|
parallelism = collections.defaultdict(lambda: 0)
|
|
for thread in sorted(Threads.keys()):
|
|
sig = "thread%d_mtask" % thread
|
|
if sig not in vcd['sigs']['threads']:
|
|
vcd['sigs']['threads'][sig] = code
|
|
code += 1
|
|
mcode = vcd['sigs']['threads'][sig]
|
|
|
|
for start in sorted(Threads[thread]):
|
|
end = Threads[thread][start]['end']
|
|
mtask = Threads[thread][start]['mtask']
|
|
cpu = Threads[thread][start]['cpu']
|
|
vcd['values'][start][mcode] = mtask
|
|
vcd['values'][end][mcode] = None
|
|
parallelism[start] += 1
|
|
parallelism[end] -= 1
|
|
|
|
sig = "cpu%d_thread" % cpu
|
|
if sig not in vcd['sigs']['cpus']:
|
|
vcd['sigs']['cpus'][sig] = code
|
|
code += 1
|
|
ccode = vcd['sigs']['cpus'][sig]
|
|
vcd['values'][start][ccode] = thread
|
|
vcd['values'][end][ccode] = None
|
|
|
|
sig = "mtask%d_cpu" % mtask
|
|
if sig not in vcd['sigs']['mtasks']:
|
|
vcd['sigs']['mtasks'][sig] = code
|
|
code += 1
|
|
ccode = vcd['sigs']['mtasks'][sig]
|
|
vcd['values'][start][ccode] = cpu
|
|
vcd['values'][end][ccode] = None
|
|
|
|
# Parallelism graph
|
|
vcd['sigs']['Stats']["parallelism"] = code
|
|
pcode = code
|
|
code += 1
|
|
|
|
value = 0
|
|
for time in sorted(parallelism.keys()):
|
|
value += parallelism[time]
|
|
vcd['values'][time][pcode] = value
|
|
|
|
fh.write("$version Generated by verilator_gantt $end\n")
|
|
fh.write("$timescale 1ns $end\n")
|
|
fh.write("\n")
|
|
|
|
all_codes = {}
|
|
fh.write(" $scope module gantt $end\n")
|
|
for module in sorted(vcd['sigs'].keys()):
|
|
fh.write(" $scope module %s $end\n" % module)
|
|
for sig in sorted(vcd['sigs'][module].keys()):
|
|
code = vcd['sigs'][module][sig]
|
|
fh.write(" $var wire 32 v%x %s [31:0] $end\n" % (code, sig))
|
|
all_codes[code] = 1
|
|
fh.write(" $upscope $end\n")
|
|
fh.write(" $upscope $end\n")
|
|
fh.write("$enddefinitions $end\n")
|
|
fh.write("\n")
|
|
|
|
first = True
|
|
for time in sorted(vcd['values']):
|
|
if first:
|
|
first = False
|
|
# Start with Z for any signals without time zero data
|
|
for code in sorted(all_codes.keys()):
|
|
if code not in vcd['values'][time]:
|
|
vcd['values'][time][code] = None
|
|
fh.write("#%d\n" % time)
|
|
for code in sorted(vcd['values'][time].keys()):
|
|
value = vcd['values'][time][code]
|
|
if value is None:
|
|
fh.write("bz v%x\n" % code)
|
|
else:
|
|
fh.write("b%s v%x\n" % (format(value, 'b'), code))
|
|
|
|
|
|
######################################################################
|
|
|
|
parser = argparse.ArgumentParser(
|
|
allow_abbrev=False,
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
description="""Create Gantt chart of multi-threaded execution""",
|
|
epilog=
|
|
"""Verilator_gantt creates a visual representation to help analyze Verilator
|
|
#xmultithreaded simulation performance, by showing when each macro-task
|
|
#xstarts and ends, and showing when each thread is busy or idle.
|
|
|
|
For documentation see
|
|
https://verilator.org/guide/latest/exe_verilator_gantt.html
|
|
|
|
Copyright 2018-2021 by Wilson Snyder. This program is free software; you
|
|
can redistribute it and/or modify it under the terms of either the GNU
|
|
Lesser General Public License Version 3 or the Perl Artistic License
|
|
Version 2.0.
|
|
|
|
SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0""")
|
|
|
|
parser.add_argument('--debug', action='store_true', help='enable debug')
|
|
parser.add_argument('--scale',
|
|
help='number of time units per character in graph',
|
|
type=int,
|
|
default=0)
|
|
parser.add_argument('--no-vcd',
|
|
help='disable creating vcd',
|
|
action='store_true')
|
|
parser.add_argument('--vcd',
|
|
help='filename for vcd outpue',
|
|
default='profile_threads.vcd')
|
|
parser.add_argument('filename',
|
|
help='input profile_threads.dat filename to process',
|
|
default='profile_threads.dat')
|
|
|
|
Args = parser.parse_args()
|
|
|
|
process(Args.filename)
|
|
if not Args.no_vcd:
|
|
write_vcd(Args.vcd)
|
|
|
|
######################################################################
|
|
# Local Variables:
|
|
# compile-command: "./verilator_gantt ../test_regress/t/t_gantt_io.dat"
|
|
# End:
|