verilator/bin/verilator_gantt

557 lines
22 KiB
Python
Executable File

#!/usr/bin/env python3
# pylint: disable=C0103,C0114,C0116,C0209,C0301,R0914,R0912,R0915,W0511,W0603,eval-used
######################################################################
import argparse
import bisect
import collections
import math
import re
import statistics
# from pprint import pprint
Sections = []
LongestVcdStrValueLength = 0
Threads = collections.defaultdict(lambda: []) # List of records per thread id
Mtasks = collections.defaultdict(lambda: {'elapsed': 0, 'end': 0})
Cpus = collections.defaultdict(lambda: {'mtask_time': 0})
Global = {'args': {}, 'cpuinfo': collections.defaultdict(lambda: {}), 'stats': {}}
ElapsedTime = None # total elapsed time
ExecGraphTime = 0 # total elapsed time excuting an exec graph
ExecGraphIntervals = [] # list of (start, end) pairs
######################################################################
def read_data(filename):
with open(filename, "r", encoding="utf8") as fh:
re_thread = re.compile(r'^VLPROFTHREAD (\d+)$')
re_record = re.compile(r'^VLPROFEXEC (\S+) (\d+)(.*)$')
re_payload_mtaskBegin = re.compile(r'id (\d+) predictStart (\d+) cpu (\d+)')
re_payload_mtaskEnd = re.compile(r'id (\d+) predictCost (\d+)')
re_arg1 = re.compile(r'VLPROF arg\s+(\S+)\+([0-9.]*)\s*')
re_arg2 = re.compile(r'VLPROF arg\s+(\S+)\s+([0-9.]*)\s*$')
re_stat = re.compile(r'VLPROF stat\s+(\S+)\s+([0-9.]+)')
re_proc_cpu = re.compile(r'VLPROFPROC processor\s*:\s*(\d+)\s*$')
re_proc_dat = re.compile(r'VLPROFPROC ([a-z_ ]+)\s*:\s*(.*)$')
cpu = None
thread = 0
execGraphStart = None
global LongestVcdStrValueLength
global ExecGraphTime
SectionStack = []
mTaskThread = {}
for line in fh:
recordMatch = re_record.match(line)
if recordMatch:
kind, tick, payload = recordMatch.groups()
tick = int(tick)
payload = payload.strip()
if kind == "SECTION_PUSH":
LongestVcdStrValueLength = max(LongestVcdStrValueLength, len(payload))
SectionStack.append(payload)
Sections[thread].append((tick, tuple(SectionStack)))
elif kind == "SECTION_POP":
assert SectionStack, "SECTION_POP without SECTION_PUSH"
SectionStack.pop()
Sections[thread].append((tick, tuple(SectionStack)))
elif kind == "MTASK_BEGIN":
mtask, predict_start, ecpu = re_payload_mtaskBegin.match(payload).groups()
mtask = int(mtask)
predict_start = int(predict_start)
ecpu = int(ecpu)
mTaskThread[mtask] = thread
records = Threads[thread]
assert not records or records[-1]['start'] <= records[-1]['end'] <= tick
records.append({
'start': tick,
'mtask': mtask,
'predict_start': predict_start,
'cpu': ecpu
})
Mtasks[mtask]['begin'] = tick
Mtasks[mtask]['thread'] = thread
Mtasks[mtask]['predict_start'] = predict_start
elif kind == "MTASK_END":
mtask, predict_cost = re_payload_mtaskEnd.match(payload).groups()
mtask = int(mtask)
predict_cost = int(predict_cost)
begin = Mtasks[mtask]['begin']
record = Threads[mTaskThread[mtask]][-1]
record['end'] = tick
record['predict_cost'] = predict_cost
Mtasks[mtask]['elapsed'] += tick - begin
Mtasks[mtask]['predict_cost'] = predict_cost
Mtasks[mtask]['end'] = max(Mtasks[mtask]['end'], tick)
elif kind == "EXEC_GRAPH_BEGIN":
execGraphStart = tick
elif kind == "EXEC_GRAPH_END":
ExecGraphTime += tick - execGraphStart
ExecGraphIntervals.append((execGraphStart, tick))
execGraphStart = None
elif Args.debug:
print("-Unknown execution trace record: %s" % line)
elif re_thread.match(line):
thread = int(re_thread.match(line).group(1))
Sections.append([])
elif re.match(r'^VLPROF(THREAD|VERSION)', line):
pass
elif re_arg1.match(line):
match = re_arg1.match(line)
Global['args'][match.group(1)] = match.group(2)
elif re_arg2.match(line):
match = re_arg2.match(line)
Global['args'][match.group(1)] = match.group(2)
elif re_stat.match(line):
match = re_stat.match(line)
Global['stats'][match.group(1)] = match.group(2)
elif re_proc_cpu.match(line):
match = re_proc_cpu.match(line)
cpu = int(match.group(1))
elif cpu is not None and re_proc_dat.match(line):
match = re_proc_dat.match(line)
term = match.group(1)
value = match.group(2)
term = re.sub(r'\s+$', '', term)
term = re.sub(r'\s+', '_', term)
value = re.sub(r'\s+$', '', value)
Global['cpuinfo'][cpu][term] = value
elif re.match(r'^#', line):
pass
elif Args.debug:
print("-Unk: %s" % line)
def re_match_result(regexp, line, result_to):
result_to = re.match(regexp, line)
return result_to
######################################################################
def report():
print("Verilator Gantt report")
print("\nArgument settings:")
for arg in sorted(Global['args'].keys()):
plus = "+" if re.match(r'^\+', arg) else " "
print(" %s%s%s" % (arg, plus, Global['args'][arg]))
for records in Threads.values():
for record in records:
cpu = record['cpu']
elapsed = record['end'] - record['start']
Cpus[cpu]['mtask_time'] += elapsed
global ElapsedTime
ElapsedTime = int(Global['stats']['ticks'])
nthreads = int(Global['stats']['threads'])
ncpus = max(len(Cpus), 1)
print("\nSummary:")
print(" Total elapsed time = {} rdtsc ticks".format(ElapsedTime))
print(" Parallelized code = {:.2%} of elapsed time".format(ExecGraphTime / ElapsedTime))
print(" Total threads = %d" % nthreads)
print(" Total CPUs used = %d" % ncpus)
print(" Total mtasks = %d" % len(Mtasks))
print(" Total yields = %d" % int(Global['stats'].get('yields', 0)))
report_mtasks()
report_cpus()
report_sections()
if nthreads > ncpus:
print()
print("%%Warning: There were fewer CPUs (%d) than threads (%d)." % (ncpus, nthreads))
print(" : See docs on use of numactl.")
else:
if 'cpu_socket_cores_warning' in Global:
print()
print("%Warning: Multiple threads scheduled on same hyperthreaded core.")
print(" : See docs on use of numactl.")
if 'cpu_sockets_warning' in Global:
print()
print("%Warning: Threads scheduled on multiple sockets.")
print(" : See docs on use of numactl.")
print()
def report_mtasks():
if not Mtasks:
return
nthreads = int(Global['stats']['threads'])
# If we know cycle time in the same (rdtsc) units,
# this will give us an actual utilization number,
# (how effectively we keep the cores busy.)
#
# It also gives us a number we can compare against
# serial mode, to estimate the overhead of data sharing,
# which will show up in the total elapsed time. (Overhead
# of synchronization and scheduling should not.)
total_mtask_time = 0
thread_mtask_time = collections.defaultdict(lambda: 0)
long_mtask_time = 0
long_mtask = None
predict_mtask_time = 0
predict_elapsed = 0
for mtaskId in Mtasks:
record = Mtasks[mtaskId]
predict_mtask_time += record['predict_cost']
total_mtask_time += record['elapsed']
thread_mtask_time[record['thread']] += record['elapsed']
predict_end = record['predict_start'] + record['predict_cost']
predict_elapsed = max(predict_elapsed, predict_end)
if record['elapsed'] > long_mtask_time:
long_mtask_time = record['elapsed']
long_mtask = mtaskId
Global['predict_last_end'] = predict_elapsed
serialTime = ElapsedTime - ExecGraphTime
def subReport(elapsed, work):
print(" Thread utilization = {:7.2%}".format(work / (elapsed * nthreads)))
print(" Speedup = {:6.3}x".format(work / elapsed))
print("\nParallelized code, measured:")
subReport(ExecGraphTime, total_mtask_time)
print("\nParallelized code, predicted during static scheduling:")
subReport(predict_elapsed, predict_mtask_time)
print("\nAll code, measured:")
subReport(ElapsedTime, serialTime + total_mtask_time)
print("\nAll code, measured, scaled by predicted speedup:")
expectedParallelSpeedup = predict_mtask_time / predict_elapsed
scaledElapsed = serialTime + total_mtask_time / expectedParallelSpeedup
subReport(scaledElapsed, serialTime + total_mtask_time)
p2e_ratios = []
min_p2e = 1000000
min_mtask = None
max_p2e = -1000000
max_mtask = None
for mtask in sorted(Mtasks.keys()):
if Mtasks[mtask]['elapsed'] > 0:
if Mtasks[mtask]['predict_cost'] == 0:
Mtasks[mtask]['predict_cost'] = 1 # don't log(0) below
p2e_ratio = math.log(Mtasks[mtask]['predict_cost'] / Mtasks[mtask]['elapsed'])
p2e_ratios.append(p2e_ratio)
if p2e_ratio > max_p2e:
max_p2e = p2e_ratio
max_mtask = mtask
if p2e_ratio < min_p2e:
min_p2e = p2e_ratio
min_mtask = mtask
print("\nMTask statistics:")
print(" Longest mtask id = {}".format(long_mtask))
print(" Longest mtask time = {:.2%} of time elapsed in parallelized code".format(
long_mtask_time / ExecGraphTime))
print(" min log(p2e) = %0.3f" % min_p2e, end="")
print(" from mtask %d (predict %d," % (min_mtask, Mtasks[min_mtask]['predict_cost']), end="")
print(" elapsed %d)" % Mtasks[min_mtask]['elapsed'])
print(" max log(p2e) = %0.3f" % max_p2e, end="")
print(" from mtask %d (predict %d," % (max_mtask, Mtasks[max_mtask]['predict_cost']), end="")
print(" elapsed %d)" % Mtasks[max_mtask]['elapsed'])
stddev = statistics.pstdev(p2e_ratios)
mean = statistics.mean(p2e_ratios)
print(" mean = %0.3f" % mean)
print(" stddev = %0.3f" % stddev)
print(" e ^ stddev = %0.3f" % math.exp(stddev))
def report_cpus():
print("\nCPU info:")
Global['cpu_sockets'] = collections.defaultdict(lambda: 0)
Global['cpu_socket_cores'] = collections.defaultdict(lambda: 0)
print(" Id | Time spent executing MTask | Socket | Core | Model")
print(" | % of elapsed ticks / ticks | | |")
print(" ====|============================|========|======|======")
for cpu in sorted(Cpus):
socket = ""
core = ""
model = ""
if cpu in Global['cpuinfo']:
cpuinfo = Global['cpuinfo'][cpu]
if 'physical_id' in cpuinfo and 'core_id' in cpuinfo:
socket = cpuinfo['physical_id']
Global['cpu_sockets'][socket] += 1
core = cpuinfo['core_id']
Global['cpu_socket_cores'][socket + "__" + core] += 1
if 'model_name' in cpuinfo:
model = cpuinfo['model_name']
print(" {:3d} | {:7.2%} / {:16d} | {:>6s} | {:>4s} | {}".format(
cpu, Cpus[cpu]['mtask_time'] / ElapsedTime, Cpus[cpu]['mtask_time'], socket, core,
model))
if len(Global['cpu_sockets']) > 1:
Global['cpu_sockets_warning'] = True
for scn in Global['cpu_socket_cores'].values():
if scn > 1:
Global['cpu_socket_cores_warning'] = True
def report_sections():
for thread, section in enumerate(Sections):
if section:
print(f"\nSection profile for thread {thread}:")
report_section(section)
def report_section(section):
totalTime = collections.defaultdict(lambda: 0)
selfTime = collections.defaultdict(lambda: 0)
sectionTree = [0, {}, 1] # [selfTime, childTrees, numberOfTimesEntered]
prevTime = 0
prevStack = ()
for time, stack in section:
if len(stack) > len(prevStack):
scope = sectionTree
for item in stack:
scope = scope[1].setdefault(item, [0, {}, 0])
scope[2] += 1
dt = time - prevTime
assert dt >= 0
scope = sectionTree
for item in prevStack:
scope = scope[1].setdefault(item, [0, {}, 0])
scope[0] += dt
if prevStack:
for name in prevStack:
totalTime[name] += dt
selfTime[prevStack[-1]] += dt
prevTime = time
prevStack = stack
def treeSum(tree):
n = tree[0]
for subTree in tree[1].values():
n += treeSum(subTree)
return n
# Make sure the tree sums to the elapsed time
sectionTree[0] += ElapsedTime - treeSum(sectionTree)
def printTree(prefix, name, entries, tree):
print(" {:7.2%} | {:7.2%} | {:8} | {:10.2f} | {}".format(
treeSum(tree) / ElapsedTime, tree[0] / ElapsedTime, tree[2], tree[2] / entries,
prefix + name))
for k in sorted(tree[1], key=lambda _: -treeSum(tree[1][_])):
printTree(prefix + " ", k, tree[2], tree[1][k])
print(" Total | Self | Total | Relative | Section")
print(" time | time | entries | entries | name ")
print("==========|=========|==========|============|========")
printTree("", "*TOTAL*", 1, sectionTree)
######################################################################
def write_vcd(filename):
print("Writing %s" % filename)
with open(filename, "w", encoding="utf8") as fh:
# dict of dicts of hierarchy elements/signal name -> (code, width)
scopeSigs = {}
# map from time -> code -> value
values = collections.defaultdict(lambda: {})
parallelism = {
'measured': collections.defaultdict(lambda: 0),
'predicted': collections.defaultdict(lambda: 0)
}
parallelism['measured'][0] = 0
parallelism['predicted'][0] = 0
nextFreeCode = 0
def getCode(width, *names):
nonlocal nextFreeCode
scope = scopeSigs
*path, name = names
for item in path:
scope = scope.setdefault(item, {})
code, oldWidth = scope.setdefault(name, (nextFreeCode, width))
assert oldWidth == width
if code == nextFreeCode:
nextFreeCode += 1
return code
def addValue(code, time, val):
if isinstance(val, str):
buf = ""
for c in val:
buf += bin(ord(c))[2:].rjust(8, "0")
val = buf.ljust(LongestVcdStrValueLength * 8, "0")
values[time][code] = val
# Measured graph
for thread in sorted(Threads):
mcode = getCode(32, 'measured', 't%d_mtask' % thread)
for record in Threads[thread]:
start = record['start']
mtask = record['mtask']
end = record['end']
cpu = record['cpu']
addValue(mcode, start, mtask)
addValue(mcode, end, None)
parallelism['measured'][start] += 1
parallelism['measured'][end] -= 1
code = getCode(32, 'measured', 'cpu%d_thread' % cpu)
addValue(code, start, thread)
addValue(code, end, None)
code = getCode(32, 'measured', 'cpu%d_mtask' % cpu)
addValue(code, start, mtask)
addValue(code, end, None)
tStart = sorted(_['start'] for records in Threads.values() for _ in records)
tEnd = sorted(_['end'] for records in Threads.values() for _ in records)
# Predicted graph
for start, end in ExecGraphIntervals:
# Find the earliest MTask start after the start point, and the
# latest MTask end before the end point, so we can scale to the
# same range
start = tStart[bisect.bisect_left(tStart, start)]
end = tEnd[bisect.bisect_right(tEnd, end) - 1]
# Compute scale so predicted graph is of same width as interval
measured_scaling = (end - start) / Global['predict_last_end']
# Predict mtasks that fill the time the execution occupied
for mtask in Mtasks:
thread = Mtasks[mtask]['thread']
pred_scaled_start = start + int(Mtasks[mtask]['predict_start'] * measured_scaling)
pred_scaled_end = start + int(
(Mtasks[mtask]['predict_start'] + Mtasks[mtask]['predict_cost']) *
measured_scaling)
if pred_scaled_start == pred_scaled_end:
continue
mcode = getCode(32, 'predicted', 't%d_mtask' % thread)
addValue(mcode, pred_scaled_start, mtask)
addValue(mcode, pred_scaled_end, None)
parallelism['predicted'][pred_scaled_start] += 1
parallelism['predicted'][pred_scaled_end] -= 1
# Parallelism graph
for measpred in ('measured', 'predicted'):
pcode = getCode(32, 'stats', '%s_parallelism' % measpred)
value = 0
for time in sorted(parallelism[measpred].keys()):
value += parallelism[measpred][time]
addValue(pcode, time, value)
# Section graph
for thread, section in enumerate(Sections):
if section:
scode = getCode(LongestVcdStrValueLength * 8, "section", f"t{thread}_trace")
dcode = getCode(32, "section", f"t{thread}_depth")
for time, stack in section:
addValue(scode, time, stack[-1] if stack else None)
addValue(dcode, time, len(stack))
# Create output file
fh.write("$version Generated by verilator_gantt $end\n")
fh.write("$timescale 1ns $end\n")
fh.write("\n")
all_codes = set()
def writeScope(scope):
assert isinstance(scope, dict)
for key in sorted(scope):
val = scope[key]
if isinstance(val, dict):
fh.write(" $scope module %s $end\n" % key)
writeScope(val)
fh.write(" $upscope $end\n")
else:
(code, width) = val
all_codes.add(code)
fh.write(" $var wire %d v%x %s [%d:0] $end\n" %
(width, code, key, width - 1))
fh.write(" $scope module gantt $end\n")
writeScope(scopeSigs)
fh.write(" $upscope $end\n")
fh.write("$enddefinitions $end\n")
fh.write("\n")
first = True
for time in sorted(values):
if first:
first = False
# Start with Z for any signals without time zero data
for code in sorted(all_codes):
if code not in values[time]:
addValue(code, time, None)
fh.write("#%d\n" % time)
for code in sorted(values[time]):
value = values[time][code]
if value is None:
fh.write("bz v%x\n" % code)
elif isinstance(value, str):
fh.write("b%s v%x\n" % (value, code))
else:
fh.write("b%s v%x\n" % (format(value, 'b'), code))
######################################################################
parser = argparse.ArgumentParser(
allow_abbrev=False,
formatter_class=argparse.RawDescriptionHelpFormatter,
description="""Create Gantt chart of multi-threaded execution.
Verilator_gantt creates a visual representation to help analyze Verilator
#xmultithreaded simulation performance, by showing when each macro-task
#xstarts and ends, and showing when each thread is busy or idle.
For documentation see
https://verilator.org/guide/latest/exe_verilator_gantt.html""",
epilog="""Copyright 2018-2024 by Wilson Snyder. This program is free software; you
can redistribute it and/or modify it under the terms of either the GNU
Lesser General Public License Version 3 or the Perl Artistic License
Version 2.0.
SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0""")
parser.add_argument('--debug', action='store_true', help='enable debug')
parser.add_argument('--no-vcd', help='disable creating vcd', action='store_true')
parser.add_argument('--vcd', help='filename for vcd outpue', default='profile_exec.vcd')
parser.add_argument('filename',
help='input profile_exec.dat filename to process',
default='profile_exec.dat')
Args = parser.parse_args()
read_data(Args.filename)
report()
if not Args.no_vcd:
write_vcd(Args.vcd)
######################################################################
# Local Variables:
# compile-command: "./verilator_gantt ../test_regress/t/t_gantt_io.dat"
# End: