Improve --prof-exec infrastructure and report

Again --prof-exec have bit-rotted a little with all the recent changes
to the structure of the generated code. This patch contains a few
improvements:
- Repalce the eval/evl_loop begin/end events with generic
  section_push/section_pop events, that can be arbitrarily sprinkled
  into the generate code (so long as they are matched correctly) to
  measure various sections. The report then contains a nested profile
  of the sections, and the VCD trace shows the section names.
- Better handling of exec graphs
- Clearer overall statistics
This commit is contained in:
Geza Lore 2023-05-10 12:12:34 +01:00
parent 146cdc020d
commit 4c0edd2efb
17 changed files with 662 additions and 561 deletions

View File

@ -1,33 +1,32 @@
#!/usr/bin/env python3
# pylint: disable=C0103,C0114,C0116,C0209,C0301,R0914,R0912,R0915,W0511,eval-used
# pylint: disable=C0103,C0114,C0116,C0209,C0301,R0914,R0912,R0915,W0511,W0603,eval-used
######################################################################
import argparse
import bisect
import collections
import math
import re
import statistics
# from pprint import pprint
Threads = collections.defaultdict(lambda: collections.defaultdict(lambda: {}))
Mtasks = collections.defaultdict(lambda: {})
Evals = collections.defaultdict(lambda: {})
EvalLoops = collections.defaultdict(lambda: {})
Sections = []
LongestVcdStrValueLength = 0
Threads = collections.defaultdict(lambda: []) # List of records per thread id
Mtasks = collections.defaultdict(lambda: {'elapsed': 0, 'end': 0})
Cpus = collections.defaultdict(lambda: {'mtask_time': 0})
Global = {
'args': {},
'cpuinfo': collections.defaultdict(lambda: {}),
'rdtsc_cycle_time': 0,
'stats': {}
}
ElapsedTime = None # total elapsed time
ExecGraphTime = 0 # total elapsed time excuting an exec graph
ExecGraphIntervals = [] # list of (start, end) pairs
######################################################################
def process(filename):
read_data(filename)
report()
def read_data(filename):
with open(filename, "r", encoding="utf8") as fh:
re_thread = re.compile(r'^VLPROFTHREAD (\d+)$')
@ -39,14 +38,17 @@ def read_data(filename):
re_arg1 = re.compile(r'VLPROF arg\s+(\S+)\+([0-9.]*)\s*')
re_arg2 = re.compile(r'VLPROF arg\s+(\S+)\s+([0-9.]*)\s*$')
re_stat = re.compile(r'VLPROF stat\s+(\S+)\s+([0-9.]+)')
re_time = re.compile(r'rdtsc time = (\d+) ticks')
re_proc_cpu = re.compile(r'VLPROFPROC processor\s*:\s*(\d+)\s*$')
re_proc_dat = re.compile(r'VLPROFPROC ([a-z_ ]+)\s*:\s*(.*)$')
cpu = None
thread = None
execGraphStart = None
lastEvalBeginTick = None
lastEvalLoopBeginTick = None
global LongestVcdStrValueLength
global ExecGraphTime
SectionStack = []
mTaskThread = {}
for line in fh:
recordMatch = re_record.match(line)
@ -54,29 +56,31 @@ def read_data(filename):
kind, tick, payload = recordMatch.groups()
tick = int(tick)
payload = payload.strip()
if kind == "EVAL_BEGIN":
Evals[tick]['start'] = tick
lastEvalBeginTick = tick
elif kind == "EVAL_END":
Evals[lastEvalBeginTick]['end'] = tick
lastEvalBeginTick = None
elif kind == "EVAL_LOOP_BEGIN":
EvalLoops[tick]['start'] = tick
lastEvalLoopBeginTick = tick
elif kind == "EVAL_LOOP_END":
EvalLoops[lastEvalLoopBeginTick]['end'] = tick
lastEvalLoopBeginTick = None
if kind == "SECTION_PUSH":
LongestVcdStrValueLength = max(LongestVcdStrValueLength,
len(payload))
SectionStack.append(payload)
Sections.append((tick, tuple(SectionStack)))
elif kind == "SECTION_POP":
assert SectionStack, "SECTION_POP without SECTION_PUSH"
SectionStack.pop()
Sections.append((tick, tuple(SectionStack)))
elif kind == "MTASK_BEGIN":
mtask, predict_start, ecpu = re_payload_mtaskBegin.match(
payload).groups()
mtask = int(mtask)
predict_start = int(predict_start)
ecpu = int(ecpu)
Threads[thread][tick]['mtask'] = mtask
Threads[thread][tick]['predict_start'] = predict_start
Threads[thread][tick]['cpu'] = ecpu
if 'elapsed' not in Mtasks[mtask]:
Mtasks[mtask] = {'end': 0, 'elapsed': 0}
mTaskThread[mtask] = thread
records = Threads[thread]
assert not records or records[-1]['start'] <= records[-1][
'end'] <= tick
records.append({
'start': tick,
'mtask': mtask,
'predict_start': predict_start,
'cpu': ecpu
})
Mtasks[mtask]['begin'] = tick
Mtasks[mtask]['thread'] = thread
Mtasks[mtask]['predict_start'] = predict_start
@ -86,11 +90,18 @@ def read_data(filename):
mtask = int(mtask)
predict_cost = int(predict_cost)
begin = Mtasks[mtask]['begin']
Threads[thread][begin]['end'] = tick
Threads[thread][begin]['predict_cost'] = predict_cost
record = Threads[mTaskThread[mtask]][-1]
record['end'] = tick
record['predict_cost'] = predict_cost
Mtasks[mtask]['elapsed'] += tick - begin
Mtasks[mtask]['predict_cost'] = predict_cost
Mtasks[mtask]['end'] = max(Mtasks[mtask]['end'], tick)
elif kind == "EXEC_GRAPH_BEGIN":
execGraphStart = tick
elif kind == "EXEC_GRAPH_END":
ExecGraphTime += tick - execGraphStart
ExecGraphIntervals.append((execGraphStart, tick))
execGraphStart = None
elif Args.debug:
print("-Unknown execution trace record: %s" % line)
elif re_thread.match(line):
@ -109,7 +120,7 @@ def read_data(filename):
elif re_proc_cpu.match(line):
match = re_proc_cpu.match(line)
cpu = int(match.group(1))
elif cpu and re_proc_dat.match(line):
elif cpu is not None and re_proc_dat.match(line):
match = re_proc_dat.match(line)
term = match.group(1)
value = match.group(2)
@ -121,11 +132,6 @@ def read_data(filename):
pass
elif Args.debug:
print("-Unk: %s" % line)
# TODO -- this is parsing text printed by a client.
# Really, verilator proper should generate this
# if it's useful...
if re_time.match(line):
Global['rdtsc_cycle_time'] = re_time.group(1)
def re_match_result(regexp, line, result_to):
@ -144,125 +150,33 @@ def report():
plus = "+" if re.match(r'^\+', arg) else " "
print(" %s%s%s" % (arg, plus, Global['args'][arg]))
for records in Threads.values():
for record in records:
cpu = record['cpu']
elapsed = record['end'] - record['start']
Cpus[cpu]['mtask_time'] += elapsed
global ElapsedTime
ElapsedTime = int(Global['stats']['ticks'])
nthreads = int(Global['stats']['threads'])
Global['cpus'] = {}
for thread in Threads:
# Make potentially multiple characters per column
for start in Threads[thread]:
if not Threads[thread][start]:
continue
cpu = Threads[thread][start]['cpu']
elapsed = Threads[thread][start]['end'] - start
if cpu not in Global['cpus']:
Global['cpus'][cpu] = {'cpu_time': 0}
Global['cpus'][cpu]['cpu_time'] += elapsed
ncpus = max(len(Cpus), 1)
measured_mt_mtask_time = 0
predict_mt_mtask_time = 0
long_mtask_time = 0
measured_last_end = 0
predict_last_end = 0
for mtask in Mtasks:
measured_mt_mtask_time += Mtasks[mtask]['elapsed']
predict_mt_mtask_time += Mtasks[mtask]['predict_cost']
measured_last_end = max(measured_last_end, Mtasks[mtask]['end'])
predict_last_end = max(
predict_last_end,
Mtasks[mtask]['predict_start'] + Mtasks[mtask]['predict_cost'])
long_mtask_time = max(long_mtask_time, Mtasks[mtask]['elapsed'])
Global['measured_last_end'] = measured_last_end
Global['predict_last_end'] = predict_last_end
# If we know cycle time in the same (rdtsc) units,
# this will give us an actual utilization number,
# (how effectively we keep the cores busy.)
#
# It also gives us a number we can compare against
# serial mode, to estimate the overhead of data sharing,
# which will show up in the total elapsed time. (Overhead
# of synchronization and scheduling should not.)
print("\nAnalysis:")
print(" Total threads = %d" % nthreads)
print(" Total mtasks = %d" % len(Mtasks))
ncpus = max(len(Global['cpus']), 1)
print(" Total cpus used = %d" % ncpus)
print(" Total yields = %d" %
int(Global['stats'].get('yields', 0)))
print(" Total evals = %d" % len(Evals))
print(" Total eval loops = %d" % len(EvalLoops))
if Mtasks:
print(" Total eval time = %d rdtsc ticks" %
Global['measured_last_end'])
print(" Longest mtask time = %d rdtsc ticks" % long_mtask_time)
print(" All-thread mtask time = %d rdtsc ticks" %
measured_mt_mtask_time)
long_efficiency = long_mtask_time / (Global.get(
'measured_last_end', 1) or 1)
print(" Longest-thread efficiency = %0.1f%%" %
(long_efficiency * 100.0))
mt_efficiency = measured_mt_mtask_time / (
Global.get('measured_last_end', 1) * nthreads or 1)
print(" All-thread efficiency = %0.1f%%" %
(mt_efficiency * 100.0))
print(" All-thread speedup = %0.1f" %
(mt_efficiency * nthreads))
if Global['rdtsc_cycle_time'] > 0:
ut = measured_mt_mtask_time / Global['rdtsc_cycle_time']
print("tot_mtask_cpu=" + measured_mt_mtask_time + " cyc=" +
Global['rdtsc_cycle_time'] + " ut=" + ut)
predict_mt_efficiency = predict_mt_mtask_time / (
Global.get('predict_last_end', 1) * nthreads or 1)
print("\nPrediction (what Verilator used for scheduling):")
print(" All-thread efficiency = %0.1f%%" %
(predict_mt_efficiency * 100.0))
print(" All-thread speedup = %0.1f" %
(predict_mt_efficiency * nthreads))
p2e_ratios = []
min_p2e = 1000000
min_mtask = None
max_p2e = -1000000
max_mtask = None
for mtask in sorted(Mtasks.keys()):
if Mtasks[mtask]['elapsed'] > 0:
if Mtasks[mtask]['predict_cost'] == 0:
Mtasks[mtask]['predict_cost'] = 1 # don't log(0) below
p2e_ratio = math.log(Mtasks[mtask]['predict_cost'] /
Mtasks[mtask]['elapsed'])
p2e_ratios.append(p2e_ratio)
if p2e_ratio > max_p2e:
max_p2e = p2e_ratio
max_mtask = mtask
if p2e_ratio < min_p2e:
min_p2e = p2e_ratio
min_mtask = mtask
print("\nMTask statistics:")
print(" min log(p2e) = %0.3f" % min_p2e, end="")
print(" from mtask %d (predict %d," %
(min_mtask, Mtasks[min_mtask]['predict_cost']),
end="")
print(" elapsed %d)" % Mtasks[min_mtask]['elapsed'])
print(" max log(p2e) = %0.3f" % max_p2e, end="")
print(" from mtask %d (predict %d," %
(max_mtask, Mtasks[max_mtask]['predict_cost']),
end="")
print(" elapsed %d)" % Mtasks[max_mtask]['elapsed'])
stddev = statistics.pstdev(p2e_ratios)
mean = statistics.mean(p2e_ratios)
print(" mean = %0.3f" % mean)
print(" stddev = %0.3f" % stddev)
print(" e ^ stddev = %0.3f" % math.exp(stddev))
print("\nSummary:")
print(" Total elapsed time = {} rdtsc ticks".format(ElapsedTime))
print(" Parallelized code = {:.2%} of elapsed time".format(
ExecGraphTime / ElapsedTime))
print(" Total threads = %d" % nthreads)
print(" Total CPUs used = %d" % ncpus)
print(" Total mtasks = %d" % len(Mtasks))
print(" Total yields = %d" % int(Global['stats'].get('yields', 0)))
report_mtasks()
report_cpus()
report_sections()
if nthreads > ncpus:
print()
print("%%Warning: There were fewer CPUs (%d) then threads (%d)." %
print("%%Warning: There were fewer CPUs (%d) than threads (%d)." %
(ncpus, nthreads))
print(" : See docs on use of numactl.")
else:
@ -279,32 +193,130 @@ def report():
print()
def report_mtasks():
if not Mtasks:
return
nthreads = int(Global['stats']['threads'])
# If we know cycle time in the same (rdtsc) units,
# this will give us an actual utilization number,
# (how effectively we keep the cores busy.)
#
# It also gives us a number we can compare against
# serial mode, to estimate the overhead of data sharing,
# which will show up in the total elapsed time. (Overhead
# of synchronization and scheduling should not.)
total_mtask_time = 0
thread_mtask_time = collections.defaultdict(lambda: 0)
long_mtask_time = 0
long_mtask = None
predict_mtask_time = 0
predict_elapsed = 0
for mtaskId in Mtasks:
record = Mtasks[mtaskId]
predict_mtask_time += record['predict_cost']
total_mtask_time += record['elapsed']
thread_mtask_time[record['thread']] += record['elapsed']
predict_end = record['predict_start'] + record['predict_cost']
predict_elapsed = max(predict_elapsed, predict_end)
if record['elapsed'] > long_mtask_time:
long_mtask_time = record['elapsed']
long_mtask = mtaskId
Global['predict_last_end'] = predict_elapsed
serialTime = ElapsedTime - ExecGraphTime
def subReport(elapsed, work):
print(" Thread utilization = {:7.2%}".format(work /
(elapsed * nthreads)))
print(" Speedup = {:6.3}x".format(work / elapsed))
print("\nParallelized code, measured:")
subReport(ExecGraphTime, total_mtask_time)
print("\nParallelized code, predicted during static scheduling:")
subReport(predict_elapsed, predict_mtask_time)
print("\nAll code, measured:")
subReport(ElapsedTime, serialTime + total_mtask_time)
print("\nAll code, measured, scaled by predicted speedup:")
expectedParallelSpeedup = predict_mtask_time / predict_elapsed
scaledElapsed = serialTime + total_mtask_time / expectedParallelSpeedup
subReport(scaledElapsed, serialTime + total_mtask_time)
p2e_ratios = []
min_p2e = 1000000
min_mtask = None
max_p2e = -1000000
max_mtask = None
for mtask in sorted(Mtasks.keys()):
if Mtasks[mtask]['elapsed'] > 0:
if Mtasks[mtask]['predict_cost'] == 0:
Mtasks[mtask]['predict_cost'] = 1 # don't log(0) below
p2e_ratio = math.log(Mtasks[mtask]['predict_cost'] /
Mtasks[mtask]['elapsed'])
p2e_ratios.append(p2e_ratio)
if p2e_ratio > max_p2e:
max_p2e = p2e_ratio
max_mtask = mtask
if p2e_ratio < min_p2e:
min_p2e = p2e_ratio
min_mtask = mtask
print("\nMTask statistics:")
print(" Longest mtask id = {}".format(long_mtask))
print(" Longest mtask time = {:.2%} of time elapsed in parallelized code".
format(long_mtask_time / ExecGraphTime))
print(" min log(p2e) = %0.3f" % min_p2e, end="")
print(" from mtask %d (predict %d," %
(min_mtask, Mtasks[min_mtask]['predict_cost']),
end="")
print(" elapsed %d)" % Mtasks[min_mtask]['elapsed'])
print(" max log(p2e) = %0.3f" % max_p2e, end="")
print(" from mtask %d (predict %d," %
(max_mtask, Mtasks[max_mtask]['predict_cost']),
end="")
print(" elapsed %d)" % Mtasks[max_mtask]['elapsed'])
stddev = statistics.pstdev(p2e_ratios)
mean = statistics.mean(p2e_ratios)
print(" mean = %0.3f" % mean)
print(" stddev = %0.3f" % stddev)
print(" e ^ stddev = %0.3f" % math.exp(stddev))
def report_cpus():
print("\nCPUs:")
print("\nCPU info:")
Global['cpu_sockets'] = collections.defaultdict(lambda: 0)
Global['cpu_socket_cores'] = collections.defaultdict(lambda: 0)
for cpu in sorted(Global['cpus'].keys()):
print(" cpu %d: " % cpu, end='')
print("cpu_time=%d" % Global['cpus'][cpu]['cpu_time'], end='')
socket = None
print(" Id | Time spent executing MTask | Socket | Core | Model")
print(" | % of elapsed ticks / ticks | | |")
print(" ====|============================|========|======|======")
for cpu in sorted(Cpus):
socket = ""
core = ""
model = ""
if cpu in Global['cpuinfo']:
cpuinfo = Global['cpuinfo'][cpu]
if 'physical_id' in cpuinfo and 'core_id' in cpuinfo:
socket = int(cpuinfo['physical_id'])
socket = cpuinfo['physical_id']
Global['cpu_sockets'][socket] += 1
print(" socket=%d" % socket, end='')
core = int(cpuinfo['core_id'])
Global['cpu_socket_cores'][str(socket) + "__" + str(core)] += 1
print(" core=%d" % core, end='')
core = cpuinfo['core_id']
Global['cpu_socket_cores'][socket + "__" + core] += 1
if 'model_name' in cpuinfo:
model = cpuinfo['model_name']
print(" %s" % model, end='')
print()
print(" {:3d} | {:7.2%} / {:16d} | {:>6s} | {:>4s} | {}".format(
cpu, Cpus[cpu]['mtask_time'] / ElapsedTime,
Cpus[cpu]['mtask_time'], socket, core, model))
if len(Global['cpu_sockets']) > 1:
Global['cpu_sockets_warning'] = True
@ -313,25 +325,68 @@ def report_cpus():
Global['cpu_socket_cores_warning'] = True
def report_sections():
if not Sections:
return
print("\nSection profile:")
totalTime = collections.defaultdict(lambda: 0)
selfTime = collections.defaultdict(lambda: 0)
sectionTree = [0, {}, 1] # [selfTime, childTrees, numberOfTimesEntered]
prevTime = 0
prevStack = ()
for time, stack in Sections:
if len(stack) > len(prevStack):
scope = sectionTree
for item in stack:
scope = scope[1].setdefault(item, [0, {}, 0])
scope[2] += 1
dt = time - prevTime
scope = sectionTree
for item in prevStack:
scope = scope[1].setdefault(item, [0, {}, 0])
scope[0] += dt
if prevStack:
for name in prevStack:
totalTime[name] += dt
selfTime[prevStack[-1]] += dt
prevTime = time
prevStack = stack
def treeSum(tree):
n = tree[0]
for subTree in tree[1].values():
n += treeSum(subTree)
return n
# Make sure the tree sums to the elapsed time
sectionTree[0] += ElapsedTime - treeSum(sectionTree)
def printTree(prefix, name, entries, tree):
print(" {:7.2%} | {:7.2%} | {:8} | {:10.2f} | {}".format(
treeSum(tree) / ElapsedTime, tree[0] / ElapsedTime, tree[2],
tree[2] / entries, prefix + name))
for k in sorted(tree[1], key=lambda _: -treeSum(tree[1][_])):
printTree(prefix + " ", k, tree[2], tree[1][k])
print(" Total | Self | Total | Relative | Section")
print(" time | time | entries | entries | name ")
print("==========|=========|==========|============|========")
printTree("", "*TOTAL*", 1, sectionTree)
######################################################################
def write_vcd(filename):
print("Writing %s" % filename)
with open(filename, "w", encoding="utf8") as fh:
vcd = {
'values':
collections.defaultdict(lambda: {}), # {<time>}{<code>} = value
'sigs': {
'predicted_threads': {},
'measured_threads': {},
'cpus': {},
'evals': {},
'mtasks': {},
'Stats': {}
} # {<module>}{<sig}} = code
}
code = 0
# dict of dicts of hierarchy elements/signal name -> (code, width)
scopeSigs = {}
# map from time -> code -> value
values = collections.defaultdict(lambda: {})
parallelism = {
'measured': collections.defaultdict(lambda: 0),
@ -340,132 +395,139 @@ def write_vcd(filename):
parallelism['measured'][0] = 0
parallelism['predicted'][0] = 0
# Measured graph
for thread in sorted(Threads.keys()):
sig = "thread%d_mtask" % thread
if sig not in vcd['sigs']['measured_threads']:
vcd['sigs']['measured_threads'][sig] = code
code += 1
mcode = vcd['sigs']['measured_threads'][sig]
nextFreeCode = 0
for start in sorted(Threads[thread]):
mtask = Threads[thread][start]['mtask']
end = Threads[thread][start]['end']
cpu = Threads[thread][start]['cpu']
vcd['values'][start][mcode] = mtask
vcd['values'][end][mcode] = None
def getCode(width, *names):
nonlocal nextFreeCode
scope = scopeSigs
*path, name = names
for item in path:
scope = scope.setdefault(item, {})
code, oldWidth = scope.setdefault(name, (nextFreeCode, width))
assert oldWidth == width
if code == nextFreeCode:
nextFreeCode += 1
return code
def addValue(code, time, val):
if isinstance(val, str):
buf = ""
for c in val:
buf += bin(ord(c))[2:].rjust(8, "0")
val = buf.ljust(LongestVcdStrValueLength * 8, "0")
values[time][code] = val
# Measured graph
for thread in sorted(Threads):
mcode = getCode(32, 'measured', 't%d_mtask' % thread)
for record in Threads[thread]:
start = record['start']
mtask = record['mtask']
end = record['end']
cpu = record['cpu']
addValue(mcode, start, mtask)
addValue(mcode, end, None)
parallelism['measured'][start] += 1
parallelism['measured'][end] -= 1
sig = "cpu%d_thread" % cpu
if sig not in vcd['sigs']['cpus']:
vcd['sigs']['cpus'][sig] = code
code += 1
ccode = vcd['sigs']['cpus'][sig]
vcd['values'][start][ccode] = thread
vcd['values'][end][ccode] = None
code = getCode(32, 'measured', 'cpu%d_thread' % cpu)
addValue(code, start, thread)
addValue(code, end, None)
sig = "mtask%d_cpu" % mtask
if sig not in vcd['sigs']['mtasks']:
vcd['sigs']['mtasks'][sig] = code
code += 1
ccode = vcd['sigs']['mtasks'][sig]
vcd['values'][start][ccode] = cpu
vcd['values'][end][ccode] = None
code = getCode(32, 'measured', 'cpu%d_mtask' % cpu)
addValue(code, start, mtask)
addValue(code, end, None)
# Eval graph
vcd['sigs']['evals']["eval"] = code
elcode = code
code += 1
n = 0
for eval_start in Evals:
eval_end = Evals[eval_start]['end']
n += 1
vcd['values'][eval_start][elcode] = n
vcd['values'][eval_end][elcode] = None
tStart = sorted(_['start'] for records in Threads.values()
for _ in records)
tEnd = sorted(_['end'] for records in Threads.values()
for _ in records)
# Eval_loop graph
vcd['sigs']['evals']["eval_loop"] = code
elcode = code
code += 1
n = 0
for eval_start in EvalLoops:
eval_end = EvalLoops[eval_start]['end']
n += 1
vcd['values'][eval_start][elcode] = n
vcd['values'][eval_end][elcode] = None
# Predicted graph
for start, end in ExecGraphIntervals:
# Find the earliest MTask start after the start point, and the
# latest MTask end before the end point, so we can scale to the
# same range
start = tStart[bisect.bisect_left(tStart, start)]
end = tEnd[bisect.bisect_right(tEnd, end) - 1]
# Compute scale so predicted graph is of same width as interval
measured_scaling = (end - start) / Global['predict_last_end']
# Predict mtasks that fill the time the execution occupied
for mtask in Mtasks:
thread = Mtasks[mtask]['thread']
pred_scaled_start = start + int(
Mtasks[mtask]['predict_start'] * measured_scaling)
pred_scaled_end = start + int(
(Mtasks[mtask]['predict_start'] +
Mtasks[mtask]['predict_cost']) * measured_scaling)
if pred_scaled_start == pred_scaled_end:
continue
if Mtasks:
# Predicted graph
for eval_start in EvalLoops:
eval_end = EvalLoops[eval_start]['end']
# Compute scale so predicted graph is of same width as eval
measured_scaling = (eval_end -
eval_start) / Global['predict_last_end']
# Predict mtasks that fill the time the eval occupied
for mtask in Mtasks:
thread = Mtasks[mtask]['thread']
pred_scaled_start = eval_start + int(
Mtasks[mtask]['predict_start'] * measured_scaling)
pred_scaled_end = eval_start + int(
(Mtasks[mtask]['predict_start'] +
Mtasks[mtask]['predict_cost']) * measured_scaling)
if pred_scaled_start == pred_scaled_end:
continue
mcode = getCode(32, 'predicted', 't%d_mtask' % thread)
addValue(mcode, pred_scaled_start, mtask)
addValue(mcode, pred_scaled_end, None)
sig = "predicted_thread%d_mtask" % thread
if sig not in vcd['sigs']['predicted_threads']:
vcd['sigs']['predicted_threads'][sig] = code
code += 1
mcode = vcd['sigs']['predicted_threads'][sig]
parallelism['predicted'][pred_scaled_start] += 1
parallelism['predicted'][pred_scaled_end] -= 1
vcd['values'][pred_scaled_start][mcode] = mtask
vcd['values'][pred_scaled_end][mcode] = None
# Parallelism graph
for measpred in ('measured', 'predicted'):
pcode = getCode(32, 'stats', '%s_parallelism' % measpred)
value = 0
for time in sorted(parallelism[measpred].keys()):
value += parallelism[measpred][time]
addValue(pcode, time, value)
parallelism['predicted'][pred_scaled_start] += 1
parallelism['predicted'][pred_scaled_end] -= 1
# Parallelism graph
for measpred in ('measured', 'predicted'):
vcd['sigs']['Stats']["%s_parallelism" % measpred] = code
pcode = code
code += 1
value = 0
for time in sorted(parallelism[measpred].keys()):
value += parallelism[measpred][time]
vcd['values'][time][pcode] = value
# Section graph
if Sections:
scode = getCode(LongestVcdStrValueLength * 8, "section", "trace")
dcode = getCode(32, "section", "depth")
for time, stack in Sections:
addValue(scode, time, stack[-1] if stack else None)
addValue(dcode, time, len(stack))
# Create output file
fh.write("$version Generated by verilator_gantt $end\n")
fh.write("$timescale 1ns $end\n")
fh.write("\n")
all_codes = {}
all_codes = set()
def writeScope(scope):
assert isinstance(scope, dict)
for key in sorted(scope):
val = scope[key]
if isinstance(val, dict):
fh.write(" $scope module %s $end\n" % key)
writeScope(val)
fh.write(" $upscope $end\n")
else:
(code, width) = val
all_codes.add(code)
fh.write(" $var wire %d v%x %s [%d:0] $end\n" %
(width, code, key, width - 1))
fh.write(" $scope module gantt $end\n")
for module in sorted(vcd['sigs'].keys()):
fh.write(" $scope module %s $end\n" % module)
for sig in sorted(vcd['sigs'][module].keys()):
code = vcd['sigs'][module][sig]
fh.write(" $var wire 32 v%x %s [31:0] $end\n" % (code, sig))
all_codes[code] = 1
fh.write(" $upscope $end\n")
writeScope(scopeSigs)
fh.write(" $upscope $end\n")
fh.write("$enddefinitions $end\n")
fh.write("\n")
first = True
for time in sorted(vcd['values']):
for time in sorted(values):
if first:
first = False
# Start with Z for any signals without time zero data
for code in sorted(all_codes.keys()):
if code not in vcd['values'][time]:
vcd['values'][time][code] = None
for code in sorted(all_codes):
if code not in values[time]:
addValue(code, time, None)
fh.write("#%d\n" % time)
for code in sorted(vcd['values'][time].keys()):
value = vcd['values'][time][code]
for code in sorted(values[time]):
value = values[time][code]
if value is None:
fh.write("bz v%x\n" % code)
elif isinstance(value, str):
fh.write("b%s v%x\n" % (value, code))
else:
fh.write("b%s v%x\n" % (format(value, 'b'), code))
@ -504,7 +566,8 @@ parser.add_argument('filename',
Args = parser.parse_args()
process(Args.filename)
read_data(Args.filename)
report()
if not Args.no_vcd:
write_vcd(Args.vcd)

View File

@ -30,15 +30,13 @@ Gantt Chart VCD Signals
In waveforms, there are the following signals. In GTKWave, use "decimal"
data format to remove the leading zeros and make the traces easier to read.
evals
Increments each time when eval_step was measured to be active. This
allow visualization of how much time eval_step was active.
trace/section
Shows the name of the current top of the execution section stack.
Set GTKWave data format to "ASCII".
eval_loop
Increments each time when the evaluation loop within eval_step was
measured to be active. For best performance, there is only a single
evaluation loop within each eval_step call; that is, the eval_loop
waveform looks identical to the evals waveform.
trace/depth
Shows the depth of the execution section stack.
Set GTKWave data format to "Analog".
measured_parallelism
The number of mtasks active at this time, for best performance, this will

View File

@ -161,12 +161,18 @@ void VlExecutionProfiler::dump(const char* filenamep, uint64_t tickEnd)
// TODO Perhaps merge with verilated_coverage output format, so can
// have a common merging and reporting tool, etc.
fprintf(fp, "VLPROFVERSION 2.0 # Verilator execution profile version 2.0\n");
fprintf(fp, "VLPROFVERSION 2.1 # Verilator execution profile version 2.1\n");
fprintf(fp, "VLPROF arg +verilator+prof+exec+start+%" PRIu64 "\n",
Verilated::threadContextp()->profExecStart());
fprintf(fp, "VLPROF arg +verilator+prof+exec+window+%u\n",
Verilated::threadContextp()->profExecWindow());
const unsigned threads = static_cast<unsigned>(m_traceps.size());
// Note that VerilatedContext will by default create as many threads as there are hardware
// processors, but not all of them might be utilized. Report the actual number that has trace
// entries to avoid over-counting.
unsigned threads = 0;
for (const auto& pair : m_traceps) {
if (!pair.second->empty()) ++threads;
}
fprintf(fp, "VLPROF stat threads %u\n", threads);
fprintf(fp, "VLPROF stat yields %" PRIu64 "\n", VlMTaskVertex::yields());
@ -183,6 +189,7 @@ void VlExecutionProfiler::dump(const char* filenamep, uint64_t tickEnd)
for (const auto& pair : m_traceps) {
const uint32_t threadId = pair.first;
ExecutionTrace* const tracep = pair.second;
if (tracep->empty()) continue;
fprintf(fp, "VLPROFTHREAD %" PRIu32 "\n", threadId);
for (const VlExecutionRecord& er : *tracep) {
@ -191,10 +198,9 @@ void VlExecutionProfiler::dump(const char* filenamep, uint64_t tickEnd)
fprintf(fp, "VLPROFEXEC %s %" PRIu64, name, time);
switch (er.m_type) {
case VlExecutionRecord::Type::EVAL_BEGIN:
case VlExecutionRecord::Type::EVAL_END:
case VlExecutionRecord::Type::EVAL_LOOP_BEGIN:
case VlExecutionRecord::Type::EVAL_LOOP_END:
case VlExecutionRecord::Type::SECTION_POP:
case VlExecutionRecord::Type::EXEC_GRAPH_BEGIN:
case VlExecutionRecord::Type::EXEC_GRAPH_END:
// No payload
fprintf(fp, "\n");
break;
@ -209,6 +215,11 @@ void VlExecutionProfiler::dump(const char* filenamep, uint64_t tickEnd)
fprintf(fp, " id %u predictCost %u\n", payload.m_id, payload.m_predictCost);
break;
}
case VlExecutionRecord::Type::SECTION_PUSH: {
const auto& payload = er.m_payload.sectionPush;
fprintf(fp, " %s\n", payload.m_name);
break;
}
default: abort(); // LCOV_EXCL_LINE
}
}

View File

@ -58,12 +58,12 @@ VL_ATTR_ALWINLINE QData VL_CPU_TICK() {
// clang-format off
#define FOREACH_VlExecutionRecord_TYPE(macro) \
_VL_FOREACH_APPLY(macro, EVAL_BEGIN) \
_VL_FOREACH_APPLY(macro, EVAL_END) \
_VL_FOREACH_APPLY(macro, EVAL_LOOP_BEGIN) \
_VL_FOREACH_APPLY(macro, EVAL_LOOP_END) \
_VL_FOREACH_APPLY(macro, SECTION_PUSH) \
_VL_FOREACH_APPLY(macro, SECTION_POP) \
_VL_FOREACH_APPLY(macro, MTASK_BEGIN) \
_VL_FOREACH_APPLY(macro, MTASK_END)
_VL_FOREACH_APPLY(macro, MTASK_END) \
_VL_FOREACH_APPLY(macro, EXEC_GRAPH_BEGIN) \
_VL_FOREACH_APPLY(macro, EXEC_GRAPH_END)
// clang-format on
class VlExecutionRecord final {
@ -83,6 +83,9 @@ class VlExecutionRecord final {
};
union Payload {
struct {
const char* m_name; // Name of section being entered
} sectionPush;
struct {
uint32_t m_id; // MTask id
uint32_t m_predictStart; // Time scheduler predicted would start
@ -109,10 +112,11 @@ public:
VlExecutionRecord() = default;
// METHODS
void evalBegin() { m_type = Type::EVAL_BEGIN; }
void evalEnd() { m_type = Type::EVAL_END; }
void evalLoopBegin() { m_type = Type::EVAL_LOOP_BEGIN; }
void evalLoopEnd() { m_type = Type::EVAL_LOOP_END; }
void sectionPush(const char* name) {
m_payload.sectionPush.m_name = name;
m_type = Type::SECTION_PUSH;
}
void sectionPop() { m_type = Type::SECTION_POP; }
void mtaskBegin(uint32_t id, uint32_t predictStart) {
m_payload.mtaskBegin.m_id = id;
m_payload.mtaskBegin.m_predictStart = predictStart;
@ -124,6 +128,8 @@ public:
m_payload.mtaskEnd.m_predictCost = predictCost;
m_type = Type::MTASK_END;
}
void execGraphBegin() { m_type = Type::EXEC_GRAPH_BEGIN; }
void execGraphEnd() { m_type = Type::EXEC_GRAPH_END; }
};
static_assert(std::is_trivially_destructible<VlExecutionRecord>::value,

View File

@ -403,7 +403,7 @@ class EmitCModel final : public EmitCFunc {
if (v3Global.opt.profExec()) {
puts("vlSymsp->__Vm_executionProfilerp->configure();\n");
puts("VL_EXEC_TRACE_ADD_RECORD(vlSymsp).evalBegin();\n");
puts("VL_EXEC_TRACE_ADD_RECORD(vlSymsp).sectionPush(\"eval\");\n");
}
puts("VL_DEBUG_IF(VL_DBG_MSGF(\"+ Eval\\n\"););\n");
@ -412,7 +412,7 @@ class EmitCModel final : public EmitCFunc {
putsDecoration("// Evaluate cleanup\n");
puts("Verilated::endOfEval(vlSymsp->__Vm_evalMsgQp);\n");
if (v3Global.opt.profExec()) puts("VL_EXEC_TRACE_ADD_RECORD(vlSymsp).evalEnd();\n");
if (v3Global.opt.profExec()) puts("VL_EXEC_TRACE_ADD_RECORD(vlSymsp).sectionPop();\n");
puts("}\n");
}

View File

@ -1473,7 +1473,8 @@ AstCFunc* order(AstNetlist* netlistp, //
// Create the result function
AstScope* const scopeTopp = netlistp->topScopep()->scopep();
AstCFunc* const funcp = new AstCFunc{netlistp->fileline(), "_eval_" + tag, scopeTopp, ""};
FileLine* const flp = netlistp->fileline();
AstCFunc* const funcp = new AstCFunc{flp, "_eval_" + tag, scopeTopp, ""};
funcp->dontCombine(true);
funcp->isStatic(false);
funcp->isLoose(true);
@ -1482,9 +1483,18 @@ AstCFunc* order(AstNetlist* netlistp, //
funcp->declPrivate(true);
scopeTopp->addBlocksp(funcp);
if (v3Global.opt.profExec()) {
funcp->addStmtsp(new AstCStmt{flp, "VL_EXEC_TRACE_ADD_RECORD(vlSymsp).sectionPush(\"func "
+ tag + "\");\n"});
}
// Add ordered statements to the result function
for (AstNode* const nodep : nodeps) funcp->addStmtsp(nodep);
if (v3Global.opt.profExec()) {
funcp->addStmtsp(new AstCStmt{flp, "VL_EXEC_TRACE_ADD_RECORD(vlSymsp).sectionPop();\n"});
}
// Dispose of the remnants of the inputs
for (auto* const lbsp : logic) lbsp->deleteActives();

View File

@ -3183,6 +3183,10 @@ static void addThreadStartToExecGraph(AstExecGraph* const execGraphp,
execGraphp->addStmtsp(new AstText{fl, text, /* tracking: */ true});
};
if (v3Global.opt.profExec()) {
addStrStmt("VL_EXEC_TRACE_ADD_RECORD(vlSymsp).execGraphBegin();\n");
}
addStrStmt("vlSymsp->__Vm_even_cycle__" + tag + " = !vlSymsp->__Vm_even_cycle__" + tag
+ ";\n");
@ -3206,6 +3210,10 @@ static void addThreadStartToExecGraph(AstExecGraph* const execGraphp,
addStrStmt("vlSelf->__Vm_mtaskstate_final__" + tag
+ ".waitUntilUpstreamDone(vlSymsp->__Vm_even_cycle__" + tag + ");\n");
if (v3Global.opt.profExec()) {
addStrStmt("VL_EXEC_TRACE_ADD_RECORD(vlSymsp).execGraphEnd();\n");
}
}
static void implementExecGraph(AstExecGraph* const execGraphp) {

View File

@ -399,6 +399,17 @@ AstSenTree* createTriggerSenTree(AstNetlist* netlistp, AstVarScope* const vscp,
return resultp;
}
//============================================================================
// Utility for creating profiling statements
AstNodeStmt* profExecSectionPush(FileLine* flp, const string& name) {
return new AstCStmt{flp, "VL_EXEC_TRACE_ADD_RECORD(vlSymsp).sectionPush(\"" + name + "\");\n"};
};
AstNodeStmt* profExecSectionPop(FileLine* flp) {
return new AstCStmt{flp, "VL_EXEC_TRACE_ADD_RECORD(vlSymsp).sectionPop();\n"};
};
//============================================================================
// Utility for extra trigger allocation
@ -442,6 +453,7 @@ const TriggerKit createTriggers(AstNetlist* netlistp, AstCFunc* const initFuncp,
// Create the trigger computation function
AstCFunc* const funcp = makeSubFunction(netlistp, "_eval_triggers__" + name, slow);
if (v3Global.opt.profExec()) funcp->addStmtsp(profExecSectionPush(flp, "trig " + name));
// Create the trigger dump function (for debugging, always 'slow')
AstCFunc* const dumpp = makeSubFunction(netlistp, "_dump_triggers__" + name, true);
@ -581,6 +593,8 @@ const TriggerKit createTriggers(AstNetlist* netlistp, AstCFunc* const initFuncp,
add("#endif\n");
}
if (v3Global.opt.profExec()) funcp->addStmtsp(profExecSectionPop(flp));
// The debug code might leak signal names, so simply delete it when using --protect-ids
if (v3Global.opt.protectIds()) dumpp->stmtsp()->unlinkFrBackWithNext()->deleteTree();
@ -636,7 +650,9 @@ EvalLoop makeEvalLoop(AstNetlist* netlistp, const string& tag, const string& nam
AstVarScope* const continuep = scopeTopp->createTemp("__V" + tag + "Continue", 1);
continuep->varp()->noReset(true);
AstNodeStmt* nodep = setVar(counterp, 0);
AstNodeStmt* nodep = nullptr;
if (v3Global.opt.profExec()) nodep = profExecSectionPush(flp, "loop " + tag);
nodep = AstNode::addNext(nodep, setVar(counterp, 0));
nodep->addNext(buildLoop(netlistp, continuep, [&](AstWhile* loopp) {
// Compute triggers
loopp->addStmtsp(computeTriggers());
@ -689,6 +705,8 @@ EvalLoop makeEvalLoop(AstNetlist* netlistp, const string& tag, const string& nam
}
}));
if (v3Global.opt.profExec()) nodep->addNext(profExecSectionPop(flp));
return {counterp, continuep, nodep};
}

View File

@ -49,7 +49,7 @@ if ($Self->{vltmt}) {
file_grep("$Self->{obj_dir}/gantt.log", qr/Total threads += 1/i);
file_grep("$Self->{obj_dir}/gantt.log", qr/Total mtasks += 0/i);
}
file_grep("$Self->{obj_dir}/gantt.log", qr/Total evals += 2/i);
file_grep("$Self->{obj_dir}/gantt.log", qr/\|\s+2\s+\|\s+2\.0+\s+\|\s+eval/i);
# Diff to itself, just to check parsing
vcd_identical("$Self->{obj_dir}/profile_exec.vcd", "$Self->{obj_dir}/profile_exec.vcd");

View File

@ -900,22 +900,18 @@ VLPROFPROC address sizes : 43 bits physical, 48 bits virtual
VLPROFPROC power management: ts ttp tm hwpstate cpb eff_freq_ro [13] [14]
VLPROFPROC
VLPROFTHREAD 0
VLPROFEXEC EVAL_BEGIN 595
VLPROFEXEC EVAL_LOOP_BEGIN 945
VLPROFEXEC EXEC_GRAPH_BEGIN 945
VLPROFEXEC MTASK_BEGIN 2695 id 6 predictStart 0 cpu 19
VLPROFEXEC MTASK_END 2905 id 6 predictCost 30
VLPROFEXEC MTASK_BEGIN 9695 id 10 predictStart 196 cpu 19
VLPROFEXEC MTASK_END 9870 id 10 predictCost 30
VLPROFEXEC EVAL_LOOP_END 12180
VLPROFEXEC EVAL_END 12250
VLPROFEXEC EVAL_BEGIN 13720
VLPROFEXEC EVAL_LOOP_BEGIN 14000
VLPROFEXEC EXEC_GRAPH_END 12180
VLPROFEXEC EXEC_GRAPH_BEGIN 14000
VLPROFEXEC MTASK_BEGIN 15610 id 6 predictStart 0 cpu 19
VLPROFEXEC MTASK_END 15820 id 6 predictCost 30
VLPROFEXEC MTASK_BEGIN 21700 id 10 predictStart 196 cpu 19
VLPROFEXEC MTASK_END 21875 id 10 predictCost 30
VLPROFEXEC EVAL_LOOP_END 22085
VLPROFEXEC EVAL_END 22330
VLPROFEXEC EXEC_GRAPH_END 22085
VLPROFTHREAD 1
VLPROFEXEC MTASK_BEGIN 5495 id 5 predictStart 0 cpu 10
VLPROFEXEC MTASK_END 6090 id 5 predictCost 30

View File

@ -4,33 +4,44 @@ Argument settings:
+verilator+prof+exec+start+2
+verilator+prof+exec+window+2
Analysis:
Total threads = 2
Total mtasks = 7
Total cpus used = 2
Total yields = 0
Total evals = 2
Total eval loops = 2
Total eval time = 21875 rdtsc ticks
Longest mtask time = 1190 rdtsc ticks
All-thread mtask time = 5495 rdtsc ticks
Longest-thread efficiency = 5.4%
All-thread efficiency = 12.6%
All-thread speedup = 0.3
Summary:
Total elapsed time = 23415 rdtsc ticks
Parallelized code = 82.51% of elapsed time
Total threads = 2
Total CPUs used = 2
Total mtasks = 7
Total yields = 0
Prediction (what Verilator used for scheduling):
All-thread efficiency = 63.2%
All-thread speedup = 1.3
Parallelized code, measured:
Thread utilization = 14.22%
Speedup = 0.284x
Parallelized code, predicted during static scheduling:
Thread utilization = 63.22%
Speedup = 1.26x
All code, measured:
Thread utilization = 20.48%
Speedup = 0.41x
All code, measured, scaled by predicted speedup:
Thread utilization = 56.80%
Speedup = 1.14x
MTask statistics:
Longest mtask id = 5
Longest mtask time = 6.16% of time elapsed in parallelized code
min log(p2e) = -3.681 from mtask 5 (predict 30, elapsed 1190)
max log(p2e) = -2.409 from mtask 8 (predict 107, elapsed 1190)
mean = -2.992
stddev = 0.459
e ^ stddev = 1.583
CPUs:
cpu 10: cpu_time=4725 socket=0 core=10 Test Ryzen 9 3950X 16-Core Processor
cpu 19: cpu_time=770 socket=0 core=3 Test Ryzen 9 3950X 16-Core Processor
CPU info:
Id | Time spent executing MTask | Socket | Core | Model
| % of elapsed ticks / ticks | | |
====|============================|========|======|======
10 | 20.18% / 4725 | 0 | 10 | Test Ryzen 9 3950X 16-Core Processor
19 | 3.29% / 770 | 0 | 3 | Test Ryzen 9 3950X 16-Core Processor
Writing profile_exec.vcd

View File

@ -2,34 +2,21 @@ $version Generated by verilator_gantt $end
$timescale 1ns $end
$scope module gantt $end
$scope module Stats $end
$var wire 32 vf measured_parallelism [31:0] $end
$var wire 32 v10 predicted_parallelism [31:0] $end
$upscope $end
$scope module cpus $end
$var wire 32 v5 cpu10_thread [31:0] $end
$scope module measured $end
$var wire 32 v5 cpu10_mtask [31:0] $end
$var wire 32 v4 cpu10_thread [31:0] $end
$var wire 32 v2 cpu19_mtask [31:0] $end
$var wire 32 v1 cpu19_thread [31:0] $end
$var wire 32 v0 t0_mtask [31:0] $end
$var wire 32 v3 t1_mtask [31:0] $end
$upscope $end
$scope module evals $end
$var wire 32 vb eval [31:0] $end
$var wire 32 vc eval_loop [31:0] $end
$scope module predicted $end
$var wire 32 v6 t0_mtask [31:0] $end
$var wire 32 v7 t1_mtask [31:0] $end
$upscope $end
$scope module measured_threads $end
$var wire 32 v0 thread0_mtask [31:0] $end
$var wire 32 v4 thread1_mtask [31:0] $end
$upscope $end
$scope module mtasks $end
$var wire 32 v3 mtask10_cpu [31:0] $end
$var wire 32 va mtask11_cpu [31:0] $end
$var wire 32 v6 mtask5_cpu [31:0] $end
$var wire 32 v2 mtask6_cpu [31:0] $end
$var wire 32 v7 mtask7_cpu [31:0] $end
$var wire 32 v8 mtask8_cpu [31:0] $end
$var wire 32 v9 mtask9_cpu [31:0] $end
$upscope $end
$scope module predicted_threads $end
$var wire 32 vd predicted_thread0_mtask [31:0] $end
$var wire 32 ve predicted_thread1_mtask [31:0] $end
$scope module stats $end
$var wire 32 v8 measured_parallelism [31:0] $end
$var wire 32 v9 predicted_parallelism [31:0] $end
$upscope $end
$upscope $end
$enddefinitions $end
@ -43,216 +30,193 @@ bz v4
bz v5
bz v6
bz v7
bz v8
bz v9
bz va
bz vb
bz vc
bz vd
bz ve
b0 vf
b0 v10
#595
b1 vb
#945
b1 vc
b110 vd
b101 ve
b10 v10
#2429
bz vd
b111 ve
b1 v10
b0 v8
b0 v9
#2695
b110 v0
b0 v1
b10011 v2
b1 vf
b110 v2
b110 v6
b101 v7
b1 v8
b10 v9
#2905
bz v0
bz v1
bz v2
b0 vf
#3914
b1000 ve
b1 v10
#5495
b101 v4
b1 v5
b1010 v6
b1 vf
#6090
bz v4
bz v5
b0 v8
#3800
bz v6
b0 vf
b111 v7
b1 v9
#4906
b1000 v7
b1 v9
#5495
b101 v3
b1 v4
b101 v5
b1 v8
#6090
bz v3
bz v4
bz v5
b0 v8
#6300
b111 v4
b1 v5
b1010 v7
b1 vf
b111 v3
b1 v4
b111 v5
b1 v8
#6895
bz v3
bz v4
bz v5
bz v7
b0 vf
b0 v8
#7490
b1000 v4
b1 v5
b1010 v8
b1 vf
b1000 v3
b1 v4
b1000 v5
b1 v8
#8540
bz v3
bz v4
bz v5
bz v8
b0 vf
b0 v8
#8848
b1001 v7
b1 v9
#9135
b1001 v4
b1 v5
b1010 v9
b1 vf
#9210
b1001 ve
b1 v10
b1001 v3
b1 v4
b1001 v5
b1 v8
#9695
b1010 v0
b0 v1
b10011 v3
b10 vf
b1010 v2
b10 v8
#9730
bz v3
bz v4
bz v5
bz v9
b1 vf
b1 v8
#9870
bz v0
bz v1
bz v3
b0 vf
bz v2
b0 v8
#9917
b1010 v6
b10 v9
#9954
b1011 v7
b10 v9
#10255
b1011 v4
b1 v5
b1010 va
b1 vf
#10645
b1010 vd
b10 v10
#10695
b1011 ve
b10 v10
b1011 v3
b1 v4
b1011 v5
b1 v8
#11023
bz v6
b1 v9
#11060
bz v3
bz v4
bz v5
bz va
b0 vf
#12130
bz vd
b1 v10
#12180
bz vc
bz ve
b0 v10
#12250
bz vb
#13720
b10 vb
#14000
b10 vc
b110 vd
b101 ve
b10 v10
#15068
bz vd
b111 ve
b1 v10
bz v7
b0 v8
b0 v9
#15610
b110 v0
b0 v1
b10011 v2
b1 vf
b110 v2
b110 v6
b101 v7
b1 v8
b10 v9
#15820
bz v0
bz v1
bz v2
b0 vf
#16137
b1000 ve
b1 v10
#18375
b101 v4
b1 v5
b1010 v6
b1 vf
#18970
bz v4
bz v5
b0 v8
#16437
bz v6
b0 vf
b111 v7
b1 v9
#17265
b1000 v7
b1 v9
#18375
b101 v3
b1 v4
b101 v5
b1 v8
#18970
bz v3
bz v4
bz v5
b0 v8
#19145
b111 v4
b1 v5
b1010 v7
b1 vf
b111 v3
b1 v4
b111 v5
b1 v8
#19320
bz v3
bz v4
bz v5
bz v7
b0 vf
b0 v8
#19670
b1000 v4
b1 v5
b1010 v8
b1 vf
b1000 v3
b1 v4
b1000 v5
b1 v8
#19810
bz v3
bz v4
bz v5
bz v8
b0 vf
#19947
b1001 ve
b1 v10
b0 v8
#20219
b1001 v7
b1 v9
#20650
b1001 v4
b1 v5
b1010 v9
b1 vf
b1001 v3
b1 v4
b1001 v5
b1 v8
#20720
bz v3
bz v4
bz v5
bz v9
b0 vf
#20980
b1010 vd
b10 v10
#21016
b1011 ve
b10 v10
b0 v8
#21019
b1010 v6
b10 v9
#21047
b1011 v7
b10 v9
#21140
b1011 v4
b1 v5
b1010 va
b1 vf
b1011 v3
b1 v4
b1011 v5
b1 v8
#21245
bz v3
bz v4
bz v5
bz va
b0 vf
b0 v8
#21700
b1010 v0
b0 v1
b10011 v3
b1 vf
b1010 v2
b1 v8
#21847
bz v6
b1 v9
#21875
bz v0
bz v1
bz v3
b0 vf
#22049
bz vd
b1 v10
#22085
bz vc
bz ve
b0 v10
#22330
bz vb
bz v2
bz v7
b0 v8
b0 v9

View File

@ -44,14 +44,12 @@ VLPROFPROC CPU part : 0x663
VLPROFPROC CPU revision : 3
VLPROFPROC
VLPROFTHREAD 0
VLPROFEXEC EVAL_BEGIN 57709
VLPROFEXEC EVAL_LOOP_BEGIN 58532
VLPROFEXEC EXEC_GRAPH_BEGIN 58532
VLPROFEXEC MTASK_BEGIN 90465 id 85 predictStart 14315 cpu 2
VLPROFEXEC MTASK_END 155034 id 85 predictCost 30533
VLPROFEXEC MTASK_BEGIN 156555 id 79 predictStart 44848 cpu 2
VLPROFEXEC MTASK_END 294309 id 79 predictCost 48001
VLPROFEXEC EVAL_LOOP_END 18028850
VLPROFEXEC EVAL_END 1803680
VLPROFEXEC EXEC_GRAPH_END 300000
VLPROFTHREAD 1
VLPROFEXEC MTASK_BEGIN 77352 id 90 predictStart 14315 cpu 3
VLPROFEXEC MTASK_END 78511 id 90 predictCost 21592
@ -59,4 +57,4 @@ VLPROFEXEC MTASK_BEGIN 79799 id 81 predictStart 35907 cpu 3
VLPROFEXEC MTASK_END 80667 id 81 predictCost 29215
VLPROFEXEC MTASK_BEGIN 81746 id 87 predictStart 65147 cpu 3
VLPROFEXEC MTASK_END 82633 id 87 predictCost 33809
VLPROF stat ticks 180832
VLPROF stat ticks 300000

View File

@ -4,33 +4,44 @@ Argument settings:
+verilator+prof+exec+start+1
+verilator+prof+exec+window+2
Analysis:
Total threads = 2
Total mtasks = 5
Total cpus used = 2
Total yields = 51
Total evals = 1
Total eval loops = 1
Total eval time = 294309 rdtsc ticks
Longest mtask time = 137754 rdtsc ticks
All-thread mtask time = 205237 rdtsc ticks
Longest-thread efficiency = 46.8%
All-thread efficiency = 34.9%
All-thread speedup = 0.7
Summary:
Total elapsed time = 300000 rdtsc ticks
Parallelized code = 80.49% of elapsed time
Total threads = 2
Total CPUs used = 2
Total mtasks = 5
Total yields = 51
Prediction (what Verilator used for scheduling):
All-thread efficiency = 82.4%
All-thread speedup = 1.6
Parallelized code, measured:
Thread utilization = 42.50%
Speedup = 0.85x
Parallelized code, predicted during static scheduling:
Thread utilization = 82.44%
Speedup = 1.65x
All code, measured:
Thread utilization = 43.96%
Speedup = 0.879x
All code, measured, scaled by predicted speedup:
Thread utilization = 72.06%
Speedup = 1.44x
MTask statistics:
Longest mtask id = 79
Longest mtask time = 57.05% of time elapsed in parallelized code
min log(p2e) = -1.054 from mtask 79 (predict 48001, elapsed 137754)
max log(p2e) = 3.641 from mtask 87 (predict 33809, elapsed 887)
mean = 1.656
stddev = 2.104
e ^ stddev = 8.200
CPUs:
cpu 2: cpu_time=202323 Phytium,FT-2500/128
cpu 3: cpu_time=2914 Phytium,FT-2500/128
CPU info:
Id | Time spent executing MTask | Socket | Core | Model
| % of elapsed ticks / ticks | | |
====|============================|========|======|======
2 | 67.44% / 202323 | | | Phytium,FT-2500/128
3 | 0.97% / 2914 | | | Phytium,FT-2500/128
Writing profile_exec.vcd

View File

@ -4,22 +4,18 @@ VLPROF arg +verilator+prof+exec+window+2
VLPROF stat threads 2
VLPROF stat yields 0
VLPROFTHREAD 0
VLPROFEXEC EVAL_BEGIN 595
VLPROFEXEC EVAL_LOOP_BEGIN 945
VLPROFEXEC EXEC_GRAPH_BEGIN 945
VLPROFEXEC MTASK_BEGIN 2695 id 6 predictStart 0 cpu 19
VLPROFEXEC MTASK_END 2905 id 6 predictCost 30
VLPROFEXEC MTASK_BEGIN 9695 id 10 predictStart 196 cpu 19
VLPROFEXEC MTASK_END 9870 id 10 predictCost 30
VLPROFEXEC EVAL_LOOP_END 12180
VLPROFEXEC EVAL_END 12250
VLPROFEXEC EVAL_BEGIN 13720
VLPROFEXEC EVAL_LOOP_BEGIN 14000
VLPROFEXEC EXEC_GRAPH_END 12180
VLPROFEXEC EXEC_GRAPH_BEGIN 14000
VLPROFEXEC MTASK_BEGIN 15610 id 6 predictStart 0 cpu 19
VLPROFEXEC MTASK_END 15820 id 6 predictCost 30
VLPROFEXEC MTASK_BEGIN 21700 id 10 predictStart 196 cpu 19
VLPROFEXEC MTASK_END 21875 id 10 predictCost 30
VLPROFEXEC EVAL_LOOP_END 22085
VLPROFEXEC EVAL_END 22330
VLPROFEXEC EXEC_GRAPH_END 22085
VLPROFTHREAD 1
VLPROFEXEC MTASK_BEGIN 5495 id 5 predictStart 0 cpu 10
VLPROFEXEC MTASK_END 6090 id 5 predictCost 30

View File

@ -4,32 +4,43 @@ Argument settings:
+verilator+prof+exec+start+2
+verilator+prof+exec+window+2
Analysis:
Total threads = 2
Total mtasks = 7
Total cpus used = 2
Total yields = 0
Total evals = 2
Total eval loops = 2
Total eval time = 21875 rdtsc ticks
Longest mtask time = 1190 rdtsc ticks
All-thread mtask time = 5495 rdtsc ticks
Longest-thread efficiency = 5.4%
All-thread efficiency = 12.6%
All-thread speedup = 0.3
Summary:
Total elapsed time = 23415 rdtsc ticks
Parallelized code = 82.51% of elapsed time
Total threads = 2
Total CPUs used = 2
Total mtasks = 7
Total yields = 0
Prediction (what Verilator used for scheduling):
All-thread efficiency = 63.2%
All-thread speedup = 1.3
Parallelized code, measured:
Thread utilization = 14.22%
Speedup = 0.284x
Parallelized code, predicted during static scheduling:
Thread utilization = 63.22%
Speedup = 1.26x
All code, measured:
Thread utilization = 20.48%
Speedup = 0.41x
All code, measured, scaled by predicted speedup:
Thread utilization = 56.80%
Speedup = 1.14x
MTask statistics:
Longest mtask id = 5
Longest mtask time = 6.16% of time elapsed in parallelized code
min log(p2e) = -3.681 from mtask 5 (predict 30, elapsed 1190)
max log(p2e) = -2.409 from mtask 8 (predict 107, elapsed 1190)
mean = -2.992
stddev = 0.459
e ^ stddev = 1.583
CPUs:
cpu 10: cpu_time=4725
cpu 19: cpu_time=770
CPU info:
Id | Time spent executing MTask | Socket | Core | Model
| % of elapsed ticks / ticks | | |
====|============================|========|======|======
10 | 20.18% / 4725 | | |
19 | 3.29% / 770 | | |

View File

@ -54,7 +54,7 @@ if ($Self->{vltmt}) {
file_grep("$Self->{obj_dir}/gantt.log", qr/Total threads += 1/i);
file_grep("$Self->{obj_dir}/gantt.log", qr/Total mtasks += 0/i);
}
file_grep("$Self->{obj_dir}/gantt.log", qr/Total evals += 4/i);
file_grep("$Self->{obj_dir}/gantt.log", qr/\|\s+4\s+\|\s+4\.0+\s+\|\s+eval/i);
# Diff to itself, just to check parsing
vcd_identical("$Self->{obj_dir}/profile_exec.vcd", "$Self->{obj_dir}/profile_exec.vcd");