mirror of
https://github.com/verilator/verilator.git
synced 2025-01-01 04:07:34 +00:00
Improve --prof-exec infrastructure and report
Again --prof-exec have bit-rotted a little with all the recent changes to the structure of the generated code. This patch contains a few improvements: - Repalce the eval/evl_loop begin/end events with generic section_push/section_pop events, that can be arbitrarily sprinkled into the generate code (so long as they are matched correctly) to measure various sections. The report then contains a nested profile of the sections, and the VCD trace shows the section names. - Better handling of exec graphs - Clearer overall statistics
This commit is contained in:
parent
146cdc020d
commit
4c0edd2efb
@ -1,33 +1,32 @@
|
||||
#!/usr/bin/env python3
|
||||
# pylint: disable=C0103,C0114,C0116,C0209,C0301,R0914,R0912,R0915,W0511,eval-used
|
||||
# pylint: disable=C0103,C0114,C0116,C0209,C0301,R0914,R0912,R0915,W0511,W0603,eval-used
|
||||
######################################################################
|
||||
|
||||
import argparse
|
||||
import bisect
|
||||
import collections
|
||||
import math
|
||||
import re
|
||||
import statistics
|
||||
# from pprint import pprint
|
||||
|
||||
Threads = collections.defaultdict(lambda: collections.defaultdict(lambda: {}))
|
||||
Mtasks = collections.defaultdict(lambda: {})
|
||||
Evals = collections.defaultdict(lambda: {})
|
||||
EvalLoops = collections.defaultdict(lambda: {})
|
||||
Sections = []
|
||||
LongestVcdStrValueLength = 0
|
||||
Threads = collections.defaultdict(lambda: []) # List of records per thread id
|
||||
Mtasks = collections.defaultdict(lambda: {'elapsed': 0, 'end': 0})
|
||||
Cpus = collections.defaultdict(lambda: {'mtask_time': 0})
|
||||
Global = {
|
||||
'args': {},
|
||||
'cpuinfo': collections.defaultdict(lambda: {}),
|
||||
'rdtsc_cycle_time': 0,
|
||||
'stats': {}
|
||||
}
|
||||
ElapsedTime = None # total elapsed time
|
||||
ExecGraphTime = 0 # total elapsed time excuting an exec graph
|
||||
ExecGraphIntervals = [] # list of (start, end) pairs
|
||||
|
||||
######################################################################
|
||||
|
||||
|
||||
def process(filename):
|
||||
read_data(filename)
|
||||
report()
|
||||
|
||||
|
||||
def read_data(filename):
|
||||
with open(filename, "r", encoding="utf8") as fh:
|
||||
re_thread = re.compile(r'^VLPROFTHREAD (\d+)$')
|
||||
@ -39,14 +38,17 @@ def read_data(filename):
|
||||
re_arg1 = re.compile(r'VLPROF arg\s+(\S+)\+([0-9.]*)\s*')
|
||||
re_arg2 = re.compile(r'VLPROF arg\s+(\S+)\s+([0-9.]*)\s*$')
|
||||
re_stat = re.compile(r'VLPROF stat\s+(\S+)\s+([0-9.]+)')
|
||||
re_time = re.compile(r'rdtsc time = (\d+) ticks')
|
||||
re_proc_cpu = re.compile(r'VLPROFPROC processor\s*:\s*(\d+)\s*$')
|
||||
re_proc_dat = re.compile(r'VLPROFPROC ([a-z_ ]+)\s*:\s*(.*)$')
|
||||
cpu = None
|
||||
thread = None
|
||||
execGraphStart = None
|
||||
|
||||
lastEvalBeginTick = None
|
||||
lastEvalLoopBeginTick = None
|
||||
global LongestVcdStrValueLength
|
||||
global ExecGraphTime
|
||||
|
||||
SectionStack = []
|
||||
mTaskThread = {}
|
||||
|
||||
for line in fh:
|
||||
recordMatch = re_record.match(line)
|
||||
@ -54,29 +56,31 @@ def read_data(filename):
|
||||
kind, tick, payload = recordMatch.groups()
|
||||
tick = int(tick)
|
||||
payload = payload.strip()
|
||||
if kind == "EVAL_BEGIN":
|
||||
Evals[tick]['start'] = tick
|
||||
lastEvalBeginTick = tick
|
||||
elif kind == "EVAL_END":
|
||||
Evals[lastEvalBeginTick]['end'] = tick
|
||||
lastEvalBeginTick = None
|
||||
elif kind == "EVAL_LOOP_BEGIN":
|
||||
EvalLoops[tick]['start'] = tick
|
||||
lastEvalLoopBeginTick = tick
|
||||
elif kind == "EVAL_LOOP_END":
|
||||
EvalLoops[lastEvalLoopBeginTick]['end'] = tick
|
||||
lastEvalLoopBeginTick = None
|
||||
if kind == "SECTION_PUSH":
|
||||
LongestVcdStrValueLength = max(LongestVcdStrValueLength,
|
||||
len(payload))
|
||||
SectionStack.append(payload)
|
||||
Sections.append((tick, tuple(SectionStack)))
|
||||
elif kind == "SECTION_POP":
|
||||
assert SectionStack, "SECTION_POP without SECTION_PUSH"
|
||||
SectionStack.pop()
|
||||
Sections.append((tick, tuple(SectionStack)))
|
||||
elif kind == "MTASK_BEGIN":
|
||||
mtask, predict_start, ecpu = re_payload_mtaskBegin.match(
|
||||
payload).groups()
|
||||
mtask = int(mtask)
|
||||
predict_start = int(predict_start)
|
||||
ecpu = int(ecpu)
|
||||
Threads[thread][tick]['mtask'] = mtask
|
||||
Threads[thread][tick]['predict_start'] = predict_start
|
||||
Threads[thread][tick]['cpu'] = ecpu
|
||||
if 'elapsed' not in Mtasks[mtask]:
|
||||
Mtasks[mtask] = {'end': 0, 'elapsed': 0}
|
||||
mTaskThread[mtask] = thread
|
||||
records = Threads[thread]
|
||||
assert not records or records[-1]['start'] <= records[-1][
|
||||
'end'] <= tick
|
||||
records.append({
|
||||
'start': tick,
|
||||
'mtask': mtask,
|
||||
'predict_start': predict_start,
|
||||
'cpu': ecpu
|
||||
})
|
||||
Mtasks[mtask]['begin'] = tick
|
||||
Mtasks[mtask]['thread'] = thread
|
||||
Mtasks[mtask]['predict_start'] = predict_start
|
||||
@ -86,11 +90,18 @@ def read_data(filename):
|
||||
mtask = int(mtask)
|
||||
predict_cost = int(predict_cost)
|
||||
begin = Mtasks[mtask]['begin']
|
||||
Threads[thread][begin]['end'] = tick
|
||||
Threads[thread][begin]['predict_cost'] = predict_cost
|
||||
record = Threads[mTaskThread[mtask]][-1]
|
||||
record['end'] = tick
|
||||
record['predict_cost'] = predict_cost
|
||||
Mtasks[mtask]['elapsed'] += tick - begin
|
||||
Mtasks[mtask]['predict_cost'] = predict_cost
|
||||
Mtasks[mtask]['end'] = max(Mtasks[mtask]['end'], tick)
|
||||
elif kind == "EXEC_GRAPH_BEGIN":
|
||||
execGraphStart = tick
|
||||
elif kind == "EXEC_GRAPH_END":
|
||||
ExecGraphTime += tick - execGraphStart
|
||||
ExecGraphIntervals.append((execGraphStart, tick))
|
||||
execGraphStart = None
|
||||
elif Args.debug:
|
||||
print("-Unknown execution trace record: %s" % line)
|
||||
elif re_thread.match(line):
|
||||
@ -109,7 +120,7 @@ def read_data(filename):
|
||||
elif re_proc_cpu.match(line):
|
||||
match = re_proc_cpu.match(line)
|
||||
cpu = int(match.group(1))
|
||||
elif cpu and re_proc_dat.match(line):
|
||||
elif cpu is not None and re_proc_dat.match(line):
|
||||
match = re_proc_dat.match(line)
|
||||
term = match.group(1)
|
||||
value = match.group(2)
|
||||
@ -121,11 +132,6 @@ def read_data(filename):
|
||||
pass
|
||||
elif Args.debug:
|
||||
print("-Unk: %s" % line)
|
||||
# TODO -- this is parsing text printed by a client.
|
||||
# Really, verilator proper should generate this
|
||||
# if it's useful...
|
||||
if re_time.match(line):
|
||||
Global['rdtsc_cycle_time'] = re_time.group(1)
|
||||
|
||||
|
||||
def re_match_result(regexp, line, result_to):
|
||||
@ -144,125 +150,33 @@ def report():
|
||||
plus = "+" if re.match(r'^\+', arg) else " "
|
||||
print(" %s%s%s" % (arg, plus, Global['args'][arg]))
|
||||
|
||||
for records in Threads.values():
|
||||
for record in records:
|
||||
cpu = record['cpu']
|
||||
elapsed = record['end'] - record['start']
|
||||
Cpus[cpu]['mtask_time'] += elapsed
|
||||
|
||||
global ElapsedTime
|
||||
ElapsedTime = int(Global['stats']['ticks'])
|
||||
nthreads = int(Global['stats']['threads'])
|
||||
Global['cpus'] = {}
|
||||
for thread in Threads:
|
||||
# Make potentially multiple characters per column
|
||||
for start in Threads[thread]:
|
||||
if not Threads[thread][start]:
|
||||
continue
|
||||
cpu = Threads[thread][start]['cpu']
|
||||
elapsed = Threads[thread][start]['end'] - start
|
||||
if cpu not in Global['cpus']:
|
||||
Global['cpus'][cpu] = {'cpu_time': 0}
|
||||
Global['cpus'][cpu]['cpu_time'] += elapsed
|
||||
ncpus = max(len(Cpus), 1)
|
||||
|
||||
measured_mt_mtask_time = 0
|
||||
predict_mt_mtask_time = 0
|
||||
long_mtask_time = 0
|
||||
measured_last_end = 0
|
||||
predict_last_end = 0
|
||||
for mtask in Mtasks:
|
||||
measured_mt_mtask_time += Mtasks[mtask]['elapsed']
|
||||
predict_mt_mtask_time += Mtasks[mtask]['predict_cost']
|
||||
measured_last_end = max(measured_last_end, Mtasks[mtask]['end'])
|
||||
predict_last_end = max(
|
||||
predict_last_end,
|
||||
Mtasks[mtask]['predict_start'] + Mtasks[mtask]['predict_cost'])
|
||||
long_mtask_time = max(long_mtask_time, Mtasks[mtask]['elapsed'])
|
||||
Global['measured_last_end'] = measured_last_end
|
||||
Global['predict_last_end'] = predict_last_end
|
||||
|
||||
# If we know cycle time in the same (rdtsc) units,
|
||||
# this will give us an actual utilization number,
|
||||
# (how effectively we keep the cores busy.)
|
||||
#
|
||||
# It also gives us a number we can compare against
|
||||
# serial mode, to estimate the overhead of data sharing,
|
||||
# which will show up in the total elapsed time. (Overhead
|
||||
# of synchronization and scheduling should not.)
|
||||
print("\nAnalysis:")
|
||||
print(" Total threads = %d" % nthreads)
|
||||
print(" Total mtasks = %d" % len(Mtasks))
|
||||
ncpus = max(len(Global['cpus']), 1)
|
||||
print(" Total cpus used = %d" % ncpus)
|
||||
print(" Total yields = %d" %
|
||||
int(Global['stats'].get('yields', 0)))
|
||||
print(" Total evals = %d" % len(Evals))
|
||||
print(" Total eval loops = %d" % len(EvalLoops))
|
||||
if Mtasks:
|
||||
print(" Total eval time = %d rdtsc ticks" %
|
||||
Global['measured_last_end'])
|
||||
print(" Longest mtask time = %d rdtsc ticks" % long_mtask_time)
|
||||
print(" All-thread mtask time = %d rdtsc ticks" %
|
||||
measured_mt_mtask_time)
|
||||
long_efficiency = long_mtask_time / (Global.get(
|
||||
'measured_last_end', 1) or 1)
|
||||
print(" Longest-thread efficiency = %0.1f%%" %
|
||||
(long_efficiency * 100.0))
|
||||
mt_efficiency = measured_mt_mtask_time / (
|
||||
Global.get('measured_last_end', 1) * nthreads or 1)
|
||||
print(" All-thread efficiency = %0.1f%%" %
|
||||
(mt_efficiency * 100.0))
|
||||
print(" All-thread speedup = %0.1f" %
|
||||
(mt_efficiency * nthreads))
|
||||
if Global['rdtsc_cycle_time'] > 0:
|
||||
ut = measured_mt_mtask_time / Global['rdtsc_cycle_time']
|
||||
print("tot_mtask_cpu=" + measured_mt_mtask_time + " cyc=" +
|
||||
Global['rdtsc_cycle_time'] + " ut=" + ut)
|
||||
|
||||
predict_mt_efficiency = predict_mt_mtask_time / (
|
||||
Global.get('predict_last_end', 1) * nthreads or 1)
|
||||
print("\nPrediction (what Verilator used for scheduling):")
|
||||
print(" All-thread efficiency = %0.1f%%" %
|
||||
(predict_mt_efficiency * 100.0))
|
||||
print(" All-thread speedup = %0.1f" %
|
||||
(predict_mt_efficiency * nthreads))
|
||||
|
||||
p2e_ratios = []
|
||||
min_p2e = 1000000
|
||||
min_mtask = None
|
||||
max_p2e = -1000000
|
||||
max_mtask = None
|
||||
|
||||
for mtask in sorted(Mtasks.keys()):
|
||||
if Mtasks[mtask]['elapsed'] > 0:
|
||||
if Mtasks[mtask]['predict_cost'] == 0:
|
||||
Mtasks[mtask]['predict_cost'] = 1 # don't log(0) below
|
||||
p2e_ratio = math.log(Mtasks[mtask]['predict_cost'] /
|
||||
Mtasks[mtask]['elapsed'])
|
||||
p2e_ratios.append(p2e_ratio)
|
||||
|
||||
if p2e_ratio > max_p2e:
|
||||
max_p2e = p2e_ratio
|
||||
max_mtask = mtask
|
||||
if p2e_ratio < min_p2e:
|
||||
min_p2e = p2e_ratio
|
||||
min_mtask = mtask
|
||||
|
||||
print("\nMTask statistics:")
|
||||
print(" min log(p2e) = %0.3f" % min_p2e, end="")
|
||||
print(" from mtask %d (predict %d," %
|
||||
(min_mtask, Mtasks[min_mtask]['predict_cost']),
|
||||
end="")
|
||||
print(" elapsed %d)" % Mtasks[min_mtask]['elapsed'])
|
||||
print(" max log(p2e) = %0.3f" % max_p2e, end="")
|
||||
print(" from mtask %d (predict %d," %
|
||||
(max_mtask, Mtasks[max_mtask]['predict_cost']),
|
||||
end="")
|
||||
print(" elapsed %d)" % Mtasks[max_mtask]['elapsed'])
|
||||
|
||||
stddev = statistics.pstdev(p2e_ratios)
|
||||
mean = statistics.mean(p2e_ratios)
|
||||
print(" mean = %0.3f" % mean)
|
||||
print(" stddev = %0.3f" % stddev)
|
||||
print(" e ^ stddev = %0.3f" % math.exp(stddev))
|
||||
print("\nSummary:")
|
||||
print(" Total elapsed time = {} rdtsc ticks".format(ElapsedTime))
|
||||
print(" Parallelized code = {:.2%} of elapsed time".format(
|
||||
ExecGraphTime / ElapsedTime))
|
||||
print(" Total threads = %d" % nthreads)
|
||||
print(" Total CPUs used = %d" % ncpus)
|
||||
print(" Total mtasks = %d" % len(Mtasks))
|
||||
print(" Total yields = %d" % int(Global['stats'].get('yields', 0)))
|
||||
|
||||
report_mtasks()
|
||||
report_cpus()
|
||||
report_sections()
|
||||
|
||||
if nthreads > ncpus:
|
||||
print()
|
||||
print("%%Warning: There were fewer CPUs (%d) then threads (%d)." %
|
||||
print("%%Warning: There were fewer CPUs (%d) than threads (%d)." %
|
||||
(ncpus, nthreads))
|
||||
print(" : See docs on use of numactl.")
|
||||
else:
|
||||
@ -279,32 +193,130 @@ def report():
|
||||
print()
|
||||
|
||||
|
||||
def report_mtasks():
|
||||
if not Mtasks:
|
||||
return
|
||||
|
||||
nthreads = int(Global['stats']['threads'])
|
||||
|
||||
# If we know cycle time in the same (rdtsc) units,
|
||||
# this will give us an actual utilization number,
|
||||
# (how effectively we keep the cores busy.)
|
||||
#
|
||||
# It also gives us a number we can compare against
|
||||
# serial mode, to estimate the overhead of data sharing,
|
||||
# which will show up in the total elapsed time. (Overhead
|
||||
# of synchronization and scheduling should not.)
|
||||
total_mtask_time = 0
|
||||
thread_mtask_time = collections.defaultdict(lambda: 0)
|
||||
long_mtask_time = 0
|
||||
long_mtask = None
|
||||
predict_mtask_time = 0
|
||||
predict_elapsed = 0
|
||||
for mtaskId in Mtasks:
|
||||
record = Mtasks[mtaskId]
|
||||
predict_mtask_time += record['predict_cost']
|
||||
total_mtask_time += record['elapsed']
|
||||
thread_mtask_time[record['thread']] += record['elapsed']
|
||||
predict_end = record['predict_start'] + record['predict_cost']
|
||||
predict_elapsed = max(predict_elapsed, predict_end)
|
||||
if record['elapsed'] > long_mtask_time:
|
||||
long_mtask_time = record['elapsed']
|
||||
long_mtask = mtaskId
|
||||
Global['predict_last_end'] = predict_elapsed
|
||||
|
||||
serialTime = ElapsedTime - ExecGraphTime
|
||||
|
||||
def subReport(elapsed, work):
|
||||
print(" Thread utilization = {:7.2%}".format(work /
|
||||
(elapsed * nthreads)))
|
||||
print(" Speedup = {:6.3}x".format(work / elapsed))
|
||||
|
||||
print("\nParallelized code, measured:")
|
||||
subReport(ExecGraphTime, total_mtask_time)
|
||||
|
||||
print("\nParallelized code, predicted during static scheduling:")
|
||||
subReport(predict_elapsed, predict_mtask_time)
|
||||
|
||||
print("\nAll code, measured:")
|
||||
subReport(ElapsedTime, serialTime + total_mtask_time)
|
||||
|
||||
print("\nAll code, measured, scaled by predicted speedup:")
|
||||
expectedParallelSpeedup = predict_mtask_time / predict_elapsed
|
||||
scaledElapsed = serialTime + total_mtask_time / expectedParallelSpeedup
|
||||
subReport(scaledElapsed, serialTime + total_mtask_time)
|
||||
|
||||
p2e_ratios = []
|
||||
min_p2e = 1000000
|
||||
min_mtask = None
|
||||
max_p2e = -1000000
|
||||
max_mtask = None
|
||||
|
||||
for mtask in sorted(Mtasks.keys()):
|
||||
if Mtasks[mtask]['elapsed'] > 0:
|
||||
if Mtasks[mtask]['predict_cost'] == 0:
|
||||
Mtasks[mtask]['predict_cost'] = 1 # don't log(0) below
|
||||
p2e_ratio = math.log(Mtasks[mtask]['predict_cost'] /
|
||||
Mtasks[mtask]['elapsed'])
|
||||
p2e_ratios.append(p2e_ratio)
|
||||
|
||||
if p2e_ratio > max_p2e:
|
||||
max_p2e = p2e_ratio
|
||||
max_mtask = mtask
|
||||
if p2e_ratio < min_p2e:
|
||||
min_p2e = p2e_ratio
|
||||
min_mtask = mtask
|
||||
|
||||
print("\nMTask statistics:")
|
||||
print(" Longest mtask id = {}".format(long_mtask))
|
||||
print(" Longest mtask time = {:.2%} of time elapsed in parallelized code".
|
||||
format(long_mtask_time / ExecGraphTime))
|
||||
print(" min log(p2e) = %0.3f" % min_p2e, end="")
|
||||
|
||||
print(" from mtask %d (predict %d," %
|
||||
(min_mtask, Mtasks[min_mtask]['predict_cost']),
|
||||
end="")
|
||||
print(" elapsed %d)" % Mtasks[min_mtask]['elapsed'])
|
||||
print(" max log(p2e) = %0.3f" % max_p2e, end="")
|
||||
print(" from mtask %d (predict %d," %
|
||||
(max_mtask, Mtasks[max_mtask]['predict_cost']),
|
||||
end="")
|
||||
print(" elapsed %d)" % Mtasks[max_mtask]['elapsed'])
|
||||
|
||||
stddev = statistics.pstdev(p2e_ratios)
|
||||
mean = statistics.mean(p2e_ratios)
|
||||
print(" mean = %0.3f" % mean)
|
||||
print(" stddev = %0.3f" % stddev)
|
||||
print(" e ^ stddev = %0.3f" % math.exp(stddev))
|
||||
|
||||
|
||||
def report_cpus():
|
||||
print("\nCPUs:")
|
||||
print("\nCPU info:")
|
||||
|
||||
Global['cpu_sockets'] = collections.defaultdict(lambda: 0)
|
||||
Global['cpu_socket_cores'] = collections.defaultdict(lambda: 0)
|
||||
|
||||
for cpu in sorted(Global['cpus'].keys()):
|
||||
print(" cpu %d: " % cpu, end='')
|
||||
print("cpu_time=%d" % Global['cpus'][cpu]['cpu_time'], end='')
|
||||
|
||||
socket = None
|
||||
print(" Id | Time spent executing MTask | Socket | Core | Model")
|
||||
print(" | % of elapsed ticks / ticks | | |")
|
||||
print(" ====|============================|========|======|======")
|
||||
for cpu in sorted(Cpus):
|
||||
socket = ""
|
||||
core = ""
|
||||
model = ""
|
||||
if cpu in Global['cpuinfo']:
|
||||
cpuinfo = Global['cpuinfo'][cpu]
|
||||
if 'physical_id' in cpuinfo and 'core_id' in cpuinfo:
|
||||
socket = int(cpuinfo['physical_id'])
|
||||
socket = cpuinfo['physical_id']
|
||||
Global['cpu_sockets'][socket] += 1
|
||||
print(" socket=%d" % socket, end='')
|
||||
|
||||
core = int(cpuinfo['core_id'])
|
||||
Global['cpu_socket_cores'][str(socket) + "__" + str(core)] += 1
|
||||
print(" core=%d" % core, end='')
|
||||
core = cpuinfo['core_id']
|
||||
Global['cpu_socket_cores'][socket + "__" + core] += 1
|
||||
|
||||
if 'model_name' in cpuinfo:
|
||||
model = cpuinfo['model_name']
|
||||
print(" %s" % model, end='')
|
||||
print()
|
||||
|
||||
print(" {:3d} | {:7.2%} / {:16d} | {:>6s} | {:>4s} | {}".format(
|
||||
cpu, Cpus[cpu]['mtask_time'] / ElapsedTime,
|
||||
Cpus[cpu]['mtask_time'], socket, core, model))
|
||||
|
||||
if len(Global['cpu_sockets']) > 1:
|
||||
Global['cpu_sockets_warning'] = True
|
||||
@ -313,25 +325,68 @@ def report_cpus():
|
||||
Global['cpu_socket_cores_warning'] = True
|
||||
|
||||
|
||||
def report_sections():
|
||||
if not Sections:
|
||||
return
|
||||
print("\nSection profile:")
|
||||
|
||||
totalTime = collections.defaultdict(lambda: 0)
|
||||
selfTime = collections.defaultdict(lambda: 0)
|
||||
|
||||
sectionTree = [0, {}, 1] # [selfTime, childTrees, numberOfTimesEntered]
|
||||
prevTime = 0
|
||||
prevStack = ()
|
||||
for time, stack in Sections:
|
||||
if len(stack) > len(prevStack):
|
||||
scope = sectionTree
|
||||
for item in stack:
|
||||
scope = scope[1].setdefault(item, [0, {}, 0])
|
||||
scope[2] += 1
|
||||
dt = time - prevTime
|
||||
scope = sectionTree
|
||||
for item in prevStack:
|
||||
scope = scope[1].setdefault(item, [0, {}, 0])
|
||||
scope[0] += dt
|
||||
|
||||
if prevStack:
|
||||
for name in prevStack:
|
||||
totalTime[name] += dt
|
||||
selfTime[prevStack[-1]] += dt
|
||||
prevTime = time
|
||||
prevStack = stack
|
||||
|
||||
def treeSum(tree):
|
||||
n = tree[0]
|
||||
for subTree in tree[1].values():
|
||||
n += treeSum(subTree)
|
||||
return n
|
||||
|
||||
# Make sure the tree sums to the elapsed time
|
||||
sectionTree[0] += ElapsedTime - treeSum(sectionTree)
|
||||
|
||||
def printTree(prefix, name, entries, tree):
|
||||
print(" {:7.2%} | {:7.2%} | {:8} | {:10.2f} | {}".format(
|
||||
treeSum(tree) / ElapsedTime, tree[0] / ElapsedTime, tree[2],
|
||||
tree[2] / entries, prefix + name))
|
||||
for k in sorted(tree[1], key=lambda _: -treeSum(tree[1][_])):
|
||||
printTree(prefix + " ", k, tree[2], tree[1][k])
|
||||
|
||||
print(" Total | Self | Total | Relative | Section")
|
||||
print(" time | time | entries | entries | name ")
|
||||
print("==========|=========|==========|============|========")
|
||||
printTree("", "*TOTAL*", 1, sectionTree)
|
||||
|
||||
|
||||
######################################################################
|
||||
|
||||
|
||||
def write_vcd(filename):
|
||||
print("Writing %s" % filename)
|
||||
with open(filename, "w", encoding="utf8") as fh:
|
||||
vcd = {
|
||||
'values':
|
||||
collections.defaultdict(lambda: {}), # {<time>}{<code>} = value
|
||||
'sigs': {
|
||||
'predicted_threads': {},
|
||||
'measured_threads': {},
|
||||
'cpus': {},
|
||||
'evals': {},
|
||||
'mtasks': {},
|
||||
'Stats': {}
|
||||
} # {<module>}{<sig}} = code
|
||||
}
|
||||
code = 0
|
||||
# dict of dicts of hierarchy elements/signal name -> (code, width)
|
||||
scopeSigs = {}
|
||||
# map from time -> code -> value
|
||||
values = collections.defaultdict(lambda: {})
|
||||
|
||||
parallelism = {
|
||||
'measured': collections.defaultdict(lambda: 0),
|
||||
@ -340,132 +395,139 @@ def write_vcd(filename):
|
||||
parallelism['measured'][0] = 0
|
||||
parallelism['predicted'][0] = 0
|
||||
|
||||
# Measured graph
|
||||
for thread in sorted(Threads.keys()):
|
||||
sig = "thread%d_mtask" % thread
|
||||
if sig not in vcd['sigs']['measured_threads']:
|
||||
vcd['sigs']['measured_threads'][sig] = code
|
||||
code += 1
|
||||
mcode = vcd['sigs']['measured_threads'][sig]
|
||||
nextFreeCode = 0
|
||||
|
||||
for start in sorted(Threads[thread]):
|
||||
mtask = Threads[thread][start]['mtask']
|
||||
end = Threads[thread][start]['end']
|
||||
cpu = Threads[thread][start]['cpu']
|
||||
vcd['values'][start][mcode] = mtask
|
||||
vcd['values'][end][mcode] = None
|
||||
def getCode(width, *names):
|
||||
nonlocal nextFreeCode
|
||||
scope = scopeSigs
|
||||
*path, name = names
|
||||
for item in path:
|
||||
scope = scope.setdefault(item, {})
|
||||
code, oldWidth = scope.setdefault(name, (nextFreeCode, width))
|
||||
assert oldWidth == width
|
||||
if code == nextFreeCode:
|
||||
nextFreeCode += 1
|
||||
return code
|
||||
|
||||
def addValue(code, time, val):
|
||||
if isinstance(val, str):
|
||||
buf = ""
|
||||
for c in val:
|
||||
buf += bin(ord(c))[2:].rjust(8, "0")
|
||||
val = buf.ljust(LongestVcdStrValueLength * 8, "0")
|
||||
values[time][code] = val
|
||||
|
||||
# Measured graph
|
||||
for thread in sorted(Threads):
|
||||
mcode = getCode(32, 'measured', 't%d_mtask' % thread)
|
||||
for record in Threads[thread]:
|
||||
start = record['start']
|
||||
mtask = record['mtask']
|
||||
end = record['end']
|
||||
cpu = record['cpu']
|
||||
addValue(mcode, start, mtask)
|
||||
addValue(mcode, end, None)
|
||||
parallelism['measured'][start] += 1
|
||||
parallelism['measured'][end] -= 1
|
||||
|
||||
sig = "cpu%d_thread" % cpu
|
||||
if sig not in vcd['sigs']['cpus']:
|
||||
vcd['sigs']['cpus'][sig] = code
|
||||
code += 1
|
||||
ccode = vcd['sigs']['cpus'][sig]
|
||||
vcd['values'][start][ccode] = thread
|
||||
vcd['values'][end][ccode] = None
|
||||
code = getCode(32, 'measured', 'cpu%d_thread' % cpu)
|
||||
addValue(code, start, thread)
|
||||
addValue(code, end, None)
|
||||
|
||||
sig = "mtask%d_cpu" % mtask
|
||||
if sig not in vcd['sigs']['mtasks']:
|
||||
vcd['sigs']['mtasks'][sig] = code
|
||||
code += 1
|
||||
ccode = vcd['sigs']['mtasks'][sig]
|
||||
vcd['values'][start][ccode] = cpu
|
||||
vcd['values'][end][ccode] = None
|
||||
code = getCode(32, 'measured', 'cpu%d_mtask' % cpu)
|
||||
addValue(code, start, mtask)
|
||||
addValue(code, end, None)
|
||||
|
||||
# Eval graph
|
||||
vcd['sigs']['evals']["eval"] = code
|
||||
elcode = code
|
||||
code += 1
|
||||
n = 0
|
||||
for eval_start in Evals:
|
||||
eval_end = Evals[eval_start]['end']
|
||||
n += 1
|
||||
vcd['values'][eval_start][elcode] = n
|
||||
vcd['values'][eval_end][elcode] = None
|
||||
tStart = sorted(_['start'] for records in Threads.values()
|
||||
for _ in records)
|
||||
tEnd = sorted(_['end'] for records in Threads.values()
|
||||
for _ in records)
|
||||
|
||||
# Eval_loop graph
|
||||
vcd['sigs']['evals']["eval_loop"] = code
|
||||
elcode = code
|
||||
code += 1
|
||||
n = 0
|
||||
for eval_start in EvalLoops:
|
||||
eval_end = EvalLoops[eval_start]['end']
|
||||
n += 1
|
||||
vcd['values'][eval_start][elcode] = n
|
||||
vcd['values'][eval_end][elcode] = None
|
||||
# Predicted graph
|
||||
for start, end in ExecGraphIntervals:
|
||||
# Find the earliest MTask start after the start point, and the
|
||||
# latest MTask end before the end point, so we can scale to the
|
||||
# same range
|
||||
start = tStart[bisect.bisect_left(tStart, start)]
|
||||
end = tEnd[bisect.bisect_right(tEnd, end) - 1]
|
||||
# Compute scale so predicted graph is of same width as interval
|
||||
measured_scaling = (end - start) / Global['predict_last_end']
|
||||
# Predict mtasks that fill the time the execution occupied
|
||||
for mtask in Mtasks:
|
||||
thread = Mtasks[mtask]['thread']
|
||||
pred_scaled_start = start + int(
|
||||
Mtasks[mtask]['predict_start'] * measured_scaling)
|
||||
pred_scaled_end = start + int(
|
||||
(Mtasks[mtask]['predict_start'] +
|
||||
Mtasks[mtask]['predict_cost']) * measured_scaling)
|
||||
if pred_scaled_start == pred_scaled_end:
|
||||
continue
|
||||
|
||||
if Mtasks:
|
||||
# Predicted graph
|
||||
for eval_start in EvalLoops:
|
||||
eval_end = EvalLoops[eval_start]['end']
|
||||
# Compute scale so predicted graph is of same width as eval
|
||||
measured_scaling = (eval_end -
|
||||
eval_start) / Global['predict_last_end']
|
||||
# Predict mtasks that fill the time the eval occupied
|
||||
for mtask in Mtasks:
|
||||
thread = Mtasks[mtask]['thread']
|
||||
pred_scaled_start = eval_start + int(
|
||||
Mtasks[mtask]['predict_start'] * measured_scaling)
|
||||
pred_scaled_end = eval_start + int(
|
||||
(Mtasks[mtask]['predict_start'] +
|
||||
Mtasks[mtask]['predict_cost']) * measured_scaling)
|
||||
if pred_scaled_start == pred_scaled_end:
|
||||
continue
|
||||
mcode = getCode(32, 'predicted', 't%d_mtask' % thread)
|
||||
addValue(mcode, pred_scaled_start, mtask)
|
||||
addValue(mcode, pred_scaled_end, None)
|
||||
|
||||
sig = "predicted_thread%d_mtask" % thread
|
||||
if sig not in vcd['sigs']['predicted_threads']:
|
||||
vcd['sigs']['predicted_threads'][sig] = code
|
||||
code += 1
|
||||
mcode = vcd['sigs']['predicted_threads'][sig]
|
||||
parallelism['predicted'][pred_scaled_start] += 1
|
||||
parallelism['predicted'][pred_scaled_end] -= 1
|
||||
|
||||
vcd['values'][pred_scaled_start][mcode] = mtask
|
||||
vcd['values'][pred_scaled_end][mcode] = None
|
||||
# Parallelism graph
|
||||
for measpred in ('measured', 'predicted'):
|
||||
pcode = getCode(32, 'stats', '%s_parallelism' % measpred)
|
||||
value = 0
|
||||
for time in sorted(parallelism[measpred].keys()):
|
||||
value += parallelism[measpred][time]
|
||||
addValue(pcode, time, value)
|
||||
|
||||
parallelism['predicted'][pred_scaled_start] += 1
|
||||
parallelism['predicted'][pred_scaled_end] -= 1
|
||||
|
||||
# Parallelism graph
|
||||
for measpred in ('measured', 'predicted'):
|
||||
vcd['sigs']['Stats']["%s_parallelism" % measpred] = code
|
||||
pcode = code
|
||||
code += 1
|
||||
value = 0
|
||||
for time in sorted(parallelism[measpred].keys()):
|
||||
value += parallelism[measpred][time]
|
||||
vcd['values'][time][pcode] = value
|
||||
# Section graph
|
||||
if Sections:
|
||||
scode = getCode(LongestVcdStrValueLength * 8, "section", "trace")
|
||||
dcode = getCode(32, "section", "depth")
|
||||
for time, stack in Sections:
|
||||
addValue(scode, time, stack[-1] if stack else None)
|
||||
addValue(dcode, time, len(stack))
|
||||
|
||||
# Create output file
|
||||
fh.write("$version Generated by verilator_gantt $end\n")
|
||||
fh.write("$timescale 1ns $end\n")
|
||||
fh.write("\n")
|
||||
|
||||
all_codes = {}
|
||||
all_codes = set()
|
||||
|
||||
def writeScope(scope):
|
||||
assert isinstance(scope, dict)
|
||||
for key in sorted(scope):
|
||||
val = scope[key]
|
||||
if isinstance(val, dict):
|
||||
fh.write(" $scope module %s $end\n" % key)
|
||||
writeScope(val)
|
||||
fh.write(" $upscope $end\n")
|
||||
else:
|
||||
(code, width) = val
|
||||
all_codes.add(code)
|
||||
fh.write(" $var wire %d v%x %s [%d:0] $end\n" %
|
||||
(width, code, key, width - 1))
|
||||
|
||||
fh.write(" $scope module gantt $end\n")
|
||||
for module in sorted(vcd['sigs'].keys()):
|
||||
fh.write(" $scope module %s $end\n" % module)
|
||||
for sig in sorted(vcd['sigs'][module].keys()):
|
||||
code = vcd['sigs'][module][sig]
|
||||
fh.write(" $var wire 32 v%x %s [31:0] $end\n" % (code, sig))
|
||||
all_codes[code] = 1
|
||||
fh.write(" $upscope $end\n")
|
||||
writeScope(scopeSigs)
|
||||
fh.write(" $upscope $end\n")
|
||||
fh.write("$enddefinitions $end\n")
|
||||
fh.write("\n")
|
||||
|
||||
first = True
|
||||
for time in sorted(vcd['values']):
|
||||
for time in sorted(values):
|
||||
if first:
|
||||
first = False
|
||||
# Start with Z for any signals without time zero data
|
||||
for code in sorted(all_codes.keys()):
|
||||
if code not in vcd['values'][time]:
|
||||
vcd['values'][time][code] = None
|
||||
for code in sorted(all_codes):
|
||||
if code not in values[time]:
|
||||
addValue(code, time, None)
|
||||
fh.write("#%d\n" % time)
|
||||
for code in sorted(vcd['values'][time].keys()):
|
||||
value = vcd['values'][time][code]
|
||||
for code in sorted(values[time]):
|
||||
value = values[time][code]
|
||||
if value is None:
|
||||
fh.write("bz v%x\n" % code)
|
||||
elif isinstance(value, str):
|
||||
fh.write("b%s v%x\n" % (value, code))
|
||||
else:
|
||||
fh.write("b%s v%x\n" % (format(value, 'b'), code))
|
||||
|
||||
@ -504,7 +566,8 @@ parser.add_argument('filename',
|
||||
|
||||
Args = parser.parse_args()
|
||||
|
||||
process(Args.filename)
|
||||
read_data(Args.filename)
|
||||
report()
|
||||
if not Args.no_vcd:
|
||||
write_vcd(Args.vcd)
|
||||
|
||||
|
@ -30,15 +30,13 @@ Gantt Chart VCD Signals
|
||||
In waveforms, there are the following signals. In GTKWave, use "decimal"
|
||||
data format to remove the leading zeros and make the traces easier to read.
|
||||
|
||||
evals
|
||||
Increments each time when eval_step was measured to be active. This
|
||||
allow visualization of how much time eval_step was active.
|
||||
trace/section
|
||||
Shows the name of the current top of the execution section stack.
|
||||
Set GTKWave data format to "ASCII".
|
||||
|
||||
eval_loop
|
||||
Increments each time when the evaluation loop within eval_step was
|
||||
measured to be active. For best performance, there is only a single
|
||||
evaluation loop within each eval_step call; that is, the eval_loop
|
||||
waveform looks identical to the evals waveform.
|
||||
trace/depth
|
||||
Shows the depth of the execution section stack.
|
||||
Set GTKWave data format to "Analog".
|
||||
|
||||
measured_parallelism
|
||||
The number of mtasks active at this time, for best performance, this will
|
||||
|
@ -161,12 +161,18 @@ void VlExecutionProfiler::dump(const char* filenamep, uint64_t tickEnd)
|
||||
|
||||
// TODO Perhaps merge with verilated_coverage output format, so can
|
||||
// have a common merging and reporting tool, etc.
|
||||
fprintf(fp, "VLPROFVERSION 2.0 # Verilator execution profile version 2.0\n");
|
||||
fprintf(fp, "VLPROFVERSION 2.1 # Verilator execution profile version 2.1\n");
|
||||
fprintf(fp, "VLPROF arg +verilator+prof+exec+start+%" PRIu64 "\n",
|
||||
Verilated::threadContextp()->profExecStart());
|
||||
fprintf(fp, "VLPROF arg +verilator+prof+exec+window+%u\n",
|
||||
Verilated::threadContextp()->profExecWindow());
|
||||
const unsigned threads = static_cast<unsigned>(m_traceps.size());
|
||||
// Note that VerilatedContext will by default create as many threads as there are hardware
|
||||
// processors, but not all of them might be utilized. Report the actual number that has trace
|
||||
// entries to avoid over-counting.
|
||||
unsigned threads = 0;
|
||||
for (const auto& pair : m_traceps) {
|
||||
if (!pair.second->empty()) ++threads;
|
||||
}
|
||||
fprintf(fp, "VLPROF stat threads %u\n", threads);
|
||||
fprintf(fp, "VLPROF stat yields %" PRIu64 "\n", VlMTaskVertex::yields());
|
||||
|
||||
@ -183,6 +189,7 @@ void VlExecutionProfiler::dump(const char* filenamep, uint64_t tickEnd)
|
||||
for (const auto& pair : m_traceps) {
|
||||
const uint32_t threadId = pair.first;
|
||||
ExecutionTrace* const tracep = pair.second;
|
||||
if (tracep->empty()) continue;
|
||||
fprintf(fp, "VLPROFTHREAD %" PRIu32 "\n", threadId);
|
||||
|
||||
for (const VlExecutionRecord& er : *tracep) {
|
||||
@ -191,10 +198,9 @@ void VlExecutionProfiler::dump(const char* filenamep, uint64_t tickEnd)
|
||||
fprintf(fp, "VLPROFEXEC %s %" PRIu64, name, time);
|
||||
|
||||
switch (er.m_type) {
|
||||
case VlExecutionRecord::Type::EVAL_BEGIN:
|
||||
case VlExecutionRecord::Type::EVAL_END:
|
||||
case VlExecutionRecord::Type::EVAL_LOOP_BEGIN:
|
||||
case VlExecutionRecord::Type::EVAL_LOOP_END:
|
||||
case VlExecutionRecord::Type::SECTION_POP:
|
||||
case VlExecutionRecord::Type::EXEC_GRAPH_BEGIN:
|
||||
case VlExecutionRecord::Type::EXEC_GRAPH_END:
|
||||
// No payload
|
||||
fprintf(fp, "\n");
|
||||
break;
|
||||
@ -209,6 +215,11 @@ void VlExecutionProfiler::dump(const char* filenamep, uint64_t tickEnd)
|
||||
fprintf(fp, " id %u predictCost %u\n", payload.m_id, payload.m_predictCost);
|
||||
break;
|
||||
}
|
||||
case VlExecutionRecord::Type::SECTION_PUSH: {
|
||||
const auto& payload = er.m_payload.sectionPush;
|
||||
fprintf(fp, " %s\n", payload.m_name);
|
||||
break;
|
||||
}
|
||||
default: abort(); // LCOV_EXCL_LINE
|
||||
}
|
||||
}
|
||||
|
@ -58,12 +58,12 @@ VL_ATTR_ALWINLINE QData VL_CPU_TICK() {
|
||||
|
||||
// clang-format off
|
||||
#define FOREACH_VlExecutionRecord_TYPE(macro) \
|
||||
_VL_FOREACH_APPLY(macro, EVAL_BEGIN) \
|
||||
_VL_FOREACH_APPLY(macro, EVAL_END) \
|
||||
_VL_FOREACH_APPLY(macro, EVAL_LOOP_BEGIN) \
|
||||
_VL_FOREACH_APPLY(macro, EVAL_LOOP_END) \
|
||||
_VL_FOREACH_APPLY(macro, SECTION_PUSH) \
|
||||
_VL_FOREACH_APPLY(macro, SECTION_POP) \
|
||||
_VL_FOREACH_APPLY(macro, MTASK_BEGIN) \
|
||||
_VL_FOREACH_APPLY(macro, MTASK_END)
|
||||
_VL_FOREACH_APPLY(macro, MTASK_END) \
|
||||
_VL_FOREACH_APPLY(macro, EXEC_GRAPH_BEGIN) \
|
||||
_VL_FOREACH_APPLY(macro, EXEC_GRAPH_END)
|
||||
// clang-format on
|
||||
|
||||
class VlExecutionRecord final {
|
||||
@ -83,6 +83,9 @@ class VlExecutionRecord final {
|
||||
};
|
||||
|
||||
union Payload {
|
||||
struct {
|
||||
const char* m_name; // Name of section being entered
|
||||
} sectionPush;
|
||||
struct {
|
||||
uint32_t m_id; // MTask id
|
||||
uint32_t m_predictStart; // Time scheduler predicted would start
|
||||
@ -109,10 +112,11 @@ public:
|
||||
VlExecutionRecord() = default;
|
||||
|
||||
// METHODS
|
||||
void evalBegin() { m_type = Type::EVAL_BEGIN; }
|
||||
void evalEnd() { m_type = Type::EVAL_END; }
|
||||
void evalLoopBegin() { m_type = Type::EVAL_LOOP_BEGIN; }
|
||||
void evalLoopEnd() { m_type = Type::EVAL_LOOP_END; }
|
||||
void sectionPush(const char* name) {
|
||||
m_payload.sectionPush.m_name = name;
|
||||
m_type = Type::SECTION_PUSH;
|
||||
}
|
||||
void sectionPop() { m_type = Type::SECTION_POP; }
|
||||
void mtaskBegin(uint32_t id, uint32_t predictStart) {
|
||||
m_payload.mtaskBegin.m_id = id;
|
||||
m_payload.mtaskBegin.m_predictStart = predictStart;
|
||||
@ -124,6 +128,8 @@ public:
|
||||
m_payload.mtaskEnd.m_predictCost = predictCost;
|
||||
m_type = Type::MTASK_END;
|
||||
}
|
||||
void execGraphBegin() { m_type = Type::EXEC_GRAPH_BEGIN; }
|
||||
void execGraphEnd() { m_type = Type::EXEC_GRAPH_END; }
|
||||
};
|
||||
|
||||
static_assert(std::is_trivially_destructible<VlExecutionRecord>::value,
|
||||
|
@ -403,7 +403,7 @@ class EmitCModel final : public EmitCFunc {
|
||||
|
||||
if (v3Global.opt.profExec()) {
|
||||
puts("vlSymsp->__Vm_executionProfilerp->configure();\n");
|
||||
puts("VL_EXEC_TRACE_ADD_RECORD(vlSymsp).evalBegin();\n");
|
||||
puts("VL_EXEC_TRACE_ADD_RECORD(vlSymsp).sectionPush(\"eval\");\n");
|
||||
}
|
||||
|
||||
puts("VL_DEBUG_IF(VL_DBG_MSGF(\"+ Eval\\n\"););\n");
|
||||
@ -412,7 +412,7 @@ class EmitCModel final : public EmitCFunc {
|
||||
putsDecoration("// Evaluate cleanup\n");
|
||||
puts("Verilated::endOfEval(vlSymsp->__Vm_evalMsgQp);\n");
|
||||
|
||||
if (v3Global.opt.profExec()) puts("VL_EXEC_TRACE_ADD_RECORD(vlSymsp).evalEnd();\n");
|
||||
if (v3Global.opt.profExec()) puts("VL_EXEC_TRACE_ADD_RECORD(vlSymsp).sectionPop();\n");
|
||||
puts("}\n");
|
||||
}
|
||||
|
||||
|
@ -1473,7 +1473,8 @@ AstCFunc* order(AstNetlist* netlistp, //
|
||||
|
||||
// Create the result function
|
||||
AstScope* const scopeTopp = netlistp->topScopep()->scopep();
|
||||
AstCFunc* const funcp = new AstCFunc{netlistp->fileline(), "_eval_" + tag, scopeTopp, ""};
|
||||
FileLine* const flp = netlistp->fileline();
|
||||
AstCFunc* const funcp = new AstCFunc{flp, "_eval_" + tag, scopeTopp, ""};
|
||||
funcp->dontCombine(true);
|
||||
funcp->isStatic(false);
|
||||
funcp->isLoose(true);
|
||||
@ -1482,9 +1483,18 @@ AstCFunc* order(AstNetlist* netlistp, //
|
||||
funcp->declPrivate(true);
|
||||
scopeTopp->addBlocksp(funcp);
|
||||
|
||||
if (v3Global.opt.profExec()) {
|
||||
funcp->addStmtsp(new AstCStmt{flp, "VL_EXEC_TRACE_ADD_RECORD(vlSymsp).sectionPush(\"func "
|
||||
+ tag + "\");\n"});
|
||||
}
|
||||
|
||||
// Add ordered statements to the result function
|
||||
for (AstNode* const nodep : nodeps) funcp->addStmtsp(nodep);
|
||||
|
||||
if (v3Global.opt.profExec()) {
|
||||
funcp->addStmtsp(new AstCStmt{flp, "VL_EXEC_TRACE_ADD_RECORD(vlSymsp).sectionPop();\n"});
|
||||
}
|
||||
|
||||
// Dispose of the remnants of the inputs
|
||||
for (auto* const lbsp : logic) lbsp->deleteActives();
|
||||
|
||||
|
@ -3183,6 +3183,10 @@ static void addThreadStartToExecGraph(AstExecGraph* const execGraphp,
|
||||
execGraphp->addStmtsp(new AstText{fl, text, /* tracking: */ true});
|
||||
};
|
||||
|
||||
if (v3Global.opt.profExec()) {
|
||||
addStrStmt("VL_EXEC_TRACE_ADD_RECORD(vlSymsp).execGraphBegin();\n");
|
||||
}
|
||||
|
||||
addStrStmt("vlSymsp->__Vm_even_cycle__" + tag + " = !vlSymsp->__Vm_even_cycle__" + tag
|
||||
+ ";\n");
|
||||
|
||||
@ -3206,6 +3210,10 @@ static void addThreadStartToExecGraph(AstExecGraph* const execGraphp,
|
||||
|
||||
addStrStmt("vlSelf->__Vm_mtaskstate_final__" + tag
|
||||
+ ".waitUntilUpstreamDone(vlSymsp->__Vm_even_cycle__" + tag + ");\n");
|
||||
|
||||
if (v3Global.opt.profExec()) {
|
||||
addStrStmt("VL_EXEC_TRACE_ADD_RECORD(vlSymsp).execGraphEnd();\n");
|
||||
}
|
||||
}
|
||||
|
||||
static void implementExecGraph(AstExecGraph* const execGraphp) {
|
||||
|
@ -399,6 +399,17 @@ AstSenTree* createTriggerSenTree(AstNetlist* netlistp, AstVarScope* const vscp,
|
||||
return resultp;
|
||||
}
|
||||
|
||||
//============================================================================
|
||||
// Utility for creating profiling statements
|
||||
|
||||
AstNodeStmt* profExecSectionPush(FileLine* flp, const string& name) {
|
||||
return new AstCStmt{flp, "VL_EXEC_TRACE_ADD_RECORD(vlSymsp).sectionPush(\"" + name + "\");\n"};
|
||||
};
|
||||
|
||||
AstNodeStmt* profExecSectionPop(FileLine* flp) {
|
||||
return new AstCStmt{flp, "VL_EXEC_TRACE_ADD_RECORD(vlSymsp).sectionPop();\n"};
|
||||
};
|
||||
|
||||
//============================================================================
|
||||
// Utility for extra trigger allocation
|
||||
|
||||
@ -442,6 +453,7 @@ const TriggerKit createTriggers(AstNetlist* netlistp, AstCFunc* const initFuncp,
|
||||
|
||||
// Create the trigger computation function
|
||||
AstCFunc* const funcp = makeSubFunction(netlistp, "_eval_triggers__" + name, slow);
|
||||
if (v3Global.opt.profExec()) funcp->addStmtsp(profExecSectionPush(flp, "trig " + name));
|
||||
|
||||
// Create the trigger dump function (for debugging, always 'slow')
|
||||
AstCFunc* const dumpp = makeSubFunction(netlistp, "_dump_triggers__" + name, true);
|
||||
@ -581,6 +593,8 @@ const TriggerKit createTriggers(AstNetlist* netlistp, AstCFunc* const initFuncp,
|
||||
add("#endif\n");
|
||||
}
|
||||
|
||||
if (v3Global.opt.profExec()) funcp->addStmtsp(profExecSectionPop(flp));
|
||||
|
||||
// The debug code might leak signal names, so simply delete it when using --protect-ids
|
||||
if (v3Global.opt.protectIds()) dumpp->stmtsp()->unlinkFrBackWithNext()->deleteTree();
|
||||
|
||||
@ -636,7 +650,9 @@ EvalLoop makeEvalLoop(AstNetlist* netlistp, const string& tag, const string& nam
|
||||
AstVarScope* const continuep = scopeTopp->createTemp("__V" + tag + "Continue", 1);
|
||||
continuep->varp()->noReset(true);
|
||||
|
||||
AstNodeStmt* nodep = setVar(counterp, 0);
|
||||
AstNodeStmt* nodep = nullptr;
|
||||
if (v3Global.opt.profExec()) nodep = profExecSectionPush(flp, "loop " + tag);
|
||||
nodep = AstNode::addNext(nodep, setVar(counterp, 0));
|
||||
nodep->addNext(buildLoop(netlistp, continuep, [&](AstWhile* loopp) {
|
||||
// Compute triggers
|
||||
loopp->addStmtsp(computeTriggers());
|
||||
@ -689,6 +705,8 @@ EvalLoop makeEvalLoop(AstNetlist* netlistp, const string& tag, const string& nam
|
||||
}
|
||||
}));
|
||||
|
||||
if (v3Global.opt.profExec()) nodep->addNext(profExecSectionPop(flp));
|
||||
|
||||
return {counterp, continuep, nodep};
|
||||
}
|
||||
|
||||
|
@ -49,7 +49,7 @@ if ($Self->{vltmt}) {
|
||||
file_grep("$Self->{obj_dir}/gantt.log", qr/Total threads += 1/i);
|
||||
file_grep("$Self->{obj_dir}/gantt.log", qr/Total mtasks += 0/i);
|
||||
}
|
||||
file_grep("$Self->{obj_dir}/gantt.log", qr/Total evals += 2/i);
|
||||
file_grep("$Self->{obj_dir}/gantt.log", qr/\|\s+2\s+\|\s+2\.0+\s+\|\s+eval/i);
|
||||
|
||||
# Diff to itself, just to check parsing
|
||||
vcd_identical("$Self->{obj_dir}/profile_exec.vcd", "$Self->{obj_dir}/profile_exec.vcd");
|
||||
|
@ -900,22 +900,18 @@ VLPROFPROC address sizes : 43 bits physical, 48 bits virtual
|
||||
VLPROFPROC power management: ts ttp tm hwpstate cpb eff_freq_ro [13] [14]
|
||||
VLPROFPROC
|
||||
VLPROFTHREAD 0
|
||||
VLPROFEXEC EVAL_BEGIN 595
|
||||
VLPROFEXEC EVAL_LOOP_BEGIN 945
|
||||
VLPROFEXEC EXEC_GRAPH_BEGIN 945
|
||||
VLPROFEXEC MTASK_BEGIN 2695 id 6 predictStart 0 cpu 19
|
||||
VLPROFEXEC MTASK_END 2905 id 6 predictCost 30
|
||||
VLPROFEXEC MTASK_BEGIN 9695 id 10 predictStart 196 cpu 19
|
||||
VLPROFEXEC MTASK_END 9870 id 10 predictCost 30
|
||||
VLPROFEXEC EVAL_LOOP_END 12180
|
||||
VLPROFEXEC EVAL_END 12250
|
||||
VLPROFEXEC EVAL_BEGIN 13720
|
||||
VLPROFEXEC EVAL_LOOP_BEGIN 14000
|
||||
VLPROFEXEC EXEC_GRAPH_END 12180
|
||||
VLPROFEXEC EXEC_GRAPH_BEGIN 14000
|
||||
VLPROFEXEC MTASK_BEGIN 15610 id 6 predictStart 0 cpu 19
|
||||
VLPROFEXEC MTASK_END 15820 id 6 predictCost 30
|
||||
VLPROFEXEC MTASK_BEGIN 21700 id 10 predictStart 196 cpu 19
|
||||
VLPROFEXEC MTASK_END 21875 id 10 predictCost 30
|
||||
VLPROFEXEC EVAL_LOOP_END 22085
|
||||
VLPROFEXEC EVAL_END 22330
|
||||
VLPROFEXEC EXEC_GRAPH_END 22085
|
||||
VLPROFTHREAD 1
|
||||
VLPROFEXEC MTASK_BEGIN 5495 id 5 predictStart 0 cpu 10
|
||||
VLPROFEXEC MTASK_END 6090 id 5 predictCost 30
|
||||
|
@ -4,33 +4,44 @@ Argument settings:
|
||||
+verilator+prof+exec+start+2
|
||||
+verilator+prof+exec+window+2
|
||||
|
||||
Analysis:
|
||||
Total threads = 2
|
||||
Total mtasks = 7
|
||||
Total cpus used = 2
|
||||
Total yields = 0
|
||||
Total evals = 2
|
||||
Total eval loops = 2
|
||||
Total eval time = 21875 rdtsc ticks
|
||||
Longest mtask time = 1190 rdtsc ticks
|
||||
All-thread mtask time = 5495 rdtsc ticks
|
||||
Longest-thread efficiency = 5.4%
|
||||
All-thread efficiency = 12.6%
|
||||
All-thread speedup = 0.3
|
||||
Summary:
|
||||
Total elapsed time = 23415 rdtsc ticks
|
||||
Parallelized code = 82.51% of elapsed time
|
||||
Total threads = 2
|
||||
Total CPUs used = 2
|
||||
Total mtasks = 7
|
||||
Total yields = 0
|
||||
|
||||
Prediction (what Verilator used for scheduling):
|
||||
All-thread efficiency = 63.2%
|
||||
All-thread speedup = 1.3
|
||||
Parallelized code, measured:
|
||||
Thread utilization = 14.22%
|
||||
Speedup = 0.284x
|
||||
|
||||
Parallelized code, predicted during static scheduling:
|
||||
Thread utilization = 63.22%
|
||||
Speedup = 1.26x
|
||||
|
||||
All code, measured:
|
||||
Thread utilization = 20.48%
|
||||
Speedup = 0.41x
|
||||
|
||||
All code, measured, scaled by predicted speedup:
|
||||
Thread utilization = 56.80%
|
||||
Speedup = 1.14x
|
||||
|
||||
MTask statistics:
|
||||
Longest mtask id = 5
|
||||
Longest mtask time = 6.16% of time elapsed in parallelized code
|
||||
min log(p2e) = -3.681 from mtask 5 (predict 30, elapsed 1190)
|
||||
max log(p2e) = -2.409 from mtask 8 (predict 107, elapsed 1190)
|
||||
mean = -2.992
|
||||
stddev = 0.459
|
||||
e ^ stddev = 1.583
|
||||
|
||||
CPUs:
|
||||
cpu 10: cpu_time=4725 socket=0 core=10 Test Ryzen 9 3950X 16-Core Processor
|
||||
cpu 19: cpu_time=770 socket=0 core=3 Test Ryzen 9 3950X 16-Core Processor
|
||||
CPU info:
|
||||
Id | Time spent executing MTask | Socket | Core | Model
|
||||
| % of elapsed ticks / ticks | | |
|
||||
====|============================|========|======|======
|
||||
10 | 20.18% / 4725 | 0 | 10 | Test Ryzen 9 3950X 16-Core Processor
|
||||
19 | 3.29% / 770 | 0 | 3 | Test Ryzen 9 3950X 16-Core Processor
|
||||
|
||||
Writing profile_exec.vcd
|
||||
|
@ -2,34 +2,21 @@ $version Generated by verilator_gantt $end
|
||||
$timescale 1ns $end
|
||||
|
||||
$scope module gantt $end
|
||||
$scope module Stats $end
|
||||
$var wire 32 vf measured_parallelism [31:0] $end
|
||||
$var wire 32 v10 predicted_parallelism [31:0] $end
|
||||
$upscope $end
|
||||
$scope module cpus $end
|
||||
$var wire 32 v5 cpu10_thread [31:0] $end
|
||||
$scope module measured $end
|
||||
$var wire 32 v5 cpu10_mtask [31:0] $end
|
||||
$var wire 32 v4 cpu10_thread [31:0] $end
|
||||
$var wire 32 v2 cpu19_mtask [31:0] $end
|
||||
$var wire 32 v1 cpu19_thread [31:0] $end
|
||||
$var wire 32 v0 t0_mtask [31:0] $end
|
||||
$var wire 32 v3 t1_mtask [31:0] $end
|
||||
$upscope $end
|
||||
$scope module evals $end
|
||||
$var wire 32 vb eval [31:0] $end
|
||||
$var wire 32 vc eval_loop [31:0] $end
|
||||
$scope module predicted $end
|
||||
$var wire 32 v6 t0_mtask [31:0] $end
|
||||
$var wire 32 v7 t1_mtask [31:0] $end
|
||||
$upscope $end
|
||||
$scope module measured_threads $end
|
||||
$var wire 32 v0 thread0_mtask [31:0] $end
|
||||
$var wire 32 v4 thread1_mtask [31:0] $end
|
||||
$upscope $end
|
||||
$scope module mtasks $end
|
||||
$var wire 32 v3 mtask10_cpu [31:0] $end
|
||||
$var wire 32 va mtask11_cpu [31:0] $end
|
||||
$var wire 32 v6 mtask5_cpu [31:0] $end
|
||||
$var wire 32 v2 mtask6_cpu [31:0] $end
|
||||
$var wire 32 v7 mtask7_cpu [31:0] $end
|
||||
$var wire 32 v8 mtask8_cpu [31:0] $end
|
||||
$var wire 32 v9 mtask9_cpu [31:0] $end
|
||||
$upscope $end
|
||||
$scope module predicted_threads $end
|
||||
$var wire 32 vd predicted_thread0_mtask [31:0] $end
|
||||
$var wire 32 ve predicted_thread1_mtask [31:0] $end
|
||||
$scope module stats $end
|
||||
$var wire 32 v8 measured_parallelism [31:0] $end
|
||||
$var wire 32 v9 predicted_parallelism [31:0] $end
|
||||
$upscope $end
|
||||
$upscope $end
|
||||
$enddefinitions $end
|
||||
@ -43,216 +30,193 @@ bz v4
|
||||
bz v5
|
||||
bz v6
|
||||
bz v7
|
||||
bz v8
|
||||
bz v9
|
||||
bz va
|
||||
bz vb
|
||||
bz vc
|
||||
bz vd
|
||||
bz ve
|
||||
b0 vf
|
||||
b0 v10
|
||||
#595
|
||||
b1 vb
|
||||
#945
|
||||
b1 vc
|
||||
b110 vd
|
||||
b101 ve
|
||||
b10 v10
|
||||
#2429
|
||||
bz vd
|
||||
b111 ve
|
||||
b1 v10
|
||||
b0 v8
|
||||
b0 v9
|
||||
#2695
|
||||
b110 v0
|
||||
b0 v1
|
||||
b10011 v2
|
||||
b1 vf
|
||||
b110 v2
|
||||
b110 v6
|
||||
b101 v7
|
||||
b1 v8
|
||||
b10 v9
|
||||
#2905
|
||||
bz v0
|
||||
bz v1
|
||||
bz v2
|
||||
b0 vf
|
||||
#3914
|
||||
b1000 ve
|
||||
b1 v10
|
||||
#5495
|
||||
b101 v4
|
||||
b1 v5
|
||||
b1010 v6
|
||||
b1 vf
|
||||
#6090
|
||||
bz v4
|
||||
bz v5
|
||||
b0 v8
|
||||
#3800
|
||||
bz v6
|
||||
b0 vf
|
||||
b111 v7
|
||||
b1 v9
|
||||
#4906
|
||||
b1000 v7
|
||||
b1 v9
|
||||
#5495
|
||||
b101 v3
|
||||
b1 v4
|
||||
b101 v5
|
||||
b1 v8
|
||||
#6090
|
||||
bz v3
|
||||
bz v4
|
||||
bz v5
|
||||
b0 v8
|
||||
#6300
|
||||
b111 v4
|
||||
b1 v5
|
||||
b1010 v7
|
||||
b1 vf
|
||||
b111 v3
|
||||
b1 v4
|
||||
b111 v5
|
||||
b1 v8
|
||||
#6895
|
||||
bz v3
|
||||
bz v4
|
||||
bz v5
|
||||
bz v7
|
||||
b0 vf
|
||||
b0 v8
|
||||
#7490
|
||||
b1000 v4
|
||||
b1 v5
|
||||
b1010 v8
|
||||
b1 vf
|
||||
b1000 v3
|
||||
b1 v4
|
||||
b1000 v5
|
||||
b1 v8
|
||||
#8540
|
||||
bz v3
|
||||
bz v4
|
||||
bz v5
|
||||
bz v8
|
||||
b0 vf
|
||||
b0 v8
|
||||
#8848
|
||||
b1001 v7
|
||||
b1 v9
|
||||
#9135
|
||||
b1001 v4
|
||||
b1 v5
|
||||
b1010 v9
|
||||
b1 vf
|
||||
#9210
|
||||
b1001 ve
|
||||
b1 v10
|
||||
b1001 v3
|
||||
b1 v4
|
||||
b1001 v5
|
||||
b1 v8
|
||||
#9695
|
||||
b1010 v0
|
||||
b0 v1
|
||||
b10011 v3
|
||||
b10 vf
|
||||
b1010 v2
|
||||
b10 v8
|
||||
#9730
|
||||
bz v3
|
||||
bz v4
|
||||
bz v5
|
||||
bz v9
|
||||
b1 vf
|
||||
b1 v8
|
||||
#9870
|
||||
bz v0
|
||||
bz v1
|
||||
bz v3
|
||||
b0 vf
|
||||
bz v2
|
||||
b0 v8
|
||||
#9917
|
||||
b1010 v6
|
||||
b10 v9
|
||||
#9954
|
||||
b1011 v7
|
||||
b10 v9
|
||||
#10255
|
||||
b1011 v4
|
||||
b1 v5
|
||||
b1010 va
|
||||
b1 vf
|
||||
#10645
|
||||
b1010 vd
|
||||
b10 v10
|
||||
#10695
|
||||
b1011 ve
|
||||
b10 v10
|
||||
b1011 v3
|
||||
b1 v4
|
||||
b1011 v5
|
||||
b1 v8
|
||||
#11023
|
||||
bz v6
|
||||
b1 v9
|
||||
#11060
|
||||
bz v3
|
||||
bz v4
|
||||
bz v5
|
||||
bz va
|
||||
b0 vf
|
||||
#12130
|
||||
bz vd
|
||||
b1 v10
|
||||
#12180
|
||||
bz vc
|
||||
bz ve
|
||||
b0 v10
|
||||
#12250
|
||||
bz vb
|
||||
#13720
|
||||
b10 vb
|
||||
#14000
|
||||
b10 vc
|
||||
b110 vd
|
||||
b101 ve
|
||||
b10 v10
|
||||
#15068
|
||||
bz vd
|
||||
b111 ve
|
||||
b1 v10
|
||||
bz v7
|
||||
b0 v8
|
||||
b0 v9
|
||||
#15610
|
||||
b110 v0
|
||||
b0 v1
|
||||
b10011 v2
|
||||
b1 vf
|
||||
b110 v2
|
||||
b110 v6
|
||||
b101 v7
|
||||
b1 v8
|
||||
b10 v9
|
||||
#15820
|
||||
bz v0
|
||||
bz v1
|
||||
bz v2
|
||||
b0 vf
|
||||
#16137
|
||||
b1000 ve
|
||||
b1 v10
|
||||
#18375
|
||||
b101 v4
|
||||
b1 v5
|
||||
b1010 v6
|
||||
b1 vf
|
||||
#18970
|
||||
bz v4
|
||||
bz v5
|
||||
b0 v8
|
||||
#16437
|
||||
bz v6
|
||||
b0 vf
|
||||
b111 v7
|
||||
b1 v9
|
||||
#17265
|
||||
b1000 v7
|
||||
b1 v9
|
||||
#18375
|
||||
b101 v3
|
||||
b1 v4
|
||||
b101 v5
|
||||
b1 v8
|
||||
#18970
|
||||
bz v3
|
||||
bz v4
|
||||
bz v5
|
||||
b0 v8
|
||||
#19145
|
||||
b111 v4
|
||||
b1 v5
|
||||
b1010 v7
|
||||
b1 vf
|
||||
b111 v3
|
||||
b1 v4
|
||||
b111 v5
|
||||
b1 v8
|
||||
#19320
|
||||
bz v3
|
||||
bz v4
|
||||
bz v5
|
||||
bz v7
|
||||
b0 vf
|
||||
b0 v8
|
||||
#19670
|
||||
b1000 v4
|
||||
b1 v5
|
||||
b1010 v8
|
||||
b1 vf
|
||||
b1000 v3
|
||||
b1 v4
|
||||
b1000 v5
|
||||
b1 v8
|
||||
#19810
|
||||
bz v3
|
||||
bz v4
|
||||
bz v5
|
||||
bz v8
|
||||
b0 vf
|
||||
#19947
|
||||
b1001 ve
|
||||
b1 v10
|
||||
b0 v8
|
||||
#20219
|
||||
b1001 v7
|
||||
b1 v9
|
||||
#20650
|
||||
b1001 v4
|
||||
b1 v5
|
||||
b1010 v9
|
||||
b1 vf
|
||||
b1001 v3
|
||||
b1 v4
|
||||
b1001 v5
|
||||
b1 v8
|
||||
#20720
|
||||
bz v3
|
||||
bz v4
|
||||
bz v5
|
||||
bz v9
|
||||
b0 vf
|
||||
#20980
|
||||
b1010 vd
|
||||
b10 v10
|
||||
#21016
|
||||
b1011 ve
|
||||
b10 v10
|
||||
b0 v8
|
||||
#21019
|
||||
b1010 v6
|
||||
b10 v9
|
||||
#21047
|
||||
b1011 v7
|
||||
b10 v9
|
||||
#21140
|
||||
b1011 v4
|
||||
b1 v5
|
||||
b1010 va
|
||||
b1 vf
|
||||
b1011 v3
|
||||
b1 v4
|
||||
b1011 v5
|
||||
b1 v8
|
||||
#21245
|
||||
bz v3
|
||||
bz v4
|
||||
bz v5
|
||||
bz va
|
||||
b0 vf
|
||||
b0 v8
|
||||
#21700
|
||||
b1010 v0
|
||||
b0 v1
|
||||
b10011 v3
|
||||
b1 vf
|
||||
b1010 v2
|
||||
b1 v8
|
||||
#21847
|
||||
bz v6
|
||||
b1 v9
|
||||
#21875
|
||||
bz v0
|
||||
bz v1
|
||||
bz v3
|
||||
b0 vf
|
||||
#22049
|
||||
bz vd
|
||||
b1 v10
|
||||
#22085
|
||||
bz vc
|
||||
bz ve
|
||||
b0 v10
|
||||
#22330
|
||||
bz vb
|
||||
bz v2
|
||||
bz v7
|
||||
b0 v8
|
||||
b0 v9
|
||||
|
@ -44,14 +44,12 @@ VLPROFPROC CPU part : 0x663
|
||||
VLPROFPROC CPU revision : 3
|
||||
VLPROFPROC
|
||||
VLPROFTHREAD 0
|
||||
VLPROFEXEC EVAL_BEGIN 57709
|
||||
VLPROFEXEC EVAL_LOOP_BEGIN 58532
|
||||
VLPROFEXEC EXEC_GRAPH_BEGIN 58532
|
||||
VLPROFEXEC MTASK_BEGIN 90465 id 85 predictStart 14315 cpu 2
|
||||
VLPROFEXEC MTASK_END 155034 id 85 predictCost 30533
|
||||
VLPROFEXEC MTASK_BEGIN 156555 id 79 predictStart 44848 cpu 2
|
||||
VLPROFEXEC MTASK_END 294309 id 79 predictCost 48001
|
||||
VLPROFEXEC EVAL_LOOP_END 18028850
|
||||
VLPROFEXEC EVAL_END 1803680
|
||||
VLPROFEXEC EXEC_GRAPH_END 300000
|
||||
VLPROFTHREAD 1
|
||||
VLPROFEXEC MTASK_BEGIN 77352 id 90 predictStart 14315 cpu 3
|
||||
VLPROFEXEC MTASK_END 78511 id 90 predictCost 21592
|
||||
@ -59,4 +57,4 @@ VLPROFEXEC MTASK_BEGIN 79799 id 81 predictStart 35907 cpu 3
|
||||
VLPROFEXEC MTASK_END 80667 id 81 predictCost 29215
|
||||
VLPROFEXEC MTASK_BEGIN 81746 id 87 predictStart 65147 cpu 3
|
||||
VLPROFEXEC MTASK_END 82633 id 87 predictCost 33809
|
||||
VLPROF stat ticks 180832
|
||||
VLPROF stat ticks 300000
|
||||
|
@ -4,33 +4,44 @@ Argument settings:
|
||||
+verilator+prof+exec+start+1
|
||||
+verilator+prof+exec+window+2
|
||||
|
||||
Analysis:
|
||||
Total threads = 2
|
||||
Total mtasks = 5
|
||||
Total cpus used = 2
|
||||
Total yields = 51
|
||||
Total evals = 1
|
||||
Total eval loops = 1
|
||||
Total eval time = 294309 rdtsc ticks
|
||||
Longest mtask time = 137754 rdtsc ticks
|
||||
All-thread mtask time = 205237 rdtsc ticks
|
||||
Longest-thread efficiency = 46.8%
|
||||
All-thread efficiency = 34.9%
|
||||
All-thread speedup = 0.7
|
||||
Summary:
|
||||
Total elapsed time = 300000 rdtsc ticks
|
||||
Parallelized code = 80.49% of elapsed time
|
||||
Total threads = 2
|
||||
Total CPUs used = 2
|
||||
Total mtasks = 5
|
||||
Total yields = 51
|
||||
|
||||
Prediction (what Verilator used for scheduling):
|
||||
All-thread efficiency = 82.4%
|
||||
All-thread speedup = 1.6
|
||||
Parallelized code, measured:
|
||||
Thread utilization = 42.50%
|
||||
Speedup = 0.85x
|
||||
|
||||
Parallelized code, predicted during static scheduling:
|
||||
Thread utilization = 82.44%
|
||||
Speedup = 1.65x
|
||||
|
||||
All code, measured:
|
||||
Thread utilization = 43.96%
|
||||
Speedup = 0.879x
|
||||
|
||||
All code, measured, scaled by predicted speedup:
|
||||
Thread utilization = 72.06%
|
||||
Speedup = 1.44x
|
||||
|
||||
MTask statistics:
|
||||
Longest mtask id = 79
|
||||
Longest mtask time = 57.05% of time elapsed in parallelized code
|
||||
min log(p2e) = -1.054 from mtask 79 (predict 48001, elapsed 137754)
|
||||
max log(p2e) = 3.641 from mtask 87 (predict 33809, elapsed 887)
|
||||
mean = 1.656
|
||||
stddev = 2.104
|
||||
e ^ stddev = 8.200
|
||||
|
||||
CPUs:
|
||||
cpu 2: cpu_time=202323 Phytium,FT-2500/128
|
||||
cpu 3: cpu_time=2914 Phytium,FT-2500/128
|
||||
CPU info:
|
||||
Id | Time spent executing MTask | Socket | Core | Model
|
||||
| % of elapsed ticks / ticks | | |
|
||||
====|============================|========|======|======
|
||||
2 | 67.44% / 202323 | | | Phytium,FT-2500/128
|
||||
3 | 0.97% / 2914 | | | Phytium,FT-2500/128
|
||||
|
||||
Writing profile_exec.vcd
|
||||
|
@ -4,22 +4,18 @@ VLPROF arg +verilator+prof+exec+window+2
|
||||
VLPROF stat threads 2
|
||||
VLPROF stat yields 0
|
||||
VLPROFTHREAD 0
|
||||
VLPROFEXEC EVAL_BEGIN 595
|
||||
VLPROFEXEC EVAL_LOOP_BEGIN 945
|
||||
VLPROFEXEC EXEC_GRAPH_BEGIN 945
|
||||
VLPROFEXEC MTASK_BEGIN 2695 id 6 predictStart 0 cpu 19
|
||||
VLPROFEXEC MTASK_END 2905 id 6 predictCost 30
|
||||
VLPROFEXEC MTASK_BEGIN 9695 id 10 predictStart 196 cpu 19
|
||||
VLPROFEXEC MTASK_END 9870 id 10 predictCost 30
|
||||
VLPROFEXEC EVAL_LOOP_END 12180
|
||||
VLPROFEXEC EVAL_END 12250
|
||||
VLPROFEXEC EVAL_BEGIN 13720
|
||||
VLPROFEXEC EVAL_LOOP_BEGIN 14000
|
||||
VLPROFEXEC EXEC_GRAPH_END 12180
|
||||
VLPROFEXEC EXEC_GRAPH_BEGIN 14000
|
||||
VLPROFEXEC MTASK_BEGIN 15610 id 6 predictStart 0 cpu 19
|
||||
VLPROFEXEC MTASK_END 15820 id 6 predictCost 30
|
||||
VLPROFEXEC MTASK_BEGIN 21700 id 10 predictStart 196 cpu 19
|
||||
VLPROFEXEC MTASK_END 21875 id 10 predictCost 30
|
||||
VLPROFEXEC EVAL_LOOP_END 22085
|
||||
VLPROFEXEC EVAL_END 22330
|
||||
VLPROFEXEC EXEC_GRAPH_END 22085
|
||||
VLPROFTHREAD 1
|
||||
VLPROFEXEC MTASK_BEGIN 5495 id 5 predictStart 0 cpu 10
|
||||
VLPROFEXEC MTASK_END 6090 id 5 predictCost 30
|
||||
|
@ -4,32 +4,43 @@ Argument settings:
|
||||
+verilator+prof+exec+start+2
|
||||
+verilator+prof+exec+window+2
|
||||
|
||||
Analysis:
|
||||
Total threads = 2
|
||||
Total mtasks = 7
|
||||
Total cpus used = 2
|
||||
Total yields = 0
|
||||
Total evals = 2
|
||||
Total eval loops = 2
|
||||
Total eval time = 21875 rdtsc ticks
|
||||
Longest mtask time = 1190 rdtsc ticks
|
||||
All-thread mtask time = 5495 rdtsc ticks
|
||||
Longest-thread efficiency = 5.4%
|
||||
All-thread efficiency = 12.6%
|
||||
All-thread speedup = 0.3
|
||||
Summary:
|
||||
Total elapsed time = 23415 rdtsc ticks
|
||||
Parallelized code = 82.51% of elapsed time
|
||||
Total threads = 2
|
||||
Total CPUs used = 2
|
||||
Total mtasks = 7
|
||||
Total yields = 0
|
||||
|
||||
Prediction (what Verilator used for scheduling):
|
||||
All-thread efficiency = 63.2%
|
||||
All-thread speedup = 1.3
|
||||
Parallelized code, measured:
|
||||
Thread utilization = 14.22%
|
||||
Speedup = 0.284x
|
||||
|
||||
Parallelized code, predicted during static scheduling:
|
||||
Thread utilization = 63.22%
|
||||
Speedup = 1.26x
|
||||
|
||||
All code, measured:
|
||||
Thread utilization = 20.48%
|
||||
Speedup = 0.41x
|
||||
|
||||
All code, measured, scaled by predicted speedup:
|
||||
Thread utilization = 56.80%
|
||||
Speedup = 1.14x
|
||||
|
||||
MTask statistics:
|
||||
Longest mtask id = 5
|
||||
Longest mtask time = 6.16% of time elapsed in parallelized code
|
||||
min log(p2e) = -3.681 from mtask 5 (predict 30, elapsed 1190)
|
||||
max log(p2e) = -2.409 from mtask 8 (predict 107, elapsed 1190)
|
||||
mean = -2.992
|
||||
stddev = 0.459
|
||||
e ^ stddev = 1.583
|
||||
|
||||
CPUs:
|
||||
cpu 10: cpu_time=4725
|
||||
cpu 19: cpu_time=770
|
||||
CPU info:
|
||||
Id | Time spent executing MTask | Socket | Core | Model
|
||||
| % of elapsed ticks / ticks | | |
|
||||
====|============================|========|======|======
|
||||
10 | 20.18% / 4725 | | |
|
||||
19 | 3.29% / 770 | | |
|
||||
|
||||
|
@ -54,7 +54,7 @@ if ($Self->{vltmt}) {
|
||||
file_grep("$Self->{obj_dir}/gantt.log", qr/Total threads += 1/i);
|
||||
file_grep("$Self->{obj_dir}/gantt.log", qr/Total mtasks += 0/i);
|
||||
}
|
||||
file_grep("$Self->{obj_dir}/gantt.log", qr/Total evals += 4/i);
|
||||
file_grep("$Self->{obj_dir}/gantt.log", qr/\|\s+4\s+\|\s+4\.0+\s+\|\s+eval/i);
|
||||
|
||||
# Diff to itself, just to check parsing
|
||||
vcd_identical("$Self->{obj_dir}/profile_exec.vcd", "$Self->{obj_dir}/profile_exec.vcd");
|
||||
|
Loading…
Reference in New Issue
Block a user