#!/usr/bin/env python3 # pylint: disable=C0103,C0114,C0116,C0301,R0914,R0912,R0915,W0511,eval-used ###################################################################### import argparse import collections import math import re import statistics # from pprint import pprint Threads = collections.defaultdict(lambda: {}) Mtasks = collections.defaultdict(lambda: {}) Evals = collections.defaultdict(lambda: {}) EvalLoops = collections.defaultdict(lambda: {}) Global = { 'args': {}, 'cpuinfo': collections.defaultdict(lambda: {}), 'rdtsc_cycle_time': 0, 'stats': {} } ###################################################################### def process(filename): read_data(filename) report() def read_data(filename): with open(filename) as fh: re_prof = re.compile( r'^VLPROF mtask\s(\d+)\sstart\s(\d+)\selapsed\s(\d+)\spredict_start\s(\d+)\spredict_cost\s(\d+)\scpu\s(\d+)\son thread (\d+)' ) re_eval = re.compile(r'^VLPROF eval\sstart\s(\d+)\selapsed\s(\d+)') re_loop = re.compile( r'^VLPROF eval_loop\sstart\s(\d+)\selapsed\s(\d+)') re_arg1 = re.compile(r'VLPROF arg\s+(\S+)\+([0-9.]*)\s*') re_arg2 = re.compile(r'VLPROF arg\s+(\S+)\s+([0-9.]*)\s*$') re_stat = re.compile(r'VLPROF stat\s+(\S+)\s+([0-9.]+)') re_time = re.compile(r'rdtsc time = (\d+) ticks') re_proc_cpu = re.compile(r'VLPROFPROC processor\s*:\s*(\d+)\s*$') re_proc_dat = re.compile(r'VLPROFPROC ([a-z_ ]+)\s*:\s*(.*)$') cpu = None for line in fh: if re_prof.match(line): match = re_prof.match(line) mtask = int(match.group(1)) start = int(match.group(2)) elapsed_time = int(match.group(3)) end = start + elapsed_time predict_start = int(match.group(4)) predict_cost = int(match.group(5)) cpu = int(match.group(6)) thread = int(match.group(7)) if start not in Threads[thread]: Threads[thread][start] = {} Threads[thread][start]['mtask'] = mtask Threads[thread][start]['end'] = end Threads[thread][start]['cpu'] = cpu Threads[thread][start]['predict_start'] = predict_start Threads[thread][start]['predict_cost'] = predict_cost if 'elapsed' not in Mtasks[mtask]: Mtasks[mtask] = {'end': 0, 'elapsed': 0} Mtasks[mtask]['thread'] = thread Mtasks[mtask]['elapsed'] += elapsed_time Mtasks[mtask]['predict_start'] = predict_start Mtasks[mtask]['predict_cost'] = predict_cost Mtasks[mtask]['end'] = max(Mtasks[mtask]['end'], end) elif re_eval.match(line): match = re_eval.match(line) start = int(match.group(1)) elapsed_time = int(match.group(2)) Evals[start]['start'] = start Evals[start]['end'] = start + elapsed_time elif re_loop.match(line): match = re_loop.match(line) start = int(match.group(1)) elapsed_time = int(match.group(2)) EvalLoops[start]['start'] = start EvalLoops[start]['end'] = start + elapsed_time elif re.match(r'^VLPROFTHREAD', line): None # pylint: disable=pointless-statement elif re_arg1.match(line): match = re_arg1.match(line) Global['args'][match.group(1)] = match.group(2) elif re_arg2.match(line): match = re_arg2.match(line) Global['args'][match.group(1)] = match.group(2) elif re_stat.match(line): match = re_stat.match(line) Global['stats'][match.group(1)] = match.group(2) elif re_proc_cpu.match(line): match = re_proc_cpu.match(line) cpu = int(match.group(1)) elif cpu and re_proc_dat.match(line): match = re_proc_dat.match(line) term = match.group(1) value = match.group(2) term = re.sub(r'\s+$', '', term) term = re.sub(r'\s+', '_', term) value = re.sub(r'\s+$', '', value) Global['cpuinfo'][cpu][term] = value elif re.match(r'^#', line): None # pylint: disable=pointless-statement elif Args.debug: print("-Unk: %s" % line) # TODO -- this is parsing text printed by a client. # Really, verilator proper should generate this # if it's useful... if re_time.match(line): Global['rdtsc_cycle_time'] = re_time.group(1) def re_match_result(regexp, line, result_to): result_to = re.match(regexp, line) return result_to ###################################################################### def report(): print("Verilator Gantt report") print("\nArgument settings:") for arg in sorted(Global['args'].keys()): plus = "+" if re.match(r'^\+', arg) else " " print(" %s%s%s" % (arg, plus, Global['args'][arg])) nthreads = len(Threads) Global['cpus'] = {} for thread in Threads: # Make potentially multiple characters per column for start in Threads[thread]: cpu = Threads[thread][start]['cpu'] elapsed = Threads[thread][start]['end'] - start if cpu not in Global['cpus']: Global['cpus'][cpu] = {'cpu_time': 0} Global['cpus'][cpu]['cpu_time'] += elapsed measured_mt_mtask_time = 0 predict_mt_mtask_time = 0 long_mtask_time = 0 measured_last_end = 0 predict_last_end = 0 for mtask in Mtasks: measured_mt_mtask_time += Mtasks[mtask]['elapsed'] predict_mt_mtask_time += Mtasks[mtask]['predict_cost'] measured_last_end = max(measured_last_end, Mtasks[mtask]['end']) predict_last_end = max( predict_last_end, Mtasks[mtask]['predict_start'] + Mtasks[mtask]['predict_cost']) long_mtask_time = max(long_mtask_time, Mtasks[mtask]['elapsed']) Global['measured_last_end'] = measured_last_end Global['predict_last_end'] = predict_last_end # If we know cycle time in the same (rdtsc) units, # this will give us an actual utilization number, # (how effectively we keep the cores busy.) # # It also gives us a number we can compare against # serial mode, to estimate the overhead of data sharing, # which will show up in the total elapsed time. (Overhead # of synchronization and scheduling should not.) print("\nAnalysis:") print(" Total threads = %d" % nthreads) print(" Total mtasks = %d" % len(Mtasks)) ncpus = len(Global['cpus']) print(" Total cpus used = %d" % ncpus) print(" Total yields = %d" % int(Global['stats']['yields'])) print(" Total evals = %d" % len(Evals)) print(" Total eval loops = %d" % len(EvalLoops)) print(" Total eval time = %d rdtsc ticks" % Global['measured_last_end']) print(" Longest mtask time = %d rdtsc ticks" % long_mtask_time) print(" All-thread mtask time = %d rdtsc ticks" % measured_mt_mtask_time) long_efficiency = long_mtask_time / (Global.get('measured_last_end', 1) or 1) print(" Longest-thread efficiency = %0.1f%%" % (long_efficiency * 100.0)) mt_efficiency = measured_mt_mtask_time / ( Global.get('measured_last_end', 1) * nthreads or 1) print(" All-thread efficiency = %0.1f%%" % (mt_efficiency * 100.0)) print(" All-thread speedup = %0.1f" % (mt_efficiency * nthreads)) if Global['rdtsc_cycle_time'] > 0: ut = measured_mt_mtask_time / Global['rdtsc_cycle_time'] print("tot_mtask_cpu=" + measured_mt_mtask_time + " cyc=" + Global['rdtsc_cycle_time'] + " ut=" + ut) predict_mt_efficiency = predict_mt_mtask_time / ( Global.get('predict_last_end', 1) * nthreads or 1) print("\nPrediction (what Verilator used for scheduling):") print(" All-thread efficiency = %0.1f%%" % (predict_mt_efficiency * 100.0)) print(" All-thread speedup = %0.1f" % (predict_mt_efficiency * nthreads)) p2e_ratios = [] min_p2e = 1000000 min_mtask = None max_p2e = -1000000 max_mtask = None for mtask in sorted(Mtasks.keys()): if Mtasks[mtask]['elapsed'] > 0: if Mtasks[mtask]['predict_cost'] == 0: Mtasks[mtask]['predict_cost'] = 1 # don't log(0) below p2e_ratio = math.log(Mtasks[mtask]['predict_cost'] / Mtasks[mtask]['elapsed']) p2e_ratios.append(p2e_ratio) if p2e_ratio > max_p2e: max_p2e = p2e_ratio max_mtask = mtask if p2e_ratio < min_p2e: min_p2e = p2e_ratio min_mtask = mtask print("\nStatistics:") print(" min log(p2e) = %0.3f" % min_p2e, end="") print(" from mtask %d (predict %d," % (min_mtask, Mtasks[min_mtask]['predict_cost']), end="") print(" elapsed %d)" % Mtasks[min_mtask]['elapsed']) print(" max log(p2e) = %0.3f" % max_p2e, end="") print(" from mtask %d (predict %d," % (max_mtask, Mtasks[max_mtask]['predict_cost']), end="") print(" elapsed %d)" % Mtasks[max_mtask]['elapsed']) stddev = statistics.pstdev(p2e_ratios) mean = statistics.mean(p2e_ratios) print(" mean = %0.3f" % mean) print(" stddev = %0.3f" % stddev) print(" e ^ stddev = %0.3f" % math.exp(stddev)) report_cpus() if nthreads > ncpus: print() print("%%Warning: There were fewer CPUs (%d) then threads (%d)." % (ncpus, nthreads)) print(" : See docs on use of numactl.") else: if 'cpu_socket_cores_warning' in Global: print() print( "%Warning: Multiple threads scheduled on same hyperthreaded core." ) print(" : See docs on use of numactl.") if 'cpu_sockets_warning' in Global: print() print("%Warning: Threads scheduled on multiple sockets.") print(" : See docs on use of numactl.") print() def report_cpus(): print("\nCPUs:") Global['cpu_sockets'] = collections.defaultdict(lambda: 0) Global['cpu_socket_cores'] = collections.defaultdict(lambda: 0) for cpu in sorted(Global['cpus'].keys()): print(" cpu %d: " % cpu, end='') print("cpu_time=%d" % Global['cpus'][cpu]['cpu_time'], end='') socket = None if cpu in Global['cpuinfo']: cpuinfo = Global['cpuinfo'][cpu] if 'physical_id' in cpuinfo and 'core_id' in cpuinfo: socket = int(cpuinfo['physical_id']) Global['cpu_sockets'][socket] += 1 print(" socket=%d" % socket, end='') core = int(cpuinfo['core_id']) Global['cpu_socket_cores'][str(socket) + "__" + str(core)] += 1 print(" core=%d" % core, end='') if 'model_name' in cpuinfo: model = cpuinfo['model_name'] print(" %s" % model, end='') print() if len(Global['cpu_sockets']) > 1: Global['cpu_sockets_warning'] = True for scn in Global['cpu_socket_cores'].values(): if scn > 1: Global['cpu_socket_cores_warning'] = True ###################################################################### def write_vcd(filename): print("Writing %s" % filename) with open(filename, "w") as fh: vcd = { 'values': collections.defaultdict(lambda: {}), # {