#!/usr/bin/env python3 # pylint: disable=C0103,C0114,C0116,C0301,R0914,R0912,R0915,W0511,eval-used ###################################################################### import argparse import collections import math import re import statistics # from pprint import pprint Threads = collections.defaultdict(lambda: {}) Mtasks = collections.defaultdict(lambda: {}) Global = { 'args': {}, 'cpuinfo': collections.defaultdict(lambda: {}), 'rdtsc_cycle_time': 0, 'stats': {} } ###################################################################### def process(filename): read_data(filename) report() def read_data(filename): with open(filename) as fh: re_prof = re.compile( r'^VLPROF mtask\s(\d+)\sstart\s(\d+)\send\s(\d+)\selapsed\s(\d+)\spredict_time\s(\d+)\scpu\s(\d+)\son thread (\d+)' ) re_arg1 = re.compile(r'VLPROF arg\s+(\S+)\+([0-9.])\s*') re_arg2 = re.compile(r'VLPROF arg\s+(\S+)\s+([0-9.])\s*$') re_stat = re.compile(r'VLPROF stat\s+(\S+)\s+([0-9.]+)') re_time = re.compile(r'rdtsc time = (\d+) ticks') re_proc_cpu = re.compile(r'VLPROFPROC processor\s*:\s*(\d+)\s*$') re_proc_dat = re.compile(r'VLPROFPROC ([a-z_ ]+)\s*:\s*(.*)$') cpu = None for line in fh: if re_prof.match(line): match = re_prof.match(line) mtask = int(match.group(1)) start = int(match.group(2)) end = int(match.group(3)) elapsed_time = int(match.group(4)) predict_time = int(match.group(5)) cpu = int(match.group(6)) thread = int(match.group(7)) if start not in Threads[thread]: Threads[thread][start] = {} Threads[thread][start]['mtask'] = mtask Threads[thread][start]['end'] = end Threads[thread][start]['cpu'] = cpu if 'elapsed' not in Mtasks[mtask]: Mtasks[mtask] = {'end': 0, 'elapsed': 0} Mtasks[mtask]['elapsed'] += elapsed_time Mtasks[mtask]['predict'] = predict_time Mtasks[mtask]['end'] = max(Mtasks[mtask]['end'], end) elif re.match(r'^VLPROFTHREAD', line): None # pylint: disable=pointless-statement elif re_arg1.match(line): match = re_arg1.match(line) Global['args'][match.group(1)] = match.group(2) elif re_arg2.match(line): match = re_arg2.match(line) Global['args'][match.group(1)] = match.group(2) elif re_stat.match(line): match = re_stat.match(line) Global['stats'][match.group(1)] = match.group(2) elif re_proc_cpu.match(line): match = re_proc_cpu.match(line) cpu = int(match.group(1)) elif cpu and re_proc_dat.match(line): match = re_proc_dat.match(line) term = match.group(1) value = match.group(2) term = re.sub(r'\s+$', '', term) term = re.sub(r'\s+', '_', term) value = re.sub(r'\s+$', '', value) Global['cpuinfo'][cpu][term] = value elif re.match(r'^#', line): None # pylint: disable=pointless-statement elif Args.debug: print("-Unk: %s" % line) # TODO -- this is parsing text printed by a client. # Really, verilator proper should generate this # if it's useful... if re_time.match(line): Global['rdtsc_cycle_time'] = re_time.group(1) def re_match_result(regexp, line, result_to): result_to = re.match(regexp, line) return result_to ###################################################################### def report(): print("Verilator Gantt report") print("\nArgument settings:") for arg in sorted(Global['args'].keys()): plus = "+" if re.match(r'^\+', arg) else " " print(" %s%s%s" % (arg, plus, Global['args'][arg])) nthreads = len(Threads) Global['cpus'] = {} for thread in Threads: # Make potentially multiple characters per column for start in Threads[thread]: cpu = Threads[thread][start]['cpu'] elapsed = Threads[thread][start]['end'] - start if cpu not in Global['cpus']: Global['cpus'][cpu] = {'cpu_time': 0} Global['cpus'][cpu]['cpu_time'] += elapsed mt_mtask_time = 0 long_mtask_time = 0 last_end = 0 for mtask in Mtasks: mt_mtask_time += Mtasks[mtask]['elapsed'] last_end = max(last_end, Mtasks[mtask]['end']) long_mtask_time = max(long_mtask_time, Mtasks[mtask]['elapsed']) Global['last_end'] = last_end report_graph() # If we know cycle time in the same (rdtsc) units, # this will give us an actual utilization number, # (how effectively we keep the cores busy.) # # It also gives us a number we can compare against # serial mode, to estimate the overhead of data sharing, # which will show up in the total elapsed time. (Overhead # of synchronization and scheduling should not.) print("\nAnalysis:") print(" Total threads = %d" % nthreads) print(" Total mtasks = %d" % len(Mtasks)) ncpus = len(Global['cpus']) print(" Total cpus used = %d" % ncpus) print(" Total yields = %d" % int(Global['stats']['yields'])) print(" Total eval time = %d rdtsc ticks" % Global['last_end']) print(" Longest mtask time = %d rdtsc ticks" % long_mtask_time) print(" All-thread mtask time = %d rdtsc ticks" % mt_mtask_time) long_efficiency = long_mtask_time / (Global.get('last_end', 1)) print(" Longest-thread efficiency = %0.1f%%" % (long_efficiency * 100.0)) mt_efficiency = mt_mtask_time / (Global.get('last_end', 1) * nthreads or 1) print(" All-thread efficiency = %0.1f%%" % (mt_efficiency * 100.0)) print(" All-thread speedup = %0.1f" % (mt_efficiency * nthreads)) if Global['rdtsc_cycle_time'] > 0: ut = mt_mtask_time / Global['rdtsc_cycle_time'] print("tot_mtask_cpu=" + mt_mtask_time + " cyc=" + Global['rdtsc_cycle_time'] + " ut=" + ut) p2e_ratios = [] min_p2e = 1000000 min_mtask = None max_p2e = -1000000 max_mtask = None for mtask in sorted(Mtasks.keys()): if Mtasks[mtask]['elapsed'] > 0: if Mtasks[mtask]['predict'] == 0: Mtasks[mtask]['predict'] = 1 # don't log(0) below p2e_ratio = math.log(Mtasks[mtask]['predict'] / Mtasks[mtask]['elapsed']) p2e_ratios.append(p2e_ratio) if p2e_ratio > max_p2e: max_p2e = p2e_ratio max_mtask = mtask if p2e_ratio < min_p2e: min_p2e = p2e_ratio min_mtask = mtask print("\nStatistics:") print(" min log(p2e) = %0.3f" % min_p2e, end="") print(" from mtask %d (predict %d," % (min_mtask, Mtasks[min_mtask]['predict']), end="") print(" elapsed %d)" % Mtasks[min_mtask]['elapsed']) print(" max log(p2e) = %0.3f" % max_p2e, end="") print(" from mtask %d (predict %d," % (max_mtask, Mtasks[max_mtask]['predict']), end="") print(" elapsed %d)" % Mtasks[max_mtask]['elapsed']) stddev = statistics.pstdev(p2e_ratios) mean = statistics.mean(p2e_ratios) print(" mean = %0.3f" % mean) print(" stddev = %0.3f" % stddev) print(" e ^ stddev = %0.3f" % math.exp(stddev)) report_cpus() if nthreads > ncpus: print() print("%%Warning: There were fewer CPUs (%d) then threads (%d)." % (ncpus, nthreads)) print(" : See docs on use of numactl.") else: if 'cpu_socket_cores_warning' in Global: print() print( "%Warning: Multiple threads scheduled on same hyperthreaded core." ) print(" : See docs on use of numactl.") if 'cpu_sockets_warning' in Global: print() print("%Warning: Threads scheduled on multiple sockets.") print(" : See docs on use of numactl.") print() def report_cpus(): print("\nCPUs:") Global['cpu_sockets'] = collections.defaultdict(lambda: 0) Global['cpu_socket_cores'] = collections.defaultdict(lambda: 0) for cpu in sorted(Global['cpus'].keys()): print(" cpu %d: " % cpu, end='') print("cpu_time=%d" % Global['cpus'][cpu]['cpu_time'], end='') socket = None if cpu in Global['cpuinfo']: socket = int(Global['cpuinfo'][cpu]['physical_id']) Global['cpu_sockets'][socket] += 1 print(" socket=%d" % socket, end='') core = int(Global['cpuinfo'][cpu]['core_id']) Global['cpu_socket_cores'][str(socket) + "__" + str(core)] += 1 print(" core=%d" % core, end='') model = Global['cpuinfo'][cpu]['model_name'] if model: print(" %s" % model, end='') print() if len(Global['cpu_sockets']) > 1: Global['cpu_sockets_warning'] = True for scn in Global['cpu_socket_cores'].values(): if scn > 1: Global['cpu_socket_cores_warning'] = True def report_graph(): time_per = Args.scale if time_per == 0: time_per = Global['last_end'] / 40 # Start with 40 columns while time_per > 10: (graph, conflicts) = _make_graph(time_per) if not conflicts: break time_per = int(time_per / 2) # One more step so we can fit more labels time_per = int(time_per / 2) if time_per <= 0: time_per = 1 (graph, conflicts) = _make_graph(time_per) print("\nThread gantt graph:") print(" Legend: One character width = %s rdtsc ticks" % time_per) print(" Legend: '&' = multiple mtasks in this period (character width)") scale = " <-%d rdtsc total" % Global['last_end'] for col in range(len(scale), int(0.99 + (Global['last_end'] / time_per))): # pylint: disable=unused-variable scale += "-" print(" " + scale + "->") for thread in sorted(graph.keys()): print(" t: ", end="") _print_graph_line(graph[thread], '') def _make_graph(time_per): # [thread][column] = char or # graph = collections.defaultdict( lambda: collections.defaultdict(lambda: '')) conflicts = 0 for thread in Threads: # Make potentially multiple characters per column multi_at_col = collections.defaultdict(lambda: '') for start in sorted(Threads[thread].keys()): end = Threads[thread][start]['end'] # mtask = Threads[thread][start]['mtask'] cpu = Threads[thread][start]['cpu'] startcol = _time_col(time_per, start) endcol = _time_col(time_per, end) label = "[" label += str(cpu) # Maybe make optional in future width = endcol - startcol + 1 while len(label) < (width - 1): # -1 for ']' label += "-" label += "]" multi_at_col[startcol] += label # Expand line to one char per column for col in multi_at_col: chars = multi_at_col[col] ok = True for coladd in range(0, len(chars)): if col + coladd in graph[thread]: ok = False break if not ok: if re.search(r'\[.*\[', chars): # Two begins or more conflicts += 1 graph[thread][col] = "&" else: graph[thread][col] = "[" for coladd in range(1, len(chars)): if col + coladd in graph[thread]: break graph[thread][col + coladd] = 'x' else: coladd = 0 for char in chars: graph[thread][col + coladd] = char coladd += 1 if Args.debug: print("# Conflicts %d" % conflicts) return (graph, conflicts) def _print_graph_line(graph_thread, sep): at = 0 for col in sorted(graph_thread.keys()): while at < col: print(' ', end="") at += 1 c = graph_thread[col] print(c + sep, end="") at += len(c) print() def _time_col(time_per, time): return int(time / time_per) ###################################################################### def write_vcd(filename): print("Writing %s" % filename) with open(filename, "w") as fh: vcd = { 'values': collections.defaultdict(lambda: {}), # {