#!/usr/bin/env python3
# pylint: disable=C0103,C0114,C0116,C0301,R0914,R0912,R0915,W0511,eval-used
######################################################################

import argparse
import collections
import math
import re
import statistics
# from pprint import pprint

Threads = collections.defaultdict(lambda: {})
Mtasks = collections.defaultdict(lambda: {})
Global = {
    'args': {},
    'cpuinfo': collections.defaultdict(lambda: {}),
    'rdtsc_cycle_time': 0,
    'stats': {}
}

######################################################################


def process(filename):
    read_data(filename)
    report()


def read_data(filename):
    with open(filename) as fh:
        re_prof = re.compile(
            r'^VLPROF mtask\s(\d+)\sstart\s(\d+)\send\s(\d+)\selapsed\s(\d+)\spredict_time\s(\d+)\scpu\s(\d+)\son thread (\d+)'
        )
        re_arg1 = re.compile(r'VLPROF arg\s+(\S+)\+([0-9.])\s*')
        re_arg2 = re.compile(r'VLPROF arg\s+(\S+)\s+([0-9.])\s*$')
        re_stat = re.compile(r'VLPROF stat\s+(\S+)\s+([0-9.]+)')
        re_time = re.compile(r'rdtsc time = (\d+) ticks')
        re_proc_cpu = re.compile(r'VLPROFPROC processor\s*:\s*(\d+)\s*$')
        re_proc_dat = re.compile(r'VLPROFPROC ([a-z_ ]+)\s*:\s*(.*)$')
        cpu = None

        for line in fh:
            if re_prof.match(line):
                match = re_prof.match(line)
                mtask = int(match.group(1))
                start = int(match.group(2))
                end = int(match.group(3))
                elapsed_time = int(match.group(4))
                predict_time = int(match.group(5))
                cpu = int(match.group(6))
                thread = int(match.group(7))
                if start not in Threads[thread]:
                    Threads[thread][start] = {}
                Threads[thread][start]['mtask'] = mtask
                Threads[thread][start]['end'] = end
                Threads[thread][start]['cpu'] = cpu

                if 'elapsed' not in Mtasks[mtask]:
                    Mtasks[mtask] = {'end': 0, 'elapsed': 0}
                Mtasks[mtask]['elapsed'] += elapsed_time
                Mtasks[mtask]['predict_cost'] = predict_time
                Mtasks[mtask]['end'] = max(Mtasks[mtask]['end'], end)
            elif re.match(r'^VLPROFTHREAD', line):
                None  # pylint: disable=pointless-statement
            elif re_arg1.match(line):
                match = re_arg1.match(line)
                Global['args'][match.group(1)] = match.group(2)
            elif re_arg2.match(line):
                match = re_arg2.match(line)
                Global['args'][match.group(1)] = match.group(2)
            elif re_stat.match(line):
                match = re_stat.match(line)
                Global['stats'][match.group(1)] = match.group(2)
            elif re_proc_cpu.match(line):
                match = re_proc_cpu.match(line)
                cpu = int(match.group(1))
            elif cpu and re_proc_dat.match(line):
                match = re_proc_dat.match(line)
                term = match.group(1)
                value = match.group(2)
                term = re.sub(r'\s+$', '', term)
                term = re.sub(r'\s+', '_', term)
                value = re.sub(r'\s+$', '', value)
                Global['cpuinfo'][cpu][term] = value
            elif re.match(r'^#', line):
                None  # pylint: disable=pointless-statement
            elif Args.debug:
                print("-Unk: %s" % line)
            # TODO -- this is parsing text printed by a client.
            # Really, verilator proper should generate this
            # if it's useful...
            if re_time.match(line):
                Global['rdtsc_cycle_time'] = re_time.group(1)


def re_match_result(regexp, line, result_to):
    result_to = re.match(regexp, line)
    return result_to


######################################################################


def report():
    print("Verilator Gantt report")

    print("\nArgument settings:")
    for arg in sorted(Global['args'].keys()):
        plus = "+" if re.match(r'^\+', arg) else " "
        print("  %s%s%s" % (arg, plus, Global['args'][arg]))

    nthreads = len(Threads)
    Global['cpus'] = {}
    for thread in Threads:
        # Make potentially multiple characters per column
        for start in Threads[thread]:
            cpu = Threads[thread][start]['cpu']
            elapsed = Threads[thread][start]['end'] - start
            if cpu not in Global['cpus']:
                Global['cpus'][cpu] = {'cpu_time': 0}
            Global['cpus'][cpu]['cpu_time'] += elapsed

    mt_mtask_time = 0
    long_mtask_time = 0
    last_end = 0
    for mtask in Mtasks:
        mt_mtask_time += Mtasks[mtask]['elapsed']
        last_end = max(last_end, Mtasks[mtask]['end'])
        long_mtask_time = max(long_mtask_time, Mtasks[mtask]['elapsed'])
    Global['last_end'] = last_end

    report_graph()

    # If we know cycle time in the same (rdtsc) units,
    # this will give us an actual utilization number,
    # (how effectively we keep the cores busy.)
    #
    # It also gives us a number we can compare against
    # serial mode, to estimate the overhead of data sharing,
    # which will show up in the total elapsed time. (Overhead
    # of synchronization and scheduling should not.)
    print("\nAnalysis:")
    print("  Total threads             = %d" % nthreads)
    print("  Total mtasks              = %d" % len(Mtasks))
    ncpus = len(Global['cpus'])
    print("  Total cpus used           = %d" % ncpus)
    print("  Total yields              = %d" % int(Global['stats']['yields']))
    print("  Total eval time           = %d rdtsc ticks" % Global['last_end'])
    print("  Longest mtask time        = %d rdtsc ticks" % long_mtask_time)
    print("  All-thread mtask time     = %d rdtsc ticks" % mt_mtask_time)
    long_efficiency = long_mtask_time / (Global.get('last_end', 1))
    print("  Longest-thread efficiency = %0.1f%%" % (long_efficiency * 100.0))
    mt_efficiency = mt_mtask_time / (Global.get('last_end', 1) * nthreads or 1)
    print("  All-thread efficiency     = %0.1f%%" % (mt_efficiency * 100.0))
    print("  All-thread speedup        = %0.1f" % (mt_efficiency * nthreads))
    if Global['rdtsc_cycle_time'] > 0:
        ut = mt_mtask_time / Global['rdtsc_cycle_time']
        print("tot_mtask_cpu=" + mt_mtask_time + " cyc=" +
              Global['rdtsc_cycle_time'] + " ut=" + ut)

    p2e_ratios = []
    min_p2e = 1000000
    min_mtask = None
    max_p2e = -1000000
    max_mtask = None

    for mtask in sorted(Mtasks.keys()):
        if Mtasks[mtask]['elapsed'] > 0:
            if Mtasks[mtask]['predict_cost'] == 0:
                Mtasks[mtask]['predict_cost'] = 1  # don't log(0) below
            p2e_ratio = math.log(Mtasks[mtask]['predict_cost'] /
                                 Mtasks[mtask]['elapsed'])
            p2e_ratios.append(p2e_ratio)

            if p2e_ratio > max_p2e:
                max_p2e = p2e_ratio
                max_mtask = mtask
            if p2e_ratio < min_p2e:
                min_p2e = p2e_ratio
                min_mtask = mtask

    print("\nStatistics:")
    print("  min log(p2e) = %0.3f" % min_p2e, end="")
    print("  from mtask %d (predict %d," %
          (min_mtask, Mtasks[min_mtask]['predict_cost']),
          end="")
    print(" elapsed %d)" % Mtasks[min_mtask]['elapsed'])
    print("  max log(p2e) = %0.3f" % max_p2e, end="")
    print("  from mtask %d (predict %d," %
          (max_mtask, Mtasks[max_mtask]['predict_cost']),
          end="")
    print(" elapsed %d)" % Mtasks[max_mtask]['elapsed'])

    stddev = statistics.pstdev(p2e_ratios)
    mean = statistics.mean(p2e_ratios)
    print("  mean = %0.3f" % mean)
    print("  stddev = %0.3f" % stddev)
    print("  e ^ stddev = %0.3f" % math.exp(stddev))

    report_cpus()

    if nthreads > ncpus:
        print()
        print("%%Warning: There were fewer CPUs (%d) then threads (%d)." %
              (ncpus, nthreads))
        print("        : See docs on use of numactl.")
    else:
        if 'cpu_socket_cores_warning' in Global:
            print()
            print(
                "%Warning: Multiple threads scheduled on same hyperthreaded core."
            )
            print("        : See docs on use of numactl.")
        if 'cpu_sockets_warning' in Global:
            print()
            print("%Warning: Threads scheduled on multiple sockets.")
            print("        : See docs on use of numactl.")
    print()


def report_cpus():
    print("\nCPUs:")

    Global['cpu_sockets'] = collections.defaultdict(lambda: 0)
    Global['cpu_socket_cores'] = collections.defaultdict(lambda: 0)

    for cpu in sorted(Global['cpus'].keys()):
        print("  cpu %d: " % cpu, end='')
        print("cpu_time=%d" % Global['cpus'][cpu]['cpu_time'], end='')

        socket = None
        if cpu in Global['cpuinfo']:
            socket = int(Global['cpuinfo'][cpu]['physical_id'])
            Global['cpu_sockets'][socket] += 1
            print(" socket=%d" % socket, end='')

            core = int(Global['cpuinfo'][cpu]['core_id'])
            Global['cpu_socket_cores'][str(socket) + "__" + str(core)] += 1
            print(" core=%d" % core, end='')

            model = Global['cpuinfo'][cpu]['model_name']
            if model:
                print("  %s" % model, end='')
        print()

    if len(Global['cpu_sockets']) > 1:
        Global['cpu_sockets_warning'] = True
        for scn in Global['cpu_socket_cores'].values():
            if scn > 1:
                Global['cpu_socket_cores_warning'] = True


def report_graph():
    time_per = Args.scale
    if time_per == 0:
        time_per = Global['last_end'] / 40  # Start with 40 columns
        while time_per > 10:
            (graph, conflicts) = _make_graph(time_per)
            if not conflicts:
                break
            time_per = int(time_per / 2)
        # One more step so we can fit more labels
        time_per = int(time_per / 2)
        if time_per <= 0:
            time_per = 1

    (graph, conflicts) = _make_graph(time_per)

    print("\nThread gantt graph:")
    print("  Legend: One character width = %s rdtsc ticks" % time_per)
    print("  Legend: '&' = multiple mtasks in this period (character width)")

    scale = "   <-%d rdtsc total" % Global['last_end']
    for col in range(len(scale), int(0.99 + (Global['last_end'] / time_per))):  # pylint: disable=unused-variable
        scale += "-"
    print("  " + scale + "->")

    for thread in sorted(graph.keys()):
        print("  t: ", end="")
        _print_graph_line(graph[thread], '')


def _make_graph(time_per):

    # [thread][column] = char or #
    graph = collections.defaultdict(
        lambda: collections.defaultdict(lambda: ''))

    conflicts = 0
    for thread in Threads:
        # Make potentially multiple characters per column
        multi_at_col = collections.defaultdict(lambda: '')
        for start in sorted(Threads[thread].keys()):
            end = Threads[thread][start]['end']
            # mtask = Threads[thread][start]['mtask']
            cpu = Threads[thread][start]['cpu']

            startcol = _time_col(time_per, start)
            endcol = _time_col(time_per, end)

            label = "["
            label += str(cpu)  # Maybe make optional in future
            width = endcol - startcol + 1
            while len(label) < (width - 1):  # -1 for ']'
                label += "-"
            label += "]"
            multi_at_col[startcol] += label

        # Expand line to one char per column
        for col in multi_at_col:
            chars = multi_at_col[col]
            ok = True
            for coladd in range(0, len(chars)):
                if col + coladd in graph[thread]:
                    ok = False
                    break
            if not ok:
                if re.search(r'\[.*\[', chars):  # Two begins or more
                    conflicts += 1
                    graph[thread][col] = "&"
                else:
                    graph[thread][col] = "["
                for coladd in range(1, len(chars)):
                    if col + coladd in graph[thread]:
                        break
                    graph[thread][col + coladd] = 'x'
            else:
                coladd = 0
                for char in chars:
                    graph[thread][col + coladd] = char
                    coladd += 1

    if Args.debug:
        print("# Conflicts %d" % conflicts)
    return (graph, conflicts)


def _print_graph_line(graph_thread, sep):
    at = 0
    for col in sorted(graph_thread.keys()):
        while at < col:
            print(' ', end="")
            at += 1
        c = graph_thread[col]
        print(c + sep, end="")
        at += len(c)
    print()


def _time_col(time_per, time):
    return int(time / time_per)


######################################################################


def write_vcd(filename):
    print("Writing %s" % filename)
    with open(filename, "w") as fh:
        vcd = {
            'values':
            collections.defaultdict(lambda: {}),  # {<time>}{<code>} = value
            'sigs': {
                'threads': {},
                'cpus': {},
                'mtasks': {},
                'Stats': {}
            }  # {<module>}{<sig}} = code
        }
        code = 0

        parallelism = collections.defaultdict(lambda: 0)
        for thread in sorted(Threads.keys()):
            sig = "thread%d_mtask" % thread
            if sig not in vcd['sigs']['threads']:
                vcd['sigs']['threads'][sig] = code
                code += 1
            mcode = vcd['sigs']['threads'][sig]

            for start in sorted(Threads[thread]):
                end = Threads[thread][start]['end']
                mtask = Threads[thread][start]['mtask']
                cpu = Threads[thread][start]['cpu']
                vcd['values'][start][mcode] = mtask
                vcd['values'][end][mcode] = None
                parallelism[start] += 1
                parallelism[end] -= 1

                sig = "cpu%d_thread" % cpu
                if sig not in vcd['sigs']['cpus']:
                    vcd['sigs']['cpus'][sig] = code
                    code += 1
                ccode = vcd['sigs']['cpus'][sig]
                vcd['values'][start][ccode] = thread
                vcd['values'][end][ccode] = None

                sig = "mtask%d_cpu" % mtask
                if sig not in vcd['sigs']['mtasks']:
                    vcd['sigs']['mtasks'][sig] = code
                    code += 1
                ccode = vcd['sigs']['mtasks'][sig]
                vcd['values'][start][ccode] = cpu
                vcd['values'][end][ccode] = None

        # Parallelism graph
        vcd['sigs']['Stats']["parallelism"] = code
        pcode = code
        code += 1

        value = 0
        for time in sorted(parallelism.keys()):
            value += parallelism[time]
            vcd['values'][time][pcode] = value

        fh.write("$version Generated by verilator_gantt $end\n")
        fh.write("$timescale 1ns $end\n")
        fh.write("\n")

        all_codes = {}
        fh.write(" $scope module gantt $end\n")
        for module in sorted(vcd['sigs'].keys()):
            fh.write("  $scope module %s $end\n" % module)
            for sig in sorted(vcd['sigs'][module].keys()):
                code = vcd['sigs'][module][sig]
                fh.write("   $var wire 32 v%x %s [31:0] $end\n" % (code, sig))
                all_codes[code] = 1
            fh.write("  $upscope $end\n")
        fh.write(" $upscope $end\n")
        fh.write("$enddefinitions $end\n")
        fh.write("\n")

        first = True
        for time in sorted(vcd['values']):
            if first:
                first = False
                # Start with Z for any signals without time zero data
                for code in sorted(all_codes.keys()):
                    if code not in vcd['values'][time]:
                        vcd['values'][time][code] = None
            fh.write("#%d\n" % time)
            for code in sorted(vcd['values'][time].keys()):
                value = vcd['values'][time][code]
                if value is None:
                    fh.write("bz v%x\n" % code)
                else:
                    fh.write("b%s v%x\n" % (format(value, 'b'), code))


######################################################################

parser = argparse.ArgumentParser(
    allow_abbrev=False,
    formatter_class=argparse.RawDescriptionHelpFormatter,
    description="""Create Gantt chart of multi-threaded execution""",
    epilog=
    """Verilator_gantt creates a visual representation to help analyze Verilator
#xmultithreaded simulation performance, by showing when each macro-task
#xstarts and ends, and showing when each thread is busy or idle.

For documentation see
https://verilator.org/guide/latest/exe_verilator_gantt.html

Copyright 2018-2021 by Wilson Snyder. This program is free software; you
can redistribute it and/or modify it under the terms of either the GNU
Lesser General Public License Version 3 or the Perl Artistic License
Version 2.0.

SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0""")

parser.add_argument('--debug', action='store_true', help='enable debug')
parser.add_argument('--scale',
                    help='number of time units per character in graph',
                    type=int,
                    default=0)
parser.add_argument('--no-vcd',
                    help='disable creating vcd',
                    action='store_true')
parser.add_argument('--vcd',
                    help='filename for vcd outpue',
                    default='profile_threads.vcd')
parser.add_argument('filename',
                    help='input profile_threads.dat filename to process',
                    default='profile_threads.dat')

Args = parser.parse_args()

process(Args.filename)
if not Args.no_vcd:
    write_vcd(Args.vcd)

######################################################################
# Local Variables:
# compile-command: "./verilator_gantt ../test_regress/t/t_gantt_io.dat"
# End: