#!/usr/bin/env python3
# pylint: disable=C0103,C0114,C0116,C0301,R0914,R0912,R0915,W0511,eval-used
######################################################################

import argparse
import collections
import math
import re
import statistics
# from pprint import pprint

Threads = collections.defaultdict(lambda: {})
Mtasks = collections.defaultdict(lambda: {})
Evals = collections.defaultdict(lambda: {})
EvalLoops = collections.defaultdict(lambda: {})
Global = {
    'args': {},
    'cpuinfo': collections.defaultdict(lambda: {}),
    'rdtsc_cycle_time': 0,
    'stats': {}
}

######################################################################


def process(filename):
    read_data(filename)
    report()


def read_data(filename):
    with open(filename) as fh:
        re_prof = re.compile(
            r'^VLPROF mtask\s(\d+)\sstart\s(\d+)\selapsed\s(\d+)\spredict_start\s(\d+)\spredict_cost\s(\d+)\scpu\s(\d+)\son thread (\d+)'
        )
        re_eval = re.compile(r'^VLPROF eval\sstart\s(\d+)\selapsed\s(\d+)')
        re_loop = re.compile(
            r'^VLPROF eval_loop\sstart\s(\d+)\selapsed\s(\d+)')
        re_arg1 = re.compile(r'VLPROF arg\s+(\S+)\+([0-9.]*)\s*')
        re_arg2 = re.compile(r'VLPROF arg\s+(\S+)\s+([0-9.]*)\s*$')
        re_stat = re.compile(r'VLPROF stat\s+(\S+)\s+([0-9.]+)')
        re_time = re.compile(r'rdtsc time = (\d+) ticks')
        re_proc_cpu = re.compile(r'VLPROFPROC processor\s*:\s*(\d+)\s*$')
        re_proc_dat = re.compile(r'VLPROFPROC ([a-z_ ]+)\s*:\s*(.*)$')
        cpu = None

        for line in fh:
            if re_prof.match(line):
                match = re_prof.match(line)
                mtask = int(match.group(1))
                start = int(match.group(2))
                elapsed_time = int(match.group(3))
                end = start + elapsed_time
                predict_start = int(match.group(4))
                predict_cost = int(match.group(5))
                cpu = int(match.group(6))
                thread = int(match.group(7))
                if start not in Threads[thread]:
                    Threads[thread][start] = {}
                Threads[thread][start]['mtask'] = mtask
                Threads[thread][start]['end'] = end
                Threads[thread][start]['cpu'] = cpu
                Threads[thread][start]['predict_start'] = predict_start
                Threads[thread][start]['predict_cost'] = predict_cost

                if 'elapsed' not in Mtasks[mtask]:
                    Mtasks[mtask] = {'end': 0, 'elapsed': 0}
                Mtasks[mtask]['thread'] = thread
                Mtasks[mtask]['elapsed'] += elapsed_time
                Mtasks[mtask]['predict_start'] = predict_start
                Mtasks[mtask]['predict_cost'] = predict_cost
                Mtasks[mtask]['end'] = max(Mtasks[mtask]['end'], end)
            elif re_eval.match(line):
                match = re_eval.match(line)
                start = int(match.group(1))
                elapsed_time = int(match.group(2))
                Evals[start]['start'] = start
                Evals[start]['end'] = start + elapsed_time
            elif re_loop.match(line):
                match = re_loop.match(line)
                start = int(match.group(1))
                elapsed_time = int(match.group(2))
                EvalLoops[start]['start'] = start
                EvalLoops[start]['end'] = start + elapsed_time
            elif re.match(r'^VLPROFTHREAD', line):
                None  # pylint: disable=pointless-statement
            elif re_arg1.match(line):
                match = re_arg1.match(line)
                Global['args'][match.group(1)] = match.group(2)
            elif re_arg2.match(line):
                match = re_arg2.match(line)
                Global['args'][match.group(1)] = match.group(2)
            elif re_stat.match(line):
                match = re_stat.match(line)
                Global['stats'][match.group(1)] = match.group(2)
            elif re_proc_cpu.match(line):
                match = re_proc_cpu.match(line)
                cpu = int(match.group(1))
            elif cpu and re_proc_dat.match(line):
                match = re_proc_dat.match(line)
                term = match.group(1)
                value = match.group(2)
                term = re.sub(r'\s+$', '', term)
                term = re.sub(r'\s+', '_', term)
                value = re.sub(r'\s+$', '', value)
                Global['cpuinfo'][cpu][term] = value
            elif re.match(r'^#', line):
                None  # pylint: disable=pointless-statement
            elif Args.debug:
                print("-Unk: %s" % line)
            # TODO -- this is parsing text printed by a client.
            # Really, verilator proper should generate this
            # if it's useful...
            if re_time.match(line):
                Global['rdtsc_cycle_time'] = re_time.group(1)


def re_match_result(regexp, line, result_to):
    result_to = re.match(regexp, line)
    return result_to


######################################################################


def report():
    print("Verilator Gantt report")

    print("\nArgument settings:")
    for arg in sorted(Global['args'].keys()):
        plus = "+" if re.match(r'^\+', arg) else " "
        print("  %s%s%s" % (arg, plus, Global['args'][arg]))

    nthreads = len(Threads)
    Global['cpus'] = {}
    for thread in Threads:
        # Make potentially multiple characters per column
        for start in Threads[thread]:
            cpu = Threads[thread][start]['cpu']
            elapsed = Threads[thread][start]['end'] - start
            if cpu not in Global['cpus']:
                Global['cpus'][cpu] = {'cpu_time': 0}
            Global['cpus'][cpu]['cpu_time'] += elapsed

    measured_mt_mtask_time = 0
    predict_mt_mtask_time = 0
    long_mtask_time = 0
    measured_last_end = 0
    predict_last_end = 0
    for mtask in Mtasks:
        measured_mt_mtask_time += Mtasks[mtask]['elapsed']
        predict_mt_mtask_time += Mtasks[mtask]['predict_cost']
        measured_last_end = max(measured_last_end, Mtasks[mtask]['end'])
        predict_last_end = max(
            predict_last_end,
            Mtasks[mtask]['predict_start'] + Mtasks[mtask]['predict_cost'])
        long_mtask_time = max(long_mtask_time, Mtasks[mtask]['elapsed'])
    Global['measured_last_end'] = measured_last_end
    Global['predict_last_end'] = predict_last_end

    # If we know cycle time in the same (rdtsc) units,
    # this will give us an actual utilization number,
    # (how effectively we keep the cores busy.)
    #
    # It also gives us a number we can compare against
    # serial mode, to estimate the overhead of data sharing,
    # which will show up in the total elapsed time. (Overhead
    # of synchronization and scheduling should not.)
    print("\nAnalysis:")
    print("  Total threads             = %d" % nthreads)
    print("  Total mtasks              = %d" % len(Mtasks))
    ncpus = len(Global['cpus'])
    print("  Total cpus used           = %d" % ncpus)
    print("  Total yields              = %d" % int(Global['stats']['yields']))
    print("  Total evals               = %d" % len(Evals))
    print("  Total eval loops          = %d" % len(EvalLoops))
    print("  Total eval time           = %d rdtsc ticks" %
          Global['measured_last_end'])
    print("  Longest mtask time        = %d rdtsc ticks" % long_mtask_time)
    print("  All-thread mtask time     = %d rdtsc ticks" %
          measured_mt_mtask_time)
    long_efficiency = long_mtask_time / (Global.get('measured_last_end', 1)
                                         or 1)
    print("  Longest-thread efficiency = %0.1f%%" % (long_efficiency * 100.0))
    mt_efficiency = measured_mt_mtask_time / (
        Global.get('measured_last_end', 1) * nthreads or 1)
    print("  All-thread efficiency     = %0.1f%%" % (mt_efficiency * 100.0))
    print("  All-thread speedup        = %0.1f" % (mt_efficiency * nthreads))
    if Global['rdtsc_cycle_time'] > 0:
        ut = measured_mt_mtask_time / Global['rdtsc_cycle_time']
        print("tot_mtask_cpu=" + measured_mt_mtask_time + " cyc=" +
              Global['rdtsc_cycle_time'] + " ut=" + ut)

    predict_mt_efficiency = predict_mt_mtask_time / (
        Global.get('predict_last_end', 1) * nthreads or 1)
    print("\nPrediction (what Verilator used for scheduling):")
    print("  All-thread efficiency     = %0.1f%%" %
          (predict_mt_efficiency * 100.0))
    print("  All-thread speedup        = %0.1f" %
          (predict_mt_efficiency * nthreads))

    p2e_ratios = []
    min_p2e = 1000000
    min_mtask = None
    max_p2e = -1000000
    max_mtask = None

    for mtask in sorted(Mtasks.keys()):
        if Mtasks[mtask]['elapsed'] > 0:
            if Mtasks[mtask]['predict_cost'] == 0:
                Mtasks[mtask]['predict_cost'] = 1  # don't log(0) below
            p2e_ratio = math.log(Mtasks[mtask]['predict_cost'] /
                                 Mtasks[mtask]['elapsed'])
            p2e_ratios.append(p2e_ratio)

            if p2e_ratio > max_p2e:
                max_p2e = p2e_ratio
                max_mtask = mtask
            if p2e_ratio < min_p2e:
                min_p2e = p2e_ratio
                min_mtask = mtask

    print("\nStatistics:")
    print("  min log(p2e) = %0.3f" % min_p2e, end="")
    print("  from mtask %d (predict %d," %
          (min_mtask, Mtasks[min_mtask]['predict_cost']),
          end="")
    print(" elapsed %d)" % Mtasks[min_mtask]['elapsed'])
    print("  max log(p2e) = %0.3f" % max_p2e, end="")
    print("  from mtask %d (predict %d," %
          (max_mtask, Mtasks[max_mtask]['predict_cost']),
          end="")
    print(" elapsed %d)" % Mtasks[max_mtask]['elapsed'])

    stddev = statistics.pstdev(p2e_ratios)
    mean = statistics.mean(p2e_ratios)
    print("  mean = %0.3f" % mean)
    print("  stddev = %0.3f" % stddev)
    print("  e ^ stddev = %0.3f" % math.exp(stddev))

    report_cpus()

    if nthreads > ncpus:
        print()
        print("%%Warning: There were fewer CPUs (%d) then threads (%d)." %
              (ncpus, nthreads))
        print("        : See docs on use of numactl.")
    else:
        if 'cpu_socket_cores_warning' in Global:
            print()
            print(
                "%Warning: Multiple threads scheduled on same hyperthreaded core."
            )
            print("        : See docs on use of numactl.")
        if 'cpu_sockets_warning' in Global:
            print()
            print("%Warning: Threads scheduled on multiple sockets.")
            print("        : See docs on use of numactl.")
    print()


def report_cpus():
    print("\nCPUs:")

    Global['cpu_sockets'] = collections.defaultdict(lambda: 0)
    Global['cpu_socket_cores'] = collections.defaultdict(lambda: 0)

    for cpu in sorted(Global['cpus'].keys()):
        print("  cpu %d: " % cpu, end='')
        print("cpu_time=%d" % Global['cpus'][cpu]['cpu_time'], end='')

        socket = None
        if cpu in Global['cpuinfo']:
            cpuinfo = Global['cpuinfo'][cpu]
            if 'physical_id' in cpuinfo and 'core_id' in cpuinfo:
                socket = int(cpuinfo['physical_id'])
                Global['cpu_sockets'][socket] += 1
                print(" socket=%d" % socket, end='')

                core = int(cpuinfo['core_id'])
                Global['cpu_socket_cores'][str(socket) + "__" + str(core)] += 1
                print(" core=%d" % core, end='')

            if 'model_name' in cpuinfo:
                model = cpuinfo['model_name']
                print("  %s" % model, end='')
        print()

    if len(Global['cpu_sockets']) > 1:
        Global['cpu_sockets_warning'] = True
        for scn in Global['cpu_socket_cores'].values():
            if scn > 1:
                Global['cpu_socket_cores_warning'] = True


######################################################################


def write_vcd(filename):
    print("Writing %s" % filename)
    with open(filename, "w") as fh:
        vcd = {
            'values':
            collections.defaultdict(lambda: {}),  # {<time>}{<code>} = value
            'sigs': {
                'predicted_threads': {},
                'measured_threads': {},
                'cpus': {},
                'evals': {},
                'mtasks': {},
                'Stats': {}
            }  # {<module>}{<sig}} = code
        }
        code = 0

        parallelism = {
            'measured': collections.defaultdict(lambda: 0),
            'predicted': collections.defaultdict(lambda: 0)
        }
        parallelism['measured'][0] = 0
        parallelism['predicted'][0] = 0

        # Measured graph
        for thread in sorted(Threads.keys()):
            sig = "thread%d_mtask" % thread
            if sig not in vcd['sigs']['measured_threads']:
                vcd['sigs']['measured_threads'][sig] = code
                code += 1
            mcode = vcd['sigs']['measured_threads'][sig]

            for start in sorted(Threads[thread]):
                mtask = Threads[thread][start]['mtask']
                end = Threads[thread][start]['end']
                cpu = Threads[thread][start]['cpu']
                vcd['values'][start][mcode] = mtask
                vcd['values'][end][mcode] = None
                parallelism['measured'][start] += 1
                parallelism['measured'][end] -= 1

                sig = "cpu%d_thread" % cpu
                if sig not in vcd['sigs']['cpus']:
                    vcd['sigs']['cpus'][sig] = code
                    code += 1
                ccode = vcd['sigs']['cpus'][sig]
                vcd['values'][start][ccode] = thread
                vcd['values'][end][ccode] = None

                sig = "mtask%d_cpu" % mtask
                if sig not in vcd['sigs']['mtasks']:
                    vcd['sigs']['mtasks'][sig] = code
                    code += 1
                ccode = vcd['sigs']['mtasks'][sig]
                vcd['values'][start][ccode] = cpu
                vcd['values'][end][ccode] = None

        # Eval graph
        vcd['sigs']['evals']["eval"] = code
        elcode = code
        code += 1
        n = 0
        for eval_start in Evals:
            eval_end = Evals[eval_start]['end']
            n += 1
            vcd['values'][eval_start][elcode] = n
            vcd['values'][eval_end][elcode] = None

        # Eval_loop graph
        vcd['sigs']['evals']["eval_loop"] = code
        elcode = code
        code += 1
        n = 0
        for eval_start in EvalLoops:
            eval_end = EvalLoops[eval_start]['end']
            n += 1
            vcd['values'][eval_start][elcode] = n
            vcd['values'][eval_end][elcode] = None

        # Predicted graph
        for eval_start in EvalLoops:
            eval_end = EvalLoops[eval_start]['end']
            # Compute scale so predicted graph is of same width as eval
            measured_scaling = (eval_end -
                                eval_start) / Global['predict_last_end']
            # Predict mtasks that fill the time the eval occupied
            for mtask in Mtasks:
                thread = Mtasks[mtask]['thread']
                pred_scaled_start = eval_start + int(
                    Mtasks[mtask]['predict_start'] * measured_scaling)
                pred_scaled_end = eval_start + int(
                    (Mtasks[mtask]['predict_start'] +
                     Mtasks[mtask]['predict_cost']) * measured_scaling)
                if pred_scaled_start == pred_scaled_end:
                    continue

                sig = "predicted_thread%d_mtask" % thread
                if sig not in vcd['sigs']['predicted_threads']:
                    vcd['sigs']['predicted_threads'][sig] = code
                    code += 1
                mcode = vcd['sigs']['predicted_threads'][sig]

                vcd['values'][pred_scaled_start][mcode] = mtask
                vcd['values'][pred_scaled_end][mcode] = None

                parallelism['predicted'][pred_scaled_start] += 1
                parallelism['predicted'][pred_scaled_end] -= 1

        # Parallelism graph
        for measpred in ('measured', 'predicted'):
            vcd['sigs']['Stats']["%s_parallelism" % measpred] = code
            pcode = code
            code += 1
            value = 0
            for time in sorted(parallelism[measpred].keys()):
                value += parallelism[measpred][time]
                vcd['values'][time][pcode] = value

        # Create output file
        fh.write("$version Generated by verilator_gantt $end\n")
        fh.write("$timescale 1ns $end\n")
        fh.write("\n")

        all_codes = {}
        fh.write(" $scope module gantt $end\n")
        for module in sorted(vcd['sigs'].keys()):
            fh.write("  $scope module %s $end\n" % module)
            for sig in sorted(vcd['sigs'][module].keys()):
                code = vcd['sigs'][module][sig]
                fh.write("   $var wire 32 v%x %s [31:0] $end\n" % (code, sig))
                all_codes[code] = 1
            fh.write("  $upscope $end\n")
        fh.write(" $upscope $end\n")
        fh.write("$enddefinitions $end\n")
        fh.write("\n")

        first = True
        for time in sorted(vcd['values']):
            if first:
                first = False
                # Start with Z for any signals without time zero data
                for code in sorted(all_codes.keys()):
                    if code not in vcd['values'][time]:
                        vcd['values'][time][code] = None
            fh.write("#%d\n" % time)
            for code in sorted(vcd['values'][time].keys()):
                value = vcd['values'][time][code]
                if value is None:
                    fh.write("bz v%x\n" % code)
                else:
                    fh.write("b%s v%x\n" % (format(value, 'b'), code))


######################################################################

parser = argparse.ArgumentParser(
    allow_abbrev=False,
    formatter_class=argparse.RawDescriptionHelpFormatter,
    description="""Create Gantt chart of multi-threaded execution""",
    epilog=
    """Verilator_gantt creates a visual representation to help analyze Verilator
#xmultithreaded simulation performance, by showing when each macro-task
#xstarts and ends, and showing when each thread is busy or idle.

For documentation see
https://verilator.org/guide/latest/exe_verilator_gantt.html

Copyright 2018-2022 by Wilson Snyder. This program is free software; you
can redistribute it and/or modify it under the terms of either the GNU
Lesser General Public License Version 3 or the Perl Artistic License
Version 2.0.

SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0""")

parser.add_argument('--debug', action='store_true', help='enable debug')
parser.add_argument('--no-vcd',
                    help='disable creating vcd',
                    action='store_true')
parser.add_argument('--vcd',
                    help='filename for vcd outpue',
                    default='profile_threads.vcd')
parser.add_argument('filename',
                    help='input profile_threads.dat filename to process',
                    default='profile_threads.dat')

Args = parser.parse_args()

process(Args.filename)
if not Args.no_vcd:
    write_vcd(Args.vcd)

######################################################################
# Local Variables:
# compile-command: "./verilator_gantt ../test_regress/t/t_gantt_io.dat"
# End: