Tests: Improve driver performance with forkserver

2025-04-05 12:12:39 +00:00 · 2024-09-19 19:28:07 -04:00 · 2024-09-19 19:28:07 -04:00 · e990415ee0
commit e990415ee0
parent 1543b202b2
1 changed files with 197 additions and 183 deletions
--- a/test_regress/driver.py
+++ b/test_regress/driver.py
@ -48,6 +48,8 @@ All_Scenarios = {
 test = None
 Arg_Tests = []
 Arg_Driver_Verilator_Flags = []
+Quitting = False
+Vltmt_Threads = 3

 # So an 'import vltest_bootstrap' inside test files will do nothing
 sys.modules['vltest_bootstrap'] = {}
@ -223,14 +225,16 @@ class Capabilities:

 class Forker:

-    class Process:
+    class Job:

-        def __init__(self, _id, name, scenario, rerun_skipping, run_pre_start, run_on_start,
-                     run_on_finish):
+        def __init__(self, _id, name, scenario, args, quiet, rerun_skipping, run_pre_start,
+                     run_on_start, run_on_finish):
            self.fail_max_skip = False
            self.id = _id
            self.name = name
            self.scenario = scenario
+            self.args = args
+            self.quiet = quiet
            self.rerun_skipping = rerun_skipping
            self.run_pre_start = run_pre_start
            self.run_on_start = run_on_start
@ -248,8 +252,8 @@ class Forker:
    def __init__(self, max_processes):
        self._max_processes = max_processes
        self._id_next = 0
-        self._left = collections.deque()  # deque of Process
-        self._running = {}  # key of pid, value of Process
+        self._left = collections.deque()  # deque of Job
+        self._running = {}  # key of pid, value of Job

    def is_any_left(self) -> bool:
        return self.num_running() > 0 or (len(self._left) > 0 and not Quitting)
@ -257,21 +261,27 @@ class Forker:
    def max_proc(self, n: int) -> None:
        self._max_processes = n

-    def poll(self) -> None:
+    def poll(self) -> bool:
+        """Run threads, returning if more work to do (if False, sleep)"""
        # We don't use SIGCHLD as conflicted with other handler, instead just poll
        completed = []  # Need two passes to avoid changing the list we are iterating
        nrunning = 0
+        more_now = False
        for process in self._running.values():
            if process.exitcode is not None:
                completed.append(process)
+                more_now = True
            else:
                nrunning += 1
-        for process in completed:
-            self._finished(process)
+        # Start new work now, so running in background while we then collect completions
        while len(self._left) and nrunning < self._max_processes and not Quitting:
            process = self._left.popleft()
            self._run(process)
            nrunning += 1
+            more_now = True
+        for process in completed:
+            self._finished(process)
+        return more_now

    def running(self) -> list:
        return self._running.values()
@ -279,15 +289,18 @@ class Forker:
    def num_running(self) -> int:
        return len(self._running)

-    def schedule(self, name, scenario, rerun_skipping, run_pre_start, run_on_start, run_on_finish):
+    def schedule(self, name, scenario, args, quiet, rerun_skipping, run_pre_start, run_on_start,
+                 run_on_finish):
        # print("-Forker::schedule: [" + name + "]")
-        process = Forker.Process(self._id_next,
-                                 name=name,
-                                 scenario=scenario,
-                                 rerun_skipping=rerun_skipping,
-                                 run_pre_start=run_pre_start,
-                                 run_on_start=run_on_start,
-                                 run_on_finish=run_on_finish)
+        process = Forker.Job(self._id_next,
+                             name=name,
+                             scenario=scenario,
+                             args=args,
+                             quiet=quiet,
+                             rerun_skipping=rerun_skipping,
+                             run_pre_start=run_pre_start,
+                             run_on_start=run_on_start,
+                             run_on_finish=run_on_finish)
        self._id_next += 1
        self._left.append(process)

@ -300,8 +313,10 @@ class Forker:
        # print("-Forker: [" + process.name + "] run_pre_start")
        process.run_pre_start(process)

-        ctx = multiprocessing.get_context('fork')
-        process.mprocess = ctx.Process(target=lambda: self.child_start(process))
+        ctx = multiprocessing.get_context('forkserver')
+        process.mprocess = ctx.Process(  #
+            target=forker.child_start,  # pylint: disable=used-before-assignment
+            args=(process, ))
        process.mprocess.start()

        # print("-Forker: [" + process.name + "] RUNNING pid=" + str(process.pid))
@ -355,6 +370,8 @@ class Runner:
        self.left_cnt += 1
        forker.schedule(name=py_filename,
                        scenario=scenario,
+                        args=Args,
+                        quiet=self.quiet,
                        rerun_skipping=rerun_skipping,
                        run_pre_start=self._run_pre_start_static,
                        run_on_start=self._run_on_start_static,
@ -380,15 +397,15 @@ class Runner:

    @staticmethod
    def _run_on_start_static(process) -> None:
-        Runner.runner._run_on_start(process)  # pylint: disable=protected-access
-
-    def _run_on_start(self, process) -> None:
        # Running in context of child, so can't pass data to parent directly
-        if self.quiet:
+        if process.quiet:
            sys.stdout = open(os.devnull, 'w')  # pylint: disable=R1732,unspecified-encoding
            sys.stderr = open(os.devnull, 'w')  # pylint: disable=R1732,unspecified-encoding

        print("=" * 70)
+        global Args
+        Args = process.args
+
        global test
        test = VlTest(py_filename=process.name,
                      scenario=process.scenario,
@ -396,9 +413,9 @@ class Runner:
        test.oprint("=" * 50)
        test._prep()
        if process.rerun_skipping:
-            print("  ---------- Earlier logfiles below; test was rerunnable = 0\n")
-            os.system("cat $test->{obj_dir}/*.log")
-            print("  ---------- Earlier logfiles above; test was rerunnable = 0\n")
+            print("  ---------- Earlier logfiles below; test was rerunnable = False\n")
+            os.system("cat " + test.obj_dir + "/*.log")
+            print("  ---------- Earlier logfiles above; test was rerunnable = False\n")
        elif process.fail_max_skip:
            test.skip("Too many test failures; exceeded --fail-max")
        else:
@ -457,10 +474,11 @@ class Runner:
        self.print_summary(force=True)
        # Wait for all children to finish
        while forker.is_any_left():
-            forker.poll()
-            if not interactive_debugger:
+            more_now = forker.poll()
+            if not Args.interactive_debugger:
                self.print_summary(force=False)
-            time.sleep(0.1)
+            if not more_now:
+                time.sleep(0.1)
        self.report(None)
        self.report(self.driver_log_filename)

@ -505,7 +523,7 @@ class Runner:
    @staticmethod
    def _py_filename_adjust(py_filename: str,
                            tdir_def: str) -> list:  # Return (py_filename, t_dir)
-        for tdir in Test_Dirs:  # pylint: disable=redefined-outer-name
+        for tdir in Args.test_dirs:  # pylint: disable=redefined-outer-name
            # t_dir used both absolutely and under obj_dir
            try_py_filename = tdir + "/" + os.path.basename(py_filename)
            if os.path.exists(try_py_filename):
@ -517,7 +535,7 @@ class Runner:
        return (py_filename, os.path.abspath(tdir_def))

    def sprint_summary(self) -> str:
-        delta = time.time() - Start
+        delta = time.time() - Start  # pylint: disable=used-before-assignment
        # Fudge of 120% works out about right so ETA correctly predicts completion time
        eta = 1.2 * ((self.all_cnt * (delta / ((self.all_cnt - self.left_cnt) + 0.001))) - delta)
        if delta < 10:
@ -542,7 +560,7 @@ class Runner:
    def _manual_args(self) -> str:
        # Return command line with scenarios stripped
        out = []
-        for oarg in Orig_ARGV_Sw:
+        for oarg in Args.orig_argv_sw:
            showarg = True
            for val in All_Scenarios.values():
                for allscarg in val:
@ -623,7 +641,7 @@ class VlTest:
        self.vlt_all = self.vlt or self.vltmt  # Any Verilator scenario

        (self.py_filename, self.t_dir) = Runner._py_filename_adjust(self.py_filename, ".")
-        for tdir in Test_Dirs:  # pylint: disable=redefined-outer-name
+        for tdir in Args.test_dirs:  # pylint: disable=redefined-outer-name
            # t_dir used both absolutely and under obj_dir
            self.t_dir = None
            if os.path.exists(tdir + "/" + self.name + ".py"):
@ -1682,7 +1700,7 @@ class VlTest:
                        data = rawbuf[0:got]
                        if tee:
                            sys.stdout.write(data.decode('latin-1'))
-                            if interactive_debugger:
+                            if Args.interactive_debugger:
                                sys.stdout.flush()
                        if logfh:
                            logfh.write(data)
@ -1728,8 +1746,8 @@ class VlTest:

        # Read the log file a couple of times to allow for NFS delays
        if check_finished:
-            for tryn in range(self.tries() - 1, -1, -1):
-                if tryn != self.tries() - 1:
+            for tryn in range(Args.log_retries - 1, -1, -1):
+                if tryn != Args.log_retries - 1:
                    time.sleep(1)
                moretry = tryn != 0
                if not self._run_log_try(logfile, check_finished, moretry):
@ -2122,8 +2140,8 @@ class VlTest:

    def files_identical(self, fn1: str, fn2: str, is_logfile=False) -> None:
        """Test if two files have identical contents"""
-        for tryn in range(self.tries() - 1, -1, -1):
-            if tryn != self.tries() - 1:
+        for tryn in range(Args.log_retries, -1, -1):
+            if tryn != Args.log_retries - 1:
                time.sleep(1)
            moretry = tryn != 0
            if not self._files_identical_try(
@ -2139,7 +2157,7 @@ class VlTest:
            f1 = None
            if not moretry:
                self.error("Files_identical file does not exist: " + fn1)
-            return True
+            return True  # Retry
        try:
            f2 = open(  # pylint: disable=consider-using-with
                fn2, 'r', encoding='latin-1', newline='\n')
@ -2148,7 +2166,7 @@ class VlTest:
            if not moretry:
                self.copy_if_golden(fn1, fn2)
                self.error("Files_identical file does not exist: " + fn2)
-            return True
+            return True  # Retry
        again = self._files_identical_reader(f1,
                                             f2,
                                             fn1=fn1,
@ -2213,7 +2231,7 @@ class VlTest:
            if l1 != l2:
                # print(" clnGOT="+pformat(l1s)+"\n clnEXP="+pformat(l2s))
                if moretry:
-                    return True
+                    return True  # Retry
                self.error_keep_going("Line " + str(lineno_m1) + " miscompares; " + fn1 + " != " +
                                      fn2)
                for c in range(min(len(l1), len(l2))):
@ -2231,9 +2249,9 @@ class VlTest:
                else:
                    print("To update reference: HARNESS_UPDATE_GOLDEN=1 {command} or --golden",
                          file=sys.stderr)
-                return False
+                return False  # No retry - bad

-        return False  # Good
+        return False  # No retry - good

    def files_identical_sorted(self, fn1: str, fn2: str, is_logfile=False) -> None:
        """Test if two files, after sorting both, have identical contents"""
@ -2379,16 +2397,6 @@ class VlTest:
            VlTest._cached_cfg_with_ccache = bool(re.match(r'OBJCACHE \?= ccache', mkf))
        return VlTest._cached_cfg_with_ccache

-    @staticmethod
-    def tries() -> int:
-        # Number of retries when reading logfiles, generally only need many
-        # retries when system is busy running a lot of tests
-        if not forker.running:
-            return 2
-        if len(Arg_Tests) > 3:
-            return 7
-        return 2
-
    def glob_some(self, pattern: str) -> list:
        """Return list of filenames matching a glob, with at least one match required."""
        files = glob.glob(pattern)
@ -2653,156 +2661,162 @@ def run_them() -> None:

 ######################################################################
 ######################################################################
-# Globals
+# Main

-os.environ['PYTHONUNBUFFERED'] = "1"
+if __name__ == '__main__':
+    os.environ['PYTHONUNBUFFERED'] = "1"

-if ('VERILATOR_ROOT' not in os.environ) and os.path.isfile('../bin/verilator'):
-    os.environ['VERILATOR_ROOT'] = os.getcwd() + "/.."
-if 'MAKE' not in os.environ:
-    os.environ['MAKE'] = "make"
-if 'CXX' not in os.environ:
-    os.environ['CXX'] = "c++"
-if 'TEST_REGRESS' in os.environ:
-    sys.exit("%Error: TEST_REGRESS environment variable is already set")
-os.environ['TEST_REGRESS'] = os.getcwd()
+    if ('VERILATOR_ROOT' not in os.environ) and os.path.isfile('../bin/verilator'):
+        os.environ['VERILATOR_ROOT'] = os.getcwd() + "/.."
+    if 'MAKE' not in os.environ:
+        os.environ['MAKE'] = "make"
+    if 'CXX' not in os.environ:
+        os.environ['CXX'] = "c++"
+    if 'TEST_REGRESS' in os.environ:
+        sys.exit("%Error: TEST_REGRESS environment variable is already set")
+    os.environ['TEST_REGRESS'] = os.getcwd()

-forker = None
-Start = time.time()
-Vltmt_Threads = 3
-_Parameter_Next_Level = None
-Quitting = False
+    forker = None
+    Start = time.time()
+    _Parameter_Next_Level = None

+    def sig_int(signum, env) -> None:  # pylint: disable=unused-argument
+        global Quitting
+        if Quitting:
+            sys.exit("\nQuitting (immediately)...")
+        print("\nQuitting... (send another interrupt signal for immediate quit)")
+        Quitting = True
+        if forker:
+            forker.kill_tree_all()

-def sig_int(signum, env) -> None:  # pylint: disable=unused-argument
-    global Quitting
-    if Quitting:
-        sys.exit("\nQuitting (immediately)...")
-    print("\nQuitting... (send another interrupt signal for immediate quit)")
-    Quitting = True
-    if forker:
-        forker.kill_tree_all()
+    signal.signal(signal.SIGINT, sig_int)

+    #---------------------------------------------------------------------

-signal.signal(signal.SIGINT, sig_int)
+    parser = argparse.ArgumentParser(
+        allow_abbrev=False,
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        description="""Run Verilator regression tests""",
+        epilog="""driver.py invokes Verilator or another simulator on each test file.
+    See docs/internals.rst in the distribution for more information.

-Orig_ARGV_Sw = []
-for arg in sys.argv:
-    if re.match(r'^-', arg) and not re.match(r'^-j', arg):
-        Orig_ARGV_Sw.append(arg)
+    Copyright 2024-2024 by Wilson Snyder. This program is free software; you
+    can redistribute it and/or modify it under the terms of either the GNU
+    Lesser General Public License Version 3 or the Perl Artistic License
+    Version 2.0.

-#---------------------------------------------------------------------
+    SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0""")

-parser = argparse.ArgumentParser(
-    allow_abbrev=False,
-    formatter_class=argparse.RawDescriptionHelpFormatter,
-    description="""Run Verilator regression tests""",
-    epilog="""driver.py invokes Verilator or another simulator on each test file.
-See docs/internals.rst in the distribution for more information.
+    parser.add_argument('--benchmark', action='store', help='enable benchmarking')
+    parser.add_argument('--debug', action='store_const', const=9, help='enable debug')
+    # --debugi: see _parameter()
+    parser.add_argument('--fail-max',
+                        action='store',
+                        default=None,
+                        help='run Verilator executable with gdb')
+    parser.add_argument('--gdb', action='store_true', help='run Verilator executable with gdb')
+    parser.add_argument('--gdbbt',
+                        action='store_true',
+                        help='run Verilated executable with gdb and backtrace')
+    parser.add_argument('--gdbsim', action='store_true', help='run Verilated executable with gdb')
+    parser.add_argument('--golden', '--gold', action='store_true', help='update golden .out files')
+    parser.add_argument('--hashset', action='store', help='split tests based on <set>/<numsets>')
+    parser.add_argument('--jobs',
+                        '-j',
+                        action='store',
+                        default=0,
+                        type=int,
+                        help='parallel job count (0=cpu count)')
+    parser.add_argument('--quiet',
+                        action='store_true',
+                        help='suppress output except failures and progress')
+    parser.add_argument('--rerun', action='store_true', help='rerun all tests that fail')
+    parser.add_argument('--rr', action='store_true', help='run Verilator executable with rr')
+    parser.add_argument('--rrsim', action='store_true', help='run Verilated executable with rr')
+    parser.add_argument('--sanitize', action='store_true', help='run address sanitizer')
+    parser.add_argument('--site',
+                        action='store_true',
+                        help='include VERILATOR_TEST_SITE test list')
+    parser.add_argument('--stop', action='store_true', help='stop on the first error')
+    parser.add_argument('--trace', action='store_true', help='enable simulator waveform tracing')
+    parser.add_argument('--verbose',
+                        action='store_true',
+                        help='compile and run test in verbose mode')
+    parser.add_argument(
+        '--verilation',  # -no-verilation undocumented debugging
+        action='store_true',
+        default=True,
+        help="don't run verilator compile() phase")
+    parser.add_argument('--verilated-debug',
+                        action='store_true',
+                        help='enable Verilated executable debug')
+    ## Scenarios
+    for scen, v in All_Scenarios.items():
+        parser.add_argument('--' + scen,
+                            dest='scenarios',
+                            action='append_const',
+                            const=scen,
+                            help='scenario-enable ' + scen)

-Copyright 2024-2024 by Wilson Snyder. This program is free software; you
-can redistribute it and/or modify it under the terms of either the GNU
-Lesser General Public License Version 3 or the Perl Artistic License
-Version 2.0.
+    (Args, rest) = parser.parse_known_intermixed_args()

-SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0""")
+    for arg in rest:
+        _parameter(arg)

-parser.add_argument('--benchmark', action='store', help='enable benchmarking')
-parser.add_argument('--debug', action='store_const', const=9, help='enable debug')
-# --debugi: see _parameter()
-parser.add_argument('--fail-max',
-                    action='store',
-                    default=None,
-                    help='run Verilator executable with gdb')
-parser.add_argument('--gdb', action='store_true', help='run Verilator executable with gdb')
-parser.add_argument('--gdbbt',
-                    action='store_true',
-                    help='run Verilated executable with gdb and backtrace')
-parser.add_argument('--gdbsim', action='store_true', help='run Verilated executable with gdb')
-parser.add_argument('--golden', '--gold', action='store_true', help='update golden .out files')
-parser.add_argument('--hashset', action='store', help='split tests based on <set>/<numsets>')
-parser.add_argument('--jobs',
-                    '-j',
-                    action='store',
-                    default=0,
-                    type=int,
-                    help='parallel job count (0=cpu count)')
-parser.add_argument('--quiet',
-                    action='store_true',
-                    help='suppress output except failures and progress')
-parser.add_argument('--rerun', action='store_true', help='rerun all tests that fail')
-parser.add_argument('--rr', action='store_true', help='run Verilator executable with rr')
-parser.add_argument('--rrsim', action='store_true', help='run Verilated executable with rr')
-parser.add_argument('--sanitize', action='store_true', help='run address sanitizer')
-parser.add_argument('--site', action='store_true', help='include VERILATOR_TEST_SITE test list')
-parser.add_argument('--stop', action='store_true', help='stop on the first error')
-parser.add_argument('--trace', action='store_true', help='enable simulator waveform tracing')
-parser.add_argument('--verbose', action='store_true', help='compile and run test in verbose mode')
-parser.add_argument(
-    '--verilation',  # -no-verilation undocumented debugging
-    action='store_true',
-    default=True,
-    help="don't run verilator compile() phase")
-parser.add_argument('--verilated-debug',
-                    action='store_true',
-                    help='enable Verilated executable debug')
-## Scenarios
-for scen, v in All_Scenarios.items():
-    parser.add_argument('--' + scen,
-                        dest='scenarios',
-                        action='append_const',
-                        const=scen,
-                        help='scenario-enable ' + scen)
+    if Args.debug:
+        Arg_Driver_Verilator_Flags.append("--debug --no-skip-identical")
+        logging.basicConfig(level=logging.DEBUG)
+        logging.info("In driver.py, ARGV=" + ' '.join(sys.argv))

-(Args, rest) = parser.parse_known_intermixed_args()
+    # Use Args for some global information, so gets passed to forked process
+    Args.interactive_debugger = Args.gdb or Args.gdbsim or Args.rr or Args.rrsim
+    if Args.jobs > 1 and Args.interactive_debugger:
+        sys.exit("%Error: Unable to use -j > 1 with --gdb* and --rr* options")

-for arg in rest:
-    _parameter(arg)
+    if Args.golden:
+        os.environ['HARNESS_UPDATE_GOLDEN'] = '1'
+    if Args.jobs == 0:
+        Args.jobs = 1 if Args.interactive_debugger else calc_jobs()
+    if not Args.scenarios:
+        Args.scenarios = []
+        Args.scenarios.append('dist')
+        Args.scenarios.append('vlt')

-if Args.debug:
-    Arg_Driver_Verilator_Flags.append("--debug --no-skip-identical")
-    logging.basicConfig(level=logging.DEBUG)
-    logging.info("In driver.py, ARGV=" + ' '.join(sys.argv))
+    Args.orig_argv_sw = []
+    for arg in sys.argv:
+        if re.match(r'^-', arg) and not re.match(r'^-j', arg):
+            Args.orig_argv_sw.append(arg)

-interactive_debugger = Args.gdb or Args.gdbsim or Args.rr or Args.rrsim
-if Args.jobs > 1 and interactive_debugger:
-    sys.exit("%Error: Unable to use -j > 1 with --gdb* and --rr* options")
+    Args.test_dirs = ["t"]
+    if 'VERILATOR_TESTS_SITE' in os.environ:
+        if Args.site or len(Arg_Tests) >= 1:
+            for tdir in os.environ['VERILATOR_TESTS_SITE'].split(':'):
+                Args.test_dirs.append(tdir)

-if Args.golden:
-    os.environ['HARNESS_UPDATE_GOLDEN'] = '1'
-if Args.jobs == 0:
-    Args.jobs = 1 if interactive_debugger else calc_jobs()
-if not Args.scenarios:
-    Args.scenarios = []
-    Args.scenarios.append('dist')
-    Args.scenarios.append('vlt')
+    if not Arg_Tests:  # Run everything
+        uniq = {}
+        for tdir in Args.test_dirs:
+            # Uniquify by inode, so different paths to same place get combined
+            stats = os.stat(tdir)
+            if stats.st_ino not in uniq:
+                uniq[stats.st_ino] = 1
+                Arg_Tests += sorted(glob.glob(tdir + "/t_*.py"))
+    if Args.hashset:
+        _calc_hashset()

-forker = Forker(Args.jobs)
+    # Number of retries when reading logfiles, generally only need many
+    # retries when system is busy running a lot of tests
+    Args.log_retries = 10 if (len(Arg_Tests) > 3) else 2

-Test_Dirs = ["t"]
-if 'VERILATOR_TESTS_SITE' in os.environ:
-    if Args.site or len(Arg_Tests) >= 1:
-        for tdir in os.environ['VERILATOR_TESTS_SITE'].split(':'):
-            Test_Dirs.append(tdir)
+    forker = Forker(Args.jobs)

-if not Arg_Tests:  # Run everything
-    uniq = {}
-    for tdir in Test_Dirs:
-        # Uniquify by inode, so different paths to same place get combined
-        stats = os.stat(tdir)
-        if stats.st_ino not in uniq:
-            uniq[stats.st_ino] = 1
-            Arg_Tests += sorted(glob.glob(tdir + "/t_*.py"))
-if Args.hashset:
-    _calc_hashset()
+    if len(Arg_Tests) >= 2 and Args.jobs >= 2:
+        # Read supported into master process, so don't call every subprocess
+        Capabilities.warmup_cache()
+        # Without this tests such as t_debug_sigsegv_bt_bad.py will occasionally
+        # block on input and cause a SIGSTOP, then a "fg" was needed to resume testing.
+        print("== Many jobs; redirecting STDIN", file=sys.stderr)
+        #
+        sys.stdin = open("/dev/null", 'r', encoding="utf8")  # pylint: disable=consider-using-with

-if len(Arg_Tests) >= 2 and Args.jobs >= 2:
-    # Read supported into master process, so don't call every subprocess
-    Capabilities.warmup_cache()
-    # Without this tests such as t_debug_sigsegv_bt_bad.py will occasionally
-    # block on input and cause a SIGSTOP, then a "fg" was needed to resume testing.
-    print("== Many jobs; redirecting STDIN", file=sys.stderr)
-    #
-    sys.stdin = open("/dev/null", 'r', encoding="utf8")  # pylint: disable=consider-using-with
-
-run_them()
+    run_them()