MAJOR: Add multithreaded model generation.

2025-04-05 12:12:39 +00:00 · 2018-07-22 20:54:28 -04:00 · 2018-07-22 20:54:28 -04:00 · ec8dbbffed
commit ec8dbbffed
parent 0070520edb
48 changed files with 5949 additions and 71 deletions
--- a/7
+++ b/7
@ -5,15 +5,18 @@ The contributors that suggested a given feature are shown in []. Thanks!
 * Verilator 4.000 devel

 **    This is a major release.  Any patches may require major rework to apply.
+      [Thanks everyone]
+
+**    Add multithreaded model generation.

 **    Add runtime arguments.

-**    Fix internals to be C++ null-pointer-check clean.
-
 ***   Better optimize large always block splitting, bug1244. [John Coiner]

 ***   Add new reloop optimization for repetitive assignment compression.

+****  Fix internals to be C++ null-pointer-check clean.
+
 ****  Fix internals to avoid 'using namespace std'.

 ****  Fix Verilation performance issues, bug1316. [John Coiner]
--- a/Makefile.in
+++ b/Makefile.in
@ -120,6 +120,7 @@ DISTFILES_INC = $(INFOS) .gitignore Artistic COPYING COPYING.LESSER \
 	bin/verilator \
 	bin/verilator_coverage \
 	bin/verilator_difftree \
+	bin/verilator_gantt \
 	bin/verilator_includer \
 	bin/verilator_profcfunc \
 	doxygen-mainpage doxygen.config veripool-logo.png \
@ -154,6 +155,7 @@ DISTFILES_INC = $(INFOS) .gitignore Artistic COPYING COPYING.LESSER \
 INST_PROJ_FILES = \
 	bin/verilator \
 	bin/verilator_coverage \
+	bin/verilator_gantt \
 	bin/verilator_includer \
 	bin/verilator_profcfunc \
 	include/verilated.mk \
@ -272,12 +274,12 @@ internals.pdf: internals.pod Makefile

 # See uninstall also - don't put wildcards in this variable, it might uninstall other stuff
 VL_INST_BIN_FILES = verilator verilator_bin verilator_bin_dbg verilator_coverage_bin_dbg \
-	verilator_coverage verilator_includer verilator_profcfunc
+	verilator_coverage verilator_gantt verilator_includer verilator_profcfunc
 # Some scripts go into both the search path and pkgdatadir,
 # so they can be found by the user, and under $VERILATOR_ROOT.

 # See uninstall also - don't put wildcards in this variable, it might uninstall other stuff
-VL_INST_MAN_FILES = verilator.1 verilator_coverage.1 verilator_profcfunc.1
+VL_INST_MAN_FILES = verilator.1 verilator_coverage.1 verilator_gantt.1 verilator_profcfunc.1

 VL_INST_INC_BLDDIR_FILES = \
 	include/verilated_config.h \
@ -295,6 +297,7 @@ installbin:
 	$(SHELL) ${srcdir}/mkinstalldirs $(DESTDIR)$(bindir)
 	( cd ${srcdir}/bin ; $(INSTALL_PROGRAM) verilator $(DESTDIR)$(bindir)/verilator )
 	( cd ${srcdir}/bin ; $(INSTALL_PROGRAM) verilator_coverage $(DESTDIR)$(bindir)/verilator_coverage )
+	( cd ${srcdir}/bin ; $(INSTALL_PROGRAM) verilator_gantt $(DESTDIR)$(bindir)/verilator_gantt )
 	( cd ${srcdir}/bin ; $(INSTALL_PROGRAM) verilator_profcfunc $(DESTDIR)$(bindir)/verilator_profcfunc )
 	( cd ${srcdir}/bin ; $(INSTALL_PROGRAM) verilator_bin $(DESTDIR)$(bindir)/verilator_bin )
 	( cd ${srcdir}/bin ; $(INSTALL_PROGRAM) verilator_bin_dbg $(DESTDIR)$(bindir)/verilator_bin_dbg )
--- a/bin/verilator
+++ b/bin/verilator
@ -338,6 +338,7 @@ detailed descriptions in L</"VERILATION ARGUMENTS"> for more information.
    --pipe-filter <command>     Filter all input through a script
    --prefix <topname>          Name of top level class
    --prof-cfuncs               Name functions for profiling
+    --prof-threads              Enable generating gantt chart data for threads
    --private                   Debugging; see docs
    --public                    Debugging; see docs
     -pvalue+<name>=<value>     Overwrite toplevel parameter
@ -350,6 +351,9 @@ detailed descriptions in L</"VERILATION ARGUMENTS"> for more information.
    --stats-vars                Provide statistics on variables
     -sv                        Enable SystemVerilog parsing
     +systemverilogext+<ext>    Synonym for +1800-2017ext+<ext>
+    --threads <threads>         Enable multithreading
+    --threads-dpi <mode>        Enable multithreaded DPI
+    --threads-max-mtasks <mtasks>  Tune maximum mtask partitioning
    --top-module <topname>      Name of top level input module
    --trace                     Enable waveform creation
    --trace-depth <levels>      Depth of tracing
@ -386,6 +390,9 @@ detailed descriptions in L</"RUNTIME ARGUMENTS"> for more information.
     +verilator+debug                  Enable debugging
     +verilator+debugi+<value>         Enable debugging at a level
     +verilator+help                   Display help
+     +verilator+prof+threads+file+I<filename>  Set profile filename
+     +verilator+prof+threads+start+I<value>    Set profile starting point
+     +verilator+prof+threads+window+I<value>   Set profile duration
     +verilator+rand+reset+<value>     Set random reset technique
     +verilator+V                      Verbose version and config
     +verilator+version                Show version and exit
@ -1080,6 +1087,18 @@ Verilog module and line number the statement came from.  This allows gprof
 or oprofile reports to be correlated with the original Verilog source
 statements. See also L<verilator_profcfunc>.

+=item --prof-threads
+
+Enable gantt chart data collection for threaded builds.
+
+Verilator will record the start and end time of each macro-task across a
+number of calls to eval. (What is a macro-task? See the Verilator internals
+document.)
+
+When profiling is enabled, the runtime will emit a blurb of profiling data
+in non-human-friendly form. The C<verilator_gantt> script will transform
+this into a nicer visual format and produce some related statistics.
+
 =item --private

 Opposite of --public.  Is the default; this option exists for backwards
@ -1134,7 +1153,10 @@ Enable including save and restore functions in the generated model.

 The user code must create a VerilatedSerialize or VerilatedDeserialze
 object then calling the << or >> operators on the generated model and any
-other data the process needs saved/restored.  For example:
+other data the process needs saved/restored.  These functions are not
+thread safe, and are typically called only by a main thread.
+
+For example:

    void save_model(const char* filenamep) {
        VerilatedSave os;
@ -1173,6 +1195,42 @@ compatibility with other simulators.

 A synonym for C<+1800-2017ext+>I<ext>.

+=item --threads I<threads>
+
+=item --no-threads
+
+With --threads 0 or --no-threads, the default, the generated model is not
+thread safe. With --threads 1, the generated model is single threaded but
+may run in a multithreaded environment. With --threads N, where N >= 2, the
+model is generated to run multithreaded on up to N threads. See
+L</"MULTITHREADING">.
+
+=item --threads-dpi all
+
+=item --threads-dpi none
+
+=item --threads-dpi pure
+
+When using --dpi with --threads, control what DPI tasks are thread safe.
+
+With --threads-dpi all, enable Verilator to assume all DPI imports are
+threadsafe, and to use thread-local storage for communication with DPI,
+potentially improving performance. Any DPI libraries need appropriate
+mutexes to avoid undefined behavior.
+
+With --threads-dpi none, Verilator assume DPI imports are not thread safe,
+and Verilator will serialize calls to DPI imports by default, potentially
+harming performance.
+
+With --threads-dpi pure, the default, Verilator assumes DPI pure imports
+are threadsafe, but non-pure DPI imports are not.
+
+=item --threads-max-mtasks I<value>
+
+Rarely needed.  When using --threads, specify the number of mtasks the
+model is to be partitioned into. If unspecified, Verilator approximates a
+good value.
+
 =item --top-module I<topname>

 When the input Verilog contains more than one top level module, specifies
@ -1464,6 +1522,28 @@ Enable debugging at the provided level.

 Display help and exit.

+=item +verilator+prof+threads+file+I<filename>
+
+When using --prof-threads, the filename to dump to.  Defaults to
+"profile_threads.dat".
+
+=item +verilator+prof+threads+start+I<value>
+
+When using --prof-threads, Verilator will wait until $time is at this
+value, then start the profiling warmup, then capturing. Generally this
+should be set to some time that is well within the normal operation of the
+simulation, i.e. outside of reset. If 0, the dump is disabled. Defaults to
+1.
+
+=item +verilator+prof+threads+window+I<value>
+
+When using --prof-threads, after $time reaches
+verilator+prof+threads+start, Verilator will warm up the profiling for
+this number of eval() calls, then will capture the profiling of this number
+of eval() calls.  Defaults to 2, which makes sense for a
+single-clock-domain module where it's typical to want to capture one
+posedge eval() and one negedge eval().
+
 =item +verilator+rand+reset+I<value>

 When a model was Verilated using "-x-inital unique", sets the
@ -1635,6 +1715,9 @@ compile times, and --x-assign=fast --x-initial=fast may increase the risk
 of reset bugs in trade for performance; see the above documentation for
 these flags.

+If using Verilated multithreaded, use C<numactl> to ensure you are using
+non-conflicting hardware resources. See L</"MULTITHREADING">.
+
 Minor Verilog code changes can also give big wins.  You should not have any
 UNOPTFLAT warnings from Verilator.  Fixing these warnings can result in
 huge improvements; one user fixed their one UNOPTFLAT warning by making a
@ -2176,6 +2259,89 @@ the names of the .cpp files to compile in from the make variables generated
 in obj_dir/Vour_classes.mk.


+=head1 MULTITHREADING
+
+Verilator experimentally supports multithreading.
+
+With --no-threads, the default, the model is not thread safe, and any use
+of more than one thread calling into one or even different Verilated models
+may result in unpredictable behavior. This gives the highest single thread
+performance.
+
+With --threads 1, the generated model is single threaded, however the
+support libraries are multithread safe. This allows different
+instantiations of model(s) to potentially each be run under a different
+thread.  All threading is the responsibility of the user's C++ testbench.
+
+With --threads N, where N is at least 2, the generated model will be
+designed to run in parallel on N threads. The thread calling eval()
+provides one of those threads, and the generated model will create and
+manage the other N-1 threads. It's the client's responsibility not to
+oversubscribe the available CPU cores. Under CPU oversubscription, the
+Verilated model should not livelock nor deadlock, however, you can expect
+performance to be far worse than it would be with proper stoichiometry of
+threads and CPU cores.
+
+The remainder of this section describe behavior with --threads 1 or
+--threads N (not --no-threads).
+
+VL_THREADED is defined when compiling a threaded Verilated module, causing
+the Verilated support classes become threadsafe.
+
+The thread used for constructing a model must the the same thread that
+calls eval() into the model, this is called the "eval thread". The thread
+used to perform certain global operations such as saving and tracing must
+be done by a "main thread". In most cases the eval thread and main thread
+are the same thread (i.e. the user's top C++ testbench runs on a single
+thread), but this is not required.
+
+When running a multithreaded model, the default Linux task scheduler often
+works against the model, by assuming threads are short lived, and thus
+often schedules threads using multiple hyperthreads within the same
+physical core. For best performance use the C<numactl> program to (when the
+threading count fits) select unique physical cores on the same socket. For
+example, if a model was Verilated with "--threads 4", we consult
+
+   egrep 'processor|physical id|core id' /proc/cpuinfo
+
+To select cores 0, 1, 2, and 3 that are all located on the same socket (0)
+but different physical cores.  (Also useful is "numactl --hardware", or
+C<lscpu> but those doesn't show Hyperthreading cores.) Then we execute
+
+   numactl -m 0 -C 0,1,2,3 -- verilated_executable_name
+
+This will limit memory to socket 0, and threads to cores 0, 1, 2, 3,
+(presumably on socket 0) optimizing performance.  Of course this must be
+adjusted if you want another simulator using e.g. socket 1, or if you
+Verilated with a different number of threads.  To see what CPUs are
+actually used, use --prof-threads.
+
+=head2 Multithreaded Verilog and Library Support
+
+$display/$stop/$finish are delayed until the end of an eval() call in order
+to maintain ordering between threads. This may result in additional tasks
+completing after the $stop or $finish.
+
+If using --coverage, the coverage routines are fully thread safe.
+
+If using --dpi, Verilator assumes pure DPI imports are thread safe,
+balancing performance versus saftey. See --threads-dpi.
+
+If using --savable, the save/restore classes are not multithreaded and are
+must be called only by the eval thread.
+
+If using --sc, the SystemC kernel is not thread safe, therefore the eval
+thread and main thread must be the same.
+
+If using --trace, the tracing classes must be constructed and called from
+the main thread.
+
+If using --vpi, since SystemVerilog VPI was not architected by IEEE to be
+multithreaded, Verilator requires all VPI calls are only made from the main
+thread.
+
+=back
+
 =head1 CONFIGURATION FILES

 In addition to the command line, warnings and other features may be
@ -3636,6 +3802,21 @@ section for more details.
 Ignoring this warning will only slow simulations, it will simulate
 correctly.

+=item UNOPTTHREADS
+
+Warns that the thread scheduler was unable to partition the design to fill
+the requested number of threads.
+
+One workaround is to request fewer threads with C<--threads>.
+
+Another possible workaround is to allow more MTasks in the runtime, by
+increasing the value of --threads-max-mtasks. More MTasks will result in
+more communication and synchronization overhead at runtime; the scheduler
+attempts to minimize the number of MTasks for this reason.
+
+Ignoring this warning will only slow simulations, it will simulate
+correctly.
+
 =item UNPACKED

 Warns that unpacked structs and unions are not supported.
@ -4185,6 +4366,8 @@ performance gain.

 In 2009, major SystemVerilog and DPI language support was added.

+In 2018, Verilator 4.000 was released with multithreaded support.
+
 Currently, various language features and performance enhancements are added
 as the need arises.  Verilator is now about 3x faster than in 2002, and is
 faster than many popular commercial simulators.
@ -4282,7 +4465,7 @@ License Version 2.0.

 =head1 SEE ALSO

-L<verilator_coverage>, L<verilator_profcfunc>, L<make>,
+L<verilator_coverage>, L<verilator_gantt>, L<verilator_profcfunc>, L<make>,

 L<verilator --help> which is the source for this document,

--- a/bin/verilator_gantt
+++ b/bin/verilator_gantt
@ -0,0 +1,559 @@
+: # -*-Mode: perl;-*- use perl, wherever it is
+eval 'exec perl -wS $0 ${1+"$@"}'
+  if 0;
+# See copyright, etc in below POD section.
+######################################################################
+
+use strict;
+use warnings;
+use Getopt::Long;
+use Pod::Usage;
+use vars qw ($Debug);
+
+$Debug = 0;
+my $Opt_File;
+my $Opt_Time_Per_Char = 0;  # rdtsc ticks per char in gantt chart, 0=auto
+my $opt_vcd = "profile_threads.vcd";
+
+our %Threads;
+our %Mtasks;
+our %Global;
+
+autoflush STDOUT 1;
+autoflush STDERR 1;
+Getopt::Long::config ("no_auto_abbrev");
+if (! GetOptions (
+          "help"        => \&usage,
+          "scale=i"     => \$Opt_Time_Per_Char,
+          "debug"       => sub { $Debug = 1; },
+          "vcd=s"       => \$opt_vcd,
+          "no-vcd!"     => sub { $opt_vcd = undef; },
+          "<>"          => \&parameter,
+    )) {
+    die "%Error: Bad usage, try 'verilator_gantt --help'\n";
+}
+
+$Opt_File = "profile_threads.dat" if !defined $Opt_File;
+
+process($Opt_File);
+write_vcd($opt_vcd) if defined $opt_vcd;
+exit(0);
+
+#######################################################################
+
+sub usage {
+    pod2usage(-verbose=>2, -exitval=>2, -output=>\*STDOUT);
+    exit (1);
+}
+
+sub parameter {
+    my $param = shift;
+    if (!defined $Opt_File) {
+        $Opt_File = $param;
+    } else {
+        die "%Error: Unknown parameter: $param\n";
+    }
+}
+
+#######################################################################
+
+sub process {
+    my $filename = shift;
+
+    read_data($filename);
+    report();
+}
+
+#######################################################################
+
+sub read_data {
+    my $filename = shift;
+
+    %Global = (rdtsc_cycle_time => 0);
+
+    my $fh = IO::File->new ($filename) or die "%Error: $! $filename,";
+    while (my $line = $fh->getline) {
+        if ($line =~ m/VLPROF mtask\s(\d+)\sstart\s(\d+)\send\s(\d+)\selapsed\s(\d+)\spredict_time\s(\d+)\scpu\s(\d+)\son thread (\d+)/) {
+            my $mtask = $1;
+            my $start = $2;
+            my $end = $3;
+            my $elapsed_time = $4;
+            my $predict_time = $5;
+            my $cpu = $6;
+            my $thread = $7;
+            $Threads{$thread}{$start}{mtask} = $mtask;
+            $Threads{$thread}{$start}{end} = $end;
+            $Threads{$thread}{$start}{cpu} = $cpu;
+
+            if (!exists $Mtasks{$mtask}{elapsed}) {
+                $Mtasks{$mtask}{elapsed} = 0;
+            }
+            $Mtasks{$mtask}{elapsed} += $elapsed_time;
+            $Mtasks{$mtask}{predict} = $predict_time;
+            $Mtasks{$mtask}{end} = max($Mtasks{$mtask}{end}, $end);
+        }
+        elsif ($line =~ /^VLPROFTHREAD/) {}
+        elsif ($line =~ m/VLPROF arg\s+(\S+)\+([0-9.])\s*$/
+               || $line =~ m/VLPROF arg\s+(\S+)\s+([0-9.])\s*$/) {
+            $Global{args}{$1} = $2;
+        }
+        elsif ($line =~ m/VLPROF stat\s+(\S+)\s+([0-9.]+)/) {
+            $Global{stats}{$1} = $2;
+        }
+        elsif ($line =~ /^#/) {}
+        elsif ($Debug) {
+            chomp $line;
+            print "Unk: $line\n";
+        }
+        # TODO -- this is parsing text printed by a client.
+        # Really, verilator proper should generate this
+        # if it's useful...
+        if ($line =~ m/rdtsc time = (\d+) ticks/) {
+            $Global{rdtsc_cycle_time} = $1;
+        }
+    }
+}
+
+sub report {
+    print "Verilator Gantt report\n";
+
+    print "\nArgument settings:\n";
+    foreach my $arg (sort keys %{$Global{args}}) {
+        my $plus = ($arg =~ /^\+/) ? "+" : " ";
+        printf "  %s%s%d\n", $arg, $plus, $Global{args}{$arg};
+    }
+
+    my $nthreads = scalar keys %Threads;
+    $Global{cpus}{cpu_time} = {};
+    foreach my $thread (keys %Threads) {
+        # Make potentially multiple characters per column
+        foreach my $start (keys %{$Threads{$thread}}) {
+            my $cpu = $Threads{$thread}{$start}{cpu};
+            my $elapsed = $Threads{$thread}{$start}{end} - $start;
+            $Global{cpus}{cpu_time}{$cpu} += $elapsed;
+        }
+    }
+
+    my $mt_mtask_time = 0;
+    my $long_mtask_time = 0;
+    my $last_end = 0;
+    foreach my $mtask (keys %Mtasks) {
+        $mt_mtask_time += $Mtasks{$mtask}{elapsed};
+        $last_end = max($last_end, $Mtasks{$mtask}{end});
+        $long_mtask_time = max($long_mtask_time, $Mtasks{$mtask}{elapsed});
+    }
+    $Global{last_end} = $last_end;
+
+    report_graph();
+
+    # If we know cycle time in the same (rdtsc) units,
+    # this will give us an actual utilization number,
+    # (how effectively we keep the cores busy.)
+    #
+    # It also gives us a number we can compare against
+    # serial mode, to estimate the overhead of data sharing,
+    # which will show up in the total elapsed time. (Overhead
+    # of synchronization and scheduling should not.)
+    print "\nAnalysis:\n";
+    printf "  Total threads             = %d\n", $nthreads;
+    printf "  Total mtasks              = %d\n", scalar (keys %Mtasks);
+    printf "  Total cpus used           = %d\n", scalar (keys %{$Global{cpus}});
+    printf "  Total yields              = %d\n", $Global{stats}{yields};
+    printf "  Total eval time           = %d rdtsc ticks\n", $Global{last_end};
+    printf "  Longest mtask time        = %d rdtsc ticks\n", $long_mtask_time;
+    printf "  All-thread mtask time     = %d rdtsc ticks\n", $mt_mtask_time;
+    my $long_efficiency = $long_mtask_time/($Global{last_end});
+    printf "  Longest-thread efficiency = %0.1f%%\n", $long_efficiency*100;
+    my $mt_efficiency = $mt_mtask_time/($Global{last_end}*$nthreads);
+    printf "  All-thread efficiency     = %0.1f%%\n", $mt_efficiency*100;
+    printf "  All-thread speedup        = %0.1f\n", $mt_efficiency*$nthreads;
+    if ($Global{rdtsc_cycle_time} > 0) {
+        my $ut = $mt_mtask_time / $Global{rdtsc_cycle_time};
+        print "tot_mtask_cpu=$mt_mtask_time cyc=$Global{rdtsc_cycle_time} ut=$ut\n";
+    }
+
+    my @p2e_ratios;
+    my $min_p2e = 1000000;
+    my $min_mtask;
+    my $max_p2e = -1000000;
+    my $max_mtask;
+    foreach my $mtask (sort keys %Mtasks) {
+        if ($Mtasks{$mtask}{elapsed} > 0) {
+            if ($Mtasks{$mtask}{predict} == 0) {
+                $Mtasks{$mtask}{predict} = 1;  # don't log(0) below
+            }
+            my $p2e_ratio = log( $Mtasks{$mtask}{predict} / $Mtasks{$mtask}{elapsed} );
+            #print "log(p2e $mtask) = $p2e_ratio   (predict $Mtasks{$mtask}{predict}, elapsed $Mtasks{$mtask}{elapsed})\n";
+            push @p2e_ratios, $p2e_ratio;
+
+            if ($p2e_ratio > $max_p2e) {
+                $max_p2e = $p2e_ratio;
+                $max_mtask = $mtask;
+            }
+            if ($p2e_ratio < $min_p2e) {
+                $min_p2e = $p2e_ratio;
+                $min_mtask = $mtask;
+            }
+        }
+    }
+
+    print "\nStatistics:\n";
+    print "  min log(p2e) = $min_p2e  from mtask $min_mtask (predict $Mtasks{$min_mtask}{predict}, elapsed $Mtasks{$min_mtask}{elapsed})\n";
+    print "  max log(p2e) = $max_p2e  from mtask $max_mtask (predict $Mtasks{$max_mtask}{predict}, elapsed $Mtasks{$max_mtask}{elapsed})\n";
+
+    my $stddev = stddev(\@p2e_ratios);
+    my $mean = mean(\@p2e_ratios);
+    print "  mean = " . ($mean) . "\n";
+    print "  stddev = " . ($stddev) . "\n";
+    print "  e ^ stddev = " . exp($stddev). "\n";
+    print "\n";
+}
+
+sub report_graph {
+    my $time_per = $Opt_Time_Per_Char;
+    if ($time_per == 0) {
+        $time_per = ($Global{last_end} / 40);  # Start with 40 columns
+        while ($time_per > 10) {
+            my ($graph, $conflicts) = _make_graph($time_per);
+            last if !$conflicts;
+            $time_per = int($time_per/2);
+        }
+        # One more step so we can fit more labels
+        $time_per = int($time_per/2);
+    }
+
+    my ($graph, $conflicts) = _make_graph($time_per);
+
+    print "\nThread gantt graph:\n";
+    print "  Legend: One character width = $time_per rdtsc ticks\n";
+    print "  Legend: '&' = multiple mtasks in this period (character width)\n";
+
+    my $scale = "   <-".$Global{last_end}." rdtsc total";
+    for (my $col = length($scale);  # -2 for '->' below
+         $col < ($Global{last_end}/$time_per); ++$col) {
+        $scale .= "-";
+    }
+    print "  $scale->\n";
+
+    foreach my $thread (sort keys %{$graph}) {
+        print "  t: ";
+        _print_graph_line($graph->{$thread}, '');
+    }
+}
+
+sub _make_graph {
+    my $time_per = shift;
+
+    my $graph = {};  # {thread}{column}{char=>'x' or chars=>#}
+    my $conflicts = 0;
+    foreach my $thread (keys %Threads) {
+        # Make potentially multiple characters per column
+        foreach my $start (sort {$a <=> $b} keys %{$Threads{$thread}}) {
+            my $end = $Threads{$thread}{$start}{end};
+            my $mtask = $Threads{$thread}{$start}{mtask};
+            my $cpu = $Threads{$thread}{$start}{cpu};
+
+            my $startcol = _time_col($time_per, $start);
+            my $endcol = _time_col($time_per, $end);
+
+            my $label = "[";
+            $label .= "$cpu";  # Maybe make optional in future
+            my $width = $endcol - $startcol + 1;
+            while (length($label) < ($width-1)) {  # -1 for ']'
+                $label .= "-";
+            }
+            $label .= "]";
+            $graph->{$thread}[$startcol]{char} .= $label;
+        }
+        if ($Debug) {
+            print "# Multicol: "; _print_graph_line($graph->{$thread}, '|');
+        }
+        # Expand line to one char per column
+        for (my $col = 0; $col <= $#{$graph->{$thread}}; ++$col) {
+            if (my $chars = $graph->{$thread}[$col]{char}) {
+                my $ok = 1;
+                for (my $coladd = 1; $coladd<length($chars); ++$coladd) {
+                    if ($graph->{$thread}[$col + $coladd]{char}) {
+                        $ok = 0; last;
+                    }
+                }
+                if (!$ok) {
+                    if ($chars =~ /\[.*\[/) {  # Two begins or more
+                        $conflicts++;
+                        $graph->{$thread}[$col]{char} = "&";
+                    } else {
+                        $graph->{$thread}[$col]{char} = "[";
+                    }
+                    for (my $coladd = 1; $coladd<length($chars); ++$coladd) {
+                        if ($graph->{$thread}[$col + $coladd]{char}) {
+                            last;
+                        } else {
+                            $graph->{$thread}[$col + $coladd]{char} = 'x';
+                        }
+                    }
+                } else {
+                    my $coladd = 0;
+                    foreach my $char (split //, $chars) {
+                        $graph->{$thread}[$col+$coladd]{char} = $char;
+                        ++$coladd;
+                    }
+                }
+            }
+        }
+        if ($Debug) {
+            print "# Singlcol: "; _print_graph_line($graph->{$thread}, '|');
+        }
+    }
+    print "# Conflicts $conflicts\n" if $Debug;
+    return ($graph, $conflicts);
+}
+
+sub _print_graph_line {
+    my $graph_thread = shift;
+    my $sep = shift;
+    for (my $col = 0; $col <= $#{$graph_thread}; ++$col) {
+        my $c = $graph_thread->[$col]{char}; $c=' ' if !defined $c;
+        print $c, $sep;
+    }
+    print "\n";
+}
+
+sub _time_col {
+    my $time_per = shift;
+    my $time = shift;
+    return int($time/$time_per);
+}
+
+#######################################################################
+
+sub write_vcd {
+    my $filename = shift;
+    print "Writing $filename\n";
+    my $fh = IO::File->new(">$filename") or die "%Error: $! $filename,";
+    my $vcd = {values => {},  # {<time>}{<code>} = value
+               sigs => {},  # {<module>}{<sig}} = code
+               code => 0,
+    };
+
+    my %parallelism;
+    foreach my $thread (keys %Threads) {
+        my $mcode = ($vcd->{sigs}{threads}{"thread${thread}_mtask"} ||= $vcd->{code}++);
+        foreach my $start (sort {$a <=> $b} keys %{$Threads{$thread}}) {
+            my $end = $Threads{$thread}{$start}{end};
+            my $mtask = $Threads{$thread}{$start}{mtask};
+            my $cpu = $Threads{$thread}{$start}{cpu};
+            $vcd->{values}{$start}{$mcode} = $mtask;
+            $vcd->{values}{$end}{$mcode} = undef;
+            $parallelism{$start}++;
+            $parallelism{$end}--;
+
+            my $ccode = $vcd->{sigs}{cpus}{"cpu${cpu}_thread"} ||= $vcd->{code}++;
+            $vcd->{values}{$start}{$ccode} = $thread;
+            $vcd->{values}{$end}{$ccode} = undef;
+
+            my $mcode = $vcd->{sigs}{mtasks}{"mtask${mtask}_cpu"} ||= $vcd->{code}++;
+            $vcd->{values}{$start}{$mcode} = $cpu;
+            $vcd->{values}{$end}{$mcode} = undef;
+        }
+    }
+    {
+        my $pcode = ($vcd->{sigs}{Stats}{"parallelism"} ||= $vcd->{code}++);
+        my $value = 0;
+        foreach my $time (sort {$a<=>$b} keys %parallelism) {
+            $value += $parallelism{$time};
+            $vcd->{values}{$time}{$pcode} = $value;
+        }
+    }
+
+    $fh->print('$version Generated by verilator_gantt $end'."\n");
+    $fh->print('$timescale 1ns $end'."\n");
+    $fh->print("\n");
+
+    my %all_codes;
+    $fh->print(' $scope module gantt $end'."\n");
+    foreach my $module (sort keys %{$vcd->{sigs}}) {
+        $fh->printf('  $scope module %s $end'."\n", $module);
+        foreach my $sig (sort keys %{$vcd->{sigs}{$module}}) {
+            my $code = $vcd->{sigs}{$module}{$sig};
+            $fh->printf('   $var wire 32 v%x %s [31:0] $end'."\n",
+                        $code, $sig);
+            $all_codes{$code} = 1;
+        }
+        $fh->print('  $upscope $end'."\n");
+    }
+    $fh->print(' $upscope $end'."\n");
+    $fh->print('$enddefinitions $end'."\n");
+    $fh->print("\n");
+
+    my $first = 1;
+    foreach my $time (sort {$a <=> $b} keys %{$vcd->{values}}) {
+        if ($first) {
+            $first = 0;
+            # Start with Z for any signals without time zero data
+            foreach my $code (keys %all_codes) {
+                if (!defined $vcd->{values}{$time}{$code}) {
+                    $vcd->{values}{$time}{$code} = undef;
+                }
+            }
+        }
+        $fh->printf("#%d\n", $time);
+        foreach my $code (sort keys %{$vcd->{values}{$time}}) {
+            my $value = $vcd->{values}{$time}{$code};
+            if (defined $value) {
+                $fh->printf("b%b v%x\n", $value, $code);
+            } else {
+                $fh->printf("bz v%x\n", $code);
+            }
+        }
+    }
+}
+
+#######################################################################
+# Similar to Statistics::Basic functions, but avoid a package dependency
+
+sub max {
+    my $n = $_[0]; shift;
+    while (defined $_[0]) {
+        $n = $_[0] if !defined $n || $_[0] > $n;
+        shift;
+    }
+    return $n;
+}
+
+sub mean {
+    my $arrayref = shift;
+    my $n = 0;
+    my $sum = 0;
+    foreach my $v (@$arrayref) {
+        $sum += $v;
+        $n++;
+    }
+    return undef if !$n;
+    return $sum/$n;
+}
+
+sub stddev {
+    my $arrayref = shift;
+    my $n = 0;
+    my $sum = 0;
+    my $sumsq = 0;
+    foreach my $v (@$arrayref) {
+        $sum += $v;
+        $sumsq += $v**2;
+        $n++;
+    }
+    return undef if !$n;
+    return sqrt(($sumsq/$n) - ($sum/$n)**2);
+}
+
+#######################################################################
+__END__
+
+=pod
+
+=head1 NAME
+
+verilator_gantt - Create Gantt chart of multi-threaded execution
+
+=head1 SYNOPSIS
+
+Creates a visual representation to help analyze Verilator multithreaded
+simulation performance, by showing when each macro-task starts and ends,
+and showing when each thread is busy or idle.
+
+The generated Gantt chart has time on the X-axis. Times shown are to the
+scale printed, i.e. a certain about of time for each character width.  The
+Y-axis shows threads, each thread's execution is shown on one line.  That
+line shows "[" at the position in time when it executes.
+
+Following the "[" is the cpu number the task executed on, followed by zero
+or more "-" to make the width of the characters match the scaled execution
+time, followed by a "]".  If the scale is too small, the cpu number and
+mtask number will not be printed.  If the scale is very small, a "&"
+indicates multiple mtasks started at that time position.
+
+Also creates a value change dump (VCD) format dump file which may be viewed
+in a waveform viewer (e.g. C<GTKWave>).  See below.
+
+=head1 USAGE
+
+  Build with --prof-threads.
+
+  Run a sim with +verilator+prof+threads+window 2.
+
+  This will create profile_threads.dat.
+
+  Then run:
+
+  verilator_gantt profile_threads.dat
+
+  The report will be printed on standard output, this also generates
+  profile_threads.vcd
+
+  View profile_threads.vcd in a waveform viewer.
+
+=head1 VCD SIGNALS
+
+In waveforms there are the following signals. Most signals the "decimal"
+format will remove the leading zeros and make the traces easier to read.
+
+parallelism: The number of mtasks active at this time, for best performance
+this will match the thread count. You may want to use an "analog step"
+format to view this signal.
+
+cpu#_thread: For the given CPU number, the thread number executing.
+
+mtask#_cpu; For the given mtask id, the CPU it is executing on.
+
+thread#_mtask: For the given thread number, the mtask id executing.
+
+=head1 ARGUMENTS
+
+=over 4
+
+=item I<filename>
+
+The filename to read data from, defaults to "profile_threads.dat".
+
+=item --help
+
+Displays this message and program version and exits.
+
+=item --scale I<n>
+
+On the X-axis of the generated Gantt chart, each character represents this
+many time units. (On x86, time units are rdtsc ticks.)  Defaults to 0,
+which will automatically compute a reasonable scale where no two mtasks
+need to fit into same character width's worth of scaled time.
+
+=item --no-vcd
+
+=item --vcd I<filename>
+
+Set output filename for vcd dump, or disable. Default is
+verilator_gantt.vcd.
+
+=back
+
+=head1 DISTRIBUTION
+
+The latest version is available from L<http://www.veripool.org/>.
+
+Copyright 2018-2018 by Wilson Snyder.  Verilator is free software; you can
+redistribute it and/or modify it under the terms of either the GNU Lesser
+General Public License Version 3 or the Perl Artistic License Version 2.0.
+
+=head1 AUTHORS
+
+Wilson Snyder <wsnyder@wsnyder.org>
+
+=head1 SEE ALSO
+
+C<verilator>
+
+=cut
+
+######################################################################
+### Local Variables:
+### compile-command: "$V4/bin/verilator_gantt $V4/test_regress/obj_vltmt/t_gantt/vlt_sim.log"
+### End:
--- a/include/verilated.cpp
+++ b/include/verilated.cpp
@ -38,6 +38,7 @@ VerilatedVoidCb Verilated::s_flushCb = NULL;

 // Keep below together in one cache line
 Verilated::Serialized Verilated::s_s;
+Verilated::NonSerialized Verilated::s_ns;
 VL_THREAD_LOCAL Verilated::ThreadLocal Verilated::t_s;

 Verilated::CommandArgValues Verilated::s_args;
@ -196,6 +197,17 @@ Verilated::Serialized::Serialized() {
    s_fatalOnVpiError = true; // retains old default behaviour
 }

+Verilated::NonSerialized::NonSerialized() {
+    s_profThreadsStart = 1;
+    s_profThreadsWindow = 2;
+    s_profThreadsFilenamep = strdup("profile_threads.dat");
+}
+Verilated::NonSerialized::~NonSerialized() {
+    if (s_profThreadsFilenamep) {
+        free(const_cast<char*>(s_profThreadsFilenamep)); s_profThreadsFilenamep=NULL;
+    }
+}
+
 //===========================================================================
 // Random reset -- Only called at init time, so don't inline.

@ -1648,6 +1660,20 @@ void Verilated::fatalOnVpiError(bool flag) VL_MT_SAFE {
    VerilatedLockGuard lock(m_mutex);
    s_s.s_fatalOnVpiError = flag;
 }
+void Verilated::profThreadsStart(vluint64_t flag) VL_MT_SAFE {
+    VerilatedLockGuard lock(m_mutex);
+    s_ns.s_profThreadsStart = flag;
+}
+void Verilated::profThreadsWindow(vluint64_t flag) VL_MT_SAFE {
+    VerilatedLockGuard lock(m_mutex);
+    s_ns.s_profThreadsWindow = flag;
+}
+void Verilated::profThreadsFilenamep(const char* flagp) VL_MT_SAFE {
+    VerilatedLockGuard lock(m_mutex);
+    if (s_ns.s_profThreadsFilenamep) free(const_cast<char*>(s_ns.s_profThreadsFilenamep));
+    s_ns.s_profThreadsFilenamep = strdup(flagp);
+}
+

 const char* Verilated::catName(const char* n1, const char* n2) VL_MT_SAFE {
    // Returns new'ed data
@ -1800,6 +1826,15 @@ void VerilatedImp::commandArgVl(const std::string& arg) {
            VL_PRINTF_MT("For help, please see 'verilator --help'\n");
            VL_FATAL_MT("COMMAND_LINE", 0, "", "Exiting due to command line argument (not an error)");
        }
+        else if (commandArgVlValue(arg, "+verilator+prof+threads+start+", value/*ref*/)) {
+            Verilated::profThreadsStart(atoll(value.c_str()));
+        }
+        else if (commandArgVlValue(arg, "+verilator+prof+threads+window+", value/*ref*/)) {
+            Verilated::profThreadsWindow(atol(value.c_str()));
+        }
+        else if (commandArgVlValue(arg, "+verilator+prof+threads+file+", value/*ref*/)) {
+            Verilated::profThreadsFilenamep(value.c_str());
+        }
        else if (commandArgVlValue(arg, "+verilator+rand+reset+", value/*ref*/)) {
            Verilated::randReset(atoi(value.c_str()));
        }
--- a/include/verilated.h
+++ b/include/verilated.h
@ -344,6 +344,17 @@ class Verilated {
        ~Serialized() {}
    } s_s;

+    static struct NonSerialized {  // Non-serialized information
+        // These are reloaded from on command-line settings, so do not need to persist
+        // Fast path
+        vluint64_t s_profThreadsStart;  ///< +prof+threads starting time
+        vluint32_t s_profThreadsWindow;  ///< +prof+threads window size
+        // Slow path
+        const char* s_profThreadsFilenamep;  ///< +prof+threads filename
+        NonSerialized();
+        ~NonSerialized();
+    } s_ns;
+
    // no need to be save-restored (serialized) the
    // assumption is that the restore is allowed to pass different arguments
    static struct CommandArgValues {
@ -409,6 +420,14 @@ public:
    /// Enable/disable vpi fatal
    static void fatalOnVpiError(bool flag) VL_MT_SAFE;
    static bool fatalOnVpiError() VL_MT_SAFE { return s_s.s_fatalOnVpiError; }
+    /// --prof-threads related settings
+    static void profThreadsStart(vluint64_t flag) VL_MT_SAFE;
+    static vluint64_t profThreadsStart() VL_MT_SAFE { return s_ns.s_profThreadsStart; }
+    static void profThreadsWindow(vluint64_t flag) VL_MT_SAFE;
+    static vluint32_t profThreadsWindow() VL_MT_SAFE { return s_ns.s_profThreadsWindow; }
+    static void profThreadsFilenamep(const char* flagp) VL_MT_SAFE;
+    static const char* profThreadsFilenamep() VL_MT_SAFE { return s_ns.s_profThreadsFilenamep; }
+
    /// Flush callback for VCD waves
    static void flushCb(VerilatedVoidCb cb) VL_MT_SAFE;
    static void flushCall() VL_MT_SAFE;
--- a/include/verilated_threads.cpp
+++ b/include/verilated_threads.cpp
@ -0,0 +1,229 @@
+// -*- mode: C++; c-file-style: "cc-mode" -*-
+//=============================================================================
+//
+// THIS MODULE IS PUBLICLY LICENSED
+//
+// Copyright 2012-2018 by Wilson Snyder.  This program is free software;
+// you can redistribute it and/or modify it under the terms of either the GNU
+// Lesser General Public License Version 3 or the Perl Artistic License Version 2.0.
+//
+// This is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+// for more details.
+//
+//=============================================================================
+///
+/// \file
+/// \brief Thread pool for verilated modules
+///
+//=============================================================================
+
+#include "verilatedos.h"
+#include "verilated_threads.h"
+#include <cstdio>
+
+std::atomic<vluint64_t> VlNotification::s_yields;
+
+VL_THREAD_LOCAL VlThreadPool::ProfileTrace* VlThreadPool::t_profilep = NULL;
+
+//=============================================================================
+// VlMTaskVertex
+
+VlMTaskVertex::VlMTaskVertex(vluint32_t upstreamDepCount)
+    : m_upstreamDepsDone(0),
+      m_upstreamDepCount(upstreamDepCount) {
+    assert(atomic_is_lock_free(&m_upstreamDepsDone));
+}
+
+//=============================================================================
+// VlWorkerThread
+
+VlWorkerThread::VlWorkerThread(VlThreadPool* poolp, bool profiling)
+    : m_poolp(poolp)
+    , m_profiling(profiling)
+    , m_exiting(false)
+      // Must init this last -- after setting up fields that it might read:
+    , m_cthread(startWorker, this) {}
+
+VlWorkerThread::~VlWorkerThread() {
+    m_exiting.store(true, std::memory_order_release);
+    {
+        VerilatedLockGuard lk(m_mutex);
+        if (sleeping()) {
+            wakeUp();
+        }
+    }
+    // The thread should exit; join it.
+    m_cthread.join();
+}
+
+void VlWorkerThread::workerLoop() {
+    if (VL_UNLIKELY(m_profiling)) {
+        m_poolp->setupProfilingClientThread();
+    }
+
+    VlNotification alarm;
+    ExecRec work;
+    work.m_fnp = NULL;
+
+    while (1) {
+        bool sleep = false;
+        if (VL_UNLIKELY(!work.m_fnp)) {
+            // Look for work
+            VerilatedLockGuard lk(m_mutex);
+            if (VL_LIKELY(!m_ready.empty())) {
+                dequeWork(&work);
+            } else {
+                // No work available, prepare to sleep. Pass alarm/work
+                // into m_sleepAlarm so wakeUp will tall this function.
+                //
+                // Must modify m_sleepAlarm in the same critical section as
+                // the check for ready work, otherwise we could race with
+                // another thread enqueueing work and never be awoken.
+                m_sleepAlarm.first = &alarm;
+                m_sleepAlarm.second = &work;
+                sleep = true;
+            }
+        }
+
+        // Do this here, not above, to avoid a race with the destructor.
+        if (VL_UNLIKELY(m_exiting.load(std::memory_order_acquire)))
+            break;
+
+        if (VL_UNLIKELY(sleep)) {
+            alarm.waitForNotification();  // ZZZzzzzz
+            alarm.reset();
+        }
+        if (VL_LIKELY(work.m_fnp)) {
+            work.m_fnp(work.m_evenCycle, work.m_sym);
+            work.m_fnp = NULL;
+        }
+    }
+
+    if (VL_UNLIKELY(m_profiling)) {
+        m_poolp->tearDownProfilingClientThread();
+    }
+}
+
+void VlWorkerThread::startWorker(VlWorkerThread* workerp) {
+    workerp->workerLoop();
+}
+
+//=============================================================================
+// VlThreadPool
+
+VlThreadPool::VlThreadPool(int nThreads, bool profiling)
+    : m_profiling(profiling) {
+    // --threads N passes nThreads=N-1, as the "main" threads counts as 1
+    unsigned cpus = std::thread::hardware_concurrency();
+    if (cpus < nThreads+1) {
+        VL_PRINTF_MT("%%Warning: System has %u CPUs but model Verilated with"
+                     " --threads %d; may run slow.\n", cpus, nThreads+1);
+    }
+    // Create'em
+    for (int i=0; i<nThreads; ++i) {
+        m_workers.push_back(new VlWorkerThread(this, profiling));
+    }
+    // Set up a profile buffer for the current thread too -- on the
+    // assumption that it's the same thread that calls eval and may be
+    // donated to run mtasks during the eval.
+    if (VL_UNLIKELY(m_profiling)) {
+        setupProfilingClientThread();
+    }
+}
+
+VlThreadPool::~VlThreadPool() {
+    for (int i = 0; i < m_workers.size(); ++i) {
+        // Each ~WorkerThread will wait for its thread to exit.
+        delete m_workers[i];
+    }
+    if (VL_UNLIKELY(m_profiling)) {
+        tearDownProfilingClientThread();
+    }
+}
+
+void VlThreadPool::tearDownProfilingClientThread() {
+    assert(t_profilep);
+    delete t_profilep;
+    t_profilep = NULL;
+}
+
+void VlThreadPool::setupProfilingClientThread() {
+    assert(!t_profilep);
+    t_profilep = new ProfileTrace;
+    // Reserve some space in the thread-local profiling buffer;
+    // try not to malloc while collecting profiling.
+    t_profilep->reserve(4096);
+    {
+        VerilatedLockGuard lk(m_mutex);
+        m_allProfiles.insert(t_profilep);
+    }
+}
+
+void VlThreadPool::profileAppendAll(const VlProfileRec& rec) {
+    VerilatedLockGuard lk(m_mutex);
+    for (ProfileSet::iterator it = m_allProfiles.begin();
+         it != m_allProfiles.end(); ++it) {
+        // Every thread's profile trace gets a copy of rec.
+        (*it)->emplace_back(rec);
+    }
+}
+
+void VlThreadPool::profileDump(const char* filenamep, vluint64_t ticksElapsed) {
+    VerilatedLockGuard lk(m_mutex);
+    VL_DEBUG_IF(VL_DBG_MSGF("+prof+threads writing to '%s'\n", filenamep););
+
+    FILE* fp = fopen(filenamep, "w");
+    if (VL_UNLIKELY(!fp)) {
+        VL_FATAL_MT(filenamep, 0, "", "+prof+threads+file file not writable");
+        return;
+    }
+
+    // TODO Perhaps merge with verilated_coverage output format, so can
+    // have a common merging and reporting tool, etc.
+    fprintf(fp, "VLPROFTHREAD 1.0 # Verilator thread profile dump version 1.0\n");
+    fprintf(fp, "VLPROF arg --threads %" VL_PRI64 "u\n",
+            vluint64_t(m_workers.size()+1));
+    fprintf(fp, "VLPROF arg +verilator+prof+threads+start+%" VL_PRI64 "u\n",
+            Verilated::profThreadsStart());
+    fprintf(fp, "VLPROF arg +verilator+prof+threads+window+%u\n",
+            Verilated::profThreadsWindow());
+    fprintf(fp, "VLPROF stat yields %" VL_PRI64 "u\n",
+            VlNotification::yields());
+
+    vluint32_t thread_id = 0;
+    for (ProfileSet::iterator pit = m_allProfiles.begin();
+         pit != m_allProfiles.end(); ++pit) {
+        ++thread_id;
+
+        bool printing = false;  // False while in warmup phase
+        for (ProfileTrace::iterator eit = (*pit)->begin();
+             eit != (*pit)->end(); ++eit) {
+            switch (eit->m_type) {
+            case VlProfileRec::TYPE_BARRIER:
+                printing = true;
+                break;
+            case VlProfileRec::TYPE_MTASK_RUN:
+                if (!printing) break;
+                fprintf(fp, "VLPROF mtask %d"
+                        " start %" VL_PRI64"u end %" VL_PRI64"u elapsed %" VL_PRI64 "u"
+                        " predict_time %u cpu %u on thread %u\n",
+                        eit->m_mtaskId,
+                        eit->m_startTime,
+                        eit->m_endTime,
+                        (eit->m_endTime - eit->m_startTime),
+                        eit->m_predictTime,
+                        eit->m_cpu,
+                        thread_id);
+                break;
+            default: assert(false);
+                break;
+            }
+        }
+    }
+    fprintf(fp, "VLPROF stat ticks %" VL_PRI64 "u\n",
+            ticksElapsed);
+
+    fclose(fp);
+}
--- a/include/verilated_threads.h
+++ b/include/verilated_threads.h
@ -0,0 +1,313 @@
+// -*- mode: C++; c-file-style: "cc-mode" -*-
+//=============================================================================
+//
+// THIS MODULE IS PUBLICLY LICENSED
+//
+// Copyright 2012-2018 by Wilson Snyder.  This program is free software;
+// you can redistribute it and/or modify it under the terms of either the GNU
+// Lesser General Public License Version 3 or the Perl Artistic License Version 2.0.
+//
+// This is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+// for more details.
+//
+//=============================================================================
+///
+/// \file
+/// \brief Thread pool and profiling for Verilated modules
+///
+//=============================================================================
+
+#ifndef _VERILATED_THREADS_H_
+#define _VERILATED_THREADS_H_
+
+#include "verilatedos.h"
+
+#include <atomic>
+#include <thread>
+#include <vector>
+#include <set>
+#include <sched.h>  // For sched_getcpu()
+
+#include "verilated.h"  // for VerilatedMutex and clang annotations
+
+// VlMTaskVertex and VlThreadpool will work with multiple symbol table types.
+// Since the type is opaque to VlMTaskVertex and VlThreadPool, represent it
+// as a void* here.
+typedef void* VlThrSymTab;
+
+class VlNotification {
+    // MEMBERS
+    std::atomic<bool> m_notified;  // Notification pending
+    static std::atomic<vluint64_t> s_yields;  // Statistics
+
+public:
+    // CONSTRUCTORS
+    VlNotification()
+        : m_notified(false) {
+        assert(atomic_is_lock_free(&m_notified));
+    }
+    ~VlNotification() {}
+
+    // METHODS
+    static vluint64_t yields() { return s_yields; }
+
+    // Block until notify() has occurred, then return.
+    // If notify() has already occurred, return immediately.
+    //
+    // This is logically const: the object will remain in notified state
+    // after WaitForNotification() returns, so you could notify more than
+    // one thread of the same event.
+    inline void waitForNotification() {
+        unsigned ct = 0;
+        while (VL_UNLIKELY(!notified())) {
+            VL_CPU_RELAX();
+            ct++;
+            if (VL_UNLIKELY(ct > VL_LOCK_SPINS)) {
+                ct = 0;
+                ++s_yields;  // Statistics
+                std::this_thread::yield();
+            }
+        }
+    }
+
+    // The 'inline' keyword here means nothing to the compiler, it's
+    // implicit on methods defined within the class body anyway.
+    //
+    // 'inline' is attached the this method, and others in this file,
+    // to remind humans that some routines in this file are called many
+    // times per cycle in threaded mode. Such routines should be
+    // inlinable; that's why they're declared in the .h and not the .cpp.
+    inline bool notified() {
+        return m_notified.load(std::memory_order_acquire);
+    }
+    // Set notified state. If state is already notified,
+    // it remains so.
+    inline void notify() {
+        m_notified.store(true, std::memory_order_release);
+    }
+    // Reset the state to un-notified state, which is also the
+    // state of a new Notification object.
+    inline void reset() {
+        m_notified.store(false, std::memory_order_relaxed);
+    }
+};
+
+typedef void (*VlExecFnp)(bool, VlThrSymTab);
+
+/// Track dependencies for a single MTask.
+class VlMTaskVertex {
+    // MEMBERS
+
+    // On even cycles, _upstreamDepsDone increases as upstream
+    // dependencies complete. When it reaches _upstreamDepCount,
+    // this MTaskVertex is ready.
+    //
+    // On odd cycles, _upstreamDepsDone decreases as upstream
+    // dependencies complete, and when it reaches zero this MTaskVertex
+    // is ready.
+    //
+    // An atomic is smaller than a mutex, and lock-free.
+    //
+    // (Why does the size of this class matter? If an mtask has many
+    // downstream mtasks to notify, we hope these will pack into a
+    // small number of cache lines to reduce the cost of pointer chasing
+    // during done-notification. Nobody's quantified that cost though.
+    // If we were really serious about shrinking this class, we could
+    // use 16-bit types here...)
+    std::atomic<vluint32_t> m_upstreamDepsDone;
+    const vluint32_t m_upstreamDepCount;
+
+public:
+    // CONSTRUCTORS
+
+    // 'upstreamDepCount' is the number of upstream MTaskVertex's
+    // that must notify this MTaskVertex before it will become ready
+    // to run.
+    explicit VlMTaskVertex(vluint32_t upstreamDepCount);
+    ~VlMTaskVertex() {}
+
+    // Upstream mtasks must call this when they complete.
+    // Returns true when the current MTaskVertex becomes ready to execute,
+    // false while it's still waiting on more dependencies.
+    inline bool signalUpstreamDone(bool evenCycle) {
+        if (evenCycle) {
+            vluint32_t upstreamDepsDone
+                = 1 + m_upstreamDepsDone.fetch_add(1, std::memory_order_release);
+            assert(upstreamDepsDone <= m_upstreamDepCount);
+            return (upstreamDepsDone == m_upstreamDepCount);
+        } else {
+            vluint32_t upstreamDepsDone_prev
+                = m_upstreamDepsDone.fetch_sub(1, std::memory_order_release);
+            assert(upstreamDepsDone_prev > 0);
+            return (upstreamDepsDone_prev == 1);
+        }
+    }
+    inline bool areUpstreamDepsDone(bool evenCycle) const {
+        vluint32_t target = evenCycle ? m_upstreamDepCount : 0;
+        return m_upstreamDepsDone.load(std::memory_order_acquire) == target;
+    }
+    inline void waitUntilUpstreamDone(bool evenCycle) const {
+        while (VL_UNLIKELY(!areUpstreamDepsDone(evenCycle))) {
+            VL_CPU_RELAX();
+        }
+    }
+};
+
+// Profiling support
+class VlProfileRec {
+protected:
+    friend class VlThreadPool;
+    enum VlProfileE {
+        TYPE_MTASK_RUN,
+        TYPE_BARRIER
+    };
+    VlProfileE m_type;  // Record type
+    vluint32_t m_mtaskId;  // Mtask we're logging
+    vluint32_t m_predictTime;  // How long scheduler predicted would take
+    vluint64_t m_startTime;  // Tick at start of execution
+    vluint64_t m_endTime;  // Tick at end of execution
+    unsigned m_cpu;  // Execution CPU number (at start anyways)
+public:
+    class Barrier {};
+    VlProfileRec() {}
+    explicit VlProfileRec(Barrier) {
+        m_type = TYPE_BARRIER;
+        m_mtaskId = 0;
+        m_predictTime = 0;
+        m_startTime = 0;
+        m_cpu = sched_getcpu();
+    }
+    void startRecord(vluint64_t time, uint32_t mtask, uint32_t predict) {
+        m_type = VlProfileRec::TYPE_MTASK_RUN;
+        m_mtaskId = mtask;
+        m_predictTime = predict;
+        m_startTime = time;
+        m_cpu = sched_getcpu();
+    }
+    void endRecord(vluint64_t time) {
+        m_endTime = time;
+    }
+};
+
+class VlThreadPool;
+
+class VlWorkerThread {
+private:
+    // TYPES
+    struct ExecRec {
+        VlExecFnp m_fnp;  // Function to execute
+        VlThrSymTab m_sym;  // Symbol table to execute
+        bool m_evenCycle;  // Even/odd for flag alternation
+        ExecRec() : m_fnp(NULL), m_sym(NULL), m_evenCycle(false) {}
+        ExecRec(VlExecFnp fnp, bool evenCycle, VlThrSymTab sym)
+            : m_fnp(fnp), m_sym(sym), m_evenCycle(evenCycle) {}
+    };
+
+    // MEMBERS
+    VerilatedMutex m_mutex;
+
+    // Why a vector? We expect the pending list to be very short, typically
+    // 0 or 1 or 2, so popping from the front shouldn't be
+    // expensive. Revisit if we ever have longer queues...
+    std::vector<ExecRec> m_ready VL_GUARDED_BY(m_mutex);
+
+    VlThreadPool* m_poolp;  // Our associated thread pool
+
+    // If values stored are non-NULL, the thread is asleep pending new
+    // work. If the thread is not asleep, both parts of m_sleepAlarm must
+    // be NULL.
+    std::pair<VlNotification*, ExecRec*> m_sleepAlarm VL_GUARDED_BY(m_mutex);
+
+    bool m_profiling;  // Is profiling enabled?
+    std::atomic<bool> m_exiting;  // Worker thread should exit
+    std::thread m_cthread;  // Underlying C++ thread record
+
+    VL_UNCOPYABLE(VlWorkerThread);
+
+public:
+    // CONSTRUCTORS
+    explicit VlWorkerThread(VlThreadPool* poolp, bool profiling);
+    ~VlWorkerThread();
+
+    // METHODS
+    inline void dequeWork(ExecRec* workp) VL_REQUIRES(m_mutex) {
+        // As noted above this is inefficient if our ready list is ever
+        // long (but it shouldn't be)
+        *workp = m_ready.front();
+        m_ready.erase(m_ready.begin());
+    }
+    inline void wakeUp() VL_REQUIRES(m_mutex) {
+        VlNotification* notifyp = m_sleepAlarm.first;
+        m_sleepAlarm.first = NULL;  // NULL+NULL means wake
+        m_sleepAlarm.second = NULL;
+        notifyp->notify();
+    }
+    inline bool sleeping() VL_REQUIRES(m_mutex) {
+        return (m_sleepAlarm.first != NULL);
+    }
+    inline void addTask(VlExecFnp fnp, bool evenCycle, VlThrSymTab sym) {
+        VerilatedLockGuard lk(m_mutex);
+        m_ready.emplace_back(fnp, evenCycle, sym);
+        if (VL_LIKELY(sleeping())) {  // Generally queue is waiting for work
+            // Awaken thread
+            dequeWork(m_sleepAlarm.second);
+            wakeUp();
+        }
+    }
+    void workerLoop();
+    static void startWorker(VlWorkerThread* workerp);
+};
+
+class VlThreadPool {
+    // TYPES
+    typedef std::vector<VlProfileRec> ProfileTrace;
+    typedef std::set<ProfileTrace*> ProfileSet;
+
+    // MEMBERS
+    std::vector<VlWorkerThread*> m_workers;  // our workers
+    bool m_profiling;  // is profiling enabled?
+
+    // Support profiling -- we can append records of profiling events
+    // to this vector with very low overhead, and then dump them out
+    // later. This prevents the overhead of printf/malloc/IO from
+    // corrupting the profiling data. It's super cheap to append
+    // a VlProfileRec struct on the end of a pre-allocated vector;
+    // this is the only cost we pay in real-time during a profiling cycle.
+    static VL_THREAD_LOCAL ProfileTrace* t_profilep;
+    ProfileSet m_allProfiles VL_GUARDED_BY(m_mutex);
+    VerilatedMutex m_mutex;
+
+public:
+    // CONSTRUCTORS
+    // Construct a thread pool with 'nThreads' dedicated threads. The thread
+    // pool will create these threads and make them available to execute tasks
+    // via this->workerp(index)->addTask(...)
+    VlThreadPool(int nThreads, bool profiling);
+    ~VlThreadPool();
+
+    // METHODS
+    inline int numThreads() const {
+        return m_workers.size();
+    }
+    inline VlWorkerThread* workerp(int index) {
+        assert(index >= 0);
+        assert(index < m_workers.size());
+        return m_workers[index];
+    }
+    inline VlProfileRec* profileAppend() {
+        t_profilep->emplace_back();
+        return &(t_profilep->back());
+    }
+    void profileAppendAll(const VlProfileRec& rec);
+    void profileDump(const char* filenamep, vluint64_t ticksElapsed);
+    // In profiling mode, each executing thread must call
+    // this once to setup profiling state:
+    void setupProfilingClientThread();
+    void tearDownProfilingClientThread();
+private:
+    VL_UNCOPYABLE(VlThreadPool);
+};
+
+#endif
--- a/internals.pod
+++ b/internals.pod
@ -155,6 +155,221 @@ provided and documented in C<V3GraphAlg.cpp>.

 =back

+=head2 Multithreaded Mode
+
+In --threads mode, the frontend of the Verilator pipeline is the same as
+serial mode, up until V3Order.
+
+V3Order builds a fine-grained, statement-level dependency graph that governs
+the ordering of code within a single eval() call. In serial mode, that
+dependency graph is used to order all statements into a total serial order.
+In parallel mode, the same dependency graph is the starting point for a
+partitioner (V3Partition).
+
+The partitioner's goal is to coarsen the fine-grained DAG into a coarser
+DAG, while maintaining as much available parallelism as possible. Often the
+partitioner can transform an input graph with millions of nodes into a
+coarsened execution graph with a few dozen nodes, while maintaining enough
+parallelism to take advantage of a modern multicore CPU. Runtime
+synchronization cost is not prohibitive with so few nodes.
+
+=head3 Partitioning
+
+Our partitioner is similar to the one Vivek Sarkar described in his 1989
+paper "Partitioning and Scheduling Parallel Programs for Multiprocessors".
+
+Let's define some terms:
+
+=over 4
+
+=item C<Par Factor>
+
+The available parallelism or "par-factor" of a DAG is the total cost to
+execute all nodes, divided by the cost to execute the longest critical path
+through the graph. This is the speedup you would get from running the graph
+in parallel, if given infinite CPU cores available and communication and
+synchronization are zero.
+
+=item C<Macro Task>
+
+When the partitioner coarsens the graph, it combines nodes together. Each
+fine-grained node represents an atomic "task"; combined nodes in the
+coarsened graph are "macro-tasks". This term comes from Sarkar. Each
+macro-task executes from start to end on one processor, without any
+synchronization to any other macro-task during its
+execution. (Synchronization only happens before the macro-task begins or
+after it ends.)
+
+=item C<Edge Contraction>
+
+Our partitioner, like Sarkar's, primarily relies on "edge contraction" to
+coarsen the graph. It starts with one macro-task per atomic task and
+iteratively combines pairs of edge-connected macro-tasks.
+
+=item C<Local Critical Path>
+
+Each node in the graph has a "local" critical path. That's the critical
+path from the start of the graph to the start of the node, plus the node's
+cost, plus the critical path from the end of the node to the end of the
+graph.
+
+=back
+
+Sarkar calls out an important trade-off: coarsening the graph reduces
+runtime synchronization overhead among the macro-tasks, but it tends to
+increase the critical path through the graph and thus reduces par-factor.
+
+Sarkar's partitioner, and ours, chooses pairs of macro-tasks to merge such
+that the growth in critical path is minimized. Each candidate merge would
+result in a new node, which would have some local critical path. We choose
+the candidate that would produce the shortest local critical path. Repeat
+until par-factor falls to a target threshold. It's a greedy algorithm, and
+it's not guaranteed to produce the best partition (which Sarkar proves is
+NP-hard).
+
+=head3 Estimating Logic Costs
+
+To compute the cost of any given path through the graph, Verilator
+estimates an execution cost for each task. Each macro-task has an execution
+cost which is simply the sum of its tasks' costs. We assume that
+communication overhead and synchronization overhead are zero, so the cost
+of any given path through the graph is simply the sum of macro-task
+execution costs. Sarkar does almost the same thing, except that he has
+nonzero estimates for synchronization costs.
+
+Verilator's cost estimates are assigned by the InstrCountCostVisitor.  This
+class is perhaps the most fragile piece of the multithread implementation.
+It's easy to have a bug where you count something cheap (eg. accessing one
+element of a huge array) as if it were expensive (eg. by counting it as if
+it were an access to the entire array.) Even without such gross bugs, the
+estimates this produce are only loosely predictive of actual runtime cost.
+Multithread performance would be better with better runtime costs
+estimates.  This is an area to improve.
+
+=head3 Scheduling Macro-Tasks at Runtime
+
+After coarsening the graph, we must schedule the macro-tasks for runtime.
+Sarkar describes two options: you can dynamically schedule tasks at
+runtime, with a runtime graph follower. Sarkar calls this the
+"macro-dataflow model."  Verilator does not support this; early experiments
+with this approach had poor performance.
+
+The other option is to statically assign macro-tasks to threads, with each
+thread running its macro-tasks in a static order. Sarkar describes this in
+Chapter 5. Verilator takes this static approach. The only dynamic aspect is
+that each macro task may block before starting, to wait until its
+prerequisites on other threads have finished.
+
+The synchronization cost is cheap if the prereqs are done. If they're not,
+fragmentation (idle CPU cores waiting) is possible. This is the major
+source of overhead in this approach. The --prof-threads switch and the
+C<verilator_gantt> script can visualize the time lost to such
+fragmentation.
+
+=head3 Locating Variables for Best Spatial Locality
+
+After scheduling all code, we attempt to locate variables in memory such
+that variables accessed by a single macro-task are close together in
+memory.  This provides "spatial locality" -- when we pull in a 64-byte
+cache line to access a 2-byte variable, we want the other 62 bytes to be
+ones we'll also likely access soon, for best cache performance.
+
+This turns out to be critical for performance. It should allow Verilator
+to scale to very large models. We don't rely on our working set fitting
+in any CPU cache; instead we essentially "stream" data into caches from
+memory. It's not literally streaming, where the address increases
+monotonically, but it should have similar performance characteristics,
+so long as each macro-task's dataset fits in one core's local caches.
+
+To achieve spatial locality, we tag each variable with the set of
+macro-tasks that access it. Let's call this set the "footprint" of that
+variable. The variables in a given module have a set of footprints. We can
+order those footprints to minimize the distance between them (distance is
+the number of macro-tasks that are different across any two footprints) and
+then emit all variables into the struct in ordered-footprint order.
+
+The footprint ordering is literally the traveling salesman problem, and we
+use a TSP-approximation algorithm to get close to an optimal sort.
+
+This is an old idea. Simulators designed at DEC in the early 1990s used
+similar techniques to optimize both single-thread and multi-thread modes.
+(Verilator does not optimize variable placement for spatial locality in
+serial mode; that is a possible area for improvement.)
+
+=head3 Improving Multithreaded Performance Further (a TODO list)
+
+=over 4
+
+=item C<Wave Scheduling>
+
+To allow the verilated model to run in parallel with the testbench, it
+might be nice to support "wave" scheduling, in which work on a cycle begins
+before eval() is called or continues after eval() returns.  For now all
+work on a cycle happens during the eval() call, leaving Verilator's threads
+idle while the testbench (everything outside eval()) is working. This would
+involve fundamental changes within the partitioner, however, it's probably
+the best bet for hiding testbench latency.
+
+=item C<Efficient Dynamic Scheduling>
+
+To scale to more than a few threads, we may revisit a fully dynamic
+scheduler. For large (>16 core) systems it might make sense to dedicate an
+entire core to scheduling, so that scheduler data structures would fit in
+its L1 cache and thus the cost of traversing priority-ordered ready lists
+would not be prohibitive.
+
+=item C<Static Scheduling with Runtime Repack>
+
+We could modify the static scheduling approach by gathering actual
+macro-task execution times at run time, and dynamically re-packing the
+macro-tasks into the threads also at run time. Say, re-pack once every
+10,000 cycles or something. This has the potential to do better than our
+static estimates about macro-task run times. It could potentially react to
+CPU cores that aren't performing equally, due to NUMA or thermal throttling
+or nonuniform competing memory traffic or whatever.
+
+=item C<Clock Domain Balancing>
+
+Right now Verilator makes no attempt to balance clock domains across
+macro-tasks. For a multi-domain model, that could lead to bad gantt chart
+fragmentation. This could be improved if it's a real problem in practice.
+
+=item C<Other Forms of MTask Balancing>
+
+The largest source of runtime overhead is idle CPUs, which happens due to
+variance between our predicted runtime for each MTask and its actual
+runtime. That variance is magnified if MTasks are homogeneous, containing
+similar repeating logic which was generally close together in source code
+and which is still packed together even after going through Verilator's
+digestive tract.
+
+If Verilator could avoid doing that, and instead would take source logic
+that was close together and distribute it across MTasks, that would
+increase the diversity of any given MTask, and this should reduce variance
+in the cost estimates.
+
+One way to do that might be to make various "tie breaker" comparison
+routines in the sources to rely more heavily on randomness, and generally
+try harder not to keep input nodes together when we have the option to
+scramble things.
+
+=item C<Performance Regression>
+
+It would be nice if we had a regression of large designs, with some
+diversity of design styles, to test on both single- and multi-threaded
+modes. This would help to avoid performance regressions, and also to
+evaluate the optimizations while minimizing the impact of parasitic noise.
+
+=item C<Per-Instance Classes>
+
+If we have multiple instances of the same module, and they partition
+differently (likely; we make no attempt to partition them the same) then
+the variable sort will be suboptimal for either instance.  A possible
+improvement would be to emit a unique class for each instance of a module,
+and sort its variables optimally for that instance's code stream.
+
+=back
+
 =head2 Verilated Flow

 The evaluation loop outputted by Verilator is designed to allow a single
--- a/nodist/install_test
+++ b/nodist/install_test
@ -64,6 +64,7 @@ sub test {
 	run("test -e $prefix/bin/verilator");
 	run("test -e $prefix/bin/verilator_bin");
 	run("test -e $prefix/bin/verilator_bin_dbg");
+        run("test -e $prefix/bin/verilator_gantt");
 	run("test -e $prefix/bin/verilator_profcfunc");
    }

--- a/src/Makefile_obj.in
+++ b/src/Makefile_obj.in
@ -217,6 +217,7 @@ RAW_OBJS = \
 	V3Order.o \
 	V3Os.o \
 	V3Param.o \
+	V3Partition.o \
 	V3PreShell.o \
 	V3Premit.o \
 	V3Reloop.o \
--- a/src/V3Ast.h
+++ b/src/V3Ast.h
@ -29,16 +29,24 @@
 #include <vector>
 #include <cmath>
 #include <map>
+#include VL_INCLUDE_UNORDERED_SET

 #include "V3Ast__gen_classes.h"	// From ./astgen
 // Things like:
 //   class V3AstNode;

+// Forward declarations
+class V3Graph;
+class ExecMTask;
+
 // Hint class so we can choose constructors
 class VFlagLogicPacked {};
 class VFlagBitPacked {};
 class VFlagChildDType {};  // Used by parser.y to select constructor that sets childDType

+// Used as key for another map, needs operator<, hence not an unordered_set
+typedef std::set<int> MTaskIdSet;  // Set of mtaskIds for Var sorting
+
 //######################################################################

 // For broken() function, return error string if have a match
--- a/src/V3AstNodes.cpp
+++ b/src/V3AstNodes.cpp
@ -31,6 +31,8 @@
 #include "V3Ast.h"
 #include "V3File.h"
 #include "V3Global.h"
+#include "V3Graph.h"
+#include "V3PartitionGraph.h"  // Just for mtask dumping

 //======================================================================
 // Special methods
@ -151,22 +153,26 @@ AstNodeBiop* AstEqWild::newTyped(FileLine* fl, AstNode* lhsp, AstNode* rhsp) {
    }
 }

+AstExecGraph::AstExecGraph(FileLine* fileline)
+    : AstNode(fileline) {
+    m_depGraphp = new V3Graph;
+}
+AstExecGraph::~AstExecGraph() {
+    delete m_depGraphp; VL_DANGLING(m_depGraphp);
+}
+
 bool AstVar::isSigPublic() const {
    return (m_sigPublic || (v3Global.opt.allPublic() && !isTemp() && !isGenVar()));
 }
-
 bool AstVar::isScQuad() const {
    return (isSc() && isQuad() && !isScBv() && !isScBigUint());
 }
-
 bool AstVar::isScBv() const {
    return ((isSc() && width() >= v3Global.opt.pinsBv()) || m_attrScBv);
 }
-
 bool AstVar::isScUint() const {
    return ((isSc() && v3Global.opt.pinsScUint() && width() >= 2 && width() <= 64) && !isScBv());
 }
-
 bool AstVar::isScBigUint() const {
    return ((isSc() && v3Global.opt.pinsScBigUint() && width() >= 65 && width() <= 512) && !isScBv());
 }
@ -441,6 +447,16 @@ AstVar* AstVar::scVarRecurse(AstNode* nodep) {
    return NULL;
 }

+string AstVar::mtasksString() const {
+    std::ostringstream os;
+    os<<" all: ";
+    for (MTaskIdSet::const_iterator it = m_mtaskIds.begin();
+         it != m_mtaskIds.end(); ++it) {
+        os<<*it<<" ";
+    }
+    return os.str();
+}
+
 AstNodeDType* AstNodeDType::dtypeDimensionp(int dimension) {
    // dimension passed from AstArraySel::dimension
    // Dimension 0 means the VAR itself, 1 is the closest SEL to the AstVar,
@ -970,6 +986,11 @@ void AstSliceSel::dump(std::ostream& str) {
        str<<" decl"<<declRange();
    }
 }
+void AstMTaskBody::dump(std::ostream& str) {
+    this->AstNode::dump(str);
+    str<<" ";
+    m_execMTaskp->dump(str);
+}
 void AstTypeTable::dump(std::ostream& str) {
    this->AstNode::dump(str);
    for (int i=0; i<(int)(AstBasicDTypeKwd::_ENUM_MAX); ++i) {
--- a/src/V3AstNodes.h
+++ b/src/V3AstNodes.h
@ -1124,6 +1124,7 @@ private:
    bool	m_noSubst:1;	// Do not substitute out references
    bool	m_trace:1;	// Trace this variable
    AstVarAttrClocker m_attrClocker;
+    MTaskIdSet  m_mtaskIds;  // MTaskID's that read or write this var

    void	init() {
 	m_input=false; m_output=false; m_tristate=false; m_declOutput=false;
@ -1323,6 +1324,10 @@ public:
 	if (varType()==AstVarType::INPUT || varType()==AstVarType::OUTPUT) m_varType = AstVarType::WIRE;
    }
    static AstVar* scVarRecurse(AstNode* nodep);
+    void addProducingMTaskId(int id) { m_mtaskIds.insert(id); }
+    void addConsumingMTaskId(int id) { m_mtaskIds.insert(id); }
+    const MTaskIdSet& mtaskIds() const { return m_mtaskIds; }
+    string mtasksString() const;
 };

 class AstDefParam : public AstNode {
@ -5698,6 +5703,44 @@ public:
    AstNode* bodysp() const { return op1p(); }  // op1= expressions to print
 };

+class AstMTaskBody : public AstNode {
+    // Hold statements for each MTask
+private:
+    ExecMTask* m_execMTaskp;
+public:
+    explicit AstMTaskBody(FileLine* flp)
+        : AstNode(flp)
+        , m_execMTaskp(NULL) {}
+    ASTNODE_NODE_FUNCS(MTaskBody);
+    virtual const char* broken() const { BROKEN_RTN(!m_execMTaskp); return NULL; }
+    AstNode* stmtsp() const { return op1p(); }
+    void addStmtsp(AstNode* nodep) { addOp1p(nodep); }
+    ExecMTask* execMTaskp() const { return m_execMTaskp; }
+    void execMTaskp(ExecMTask* execMTaskp) { m_execMTaskp = execMTaskp; }
+    virtual void dump(std::ostream& str=std::cout);
+};
+
+class AstExecGraph : public AstNode {
+    // For parallel execution, this node contains a dependency graph.  Each
+    // node in the graph is an ExecMTask, which contains a body for the
+    // mtask, which contains a set of AstActive's, each of which calls a
+    // leaf AstCFunc. whew!
+    //
+    // The mtask bodies are also children of this node, so we can visit
+    // them without traversing the graph (it's not always needed to
+    // traverse the graph.)
+private:
+    V3Graph *m_depGraphp;  // contains ExecMTask's
+public:
+    explicit AstExecGraph(FileLine* fileline);
+    ASTNODE_NODE_FUNCS_NO_DTOR(ExecGraph)
+    virtual ~AstExecGraph();
+    virtual const char* broken() const { BROKEN_RTN(!m_depGraphp); return NULL; }
+    const V3Graph* depGraphp() const { return m_depGraphp; }
+    V3Graph* mutableDepGraphp() { return m_depGraphp; }
+    void addMTaskBody(AstMTaskBody* bodyp) { addOp1p(bodyp); }
+};
+
 class AstSplitPlaceholder : public AstNode {
 public:
    // Dummy node used within V3Split; never exists outside of V3Split.
@ -5749,12 +5792,14 @@ private:
    AstTypeTable* m_typeTablep;	// Reference to top type table, for faster lookup
    AstPackage*	  m_dollarUnitPkgp;
    AstCFunc*     m_evalp;      // The '_eval' function
+    AstExecGraph* m_execGraphp;  // Execution MTask graph for threads>1 mode
 public:
    AstNetlist()
 	: AstNode(new FileLine("AstRoot",0))
 	, m_typeTablep(NULL)
 	, m_dollarUnitPkgp(NULL)
-	, m_evalp(NULL) { }
+        , m_evalp(NULL)
+        , m_execGraphp(NULL) { }
    ASTNODE_NODE_FUNCS(Netlist)
    virtual const char* broken() const {
        BROKEN_RTN(m_dollarUnitPkgp && !m_dollarUnitPkgp->brokeExists());
@ -5784,6 +5829,8 @@ public:
 	return m_dollarUnitPkgp; }
    AstCFunc* evalp() const { return m_evalp; }
    void evalp(AstCFunc* evalp) { m_evalp = evalp; }
+    AstExecGraph* execGraphp() const { return m_execGraphp; }
+    void execGraphp(AstExecGraph* graphp) { m_execGraphp = graphp; }
 };

 //######################################################################
--- a/src/V3Clock.cpp
+++ b/src/V3Clock.cpp
@ -68,6 +68,7 @@ private:
    AstCFunc*		m_settleFuncp;	// Top settlement function we are creating
    AstSenTree*		m_lastSenp;	// Last sensitivity match, so we can detect duplicates.
    AstIf*		m_lastIfp;	// Last sensitivity if active to add more under
+    AstMTaskBody*       m_mtaskBodyp;   // Current mtask body

    // METHODS
    VL_DEBUG_FUNC;  // Declare debug()
@ -338,6 +339,30 @@ private:
 	    // Only empty blocks should be leftover on the non-top.  Killem.
 	    if (nodep->stmtsp()) nodep->v3fatalSrc("Non-empty lower active");
 	    nodep->unlinkFrBack()->deleteTree(); VL_DANGLING(nodep);
+        } else if (m_mtaskBodyp) {
+            UINFO(4,"  TR ACTIVE  "<<nodep<<endl);
+            AstNode* stmtsp = nodep->stmtsp()->unlinkFrBackWithNext();
+            if (nodep->hasClocked()) {
+                if (nodep->hasInitial()) nodep->v3fatalSrc("Initial block should not have clock sensitivity");
+                if (m_lastSenp && nodep->sensesp()->sameTree(m_lastSenp)) {
+                    UINFO(4,"    sameSenseTree\n");
+                } else {
+                    clearLastSen();
+                    m_lastSenp = nodep->sensesp();
+                    // Make a new if statement
+                    m_lastIfp = makeActiveIf(m_lastSenp);
+                    m_mtaskBodyp->addStmtsp(m_lastIfp);
+                }
+                // Move statements to if
+                m_lastIfp->addIfsp(stmtsp);
+            } else if (nodep->hasInitial() || nodep->hasSettle()) {
+                nodep->v3fatalSrc("MTask should not include initial/settle logic.");
+            } else {
+                // Combo logic. Move statements to mtask func.
+                clearLastSen();
+                m_mtaskBodyp->addStmtsp(stmtsp);
+            }
+            nodep->unlinkFrBack()->deleteTree(); VL_DANGLING(nodep);
 	} else {
 	    UINFO(4,"  ACTIVE  "<<nodep<<endl);
 	    AstNode* stmtsp = nodep->stmtsp()->unlinkFrBackWithNext();
@ -372,6 +397,20 @@ private:
 	    nodep->unlinkFrBack()->deleteTree(); VL_DANGLING(nodep);
 	}
    }
+    virtual void visit(AstExecGraph* nodep) {
+        for (m_mtaskBodyp = VN_CAST(nodep->op1p(), MTaskBody);
+             m_mtaskBodyp;
+             m_mtaskBodyp = VN_CAST(m_mtaskBodyp->nextp(), MTaskBody)) {
+            clearLastSen();
+            iterate(m_mtaskBodyp);
+        }
+        clearLastSen();
+        // Move the ExecGraph into _eval. Its location marks the
+        // spot where the graph will execute, relative to other
+        // (serial) logic in the cycle.
+        nodep->unlinkFrBack();
+        addToEvalLoop(nodep);
+    }

    //--------------------
    // Default: Just iterate
@ -391,6 +430,7 @@ public:
        m_lastSenp = NULL;
 	m_lastIfp = NULL;
 	m_scopep = NULL;
+        m_mtaskBodyp = NULL;
 	//
        iterate(nodep);
        // Allow downstream modules to find _eval()
--- a/src/V3EmitC.cpp
+++ b/src/V3EmitC.cpp
@ -34,6 +34,8 @@
 #include "V3EmitC.h"
 #include "V3EmitCBase.h"
 #include "V3Number.h"
+#include "V3PartitionGraph.h"
+#include "V3TSP.h"

 #define VL_VALUE_STRING_MAX_WIDTH 8192	// We use a static char array in VL_VALUE_STRING

@ -103,7 +105,13 @@ public:
 	    puts("["+cvtToStr(arrayp->elementsConst())+"]");
 	}
    }
-
+    void emitVarCmtChg(const AstVar* varp, string* curVarCmtp) {
+        string newVarCmt = varp->mtasksString();
+        if (*curVarCmtp != newVarCmt) {
+            *curVarCmtp = newVarCmt;
+            puts("// Begin mtask footprint "+*curVarCmtp+"\n");
+        }
+    }
    void emitTypedefs(AstNode* firstp) {
 	bool first = true;
 	for (AstNode* loopp=firstp; loopp; loopp = loopp->nextp()) {
@ -783,6 +791,50 @@ public:
    virtual ~EmitCStmts() {}
 };

+//######################################################################
+// Establish mtask variable sort order in mtasks mode
+
+class EmitVarTspSorter : public V3TSP::TspStateBase {
+private:
+    // MEMBERS
+    const MTaskIdSet& m_mtaskIds;  // Mtask we're ordering
+    static unsigned m_serialNext;  // Unique ID to establish serial order
+    unsigned m_serial;  // Serial ordering
+public:
+    // CONSTRUCTORS
+    explicit EmitVarTspSorter(const MTaskIdSet& mtaskIds)
+        : m_mtaskIds(mtaskIds),
+          m_serial(++m_serialNext) {}
+    virtual ~EmitVarTspSorter() {}
+    // METHODS
+    bool operator<(const TspStateBase& other) const {
+        return operator<(dynamic_cast<const EmitVarTspSorter&>(other));
+    }
+    bool operator<(const EmitVarTspSorter& other) const {
+        return m_serial < other.m_serial;
+    }
+    const MTaskIdSet& mtaskIds() const { return m_mtaskIds; }
+    virtual int cost(const TspStateBase* otherp) const {
+        return cost(dynamic_cast<const EmitVarTspSorter*>(otherp));
+    }
+    virtual int cost(const EmitVarTspSorter* otherp) const {
+        int cost = diffs(m_mtaskIds, otherp->m_mtaskIds);
+        cost += diffs(otherp->m_mtaskIds, m_mtaskIds);
+        return cost;
+    }
+    // Returns the number of elements in set_a that don't appear in set_b
+    static int diffs(const MTaskIdSet& set_a, const MTaskIdSet& set_b) {
+        int diffs = 0;
+        for (MTaskIdSet::iterator it = set_a.begin();
+             it != set_a.end(); ++it) {
+            if (set_b.find(*it) == set_b.end()) ++diffs;
+        }
+        return diffs;
+    }
+};
+
+unsigned EmitVarTspSorter::m_serialNext = 0;
+
 //######################################################################
 // Internal EmitC implementation

@ -873,6 +925,91 @@ class EmitCImp : EmitCStmts {
 	return ofp;
    }

+    // Returns the number of cross-thread dependencies into mtaskp.
+    // If >0, mtaskp must test whether its prereqs are done before starting,
+    // and may need to block.
+    static uint32_t packedMTaskMayBlock(const ExecMTask* mtaskp) {
+        uint32_t result = 0;
+        for (V3GraphEdge* edgep = mtaskp->inBeginp(); edgep; edgep = edgep->inNextp()) {
+            const ExecMTask* prevp = dynamic_cast<ExecMTask*>(edgep->fromp());
+            if (prevp->thread() != mtaskp->thread()) {
+                ++result;
+            }
+        }
+        return result;
+    }
+
+    void emitMTaskBody(AstMTaskBody* nodep) {
+        ExecMTask* curExecMTaskp = nodep->execMTaskp();
+        if (packedMTaskMayBlock(curExecMTaskp)) {
+            puts("vlTOPp->__Vm_mt_" + cvtToStr(curExecMTaskp->id())
+                 + ".waitUntilUpstreamDone(even_cycle);\n");
+        }
+
+        string recName;
+        if (v3Global.opt.profThreads()) {
+            recName = "__Vprfthr_" + cvtToStr(curExecMTaskp->id());
+            puts("VlProfileRec* " + recName + " = NULL;\n");
+            // Leave this if() here, as don't want to call VL_RDTSC_Q unless profiling
+            puts("if (VL_UNLIKELY(vlTOPp->__Vm_profile_cycle_start)) {\n");
+            puts(  recName + " = vlTOPp->__Vm_threadPoolp->profileAppend();\n");
+            puts(  recName + "->startRecord(VL_RDTSC_Q() - vlTOPp->__Vm_profile_cycle_start,");
+            puts(               " "+cvtToStr(curExecMTaskp->id())+ ",");
+            puts(               " "+cvtToStr(curExecMTaskp->cost())+");\n");
+            puts("}\n");
+        }
+        puts("Verilated::mtaskId(" + cvtToStr(curExecMTaskp->id()) + ");\n");
+
+        // The actual body of calls to leaf functions
+        iterateAndNextNull(nodep->stmtsp());
+
+        if (v3Global.opt.profThreads()) {
+            // Leave this if() here, as don't want to call VL_RDTSC_Q unless profiling
+            puts("if (VL_UNLIKELY("+recName+")) {\n");
+            puts(  recName + "->endRecord(VL_RDTSC_Q() - vlTOPp->__Vm_profile_cycle_start);\n");
+            puts("}\n");
+        }
+
+        // Flush message queue
+        puts("Verilated::endOfThreadMTask(vlSymsp->__Vm_evalMsgQp);\n");
+
+        // For any downstream mtask that's on another thread, bump its
+        // counter and maybe notify it.
+        for (V3GraphEdge* edgep = curExecMTaskp->outBeginp();
+             edgep; edgep = edgep->outNextp()) {
+            const ExecMTask* nextp = dynamic_cast<ExecMTask*>(edgep->top());
+            if (nextp->thread() != curExecMTaskp->thread()) {
+                puts("vlTOPp->__Vm_mt_"+cvtToStr(nextp->id())
+                     + ".signalUpstreamDone(even_cycle);\n");
+            }
+        }
+
+        // Run the next mtask inline
+        const ExecMTask* nextp = curExecMTaskp->packNextp();
+        if (nextp) {
+            emitMTaskBody(nextp->bodyp());
+        } else {
+            // Unblock the fake "final" mtask
+            puts("vlTOPp->__Vm_mt_final.signalUpstreamDone(even_cycle);\n");
+        }
+    }
+
+    virtual void visit(AstMTaskBody* nodep) {
+        ExecMTask* mtp = nodep->execMTaskp();
+        puts("\n");
+        puts("void ");
+        puts(modClassName(m_modp)+"::"+mtp->cFuncName());
+        puts("(bool even_cycle, void* symtab) {\n");
+
+        // Declare and set vlSymsp
+        puts(EmitCBaseVisitor::symClassVar() + " = ("
+             + EmitCBaseVisitor::symClassName() + "*)symtab;\n");
+        puts(EmitCBaseVisitor::symTopAssign()+"\n");
+
+        emitMTaskBody(nodep);
+        puts("}\n");
+    }
+
    //---------------------------------------
    // VISITORS
    using EmitCStmts::visit;  // Suppress hidden overloaded virtual function warning
@ -973,6 +1110,54 @@ class EmitCImp : EmitCStmts {
 	emitVarReset(varp);
    }

+    virtual void visit(AstExecGraph* nodep) {
+        if (nodep != v3Global.rootp()->execGraphp()) {
+            nodep->v3fatalSrc("ExecGraph should be a singleton!");
+        }
+        // The location of the AstExecGraph within the containing _eval()
+        // function is where we want to invoke the graph and wait for it to
+        // complete. Do that now.
+        //
+        // Don't recurse to children -- this isn't the place to emit
+        // function definitions for the nested CFuncs. We'll do that at the
+        // end.
+        puts("vlTOPp->__Vm_even_cycle = !vlTOPp->__Vm_even_cycle;\n");
+
+        // Build the list of initial mtasks to start
+        std::vector<const ExecMTask*> execMTasks;
+
+        // Start each root mtask
+        for (const V3GraphVertex* vxp = nodep->depGraphp()->verticesBeginp();
+             vxp; vxp = vxp->verticesNextp()) {
+            const ExecMTask* etp = dynamic_cast<const ExecMTask*>(vxp);
+            if (etp->threadRoot()) execMTasks.push_back(etp);
+        }
+        if (execMTasks.size() >
+            static_cast<unsigned>(v3Global.opt.threads())) {
+            nodep->v3fatalSrc("More root mtasks than available threads");
+        }
+
+        if (!execMTasks.empty()) {
+            for (uint32_t i = 0; i < execMTasks.size(); ++i) {
+                bool runInline = (i == execMTasks.size() - 1);
+                if (runInline) {
+                    // The thread calling eval() will run this mtask inline,
+                    // along with its packed successors.
+                    puts(execMTasks[i]->cFuncName()
+                         + "(vlTOPp->__Vm_even_cycle, vlSymsp);\n");
+                    puts("Verilated::mtaskId(0);\n");
+                } else {
+                    // The other N-1 go to the thread pool.
+                    puts("vlTOPp->__Vm_threadPoolp->workerp("
+                         + cvtToStr(i)+")->addTask("
+                         + execMTasks[i]->cFuncName()
+                         + ", vlTOPp->__Vm_even_cycle, vlSymsp);\n");
+                }
+            }
+            puts("vlTOPp->__Vm_mt_final.waitUntilUpstreamDone(vlTOPp->__Vm_even_cycle);\n");
+        }
+    }
+
    //---------------------------------------
    // ACCESSORS

@ -995,6 +1180,8 @@ class EmitCImp : EmitCStmts {
    void emitStaticDecl(AstNodeModule* modp);
    void emitSettleLoop(const std::string& eval_call, bool initial);
    void emitWrapEval(AstNodeModule* modp);
+    void emitMTaskState();
+    void emitMTaskVertexCtors(bool* firstp);
    void emitInt(AstNodeModule* modp);
    void maybeSplit(AstNodeModule* modp);

@ -1534,6 +1721,36 @@ void EmitCImp::emitCoverageDecl(AstNodeModule* modp) {
    }
 }

+void EmitCImp::emitMTaskVertexCtors(bool* firstp) {
+    AstExecGraph* execGraphp = v3Global.rootp()->execGraphp();
+    if (!execGraphp) v3Global.rootp()->v3fatalSrc("Should have an execGraphp");
+    const V3Graph* depGraphp = execGraphp->depGraphp();
+
+    unsigned finalEdgesInCt = 0;
+    for (const V3GraphVertex* vxp = depGraphp->verticesBeginp();
+         vxp; vxp = vxp->verticesNextp()) {
+        const ExecMTask* mtp = dynamic_cast<const ExecMTask*>(vxp);
+        unsigned edgesInCt = packedMTaskMayBlock(mtp);
+        if (packedMTaskMayBlock(mtp) > 0) {
+            emitCtorSep(firstp);
+            puts("__Vm_mt_"+cvtToStr(mtp->id())+"("+cvtToStr(edgesInCt)+")");
+        }
+        // Each mtask with no packed successor will become a dependency
+        // for the final node:
+        if (!mtp->packNextp()) ++finalEdgesInCt;
+    }
+
+    emitCtorSep(firstp);
+    puts("__Vm_mt_final(" + cvtToStr(finalEdgesInCt) + ")");
+
+    // This will flip to 'true' before the start of the 0th cycle.
+    emitCtorSep(firstp); puts("__Vm_threadPoolp(NULL)");
+    if (v3Global.opt.profThreads()) {
+        emitCtorSep(firstp); puts("__Vm_profile_cycle_start(0)");
+    }
+    emitCtorSep(firstp); puts("__Vm_even_cycle(false)");
+}
+
 void EmitCImp::emitCtorImp(AstNodeModule* modp) {
    puts("\n");
    bool first = true;
@ -1544,6 +1761,9 @@ void EmitCImp::emitCtorImp(AstNodeModule* modp) {
        first = false;  // VL_CTOR_IMP includes the first ':'
    }
    emitVarCtors(&first);
+    if (modp->isTop() && v3Global.opt.mtasks()) {
+        emitMTaskVertexCtors(&first);
+    }
    puts(" {\n");
    emitCellCtors(modp);
    emitSensitives();
@ -1556,6 +1776,39 @@ void EmitCImp::emitCtorImp(AstNodeModule* modp) {
    putsDecoration("// Reset structure values\n");
    puts("_ctor_var_reset();\n");
    emitTextSection(AstType::atScCtor);
+
+    if (modp->isTop() && v3Global.opt.mtasks()) {
+        // TODO-- For now each top module creates its own ThreadPool here,
+        // and deletes it in the destructor. If A and B are each top level
+        // modules, each creates a separate thread pool.  This allows
+        // A.eval() and B.eval() to run concurrently without any
+        // interference -- so long as the physical machine has enough cores
+        // to support both pools and all testbench threads.
+        //
+        // In the future, we might want to let the client provide a
+        // threadpool to the constructor. This would allow two or more
+        // models to share a single threadpool.
+        //
+        // For example: suppose models A and B are each compiled to run on
+        // 4 threads. The client might create a single thread pool with 3
+        // threads and pass it to both models. If the client can ensure tht
+        // A.eval() and B.eval() do NOT run concurrently, there will be no
+        // contention for the threads. This mode is missing for now.  (Is
+        // there demand for such a setup?)
+        puts("__Vm_threadPoolp = new VlThreadPool("
+             // Note we create N-1 threads in the thread pool. The thread
+             // that calls eval() becomes the final Nth thread for the
+             // duration of the eval call.
+             + cvtToStr(v3Global.opt.threads() - 1)
+             + ", " + cvtToStr(v3Global.opt.profThreads())
+             + ");\n");
+
+        if (v3Global.opt.profThreads()) {
+            puts("__Vm_profile_cycle_start = 0;\n");
+            puts("__Vm_profile_time_finished = 0;\n");
+            puts("__Vm_profile_window_ct = 0;");
+        }
+    }
    puts("}\n");
 }

@ -1597,6 +1850,9 @@ void EmitCImp::emitCoverageImp(AstNodeModule* modp) {
 void EmitCImp::emitDestructorImp(AstNodeModule* modp) {
    puts("\n");
    puts(modClassName(modp)+"::~"+modClassName(modp)+"() {\n");
+    if (modp->isTop() && v3Global.opt.mtasks()) {
+        puts("delete __Vm_threadPoolp; __Vm_threadPoolp = NULL;\n");
+    }
    emitTextSection(AstType::atScDtor);
    if (modp->isTop()) puts("delete __VlSymsp; __VlSymsp=NULL;\n");
    puts("}\n");
@ -1796,9 +2052,47 @@ void EmitCImp::emitWrapEval(AstNodeModule* modp) {
    if (v3Global.opt.threads() == 1) {
 	uint32_t mtaskId = 0;
 	putsDecoration("// MTask "+cvtToStr(mtaskId)+" start\n");
-	puts("VL_DEBUG_IF(VL_DBG_MSGF(\"MTask starting, mtaskId="+cvtToStr(mtaskId)+"\\n\"););\n");
+        puts("VL_DEBUG_IF(VL_DBG_MSGF(\"MTask"+cvtToStr(mtaskId)+" starting\\n\"););\n");
 	puts("Verilated::mtaskId("+cvtToStr(mtaskId)+");\n");
    }
+
+    if (v3Global.opt.mtasks()
+        && v3Global.opt.profThreads()) {
+        puts("if (VL_UNLIKELY((Verilated::profThreadsStart() != __Vm_profile_time_finished)\n");
+        puts(                 " && (VL_TIME_Q() > Verilated::profThreadsStart())\n");
+        puts(                 " && (Verilated::profThreadsWindow() >= 1))) {\n");
+        // Within a profile (either starting, middle, or end)
+        puts(    "if (vlTOPp->__Vm_profile_window_ct == 0) {\n");  // Opening file?
+        // Start profile on this cycle. We'll capture a window worth, then
+        // only analyze the next window worth. The idea is that the first window
+        // capture will hit some cache-cold stuff (eg printf) but it'll be warm
+        // by the time we hit the second window, we hope.
+        puts(        "vlTOPp->__Vm_profile_cycle_start = VL_RDTSC_Q();\n");
+        // "* 2" as first half is warmup, second half is collection
+        puts(        "vlTOPp->__Vm_profile_window_ct = Verilated::profThreadsWindow() * 2 + 1;\n");
+        puts(    "}\n");
+        puts(    "--vlTOPp->__Vm_profile_window_ct;\n");
+        puts(    "if (vlTOPp->__Vm_profile_window_ct == (Verilated::profThreadsWindow())) {\n");
+        // This barrier record in every threads' profile demarcates the
+        // cache-warm-up cycles before the barrier from the actual profile
+        // cycles afterward.
+        puts(        "vlTOPp->__Vm_threadPoolp->profileAppendAll(");
+        puts(                       "VlProfileRec(VlProfileRec::Barrier()));\n");
+        puts(        "vlTOPp->__Vm_profile_cycle_start = VL_RDTSC_Q();\n");
+        puts(    "}\n");
+        puts(    "else if (vlTOPp->__Vm_profile_window_ct == 0) {\n");
+        // Ending file.
+        puts(        "vluint64_t elapsed = VL_RDTSC_Q() - vlTOPp->__Vm_profile_cycle_start;\n");
+        puts(        "vlTOPp->__Vm_threadPoolp->profileDump(Verilated::profThreadsFilenamep(), elapsed);\n");
+        // This turns off the test to enter the profiling code, but still
+        // allows the user to collect another profile by changing
+        // profThreadsStart
+        puts(        "__Vm_profile_time_finished = Verilated::profThreadsStart();\n");
+        puts(        "vlTOPp->__Vm_profile_cycle_start = 0;\n");
+        puts(    "}\n");
+        puts("}\n");
+    }
+
    emitSettleLoop(
        (string("VL_DEBUG_IF(VL_DBG_MSGF(\"+ Clock loop\\n\"););\n")
         + (v3Global.opt.trace() ? "vlSymsp->__Vm_activity = true;\n" : "")
@ -1832,10 +2126,13 @@ void EmitCStmts::emitVarList(AstNode* firstp, EisWhich which, const string& pref
    // Put out a list of signal declarations
    // in order of 0:clocks, 1:vluint8, 2:vluint16, 4:vluint32, 5:vluint64, 6:wide, 7:arrays
    // This aids cache packing and locality
-    // Largest->smallest reduces the number of pad variables.
-    // But for now, Smallest->largest makes it more likely a small offset will allow access to the signal.
-    // TODO: Move this sort to an earlier visitor stage.
    //
+    // Largest->smallest reduces the number of pad variables.  Also
+    // experimented with alternating between large->small and small->large
+    // on successive Mtask groups, but then when a new mtask gets added may
+    // cause a huge delta.
+    //
+    // TODO: Move this sort to an earlier visitor stage.
    VarSortMap varAnonMap;
    VarSortMap varNonanonMap;

@ -1891,8 +2188,9 @@ void EmitCStmts::emitVarList(AstNode* firstp, EisWhich which, const string& pref

 void EmitCStmts::emitVarSort(const VarSortMap& vmap, VarVec* sortedp) {
    UASSERT(sortedp->empty(), "Sorted should be initially empty");
-    {
-        // Plain old serial mode. Sort by size, from small to large.
+    if (!v3Global.opt.mtasks()) {
+        // Plain old serial mode. Sort by size, from small to large,
+        // to optimize for both packing and small offsets in code.
        for (VarSortMap::const_iterator it = vmap.begin();
             it != vmap.end(); ++it) {
            for (VarVec::const_iterator jt = it->second.begin();
@ -1900,12 +2198,52 @@ void EmitCStmts::emitVarSort(const VarSortMap& vmap, VarVec* sortedp) {
                sortedp->push_back(*jt);
            }
        }
+        return;
+    }
+
+    // MacroTask mode.  Sort by MTask-affinity group first, size second.
+    typedef std::map<MTaskIdSet, VarSortMap> MTaskVarSortMap;
+    MTaskVarSortMap m2v;
+    for (VarSortMap::const_iterator it = vmap.begin(); it != vmap.end(); ++it) {
+        int size_class = it->first;
+        const VarVec& vec = it->second;
+        for (VarVec::const_iterator jt = vec.begin(); jt != vec.end(); ++jt) {
+            const AstVar* varp = *jt;
+            m2v[varp->mtaskIds()][size_class].push_back(varp);
+        }
+    }
+
+    // Create a TSP sort state for each MTaskIdSet footprint
+    V3TSP::StateVec states;
+    for (MTaskVarSortMap::iterator it = m2v.begin(); it != m2v.end(); ++it) {
+        states.push_back(new EmitVarTspSorter(it->first));
+    }
+
+    // Do the TSP sort
+    V3TSP::StateVec sorted_states;
+    V3TSP::tspSort(states, &sorted_states);
+
+    for (V3TSP::StateVec::iterator it = sorted_states.begin();
+         it != sorted_states.end(); ++it) {
+        const EmitVarTspSorter* statep = dynamic_cast<const EmitVarTspSorter*>(*it);
+        const VarSortMap& localVmap = m2v[statep->mtaskIds()];
+        // use rbegin/rend to sort size large->small
+        for (VarSortMap::const_reverse_iterator jt = localVmap.rbegin();
+             jt != localVmap.rend(); ++jt) {
+            const VarVec& vec = jt->second;
+            for (VarVec::const_iterator kt = vec.begin();
+                 kt != vec.end(); ++kt) {
+                sortedp->push_back(*kt);
+            }
+        }
+        delete statep; VL_DANGLING(statep);
    }
 }

 void EmitCStmts::emitSortedVarList(const VarVec& anons,
                                   const VarVec& nonanons,
                                   const string& prefixIfImp) {
+    string curVarCmt = "";
    // Output anons
    {
        int anonMembers = anons.size();
@ -1933,6 +2271,7 @@ void EmitCStmts::emitSortedVarList(const VarVec& anons,
                    if (anonL1s != 1) puts("struct {\n");
                    for (int l0=0; l0<lim && it != anons.end(); ++l0) {
                        const AstVar* varp = *it;
+                        emitVarCmtChg(varp, &curVarCmt);
                        emitVarDecl(varp, prefixIfImp);
                        ++it;
                    }
@ -1945,12 +2284,14 @@ void EmitCStmts::emitSortedVarList(const VarVec& anons,
        // Leftovers, just in case off by one error somewhere above
        for (; it != anons.end(); ++it) {
            const AstVar* varp = *it;
+            emitVarCmtChg(varp, &curVarCmt);
            emitVarDecl(varp, prefixIfImp);
        }
    }
    // Output nonanons
    for (VarVec::const_iterator it = nonanons.begin(); it != nonanons.end(); ++it) {
        const AstVar* varp = *it;
+        emitVarCmtChg(varp, &curVarCmt);
        emitVarDecl(varp, prefixIfImp);
    }
 }
@ -1986,6 +2327,59 @@ void EmitCImp::emitIntFuncDecls(AstNodeModule* modp) {
 	    if (funcp->ifdef()!="") puts("#endif // "+funcp->ifdef()+"\n");
 	}
    }
+
+    if (modp->isTop() && v3Global.opt.mtasks()) {
+        // Emit the mtask func prototypes.
+        AstExecGraph* execGraphp = v3Global.rootp()->execGraphp();
+        if (!execGraphp) v3Global.rootp()->v3fatalSrc("Root should have an execGraphp");
+        const V3Graph* depGraphp = execGraphp->depGraphp();
+        for (const V3GraphVertex* vxp = depGraphp->verticesBeginp();
+             vxp; vxp = vxp->verticesNextp()) {
+            const ExecMTask* mtp = dynamic_cast<const ExecMTask*>(vxp);
+            if (mtp->threadRoot()) {
+                // Emit function declaration for this mtask
+                ofp()->putsPrivate(true);
+                puts("static void "); puts(mtp->cFuncName());
+                puts("(bool even_cycle, void* symtab);\n");
+            }
+        }
+        // No AstCFunc for this one, as it's synthetic. Just write it:
+        puts("static void __Vmtask__final(bool even_cycle, void* symtab);\n");
+    }
+}
+
+void EmitCImp::emitMTaskState() {
+    ofp()->putsPrivate(true);
+    AstExecGraph* execGraphp = v3Global.rootp()->execGraphp();
+    if (!execGraphp) v3Global.rootp()->v3fatalSrc("Root should have an execGraphp");
+
+    const V3Graph* depGraphp = execGraphp->depGraphp();
+    for (const V3GraphVertex* vxp = depGraphp->verticesBeginp();
+         vxp; vxp = vxp->verticesNextp()) {
+        const ExecMTask* mtp = dynamic_cast<const ExecMTask*>(vxp);
+        if (packedMTaskMayBlock(mtp) > 0) {
+            puts("VlMTaskVertex __Vm_mt_" + cvtToStr(mtp->id()) + ";\n");
+        }
+    }
+    // This fake mtask depends on all the real ones.  We use it to block
+    // eval() until all mtasks are done.
+    //
+    // In the future we might allow _eval() to return before the graph is
+    // fully done executing, for "half wave" scheduling. For now we wait
+    // for all mtasks though.
+    puts("VlMTaskVertex __Vm_mt_final;\n");
+    puts("VlThreadPool* __Vm_threadPoolp;\n");
+
+    if (v3Global.opt.profThreads()) {
+        // rdtsc() at current cycle start
+        puts("vluint64_t __Vm_profile_cycle_start;\n");
+        // Time we finished analysis
+        puts("vluint64_t __Vm_profile_time_finished;\n");
+        // Track our position in the cache warmup and actual profile window
+        puts("vluint32_t __Vm_profile_window_ct;\n");
+    }
+
+    puts("bool __Vm_even_cycle;\n");
 }

 void EmitCImp::emitInt(AstNodeModule* modp) {
@ -2000,6 +2394,9 @@ void EmitCImp::emitInt(AstNodeModule* modp) {
    } else {
 	puts("#include \"verilated.h\"\n");
    }
+    if (v3Global.opt.mtasks()) {
+        puts("#include \"verilated_threads.h\"\n");
+    }
    if (v3Global.opt.savable()) {
 	puts("#include \"verilated_save.h\"\n");
    }
@ -2084,6 +2481,9 @@ void EmitCImp::emitInt(AstNodeModule* modp) {
 	    puts("bool __Vm_inhibitSim;  ///< Set true to disable evaluation of module\n");
 	}
    }
+    if (modp->isTop() && v3Global.opt.mtasks()) {
+        emitMTaskState();
+    }
    emitCoverageDecl(modp);	// may flip public/private

    puts("\n// PARAMETERS\n");
@ -2291,6 +2691,24 @@ void EmitCImp::main(AstNodeModule* modp, bool slow, bool fast) {
 	}
    }

+    if (fast && modp->isTop() && v3Global.opt.mtasks()) {
+        // Make a final pass and emit function definitions for the mtasks
+        // in the ExecGraph
+        AstExecGraph* execGraphp = v3Global.rootp()->execGraphp();
+        const V3Graph* depGraphp = execGraphp->depGraphp();
+        for (const V3GraphVertex* vxp = depGraphp->verticesBeginp();
+             vxp; vxp = vxp->verticesNextp()) {
+            const ExecMTask* mtaskp = dynamic_cast<const ExecMTask*>(vxp);
+            if (mtaskp->threadRoot()) {
+                maybeSplit(modp);
+                // Only define one function for all the mtasks packed on
+                // a given thread. We'll name this function after the
+                // root mtask though it contains multiple mtasks' worth
+                // of logic.
+                iterate(mtaskp->bodyp());
+            }
+        }
+    }
    delete m_ofp; m_ofp=NULL;
 }

--- a/src/V3EmitMk.cpp
+++ b/src/V3EmitMk.cpp
@ -94,6 +94,9 @@ public:
 			    putMakeClassEntry(of, "verilated_vcd_sc.cpp");
 			}
 		    }
+                    if (v3Global.opt.mtasks()) {
+                        putMakeClassEntry(of, "verilated_threads.cpp");
+                    }
 		}
 		else if (support==2 && slow) {
 		}
--- a/src/V3Error.h
+++ b/src/V3Error.h
@ -131,7 +131,7 @@ public:
 	    "ALWCOMBORDER", "ASSIGNDLY", "ASSIGNIN",
 	    "BLKANDNBLK", "BLKLOOPINIT", "BLKSEQ", "BSSPACE",
 	    "CASEINCOMPLETE", "CASEOVERLAP", "CASEWITHX", "CASEX", "CDCRSTLOGIC", "CLKDATA",
-	    "CMPCONST", "COLONPLUS", "COMBDLY", "DEFPARAM", "DECLFILENAME",
+            "CMPCONST", "COLONPLUS", "COMBDLY", "DEFPARAM", "DECLFILENAME",
 	    "ENDLABEL", "GENCLK",
 	    "IFDEPTH", "IMPERFECTSCH", "IMPLICIT", "IMPURE",
            "INCABSPATH", "INFINITELOOP", "INITIALDLY",
--- a/src/V3LifePost.cpp
+++ b/src/V3LifePost.cpp
@ -37,6 +37,8 @@
 #include VL_INCLUDE_UNORDERED_MAP

 #include "V3Global.h"
+#include "V3PartitionGraph.h"
+#include "V3GraphPathChecker.h"
 #include "V3LifePost.h"
 #include "V3Stats.h"
 #include "V3Ast.h"
@ -78,6 +80,11 @@ private:
            iterate(nodep->funcp());
        }
    }
+    virtual void visit(AstExecGraph* nodep) {
+        // Can just iterate across the MTask bodies in any order.  Order
+        // isn't important for LifePostElimVisitor's simple substitution.
+        iterateChildren(nodep);
+    }
    virtual void visit(AstCFunc* nodep) {
        if (!m_tracingCall && !nodep->entryPoint()) return;
        m_tracingCall = false;
@ -101,11 +108,17 @@ public:
 // and a sequence number within the mtask:

 struct LifeLocation {
+    const ExecMTask* mtaskp;
    uint32_t sequence;
 public:
-    LifeLocation() : sequence(0) {}
-    LifeLocation(uint32_t sequence_) : sequence(sequence_) {}
+    LifeLocation() : mtaskp(NULL), sequence(0) {}
+    LifeLocation(const ExecMTask* mtaskp_, uint32_t sequence_)
+        : mtaskp(mtaskp_), sequence(sequence_) {}
    bool operator< (const LifeLocation& b) const {
+        unsigned a_id = mtaskp ? mtaskp->id() : 0;
+        unsigned b_id = b.mtaskp ? b.mtaskp->id() : 0;
+        if (a_id < b_id) { return true; }
+        if (b_id < a_id) { return false; }
        return sequence < b.sequence;
    }
 };
@ -130,6 +143,9 @@ private:

    // STATE
    uint32_t            m_sequence;     // Sequence number of assigns/varrefs,
+    //                                  // local to the current MTask.
+    const ExecMTask*    m_execMTaskp;   // Current ExecMTask being processed,
+    //                                  // or NULL for serial code.
    V3Double0           m_statAssnDel;  // Statistic tracking
    bool                m_tracingCall;  // Currently tracing a CCall to a CFunc

@ -143,11 +159,15 @@ private:
    typedef vl_unordered_map<const AstVarScope*, LifePostLocation> PostLocMap;
    PostLocMap          m_assignposts;  // AssignPost dly var locations

+    const V3Graph*      m_mtasksGraphp;  // Mtask tracking graph
+    vl_unique_ptr<GraphPathChecker> m_checker;
+
    // METHODS
    VL_DEBUG_FUNC;  // Declare debug()

-    static bool before(const LifeLocation& a, const LifeLocation& b) {
-        return a.sequence < b.sequence;
+    bool before(const LifeLocation& a, const LifeLocation& b) {
+        if (a.mtaskp == b.mtaskp) return a.sequence < b.sequence;
+        return m_checker->pathExistsFrom(a.mtaskp, b.mtaskp);
    }
    bool outsideCriticalArea(LifeLocation loc,
                             const std::set<LifeLocation>& dlyVarAssigns,
@ -159,6 +179,13 @@ private:
        // Otherwise, loc could fall in the "critical" area where the
        // substitution affects the result of the operation at loc, so
        // return false.
+        if (!loc.mtaskp && assignPostLoc.mtaskp) {
+            // This is threaded mode; 'loc' is something that happens at
+            // initial/settle time, or perhaps in _eval() but outside of
+            // the mtask graph.
+            // In either case, it's not in the critical area.
+            return true;
+        }
        if (before(assignPostLoc, loc)) return true;
        for (std::set<LifeLocation>::iterator it = dlyVarAssigns.begin();
             it != dlyVarAssigns.end(); ++it) {
@ -239,6 +266,17 @@ private:
        // within the mtask) where each varscope is read, and written.
        iterateChildren(nodep);

+        if (v3Global.opt.mtasks()) {
+            if (!m_mtasksGraphp) {
+                nodep->v3fatalSrc("Should have initted m_mtasksGraphp by now");
+            }
+            m_checker.reset(new GraphPathChecker(m_mtasksGraphp));
+        } else {
+            if (m_mtasksGraphp) {
+                nodep->v3fatalSrc("Did not expect any m_mtasksGraphp in serial mode");
+            }
+        }
+
        // Find all assignposts. Determine which ones can be
        // eliminated. Remove those, and mark their dly vars' user4 field
        // to indicate we should replace these dly vars with their original
@ -252,7 +290,8 @@ private:
        // Consumption/generation of a variable,
        AstVarScope* vscp = nodep->varScopep();
        if (!vscp) nodep->v3fatalSrc("Scope not assigned");
-        LifeLocation loc(++m_sequence);
+
+        LifeLocation loc(m_execMTaskp, ++m_sequence);
        if (nodep->lvalue()) {
            m_writes[vscp].insert(loc);
        } else {
@ -275,7 +314,7 @@ private:
            if (m_assignposts.find(dlyVarp) != m_assignposts.end()) {
                nodep->v3fatalSrc("LifePostLocation attempted duplicate dlyvar map addition");
            }
-            LifeLocation loc(++m_sequence);
+            LifeLocation loc(m_execMTaskp, ++m_sequence);
            m_assignposts[dlyVarp] = LifePostLocation(loc, nodep);
        }
    }
@ -291,6 +330,18 @@ private:
            iterate(nodep->funcp());
        }
    }
+    virtual void visit(AstExecGraph* nodep) {
+        // Treat the ExecGraph like a call to each mtask body
+        m_mtasksGraphp = nodep->depGraphp();
+        for (V3GraphVertex* mtaskVxp = m_mtasksGraphp->verticesBeginp();
+             mtaskVxp; mtaskVxp = mtaskVxp->verticesNextp()) {
+            ExecMTask* mtaskp = dynamic_cast<ExecMTask*>(mtaskVxp);
+            m_execMTaskp = mtaskp;
+            m_sequence = 0;
+            iterate(mtaskp->bodyp());
+        }
+        m_execMTaskp = NULL;
+    }
    virtual void visit(AstCFunc* nodep) {
        if (!m_tracingCall && !nodep->entryPoint()) return;
        m_tracingCall = false;
@ -305,7 +356,9 @@ public:
    // CONSTRUCTORS
    explicit LifePostDlyVisitor(AstNetlist* nodep)
        : m_sequence(0)
-        , m_tracingCall(false) {
+        , m_execMTaskp(NULL)
+        , m_tracingCall(false)
+        , m_mtasksGraphp(NULL) {
        iterate(nodep);
    }
    virtual ~LifePostDlyVisitor() {
--- a/src/V3Options.cpp
+++ b/src/V3Options.cpp
@ -661,6 +661,9 @@ void V3Options::parseOptsList(FileLine* fl, const string& optdir, int argc, char
 	    else if ( !strcmp (sw, "-debug-abort") )		{ abort(); } // Undocumented, see also --debug-sigsegv
 	    else if ( onoff   (sw, "-debug-check", flag/*ref*/) ){ m_debugCheck = flag; }
            else if ( onoff   (sw, "-debug-leak", flag/*ref*/) ){ m_debugLeak = flag; }
+            else if ( onoff   (sw, "-debug-nondeterminism", flag/*ref*/) ){ m_debugNondeterminism = flag; }
+            else if ( onoff   (sw, "-debug-partition", flag/*ref*/) ){ m_debugPartition = flag; }  // Undocumented
+            else if ( onoff   (sw, "-debug-self-test", flag/*ref*/) ){ m_debugSelfTest = flag; }  // Undocumented
 	    else if ( !strcmp (sw, "-debug-sigsegv") )		{ throwSigsegv(); }  // Undocumented, see also --debug-abort
 	    else if ( !strcmp (sw, "-debug-fatalsrc") )		{ v3fatalSrc("--debug-fatal-src"); }  // Undocumented, see also --debug-abort
 	    else if ( onoff   (sw, "-decoration", flag/*ref*/) ) { m_decoration = flag; }
@ -678,6 +681,7 @@ void V3Options::parseOptsList(FileLine* fl, const string& optdir, int argc, char
 	    else if ( !strcmp (sw, "-private") )		{ m_public = false; }
            else if ( onoff   (sw, "-prof-cfuncs", flag/*ref*/) )       { m_profCFuncs = flag; }
            else if ( onoff   (sw, "-profile-cfuncs", flag/*ref*/) )    { m_profCFuncs = flag; }  // Undocumented, for backward compat
+            else if ( onoff   (sw, "-prof-threads", flag/*ref*/) )      { m_profThreads = flag; }
 	    else if ( onoff   (sw, "-public", flag/*ref*/) )		{ m_public = flag; }
            else if ( !strncmp(sw, "-pvalue+", strlen("-pvalue+")))	{ addParameter(string(sw+strlen("-pvalue+")), false); }
            else if ( onoff   (sw, "-relative-cfuncs", flag/*ref*/) )   { m_relativeCFuncs = flag; }
@ -689,6 +693,7 @@ void V3Options::parseOptsList(FileLine* fl, const string& optdir, int argc, char
 	    else if ( onoff   (sw, "-stats", flag/*ref*/) )		{ m_stats = flag; }
 	    else if ( onoff   (sw, "-stats-vars", flag/*ref*/) )	{ m_statsVars = flag; m_stats |= flag; }
 	    else if ( !strcmp (sw, "-sv") )				{ m_defaultLanguage = V3LangCode::L1800_2005; }
+            else if ( onoff   (sw, "-threads-coarsen", flag/*ref*/))    { m_threadsCoarsen = flag; }  // Undocumented, debug
 	    else if ( onoff   (sw, "-trace", flag/*ref*/) )		{ m_trace = flag; }
 	    else if ( onoff   (sw, "-trace-dups", flag/*ref*/) )	{ m_traceDups = flag; }
 	    else if ( onoff   (sw, "-trace-params", flag/*ref*/) )	{ m_traceParams = flag; }
@ -1013,6 +1018,20 @@ void V3Options::parseOptsList(FileLine* fl, const string& optdir, int argc, char
 		shift; m_threads = atoi(argv[i]);
 		if (m_threads < 0) fl->v3fatal("--threads must be >= 0: "<<argv[i]);
 	    }
+            else if ( !strcmp (sw, "-threads-dpi") && (i+1)<argc) {
+                shift;
+                if (!strcmp(argv[i], "all")) { m_threadsDpiPure=true; m_threadsDpiUnpure=true; }
+                else if (!strcmp(argv[i], "none")) { m_threadsDpiPure=false; m_threadsDpiUnpure=false; }
+                else if (!strcmp(argv[i], "pure")) { m_threadsDpiPure=true; m_threadsDpiUnpure=false; }
+                else {
+                    fl->v3fatal("Unknown setting for --threads-dpi: "<<argv[i]);
+                }
+            }
+            else if ( !strcmp (sw, "-threads-max-mtasks") ) {
+                shift; m_threadsMaxMTasks = atoi(argv[i]);
+                if (m_threadsMaxMTasks < 1)
+                    fl->v3fatal("--threads-max-mtasks must be >= 1: "<<argv[i]);
+            }
 	    else if ( !strcmp (sw, "-top-module") && (i+1)<argc ) {
 		shift; m_topModule = argv[i];
 	    }
@ -1223,6 +1242,9 @@ V3Options::V3Options() {
    m_coverageUser = false;
    m_debugCheck = false;
    m_debugLeak = true;
+    m_debugNondeterminism = false;
+    m_debugPartition = false;
+    m_debugSelfTest = false;
    m_decoration = true;
    m_exe = false;
    m_ignc = false;
@ -1237,6 +1259,7 @@ V3Options::V3Options() {
    m_pinsScBigUint = false;
    m_pinsUint8 = false;
    m_profCFuncs = false;
+    m_profThreads = false;
    m_preprocOnly = false;
    m_preprocNoLine = false;
    m_public = false;
@ -1249,6 +1272,10 @@ V3Options::V3Options() {
    m_statsVars = false;
    m_systemC = false;
    m_threads = 0;
+    m_threadsDpiPure = true;
+    m_threadsDpiUnpure = false;
+    m_threadsCoarsen = true;
+    m_threadsMaxMTasks = 0;
    m_trace = false;
    m_traceDups = false;
    m_traceParams = true;
--- a/src/V3Options.h
+++ b/src/V3Options.h
@ -75,7 +75,10 @@ class V3Options {
    bool	m_coverageUnderscore;// main switch: --coverage-underscore
    bool	m_coverageUser;	// main switch: --coverage-func
    bool	m_debugCheck;	// main switch: --debug-check
-    bool        m_debugLeak;   // main switch: --debug-leak
+    bool        m_debugLeak;    // main switch: --debug-leak
+    bool        m_debugNondeterminism;  // main switch: --debug-nondeterminism
+    bool        m_debugPartition;  // main switch: --debug-partition
+    bool        m_debugSelfTest;  // main switch: --debug-self-test
    bool	m_decoration;	// main switch: --decoration
    bool	m_exe;		// main switch: --exe
    bool	m_ignc;		// main switch: --ignc
@ -87,6 +90,7 @@ class V3Options {
    bool	m_pinsScBigUint;// main switch: --pins-sc-biguint
    bool	m_pinsUint8;	// main switch: --pins-uint8
    bool        m_profCFuncs;   // main switch: --prof-cfuncs
+    bool        m_profThreads;  // main switch: --prof-threads
    bool	m_public;	// main switch: --public
    bool	m_relativeCFuncs; // main switch: --relative-cfuncs
    bool	m_relativeIncludes; // main switch: --relative-includes
@ -96,6 +100,9 @@ class V3Options {
    bool	m_skipIdentical;// main switch: --skip-identical
    bool	m_stats;	// main switch: --stats
    bool	m_statsVars;	// main switch: --stats-vars
+    bool        m_threadsCoarsen;  // main switch: --threads-coarsen
+    bool        m_threadsDpiPure;  // main switch: --threads-dpi all/pure
+    bool        m_threadsDpiUnpure;  // main switch: --threads-dpi all
    bool	m_trace;	// main switch: --trace
    bool	m_traceDups;	// main switch: --trace-dups
    bool	m_traceParams;	// main switch: --trace-params
@ -117,6 +124,7 @@ class V3Options {
    int		m_outputSplitCTrace;// main switch: --output-split-ctrace
    int		m_pinsBv;	// main switch: --pins-bv
    int		m_threads;	// main switch: --threads (0 == --no-threads)
+    int         m_threadsMaxMTasks;  // main switch: --threads-max-mtasks
    int		m_traceDepth;	// main switch: --trace-depth
    int		m_traceMaxArray;// main switch: --trace-max-array
    int		m_traceMaxWidth;// main switch: --trace-max-width
@ -232,8 +240,14 @@ class V3Options {
    bool coverageUser() const { return m_coverageUser; }
    bool debugCheck() const { return m_debugCheck; }
    bool debugLeak() const { return m_debugLeak; }
+    bool debugNondeterminism() const { return m_debugNondeterminism; }
+    bool debugPartition() const { return m_debugPartition; }
+    bool debugSelfTest() const { return m_debugSelfTest; }
    bool decoration() const { return m_decoration; }
    bool exe() const { return m_exe; }
+    bool threadsDpiPure() const { return m_threadsDpiPure; }
+    bool threadsDpiUnpure() const { return m_threadsDpiUnpure; }
+    bool threadsCoarsen() const { return m_threadsCoarsen; }
    bool trace() const { return m_trace; }
    bool traceDups() const { return m_traceDups; }
    bool traceParams() const { return m_traceParams; }
@ -246,6 +260,7 @@ class V3Options {
    bool pinsScBigUint() const { return m_pinsScBigUint; }
    bool pinsUint8() const { return m_pinsUint8; }
    bool profCFuncs() const { return m_profCFuncs; }
+    bool profThreads() const { return m_profThreads; }
    bool allPublic() const { return m_public; }
    bool lintOnly() const { return m_lintOnly; }
    bool ignc() const { return m_ignc; }
@ -267,6 +282,7 @@ class V3Options {
    int	   outputSplitCTrace() const { return m_outputSplitCTrace; }
    int	   pinsBv() const { return m_pinsBv; }
    int threads() const { return m_threads; }
+    int threadsMaxMTasks() const { return m_threadsMaxMTasks; }
    bool mtasks() const { return (m_threads > 1); }
    int	   traceDepth() const { return m_traceDepth; }
    int	   traceMaxArray() const { return m_traceMaxArray; }
--- a/src/V3Order.cpp
+++ b/src/V3Order.cpp
@ -89,19 +89,22 @@
 #include <sstream>
 #include <memory>

-#include "V3Global.h"
-#include "V3File.h"
 #include "V3Ast.h"
+#include "V3Const.h"
+#include "V3EmitCBase.h"
+#include "V3EmitV.h"
+#include "V3File.h"
+#include "V3Global.h"
 #include "V3Graph.h"
+#include "V3GraphStream.h"
 #include "V3List.h"
+#include "V3Partition.h"
+#include "V3PartitionGraph.h"
 #include "V3SenTree.h"
 #include "V3Stats.h"
-#include "V3EmitCBase.h"
-#include "V3Const.h"

 #include "V3Order.h"
 #include "V3OrderGraph.h"
-#include "V3EmitV.h"

 #include VL_INCLUDE_UNORDERED_MAP
 #include VL_INCLUDE_UNORDERED_SET
@ -423,10 +426,15 @@ class ProcessMoveBuildGraph {
    // OrderVisitor. It produces a slightly coarsened graph to drive the
    // code scheduling.
    //
-    // * The new graph contains nodes of type OrderMoveVertex.
+    // * For the serial code scheduler, the new graph contains
+    //   nodes of type OrderMoveVertex.
+    //
+    // * For the threaded code scheduler, the new graph contains
+    //   nodes of type MTaskMoveVertex.
    //
    // * The difference in output type is abstracted away by the
-    //   'T_MoveVertex' template parameter.
+    //   'T_MoveVertex' template parameter; ProcessMoveBuildGraph otherwise
+    //   works the same way for both cases.

    // TYPES
    typedef std::pair<const V3GraphVertex*, const AstSenTree*> VxDomPair;
@ -563,7 +571,7 @@ private:
 };

 //######################################################################
-// OrderMoveVertexMaker
+// OrderMoveVertexMaker and related

 class OrderMoveVertexMaker
    : public ProcessMoveBuildGraph<OrderMoveVertex>::MoveVertexMaker {
@ -595,6 +603,64 @@ private:
    VL_UNCOPYABLE(OrderMoveVertexMaker);
 };

+class OrderMTaskMoveVertexMaker
+    : public ProcessMoveBuildGraph<MTaskMoveVertex>::MoveVertexMaker {
+    V3Graph* m_pomGraphp;
+public:
+    explicit OrderMTaskMoveVertexMaker(V3Graph* pomGraphp)
+        : m_pomGraphp(pomGraphp) {}
+    MTaskMoveVertex* makeVertexp(OrderLogicVertex* lvertexp,
+                                 const OrderEitherVertex* varVertexp,
+                                 const AstScope* scopep,
+                                 const AstSenTree* domainp) {
+        // Exclude initial/settle logic from the mtasks graph.
+        // We'll output time-zero logic separately.
+        if (domainp->hasInitial() || domainp->hasSettle()) {
+            return NULL;
+        }
+        return new MTaskMoveVertex(m_pomGraphp, lvertexp, varVertexp, scopep, domainp);
+    }
+    void freeVertexp(MTaskMoveVertex* freeMep) {
+        freeMep->unlinkDelete(m_pomGraphp);
+    }
+private:
+    VL_UNCOPYABLE(OrderMTaskMoveVertexMaker);
+};
+
+class OrderVerticesByDomainThenScope {
+    PartPtrIdMap m_ids;
+public:
+    virtual bool operator()(const V3GraphVertex* lhsp,
+                            const V3GraphVertex* rhsp) const {
+        const MTaskMoveVertex* l_vxp = dynamic_cast<const MTaskMoveVertex*>(lhsp);
+        const MTaskMoveVertex* r_vxp = dynamic_cast<const MTaskMoveVertex*>(rhsp);
+        vluint64_t l_id = m_ids.findId(l_vxp->domainp());
+        vluint64_t r_id = m_ids.findId(r_vxp->domainp());
+        if (l_id < r_id) return true;
+        if (l_id > r_id) return false;
+        l_id = m_ids.findId(l_vxp->scopep());
+        r_id = m_ids.findId(r_vxp->scopep());
+        return l_id < r_id;
+    }
+};
+
+class MTaskVxIdLessThan {
+public:
+    MTaskVxIdLessThan() {}
+    virtual ~MTaskVxIdLessThan() {}
+
+    // Sort vertex's, which must be AbstractMTask's, into a deterministic
+    // order by comparing their serial IDs.
+    virtual bool operator()(const V3GraphVertex* lhsp,
+                            const V3GraphVertex* rhsp) const {
+        const AbstractMTask* lmtaskp =
+            dynamic_cast<const AbstractLogicMTask*>(lhsp);
+        const AbstractMTask* rmtaskp =
+            dynamic_cast<const AbstractLogicMTask*>(rhsp);
+        return lmtaskp->id() < rmtaskp->id();
+    }
+};
+
 //######################################################################
 // Order class functions

@ -701,6 +767,7 @@ private:
    void processDomainsIterate(OrderEitherVertex* vertexp);
    void processEdgeReport();

+    // processMove* routines schedule serial execution
    void processMove();
    void processMoveClear();
    void processMoveBuildGraph();
@ -711,6 +778,18 @@ private:
    AstActive* processMoveOneLogic(const OrderLogicVertex* lvertexp,
                                   AstCFunc*& newFuncpr, int& newStmtsr);

+    // processMTask* routines schedule threaded execution
+    struct MTaskState {
+        typedef std::list<const OrderLogicVertex*> Logics;
+        AstMTaskBody* m_mtaskBodyp;
+        Logics m_logics;
+        ExecMTask* m_execMTaskp;
+        MTaskState() : m_mtaskBodyp(NULL), m_execMTaskp(NULL) {}
+    };
+    void processMTasks();
+    typedef enum {LOGIC_INITIAL, LOGIC_SETTLE} InitialLogicE;
+    void processMTasksInitial(InitialLogicE logic_type);
+
    string cfuncName(AstNodeModule* modp, AstSenTree* domainp, AstScope* scopep, AstNode* forWhatp) {
 	modp->user3Inc();
 	int funcnum = modp->user3();
@ -1726,6 +1805,173 @@ AstActive* OrderVisitor::processMoveOneLogic(const OrderLogicVertex* lvertexp,
    return activep;
 }

+void OrderVisitor::processMTasksInitial(InitialLogicE logic_type) {
+    // Emit initial/settle logic. Initial blocks won't be part of the
+    // mtask partition, aren't eligible for parallelism.
+    //
+    int initStmts = 0;
+    AstCFunc* initCFunc = NULL;
+    AstScope* lastScopep = NULL;
+    for (V3GraphVertex* initVxp = m_graph.verticesBeginp();
+         initVxp; initVxp = initVxp->verticesNextp()) {
+        OrderLogicVertex* initp = dynamic_cast<OrderLogicVertex*>(initVxp);
+        if (!initp) continue;
+        if ((logic_type == LOGIC_INITIAL)
+            && !initp->domainp()->hasInitial()) continue;
+        if ((logic_type == LOGIC_SETTLE)
+            && !initp->domainp()->hasSettle()) continue;
+        if (initp->scopep() != lastScopep) {
+            // Start new cfunc, don't let the cfunc cross scopes
+            initCFunc = NULL;
+            lastScopep = initp->scopep();
+        }
+        AstActive* newActivep = processMoveOneLogic(initp, initCFunc/*ref*/, initStmts/*ref*/);
+        if (newActivep) m_scopetopp->addActivep(newActivep);
+    }
+}
+
+void OrderVisitor::processMTasks() {
+    // For nondeterminism debug:
+    V3Partition::hashGraphDebug(&m_graph, "V3Order's m_graph");
+
+    processMTasksInitial(LOGIC_INITIAL);
+    processMTasksInitial(LOGIC_SETTLE);
+
+    // We already produced a graph of every var, input, logic, and settle
+    // block and all dependencies; this is 'm_graph'.
+    //
+    // Now, starting from m_graph, make a slightly-coarsened graph representing
+    // only logic, and discarding edges we know we can ignore.
+    // This is quite similar to the 'm_pomGraph' of the serial code gen:
+    V3Graph logicGraph;
+    OrderMTaskMoveVertexMaker create_mtask_vertex(&logicGraph);
+    ProcessMoveBuildGraph<MTaskMoveVertex> mtask_pmbg(
+        &m_graph, &logicGraph, &create_mtask_vertex);
+    mtask_pmbg.build();
+
+    // Needed? We do this for m_pomGraph in serial mode, so do it here too:
+    logicGraph.removeRedundantEdges(&V3GraphEdge::followAlwaysTrue);
+
+    // Partition logicGraph into LogicMTask's. The partitioner will annotate
+    // each vertex in logicGraph with a 'color' which is really an mtask ID
+    // in this context.
+    V3Partition partitioner(&logicGraph);
+    V3Graph mtasks;
+    partitioner.go(&mtasks);
+
+    vl_unordered_map<unsigned /*mtask id*/, MTaskState> mtaskStates;
+
+    // Iterate through the entire logicGraph. For each logic node,
+    // attach it to a per-MTask ordered list of logic nodes.
+    // This is the order we'll execute logic nodes within the MTask.
+    //
+    // MTasks may span scopes and domains, so sort by both here:
+    GraphStream<OrderVerticesByDomainThenScope> emit_logic(&logicGraph);
+    const V3GraphVertex* moveVxp;
+    while ((moveVxp = emit_logic.nextp())) {
+        const MTaskMoveVertex* movep =
+            dynamic_cast<const MTaskMoveVertex*>(moveVxp);
+        unsigned mtaskId = movep->color();
+        UASSERT(mtaskId > 0,
+                "Every MTaskMoveVertex should have an mtask assignment >0");
+        if (movep->logicp()) {
+            // Add this logic to the per-mtask order
+            mtaskStates[mtaskId].m_logics.push_back(movep->logicp());
+
+            // Since we happen to be iterating over every logic node,
+            // take this opportunity to annotate each AstVar with the id's
+            // of mtasks that consume it and produce it. We'll use this
+            // information in V3EmitC when we lay out var's in memory.
+            const OrderLogicVertex* logicp = movep->logicp();
+            for (const V3GraphEdge* edgep = logicp->inBeginp();
+                 edgep; edgep = edgep->inNextp()) {
+                const OrderVarVertex* pre_varp =
+                    dynamic_cast<const OrderVarVertex*>(edgep->fromp());
+                if (!pre_varp) continue;
+                AstVar* varp = pre_varp->varScp()->varp();
+                // varp depends on logicp, so logicp produces varp,
+                // and vice-versa below
+                varp->addProducingMTaskId(mtaskId);
+            }
+            for (const V3GraphEdge* edgep = logicp->outBeginp();
+                 edgep; edgep = edgep->outNextp()) {
+                const OrderVarVertex* post_varp
+                    = dynamic_cast<const OrderVarVertex*>(edgep->top());
+                if (!post_varp) continue;
+                AstVar* varp = post_varp->varScp()->varp();
+                varp->addConsumingMTaskId(mtaskId);
+            }
+            // TODO? We ignore IO vars here, so those will have empty mtask
+            // signatures. But we could also give those mtask signatures.
+        }
+    }
+
+    // Create the AstExecGraph node which represents the execution
+    // of the MTask graph.
+    FileLine* rootFlp = new FileLine("AstRoot", 0);
+    AstExecGraph* execGraphp = new AstExecGraph(rootFlp);
+    m_scopetopp->addActivep(execGraphp);
+    v3Global.rootp()->execGraphp(execGraphp);
+
+    // Create CFuncs and bodies for each MTask.
+    GraphStream<MTaskVxIdLessThan> emit_mtasks(&mtasks);
+    const V3GraphVertex* mtaskVxp;
+    while ((mtaskVxp = emit_mtasks.nextp())) {
+        const AbstractLogicMTask* mtaskp =
+            dynamic_cast<const AbstractLogicMTask*>(mtaskVxp);
+
+        // Create a body for this mtask
+        AstMTaskBody* bodyp = new AstMTaskBody(rootFlp);
+        MTaskState& state = mtaskStates[mtaskp->id()];
+        state.m_mtaskBodyp = bodyp;
+
+        // Create leaf CFunc's to run this mtask's logic,
+        // and create a set of AstActive's to call those CFuncs.
+        // Add the AstActive's into the AstMTaskBody.
+        const AstSenTree* last_domainp = NULL;
+        AstCFunc* leafCFuncp = NULL;
+        int leafStmts = 0;
+        for (MTaskState::Logics::iterator it = state.m_logics.begin();
+             it != state.m_logics.end(); ++it) {
+            const OrderLogicVertex* logicp = *it;
+            if (logicp->domainp() != last_domainp) {
+                // Start a new leaf function.
+                leafCFuncp = NULL;
+            }
+            last_domainp = logicp->domainp();
+
+            AstActive* newActivep = processMoveOneLogic(logicp, leafCFuncp/*ref*/, leafStmts/*ref*/);
+            if (newActivep) bodyp->addStmtsp(newActivep);
+        }
+
+        // Translate the LogicMTask graph into the corresponding ExecMTask
+        // graph, which will outlive V3Order and persist for the remainder
+        // of verilator's processing.
+        // - The LogicMTask graph points to MTaskMoveVertex's
+        //   and OrderLogicVertex's which are ephemeral to V3Order.
+        // - The ExecMTask graph and the AstMTaskBody's produced here
+        //   persist until code generation time.
+        state.m_execMTaskp =
+            new ExecMTask(execGraphp->mutableDepGraphp(),
+                          bodyp, mtaskp->id());
+        // Cross-link each ExecMTask and MTaskBody
+        //  Q: Why even have two objects?
+        //  A: One is an AstNode, the other is a GraphVertex,
+        //     to combine them would involve multiple inheritance...
+        state.m_mtaskBodyp->execMTaskp(state.m_execMTaskp);
+        for (V3GraphEdge* inp = mtaskp->inBeginp();
+             inp; inp = inp->inNextp()) {
+            const V3GraphVertex* fromVxp = inp->fromp();
+            const AbstractLogicMTask* fromp =
+                dynamic_cast<const AbstractLogicMTask*>(fromVxp);
+            MTaskState& fromState = mtaskStates[fromp->id()];
+            new V3GraphEdge(execGraphp->mutableDepGraphp(),
+                            fromState.m_execMTaskp, state.m_execMTaskp, 1);
+        }
+        execGraphp->addMTaskBody(bodyp);
+    }
+}
+
 //######################################################################
 // OrderVisitor - Top processing

@ -1762,7 +2008,7 @@ void OrderVisitor::process() {

    if (debug() && v3Global.opt.dumpTree()) processEdgeReport();

-    {
+    if (!v3Global.opt.mtasks()) {
        UINFO(2,"  Construct Move Graph...\n");
        processMoveBuildGraph();
        if (debug()>=4) m_pomGraph.dumpDotFilePrefixed("ordermv_start");  // Different prefix (ordermv) as it's not the same graph
@ -1771,6 +2017,9 @@ void OrderVisitor::process() {

        UINFO(2,"  Move...\n");
        processMove();
+    } else {
+        UINFO(2,"  Set up mtasks...\n");
+        processMTasks();
    }

    // Any SC inputs feeding a combo domain must be marked, so we can make them sc_sensitive
--- a/src/V3OrderGraph.h
+++ b/src/V3OrderGraph.h
@ -21,6 +21,7 @@
 //
 //	V3GraphVertex
 //	  OrderMoveVertex
+//        MTaskMoveVertex
 //	  OrderEitherVertex
 //	    OrderInputsVertex
 //	    OrderSettleVertex
@ -47,6 +48,7 @@
 #include "verilatedos.h"
 #include "V3Ast.h"
 #include "V3Graph.h"
+#include VL_INCLUDE_UNORDERED_MAP

 class OrderVisitor;
 class OrderMoveVertex;
@ -363,6 +365,57 @@ public:
    void domScopep(OrderMoveDomScope* ds) { m_domScopep=ds; }
 };

+// Similar to OrderMoveVertex, but modified for threaded code generation.
+class MTaskMoveVertex : public V3GraphVertex {
+    //  This could be more compact, since we know m_varp and m_logicp
+    //  cannot both be set. Each MTaskMoveVertex represents a logic node
+    //  or a var node, it can't be both.
+    OrderLogicVertex* m_logicp;  // Logic represented by this vertex
+    const OrderEitherVertex* m_varp;  // Var represented by this vertex
+    const AstScope* m_scopep;
+    const AstSenTree* m_domainp;
+
+protected:
+    friend class OrderVisitor;
+    friend class MTaskMoveVertexMaker;
+public:
+    MTaskMoveVertex(V3Graph* graphp, OrderLogicVertex* logicp,
+                    const OrderEitherVertex* varp,
+                    const AstScope* scopep, const AstSenTree* domainp)
+        : V3GraphVertex(graphp), m_logicp(logicp),
+          m_varp(varp), m_scopep(scopep), m_domainp(domainp) {
+        UASSERT(!(logicp && varp),
+                "MTaskMoveVertex: logicp and varp may not both be set!\n");
+    }
+    virtual ~MTaskMoveVertex() {}
+    virtual MTaskMoveVertex* clone(V3Graph* graphp) const {
+      v3fatalSrc("Unsupported"); return NULL; }
+    virtual OrderVEdgeType type() const { return OrderVEdgeType::VERTEX_MOVE; }
+    virtual string dotColor() const {
+        if (logicp()) return logicp()->dotColor();
+        else return "yellow";
+    }
+    virtual string name() const {
+        string nm;
+        if (logicp()) {
+            nm = logicp()->name();
+            nm += (string("\\nMV:")
+                   +" d="+cvtToStr((void*)logicp()->domainp())
+                   +" s="+cvtToStr((void*)logicp()->scopep())
+                   // "color()" represents the mtask ID.
+                   +"\\nt="+cvtToStr(color()));
+        } else {
+            nm = "nolog\\nt="+cvtToStr(color());
+        }
+        return nm;
+    }
+    // ACCESSORS
+    OrderLogicVertex* logicp() const { return m_logicp; }
+    const OrderEitherVertex* varp() const { return m_varp; }
+    const AstScope* scopep() const { return m_scopep; }
+    const AstSenTree* domainp() const { return m_domainp; }
+};
+
 //######################################################################
 // Edge types

--- a/src/V3Partition.cpp
+++ b/src/V3Partition.cpp
--- a/src/V3Partition.h
+++ b/src/V3Partition.h
@ -0,0 +1,99 @@
+// -*- mode: C++; c-file-style: "cc-mode" -*-
+//*************************************************************************
+// DESCRIPTION: Verilator: Threading's logic to mtask partitioner
+//
+// Code available from: http://www.veripool.org/verilator
+//
+//*************************************************************************
+//
+// Copyright 2003-2018 by Wilson Snyder.  This program is free software; you can
+// redistribute it and/or modify it under the terms of either the GNU
+// Lesser General Public License Version 3 or the Perl Artistic License
+// Version 2.0.
+//
+// Verilator is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+//*************************************************************************
+
+#ifndef _V3PARTITION_H_
+#define _V3PARTITION_H_
+
+#include "config_build.h"
+#include "verilatedos.h"
+#include <list>
+
+#include "V3Graph.h"
+#include "V3OrderGraph.h"
+
+class LogicMTask;
+typedef vl_unordered_map<const MTaskMoveVertex*, LogicMTask*> Vx2MTaskMap;
+
+//*************************************************************************
+/// V3Partition takes the fine-grained logic graph from V3Order and
+/// collapses it into a coarse-grained graph of AbstractLogicMTask's, each
+/// of which contains of set of the logic nodes from the fine-grained
+/// graph.
+
+class V3Partition {
+    // MEMBERS
+    V3Graph* m_fineDepsGraphp;  // Fine-grained dependency graph
+public:
+    // CONSTRUCTORS
+    explicit V3Partition(V3Graph* fineDepsGraphp)
+        : m_fineDepsGraphp(fineDepsGraphp) {}
+    ~V3Partition() {}
+
+    // METHODS
+
+    // Fill in the provided empty graph with AbstractLogicMTask's and their
+    // interdependencies.
+    void go(V3Graph* mtasksp);
+
+    static void selfTest();
+
+    // Print out a hash of the shape of graphp.  Only needed to debug the
+    // origin of some nondeterminism; otherwise this is pretty useless.
+    static void hashGraphDebug(const V3Graph* graphp, const char* debugName);
+
+    // Print debug stats about graphp whose nodes must be AbstractMTask's.
+    static void debugMTaskGraphStats(const V3Graph* graphp, const string& name);
+
+    // Operate on the final ExecMTask graph, immediately prior to code
+    // generation time.
+    static void finalize();
+private:
+    static void finalizeCosts(V3Graph* execMTaskGraphp);
+    static void setupMTaskDeps(V3Graph* mtasksp, const Vx2MTaskMap* vx2mtaskp);
+
+    VL_DEBUG_FUNC;  // Declare debug()
+    VL_UNCOPYABLE(V3Partition);
+};
+
+//*************************************************************************
+// Map a pointer into a id, for e.g. nodep to mtask mappings
+
+class PartPtrIdMap {
+private:
+    // TYPES
+    typedef vl_unordered_map <const void*, vluint64_t> PtrMap;
+    // MEMBERS
+    mutable vluint64_t m_nextId;
+    mutable PtrMap m_id;
+public:
+    // CONSTRUCTORS
+    PartPtrIdMap() : m_nextId(0) {}
+    // METHODS
+    vluint64_t findId(const void* ptrp) const {
+        PtrMap::iterator it = m_id.find(ptrp);
+        if (it != m_id.end()) {
+            return it->second;
+        }
+        m_id[ptrp] = m_nextId;
+        return m_nextId++;
+    }
+};
+
+#endif  // Guard
--- a/src/V3PartitionGraph.h
+++ b/src/V3PartitionGraph.h
@ -0,0 +1,108 @@
+// -*- mode: C++; c-file-style: "cc-mode" -*-
+//*************************************************************************
+// DESCRIPTION: Verilator: Threading's graph structures
+//
+// Code available from: http://www.veripool.org/verilator
+//
+//*************************************************************************
+//
+// Copyright 2003-2018 by Wilson Snyder.  This program is free software; you can
+// redistribute it and/or modify it under the terms of either the GNU
+// Lesser General Public License Version 3 or the Perl Artistic License
+// Version 2.0.
+//
+// Verilator is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+//*************************************************************************
+
+#ifndef _V3PARTITIONGRAPH_H_
+#define _V3PARTITIONGRAPH_H_
+
+#include "config_build.h"
+#include "verilatedos.h"
+#include <list>
+
+#include "V3Graph.h"
+#include "V3OrderGraph.h"
+
+//*************************************************************************
+// MTasks and graph structures
+
+class AbstractMTask : public V3GraphVertex {
+public:
+    AbstractMTask(V3Graph* graphp) : V3GraphVertex(graphp) {}
+    virtual ~AbstractMTask() {}
+    virtual uint32_t id() const = 0;
+    virtual uint32_t cost() const = 0;
+};
+
+class AbstractLogicMTask : public AbstractMTask {
+public:
+    // TYPES
+    typedef std::list<MTaskMoveVertex*> VxList;
+    // CONSTRUCTORS
+    AbstractLogicMTask(V3Graph* graphp) : AbstractMTask(graphp) {}
+    virtual ~AbstractLogicMTask() {}
+    // METHODS
+    // Set of logic vertices in this mtask. Order is not significant.
+    virtual const VxList* vertexListp() const = 0;
+    virtual uint32_t id() const = 0;  // Unique id of this mtask.
+    virtual uint32_t cost() const = 0;
+};
+
+class ExecMTask : public AbstractMTask {
+private:
+    AstMTaskBody*       m_bodyp;     // Task body
+    uint32_t            m_id;        // Unique id of this mtask.
+    uint32_t            m_priority;  // Predicted critical path from the start of
+    // this mtask to the ends of the graph that are reachable from this
+    // mtask. In abstract time units.
+    uint32_t            m_cost;      // Predicted runtime of this mtask, in the same
+    // abstract time units as priority().
+    uint32_t            m_thread;    // Thread for static (pack_mtasks) scheduling,
+    // or 0xffffffff if not yet assigned.
+    const ExecMTask*    m_packNextp;  // Next for static (pack_mtasks) scheduling
+    bool                m_threadRoot;  // Is root thread
+    VL_UNCOPYABLE(ExecMTask);
+public:
+    ExecMTask(V3Graph* graphp, AstMTaskBody* bodyp, uint32_t id)
+        : AbstractMTask(graphp),
+          m_bodyp(bodyp),
+          m_id(id),
+          m_priority(0),
+          m_cost(0),
+          m_thread(0xffffffff),
+          m_packNextp(NULL),
+          m_threadRoot(false) {}
+    AstMTaskBody* bodyp() const { return m_bodyp; }
+    virtual uint32_t id() const { return m_id; }
+    uint32_t priority() const { return m_priority; }
+    void priority(uint32_t pri) { m_priority = pri; }
+    virtual uint32_t cost() const { return m_cost; }
+    void cost(uint32_t cost) { m_cost = cost; }
+    void thread(uint32_t thread) { m_thread = thread; }
+    uint32_t thread() const { return m_thread; }
+    void packNextp(const ExecMTask* nextp) { m_packNextp = nextp; }
+    const ExecMTask* packNextp() const { return m_packNextp; }
+    bool threadRoot() const { return m_threadRoot; }
+    void threadRoot(bool threadRoot) { m_threadRoot = threadRoot; }
+    string cFuncName() const {
+        // If this MTask maps to a C function, this should be the name
+        return string("__Vmtask")+"__"+cvtToStr(m_id);
+    }
+    string name() const { return string("mt")+cvtToStr(id()); }
+    void dump(std::ostream& str) const {
+        str <<name()<<"."<<((void*)this);
+        if (priority() || cost()) str <<" [pr="<<priority()<<" c="<<cvtToStr(cost())<<"]";
+        if (thread() != 0xffffffff) str <<" th="<<thread();
+        if (threadRoot()) str <<" [ROOT]";
+        if (packNextp()) str <<" nx="<<packNextp()->name();
+    }
+};
+inline std::ostream& operator<<(std::ostream& os, const ExecMTask& rhs) {
+    rhs.dump(os); return os; }
+
+#endif  // Guard
--- a/src/V3Trace.cpp
+++ b/src/V3Trace.cpp
@ -182,6 +182,7 @@ private:
    AstNode*		m_chgSubParentp;// Which node has call to m_chgSubFuncp
    int			m_chgSubStmts;	// Statements under function being built
    AstVarScope*	m_activityVscp;	// Activity variable
+    uint32_t            m_activityNumber;  // Count of fields in activity variable
    uint32_t		m_code;		// Trace ident code# being assigned
    V3Graph		m_graph;	// Var/CFunc tracking
    TraceActivityVertex* m_alwaysVtxp;	// "Always trace" vertex
@ -297,7 +298,7 @@ private:

    void assignActivity() {
 	// Select activity numbers and put into each CFunc vertex
-	uint32_t activityNumber = 1;	// Note 0 indicates "slow"
+        m_activityNumber = 1;  // Note 0 indicates "slow"
 	for (V3GraphVertex* itp = m_graph.verticesBeginp(); itp; itp=itp->verticesNextp()) {
 	    if (TraceActivityVertex* vvertexp = dynamic_cast<TraceActivityVertex*>(itp)) {
 		if (!vvertexp->activityCodeValid()) {
@ -306,17 +307,39 @@ private:
 			// This makes us need less activityNumbers and so speeds up the fast path.
 			vvertexp->activityCode(TraceActivityVertex::ACTIVITY_SLOW);
 		    } else {
-			vvertexp->activityCode(activityNumber++);
+                        vvertexp->activityCode(m_activityNumber++);
 		    }
 		}
 	    }
 	}

-	// Insert global variable
-	if (!activityNumber) activityNumber++;   // For simplicity, always create it
-	int activityBits = VL_WORDS_I(activityNumber)*VL_WORDSIZE;   // For tighter code; round to next 32 bit point.
-	AstVar* newvarp = new AstVar (m_chgFuncp->fileline(), AstVarType::MODULETEMP,
-				      "__Vm_traceActivity", VFlagBitPacked(), activityBits);
+        AstVar* newvarp;
+        if (v3Global.opt.mtasks()) {
+            // Create a vector of bytes, not bits, for the tracing vector,
+            // so that we can set them atomically without locking.
+            //
+            // TODO: It would be slightly faster to have a bit vector per
+            // chain of packed MTasks, but we haven't packed the MTasks yet.
+            // If we support fully threaded tracing in the future, it would
+            // make sense to improve this at that time.
+            AstNodeDType* newScalarDtp
+                = new AstBasicDType(m_chgFuncp->fileline(), VFlagLogicPacked(), 1);
+            v3Global.rootp()->typeTablep()->addTypesp(newScalarDtp);
+            AstNodeDType* newArrDtp = new AstUnpackArrayDType(
+                m_chgFuncp->fileline(),
+                newScalarDtp,
+                new AstRange(m_chgFuncp->fileline(),
+                             VNumRange(m_activityNumber-1, 0, false)));
+            v3Global.rootp()->typeTablep()->addTypesp(newArrDtp);
+            newvarp = new AstVar(m_chgFuncp->fileline(),
+                                 AstVarType::MODULETEMP,
+                                  "__Vm_traceActivity", newArrDtp);
+        } else {
+            // For tighter code; round to next 32 bit point.
+            int activityBits = VL_WORDS_I(m_activityNumber)*VL_WORDSIZE;
+            newvarp = new AstVar(m_chgFuncp->fileline(), AstVarType::MODULETEMP,
+                                 "__Vm_traceActivity", VFlagBitPacked(), activityBits);
+        }
 	m_topModp->addStmtp(newvarp);
 	AstVarScope* newvscp = new AstVarScope(newvarp->fileline(), m_highScopep, newvarp);
 	m_highScopep->addVarp(newvscp);
@ -329,15 +352,23 @@ private:
 		    FileLine* fl = vvertexp->insertp()->fileline();
 		    uint32_t acode = vvertexp->activityCode();
 		    vvertexp->insertp()->addNextHere
-			(new AstAssign (fl,
-					new AstSel (fl, new AstVarRef(fl, m_activityVscp, true),
-						    acode, 1),
-					new AstConst (fl, AstConst::LogicTrue())));
+                        (new AstAssign(fl, selectActivity(fl, acode, true),
+                                       new AstConst(fl, AstConst::LogicTrue())));
 		}
 	    }
 	}
    }

+    AstNode* selectActivity(FileLine* flp, uint32_t acode, bool lvalue) {
+        if (v3Global.opt.mtasks()) {
+            return new AstArraySel(
+                flp, new AstVarRef(flp, m_activityVscp, lvalue), acode);
+        } else {
+            return new AstSel(
+                flp, new AstVarRef(flp, m_activityVscp, lvalue), acode, 1);
+        }
+    }
+
    AstCFunc* newCFunc(AstCFuncType type, const string& name, AstCFunc* basep) {
 	AstCFunc* funcp = new AstCFunc(basep->fileline(), name, basep->scopep());
 	funcp->slow(basep->slow());
@ -453,8 +484,7 @@ private:
 		    AstNode* condp = NULL;
 		    for (ActCodeSet::const_iterator csit = actset.begin(); csit!=actset.end(); ++csit) {
 			uint32_t acode = *csit;
-			AstNode* selp = new AstSel (fl, new AstVarRef(fl, m_activityVscp, false),
-						    acode, 1);
+                        AstNode* selp = selectActivity(fl, acode, false);
 			if (condp) condp = new AstOr (fl, condp, selp);
 			else condp = selp;
 		    }
@ -473,11 +503,19 @@ private:

 	// Clear activity after tracing completes
 	FileLine* fl = m_chgFuncp->fileline();
-	AstNode* clrp = new AstAssign (fl,
-				       new AstVarRef(fl, m_activityVscp, true),
-				       new AstConst(fl, V3Number(fl, m_activityVscp->width())));
-	m_fullFuncp->addFinalsp(clrp->cloneTree(true));
-	m_chgFuncp->addFinalsp(clrp);
+        if (v3Global.opt.mtasks()) {
+            for (uint32_t i = 0; i < m_activityNumber; ++i) {
+                AstNode* clrp = new AstAssign(fl, selectActivity(fl, i, true),
+                                              new AstConst(fl, AstConst::LogicFalse()));
+                m_fullFuncp->addFinalsp(clrp->cloneTree(true));
+                m_chgFuncp->addFinalsp(clrp);
+            }
+        } else {
+            AstNode* clrp = new AstAssign(fl, new AstVarRef(fl, m_activityVscp, true),
+                                          new AstConst(fl, V3Number(fl, m_activityVscp->width())));
+            m_fullFuncp->addFinalsp(clrp->cloneTree(true));
+            m_chgFuncp->addFinalsp(clrp);
+        }
    }

    uint32_t assignDeclCode(AstTraceDecl* nodep) {
@ -699,6 +737,7 @@ public:
 	m_chgSubFuncp = NULL;
 	m_chgSubParentp = NULL;
 	m_chgSubStmts = 0;
+        m_activityNumber = 0;
        m_code = 0;
        m_finding = false;
 	m_funcNum = 0;
--- a/src/Verilator.cpp
+++ b/src/Verilator.cpp
@ -73,6 +73,7 @@
 #include "V3Param.h"
 #include "V3Parse.h"
 #include "V3ParseSym.h"
+#include "V3Partition.h"
 #include "V3PreShell.h"
 #include "V3Premit.h"
 #include "V3Reloop.h"
@ -524,6 +525,14 @@ void process () {
 	V3EmitC::emitcSyms();
 	V3EmitC::emitcTrace();
    }
+    if (!v3Global.opt.xmlOnly()
+        && v3Global.opt.mtasks()) {
+        // Finalize our MTask cost estimates and pack the mtasks into
+        // threads. Must happen pre-EmitC which relies on the packing
+        // order. Must happen post-V3LifePost which changes the relative
+        // costs of mtasks.
+        V3Partition::finalize();
+    }
    if (!v3Global.opt.xmlOnly()) { // Unfortunately we have some lint checks in emitc.
 	V3EmitC::emitc();
    }
@ -607,8 +616,11 @@ int main(int argc, char** argv, char** env) {
    VHashSha1::selfTest();
    AstBasicDTypeKwd::selfTest();
    V3Graph::selfTest();
-    V3TSP::selfTest();
-    V3ScoreboardBase::selfTest();
+    if (v3Global.opt.debugSelfTest()) {
+        V3TSP::selfTest();
+        V3ScoreboardBase::selfTest();
+        V3Partition::selfTest();
+    }

    // Read first filename
    v3Global.readFiles();
--- a/test_regress/Makefile
+++ b/test_regress/Makefile
@ -44,7 +44,7 @@ endif

 .PHONY: test
 test:
-	$(PERL) driver.pl $(DRIVER_FLAGS) --vlt --dist
+	$(PERL) driver.pl $(DRIVER_FLAGS) --vlt --vltmt --dist

 ######################################################################

@ -61,6 +61,9 @@ nc:
 vlt:
 	$(PERL) driver.pl $(DRIVER_FLAGS) --vlt --stop

+vltmt:
+	$(PERL) driver.pl $(DRIVER_FLAGS) --vltmt --stop
+
 ######################################################################

 random:
--- a/test_regress/driver.pl
+++ b/test_regress/driver.pl
@ -45,6 +45,7 @@ our %All_Scenarios
       nc    => ["simulator", "nc"],
       vcs   => ["simulator", "vcs"],
       vlt   => ["simulator", "vlt_all", "vlt"],
+       vltmt => ["simulator", "vlt_all", "vltmt"],
    );

 #======================================================================
@ -104,6 +105,7 @@ if (! GetOptions (
          "ms!"         => sub { $opt_scenarios{ms} = $_[1]; },
          "nc!"         => sub { $opt_scenarios{nc} = $_[1]; },
          "vlt!"        => sub { $opt_scenarios{vlt} = $_[1]; },
+          "vltmt!"      => sub { $opt_scenarios{vltmt} = $_[1]; },
          "vcs!"        => sub { $opt_scenarios{vcs} = $_[1]; },
          "<>"          => \&parameter,
    )) {
@ -322,6 +324,7 @@ sub new {
    $self->{scenario} ||= "ghdl" if $self->{ghdl};
    $self->{scenario} ||= "vcs" if $self->{vcs};
    $self->{scenario} ||= "vlt" if $self->{vlt};
+    $self->{scenario} ||= "vltmt" if $self->{vltmt};
    $self->{scenario} ||= "nc" if $self->{nc};
    $self->{scenario} ||= "ms" if $self->{ms};
    $self->{scenario} ||= "iv" if $self->{iv};
@ -407,6 +410,7 @@ sub new {
 	ms_run_flags => [split(/\s+/,"-lib $self->{obj_dir}/work -c -do 'run -all;quit' ")],
 	# Verilator
 	vlt => 0,
+        vltmt => 0,
 	verilator_flags => ["-cc",
 			    "-Mdir $self->{obj_dir}",
 			    "-OD",  # As currently disabled unless -O3
@ -420,7 +424,7 @@ sub new {
 	%$self};
    bless $self, $class;

-    $self->{vlt_all} = $self->{vlt};  # Any Verilator scenario
+    $self->{vlt_all} = $self->{vlt} || $self->{vltmt};  # Any Verilator scenario

    $self->{VM_PREFIX} ||= "V".$self->{name};
    $self->{stats} ||= "$self->{obj_dir}/V".$self->{name}."__stats.txt";
@ -593,6 +597,8 @@ sub compile_vlt_flags {
    unshift @verilator_flags, "--gdbbt" if $opt_gdbbt;
    unshift @verilator_flags, "--x-assign unique";  # More likely to be buggy
    unshift @verilator_flags, "--trace" if $opt_trace;
+    unshift @verilator_flags, "--threads 3" if $param{vltmt};
+    unshift @verilator_flags, "--debug-partition" if $param{vltmt};
    if (defined $opt_optimize) {
 	my $letters = "";
 	if ($opt_optimize =~ /[a-zA-Z]/) {
@ -746,6 +752,11 @@ sub compile {
 	    return 1;
 	}

+        if ($self->{vltmt} && !$self->cfg_with_threaded) {
+            $self->skip("Test requires Verilator configured with threads\n");
+            return 1;
+        }
+
 	if (!$param{fails} && $param{verilator_make_gcc}
 	    && $param{make_main}) {
 	    $self->_make_main();
@ -2045,7 +2056,11 @@ Run Synopsys VCS simulator tests.

 =item --vlt

-Run Verilator tests.  Default unless another scenario flag is provided.
+Run Verilator tests in single-threaded mode.  Default unless another scenario flag is provided.
+
+=item --vltmt
+
+Run Verilator tests in multithreaded mode.

 =back

--- a/test_regress/t/t_a_selftest.pl
+++ b/test_regress/t/t_a_selftest.pl
@ -0,0 +1,22 @@
+#!/usr/bin/perl
+if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); die; }
+# DESCRIPTION: Verilator: Verilog Test driver/expect definition
+#
+# Copyright 2003 by Wilson Snyder. This program is free software; you can
+# redistribute it and/or modify it under the terms of either the GNU
+# Lesser General Public License Version 3 or the Perl Artistic License
+# Version 2.0.
+
+scenarios(vlt_all => 1);
+
+top_filename("t/t_EXAMPLE.v");
+
+compile(
+    verilator_flags2 => ['--debug-self-test'],
+    verilator_make_gcc => 0,
+    make_top_shell => 0,
+    make_main => 0,
+    );
+
+ok(1);
+1;
--- a/test_regress/t/t_case_huge.pl
+++ b/test_regress/t/t_case_huge.pl
@ -15,7 +15,8 @@ compile(

 if ($Self->{vlt_all}) {
    file_grep ($Self->{stats}, qr/Optimizations, Tables created\s+(\d+)/i, 10);
-    file_grep ($Self->{stats}, qr/Optimizations, Combined CFuncs\s+(\d+)/i, 8);
+    file_grep ($Self->{stats}, qr/Optimizations, Combined CFuncs\s+(\d+)/i,
+               ($Self->{vltmt} ? 0 : 8));
 }

 execute(
--- a/test_regress/t/t_dpi_threads.pl
+++ b/test_regress/t/t_dpi_threads.pl
@ -0,0 +1,21 @@
+#!/usr/bin/perl
+if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); die; }
+# DESCRIPTION: Verilator: Verilog Test driver/expect definition
+#
+# Copyright 2018 by Wilson Snyder. This program is free software; you can
+# redistribute it and/or modify it under the terms of either the GNU
+# Lesser General Public License Version 3 or the Perl Artistic License
+# Version 2.0.
+
+scenarios(vltmt => 1);
+
+compile(
+    v_flags2 => ["t/t_dpi_threads_c.cpp --no-threads-coarsen"],
+    );
+
+execute(
+    check_finished => 1,
+    );
+
+ok(1);
+1;
--- a/test_regress/t/t_dpi_threads.v
+++ b/test_regress/t/t_dpi_threads.v
@ -0,0 +1,62 @@
+// DESCRIPTION: Verilator: Verilog Test module
+//
+// Copyright 2018 by Wilson Snyder. This program is free software; you can
+// redistribute it and/or modify it under the terms of either the GNU
+// Lesser General Public License Version 3 or the Perl Artistic License
+// Version 2.0.
+
+import "DPI-C" dpii_sys_task = function void \$dpii_sys ();
+import "DPI-C" dpii_failure = function int \$dpii_failure ();
+
+module t (clk);
+   input clk;
+   integer cyc;
+   integer failure;
+
+   initial cyc = 0;
+
+`ifndef verilator
+   `error "Only Verilator supports PLI-ish DPI calls."
+`endif
+
+   always @ (posedge clk) begin
+      if (cyc == 2) begin
+         failure = $dpii_failure();
+         $write("* failure = %0d\n", failure);
+         if (failure > 0) begin
+            $stop;
+         end
+         $write("*-* All Finished *-*\n");
+         $finish;
+      end
+      cyc <= cyc + 1;
+   end
+
+   // The purpose of this test is to confirm that the DPI-call serialization
+   // code in V3Partition does ensure that these DPI calls do not run
+   // concurrently.
+   //
+   // Alternatively, the test may be run with "--threads-dpi all" in which case
+   // it should confirm that the calls do run concurrently and do detect a
+   // collision (they should, if the test is set up right.)  This is
+   // t_dpi_threads_collide.pl.
+   //
+   // Q) Is it a risk that the partitioner will merge or serialize these always
+   //    blocks, just by luck, even if the DPI-call serialization code fails?
+   //
+   // A) Yes, that's why t_dpi_threads_collide.pl also passes
+   //    --no-threads-do-coaren to disable MTask coarsening.  This ensures that
+   //    the MTask graph at the end of FixDataHazards (where we resolve DPI
+   //    hazards) is basically the final MTasks graph, and that data hazards
+   //    which persist beyond FixDataHazards should persist in the final
+   //    generated C code.
+
+   always @ (posedge clk) begin
+      $dpii_sys();
+   end
+
+   always @ (posedge clk) begin
+      $dpii_sys();
+   end
+
+endmodule
--- a/test_regress/t/t_dpi_threads_c.cpp
+++ b/test_regress/t/t_dpi_threads_c.cpp
@ -0,0 +1,78 @@
+// -*- mode: C++; c-file-style: "cc-mode" -*-
+//*************************************************************************
+//
+// Copyright 2018-2018 by Wilson Snyder. This program is free software; you can
+// redistribute it and/or modify it under the terms of either the GNU
+// Lesser General Public License Version 3 or the Perl Artistic License.
+// Version 2.0.
+//
+// Verilator is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+//*************************************************************************
+
+#include <atomic>
+#include <cstdio>
+#include <iostream>
+#include <unistd.h>
+#include "svdpi.h"
+
+//======================================================================
+
+#if defined(VERILATOR)
+# ifdef T_DPI_THREADS_COLLIDE
+#  include "Vt_dpi_threads_collide__Dpi.h"
+# else
+#  include "Vt_dpi_threads__Dpi.h"
+# endif
+#elif defined(VCS)
+# include "../vc_hdrs.h"
+#elif defined(CADENCE)
+# define NEED_EXTERNS
+#else
+# error "Unknown simulator for DPI test"
+#endif
+
+#ifdef NEED_EXTERNS
+extern "C" {
+    extern void dpii_sys_task();
+    extern int dpii_failure();
+}
+#endif
+
+//======================================================================
+
+struct state {
+    std::atomic<bool> task_is_running;
+    std::atomic<int> failure;
+    state() : task_is_running(false)
+            , failure(false) {}
+};
+
+static state st;
+
+void dpii_sys_task() {
+    bool other_task_running = atomic_exchange(&st.task_is_running, true);
+    if (other_task_running) {
+        // Another task is running. This is a collision.
+        st.failure = 1;
+        std::cerr << "t_dpi_threads_c.cpp dpii_sys_task() saw threads collide.\n";
+    } else {
+        std::cerr << "t_dpi_threads_c.cpp dpii_sys_task() no collision. @" << &st.task_is_running << "\n";
+    }
+
+    // Spend some time in the DPI call, so that if we can have a collision
+    // we probably will. Technically this is not guaranteed to detect every
+    // race. However, one second is so much greater than the expected
+    // runtime of everything else in the test, it really should pick up on
+    // races just about all of the time.
+    sleep(1);
+
+    atomic_exchange(&st.task_is_running, false);
+}
+
+int dpii_failure() {
+    return st.failure;
+}
--- a/test_regress/t/t_dpi_threads_collide.pl
+++ b/test_regress/t/t_dpi_threads_collide.pl
@ -0,0 +1,28 @@
+#!/usr/bin/perl
+if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); die; }
+# DESCRIPTION: Verilator: Verilog Test driver/expect definition
+#
+# Copyright 2018 by Wilson Snyder. This program is free software; you can
+# redistribute it and/or modify it under the terms of either the GNU
+# Lesser General Public License Version 3 or the Perl Artistic License
+# Version 2.0.
+
+scenarios(vltmt => 1);
+
+top_filename("t/t_dpi_threads.v");
+
+compile(
+    v_flags2 => ["t/t_dpi_threads_c.cpp --threads-dpi all --no-threads-coarsen"],
+    );
+
+# Similar to t_dpi_threads, which confirms that Verilator can prevent a
+# race between DPI import calls, this test confirms that the race exists
+# and that the DPI C code can detect it under --threads-dpi all
+# mode.
+#
+execute(
+    fails => 1,
+    );
+
+ok(1);
+1;
--- a/test_regress/t/t_emit_memb_limit.pl
+++ b/test_regress/t/t_emit_memb_limit.pl
@ -43,7 +43,10 @@ gen($Self->{top_filename}, 6000);
 compile(
    verilator_flags2=>["-x-assign fast --x-initial fast",
                       "-Wno-UNOPTTHREADS",
-    ],
+                       # The slow V3Partition asserts are just too slow
+                       # in this test. They're disabled just for performance
+                       # reasons:
+                       "--no-debug-partition"],
    );

 execute(
--- a/test_regress/t/t_gantt.pl
+++ b/test_regress/t/t_gantt.pl
@ -0,0 +1,74 @@
+#!/usr/bin/perl
+if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); die; }
+# DESCRIPTION: Verilator: Verilog Test driver/expect definition
+#
+# Copyright 2003 by Wilson Snyder. This program is free software; you can
+# redistribute it and/or modify it under the terms of either the GNU
+# Lesser General Public License Version 3 or the Perl Artistic License
+# Version 2.0.
+
+use IO::File;
+
+# Test for bin/verilator_gantt,
+#
+# Only needed in multithreaded regression.
+scenarios(vltmt => 1);
+
+# It doesn't really matter what test
+# we use, so long as it runs several cycles,
+# enough for the profiling to happen:
+top_filename("t/t_gen_alw.v");
+
+compile(
+    v_flags2 => ["--prof-threads"]
+    );
+
+execute(
+    all_run_flags => ["+verilator+prof+threads+start+2",
+                      " +verilator+prof+threads+window+2",
+                      " +verilator+prof+threads+file+$Self->{obj_dir}/profile_threads.dat",
+                      ],
+    check_finished => 1,
+    );
+
+# For now, verilator_gantt still reads from STDIN
+#  (probably it should take a file, gantt.dat like verilator_profcfunc)
+# The profiling data still goes direct to the runtime's STDOUT
+#  (maybe that should go to a separate file - gantt.dat?)
+run(cmd => ["$ENV{VERILATOR_ROOT}/bin/verilator_gantt",
+            "$Self->{obj_dir}/profile_threads.dat",
+            "--vcd $Self->{obj_dir}/profile_threads.vcd",
+            "> $Self->{obj_dir}/gantt.log"]);
+
+# We should have three lines of gantt chart, each with
+# an even number of mtask-bars (eg "[123--]")
+my $gantt_line_ct = 0;
+my $global_mtask_ct = 0;
+{
+    my $fh = IO::File->new("<$Self->{obj_dir}/gantt.log")
+        or error("$! $Self->{obj_dir}/gantt.log");
+    while (my $line = ($fh && $fh->getline)) {
+        if ($line !~ m/^  t:/) { next; }
+        $gantt_line_ct++;
+        my $this_thread_mtask_ct = 0;
+        my @mtasks = split(/\[/, $line);
+        shift @mtasks; # throw the '>>  ' away
+        foreach my $mtask (@mtasks) {
+            # Format of each mtask is "[123--]" where the hyphens
+            # number or ] may or may not appear; it depends on exact timing.
+            $this_thread_mtask_ct++;
+            $global_mtask_ct++;
+        }
+        if ($this_thread_mtask_ct % 2 != 0) { error("odd number of mtasks found"); }
+    }
+}
+if ($gantt_line_ct != 3) { error("wrong number of gantt lines"); }
+if ($global_mtask_ct == 0) { error("wrong number of mtasks, should be > 0"); }
+print "Found $gantt_line_ct lines of gantt data with $global_mtask_ct mtasks\n"
+    if $Self->{verbose};
+
+# Diff to itself, just to check parsing
+vcd_identical("$Self->{obj_dir}/profile_threads.vcd", "$Self->{obj_dir}/profile_threads.vcd");
+
+ok(1);
+1;
--- a/test_regress/t/t_gate_tree.pl
+++ b/test_regress/t/t_gate_tree.pl
@ -117,6 +117,10 @@ compile(
    );

 execute(
+    all_run_flags => ["+verilator+prof+threads+start+100",
+                      " +verilator+prof+threads+window+2",
+                      " +verilator+prof+threads+file+$Self->{obj_dir}/profile_threads.dat",
+                      ],
    check_finished => 1,
    );

--- a/test_regress/t/t_help.pl
+++ b/test_regress/t/t_help.pl
@ -13,6 +13,7 @@ foreach my $prog (
    "../bin/verilator",
    "../bin/verilator_coverage",
    "../bin/verilator_difftree",
+    "../bin/verilator_gantt",
    "../bin/verilator_profcfunc",
    ) {
    run(fails => 1,
--- a/test_regress/t/t_inst_tree_inl0_pub1.pl
+++ b/test_regress/t/t_inst_tree_inl0_pub1.pl
@ -38,7 +38,8 @@ sub checkRelativeRefs {
 if ($Self->{vlt_all}) {
    # We expect to combine sequent functions across multiple instances of
    # l2, l3, l4, l5. If this number drops, please confirm this has not broken.
-    file_grep ($Self->{stats}, qr/Optimizations, Combined CFuncs\s+(\d+)/i, 52);
+    file_grep ($Self->{stats}, qr/Optimizations, Combined CFuncs\s+(\d+)/i,
+               ($Self->{vltmt} ? 84 : 52));

    # Expect absolute refs in CFuncs for t (top module) and l1 (because it
    # has only one instance)
--- a/test_regress/t/t_inst_tree_inl0_pub1_norelcfuncs.pl
+++ b/test_regress/t/t_inst_tree_inl0_pub1_norelcfuncs.pl
@ -18,7 +18,8 @@ compile(
 if ($Self->{vlt_all}) {
    # Fewer optimizations than t_inst_tree_inl0_pub1 which allows
    # relative CFuncs:
-    file_grep ($Self->{stats}, qr/Optimizations, Combined CFuncs\s+(\d+)/i, 31);
+    file_grep ($Self->{stats}, qr/Optimizations, Combined CFuncs\s+(\d+)/i,
+               ($Self->{vltmt} ? 0 : 31));

    # Should not find any 'this->' except some 'this->__VlSymsp'
    my @files = `ls $Self->{obj_dir}/*.cpp`;
--- a/test_regress/t/t_threads_counter_1.pl
+++ b/test_regress/t/t_threads_counter_1.pl
@ -7,8 +7,7 @@ if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); di
 # Lesser General Public License Version 3 or the Perl Artistic License
 # Version 2.0.

-scenarios(simulator => 1);
-$Self->cfg_with_threaded or skip("No thread support");
+scenarios(vltmt => 1);

 top_filename("t/t_threads_counter.v");

--- a/test_regress/t/t_threads_counter_2.pl
+++ b/test_regress/t/t_threads_counter_2.pl
@ -7,8 +7,7 @@ if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); di
 # Lesser General Public License Version 3 or the Perl Artistic License
 # Version 2.0.

-scenarios(simulator => 1);
-$Self->cfg_with_threaded or skip("No thread support");
+scenarios(vltmt => 1);

 top_filename("t/t_threads_counter.v");

--- a/test_regress/t/t_threads_counter_4.pl
+++ b/test_regress/t/t_threads_counter_4.pl
@ -0,0 +1,23 @@
+#!/usr/bin/perl
+if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); die; }
+# DESCRIPTION: Verilator: Verilog Test driver/expect definition
+#
+# Copyright 2003-2009 by Wilson Snyder. This program is free software; you can
+# redistribute it and/or modify it under the terms of either the GNU
+# Lesser General Public License Version 3 or the Perl Artistic License
+# Version 2.0.
+
+scenarios(vltmt => 1);
+
+top_filename("t/t_threads_counter.v");
+
+compile(
+    verilator_flags2 => ['--cc --threads 4'],
+    );
+
+execute(
+    check_finished => 1,
+    );
+
+ok(1);
+1;
--- a/test_regress/t/t_threads_nondeterminism.pl
+++ b/test_regress/t/t_threads_nondeterminism.pl
@ -0,0 +1,25 @@
+#!/usr/bin/perl
+if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); die; }
+# DESCRIPTION: Verilator: Verilog Test driver/expect definition
+#
+# Copyright 2003-2009 by Wilson Snyder. This program is free software; you can
+# redistribute it and/or modify it under the terms of either the GNU
+# Lesser General Public License Version 3 or the Perl Artistic License
+# Version 2.0.
+
+scenarios(vltmt => 1);
+
+top_filename("t/t_threads_counter.v");
+
+compile(
+    verilator_flags2 => ['--cc --threads 2 --debug-nondeterminism'],
+    );
+
+execute(
+    check_finished => 1,
+    );
+
+file_grep("$Self->{obj_dir}/vlt_compile.log", qr/hash of shape/i);
+
+ok(1);
+1;
--- a/test_regress/t/t_verilated_all.pl
+++ b/test_regress/t/t_verilated_all.pl
@ -13,7 +13,12 @@ my $root = "..";

 compile(
    # Can't use --coverage and --savable together, so cheat and compile inline
-    verilator_flags2 => ['--cc --coverage-toggle --coverage-line --coverage-user --trace --vpi $root/include/verilated_save.cpp'],
+    verilator_flags2 => ["--cc",
+                         "--coverage-toggle --coverage-line --coverage-user",
+                         "--trace --vpi ",
+                         ($Self->cfg_with_threaded
+                          ? "--threads 2 $root/include/verilated_threads.cpp" : ""),
+                         "$root/include/verilated_save.cpp"],
    );

 execute(
@ -43,7 +48,8 @@ foreach my $dfile (glob("$Self->{obj_dir}/*.d")) {

 foreach my $file (sort keys %hit) {
    if (!$hit{$file}
-        && $file !~ /_sc/) {
+        && $file !~ /_sc/
+        && ($file !~ /_thread/ || $Self->cfg_with_threaded)) {
        error("Include file not covered by t_verilated_all test: ",$file);
    }
 }
--- a/test_regress/t/t_verilated_threaded.pl
+++ b/test_regress/t/t_verilated_threaded.pl
@ -7,8 +7,7 @@ if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); di
 # Lesser General Public License Version 3 or the Perl Artistic License
 # Version 2.0.

-scenarios(simulator => 1);
-$Self->cfg_with_threaded or skip("No thread support");
+scenarios(vltmt => 1);

 top_filename("t/t_verilated_all.v");