MAJOR: Add multithreaded model generation.

This commit is contained in:
Wilson Snyder 2018-07-22 20:54:28 -04:00
parent 0070520edb
commit ec8dbbffed
48 changed files with 5949 additions and 71 deletions

View File

@ -5,15 +5,18 @@ The contributors that suggested a given feature are shown in []. Thanks!
* Verilator 4.000 devel
** This is a major release. Any patches may require major rework to apply.
[Thanks everyone]
** Add multithreaded model generation.
** Add runtime arguments.
** Fix internals to be C++ null-pointer-check clean.
*** Better optimize large always block splitting, bug1244. [John Coiner]
*** Add new reloop optimization for repetitive assignment compression.
**** Fix internals to be C++ null-pointer-check clean.
**** Fix internals to avoid 'using namespace std'.
**** Fix Verilation performance issues, bug1316. [John Coiner]

View File

@ -120,6 +120,7 @@ DISTFILES_INC = $(INFOS) .gitignore Artistic COPYING COPYING.LESSER \
bin/verilator \
bin/verilator_coverage \
bin/verilator_difftree \
bin/verilator_gantt \
bin/verilator_includer \
bin/verilator_profcfunc \
doxygen-mainpage doxygen.config veripool-logo.png \
@ -154,6 +155,7 @@ DISTFILES_INC = $(INFOS) .gitignore Artistic COPYING COPYING.LESSER \
INST_PROJ_FILES = \
bin/verilator \
bin/verilator_coverage \
bin/verilator_gantt \
bin/verilator_includer \
bin/verilator_profcfunc \
include/verilated.mk \
@ -272,12 +274,12 @@ internals.pdf: internals.pod Makefile
# See uninstall also - don't put wildcards in this variable, it might uninstall other stuff
VL_INST_BIN_FILES = verilator verilator_bin verilator_bin_dbg verilator_coverage_bin_dbg \
verilator_coverage verilator_includer verilator_profcfunc
verilator_coverage verilator_gantt verilator_includer verilator_profcfunc
# Some scripts go into both the search path and pkgdatadir,
# so they can be found by the user, and under $VERILATOR_ROOT.
# See uninstall also - don't put wildcards in this variable, it might uninstall other stuff
VL_INST_MAN_FILES = verilator.1 verilator_coverage.1 verilator_profcfunc.1
VL_INST_MAN_FILES = verilator.1 verilator_coverage.1 verilator_gantt.1 verilator_profcfunc.1
VL_INST_INC_BLDDIR_FILES = \
include/verilated_config.h \
@ -295,6 +297,7 @@ installbin:
$(SHELL) ${srcdir}/mkinstalldirs $(DESTDIR)$(bindir)
( cd ${srcdir}/bin ; $(INSTALL_PROGRAM) verilator $(DESTDIR)$(bindir)/verilator )
( cd ${srcdir}/bin ; $(INSTALL_PROGRAM) verilator_coverage $(DESTDIR)$(bindir)/verilator_coverage )
( cd ${srcdir}/bin ; $(INSTALL_PROGRAM) verilator_gantt $(DESTDIR)$(bindir)/verilator_gantt )
( cd ${srcdir}/bin ; $(INSTALL_PROGRAM) verilator_profcfunc $(DESTDIR)$(bindir)/verilator_profcfunc )
( cd ${srcdir}/bin ; $(INSTALL_PROGRAM) verilator_bin $(DESTDIR)$(bindir)/verilator_bin )
( cd ${srcdir}/bin ; $(INSTALL_PROGRAM) verilator_bin_dbg $(DESTDIR)$(bindir)/verilator_bin_dbg )

View File

@ -338,6 +338,7 @@ detailed descriptions in L</"VERILATION ARGUMENTS"> for more information.
--pipe-filter <command> Filter all input through a script
--prefix <topname> Name of top level class
--prof-cfuncs Name functions for profiling
--prof-threads Enable generating gantt chart data for threads
--private Debugging; see docs
--public Debugging; see docs
-pvalue+<name>=<value> Overwrite toplevel parameter
@ -350,6 +351,9 @@ detailed descriptions in L</"VERILATION ARGUMENTS"> for more information.
--stats-vars Provide statistics on variables
-sv Enable SystemVerilog parsing
+systemverilogext+<ext> Synonym for +1800-2017ext+<ext>
--threads <threads> Enable multithreading
--threads-dpi <mode> Enable multithreaded DPI
--threads-max-mtasks <mtasks> Tune maximum mtask partitioning
--top-module <topname> Name of top level input module
--trace Enable waveform creation
--trace-depth <levels> Depth of tracing
@ -386,6 +390,9 @@ detailed descriptions in L</"RUNTIME ARGUMENTS"> for more information.
+verilator+debug Enable debugging
+verilator+debugi+<value> Enable debugging at a level
+verilator+help Display help
+verilator+prof+threads+file+I<filename> Set profile filename
+verilator+prof+threads+start+I<value> Set profile starting point
+verilator+prof+threads+window+I<value> Set profile duration
+verilator+rand+reset+<value> Set random reset technique
+verilator+V Verbose version and config
+verilator+version Show version and exit
@ -1080,6 +1087,18 @@ Verilog module and line number the statement came from. This allows gprof
or oprofile reports to be correlated with the original Verilog source
statements. See also L<verilator_profcfunc>.
=item --prof-threads
Enable gantt chart data collection for threaded builds.
Verilator will record the start and end time of each macro-task across a
number of calls to eval. (What is a macro-task? See the Verilator internals
document.)
When profiling is enabled, the runtime will emit a blurb of profiling data
in non-human-friendly form. The C<verilator_gantt> script will transform
this into a nicer visual format and produce some related statistics.
=item --private
Opposite of --public. Is the default; this option exists for backwards
@ -1134,7 +1153,10 @@ Enable including save and restore functions in the generated model.
The user code must create a VerilatedSerialize or VerilatedDeserialze
object then calling the << or >> operators on the generated model and any
other data the process needs saved/restored. For example:
other data the process needs saved/restored. These functions are not
thread safe, and are typically called only by a main thread.
For example:
void save_model(const char* filenamep) {
VerilatedSave os;
@ -1173,6 +1195,42 @@ compatibility with other simulators.
A synonym for C<+1800-2017ext+>I<ext>.
=item --threads I<threads>
=item --no-threads
With --threads 0 or --no-threads, the default, the generated model is not
thread safe. With --threads 1, the generated model is single threaded but
may run in a multithreaded environment. With --threads N, where N >= 2, the
model is generated to run multithreaded on up to N threads. See
L</"MULTITHREADING">.
=item --threads-dpi all
=item --threads-dpi none
=item --threads-dpi pure
When using --dpi with --threads, control what DPI tasks are thread safe.
With --threads-dpi all, enable Verilator to assume all DPI imports are
threadsafe, and to use thread-local storage for communication with DPI,
potentially improving performance. Any DPI libraries need appropriate
mutexes to avoid undefined behavior.
With --threads-dpi none, Verilator assume DPI imports are not thread safe,
and Verilator will serialize calls to DPI imports by default, potentially
harming performance.
With --threads-dpi pure, the default, Verilator assumes DPI pure imports
are threadsafe, but non-pure DPI imports are not.
=item --threads-max-mtasks I<value>
Rarely needed. When using --threads, specify the number of mtasks the
model is to be partitioned into. If unspecified, Verilator approximates a
good value.
=item --top-module I<topname>
When the input Verilog contains more than one top level module, specifies
@ -1464,6 +1522,28 @@ Enable debugging at the provided level.
Display help and exit.
=item +verilator+prof+threads+file+I<filename>
When using --prof-threads, the filename to dump to. Defaults to
"profile_threads.dat".
=item +verilator+prof+threads+start+I<value>
When using --prof-threads, Verilator will wait until $time is at this
value, then start the profiling warmup, then capturing. Generally this
should be set to some time that is well within the normal operation of the
simulation, i.e. outside of reset. If 0, the dump is disabled. Defaults to
1.
=item +verilator+prof+threads+window+I<value>
When using --prof-threads, after $time reaches
+verilator+prof+threads+start, Verilator will warm up the profiling for
this number of eval() calls, then will capture the profiling of this number
of eval() calls. Defaults to 2, which makes sense for a
single-clock-domain module where it's typical to want to capture one
posedge eval() and one negedge eval().
=item +verilator+rand+reset+I<value>
When a model was Verilated using "-x-inital unique", sets the
@ -1635,6 +1715,9 @@ compile times, and --x-assign=fast --x-initial=fast may increase the risk
of reset bugs in trade for performance; see the above documentation for
these flags.
If using Verilated multithreaded, use C<numactl> to ensure you are using
non-conflicting hardware resources. See L</"MULTITHREADING">.
Minor Verilog code changes can also give big wins. You should not have any
UNOPTFLAT warnings from Verilator. Fixing these warnings can result in
huge improvements; one user fixed their one UNOPTFLAT warning by making a
@ -2176,6 +2259,89 @@ the names of the .cpp files to compile in from the make variables generated
in obj_dir/Vour_classes.mk.
=head1 MULTITHREADING
Verilator experimentally supports multithreading.
With --no-threads, the default, the model is not thread safe, and any use
of more than one thread calling into one or even different Verilated models
may result in unpredictable behavior. This gives the highest single thread
performance.
With --threads 1, the generated model is single threaded, however the
support libraries are multithread safe. This allows different
instantiations of model(s) to potentially each be run under a different
thread. All threading is the responsibility of the user's C++ testbench.
With --threads N, where N is at least 2, the generated model will be
designed to run in parallel on N threads. The thread calling eval()
provides one of those threads, and the generated model will create and
manage the other N-1 threads. It's the client's responsibility not to
oversubscribe the available CPU cores. Under CPU oversubscription, the
Verilated model should not livelock nor deadlock, however, you can expect
performance to be far worse than it would be with proper stoichiometry of
threads and CPU cores.
The remainder of this section describe behavior with --threads 1 or
--threads N (not --no-threads).
VL_THREADED is defined when compiling a threaded Verilated module, causing
the Verilated support classes become threadsafe.
The thread used for constructing a model must the the same thread that
calls eval() into the model, this is called the "eval thread". The thread
used to perform certain global operations such as saving and tracing must
be done by a "main thread". In most cases the eval thread and main thread
are the same thread (i.e. the user's top C++ testbench runs on a single
thread), but this is not required.
When running a multithreaded model, the default Linux task scheduler often
works against the model, by assuming threads are short lived, and thus
often schedules threads using multiple hyperthreads within the same
physical core. For best performance use the C<numactl> program to (when the
threading count fits) select unique physical cores on the same socket. For
example, if a model was Verilated with "--threads 4", we consult
egrep 'processor|physical id|core id' /proc/cpuinfo
To select cores 0, 1, 2, and 3 that are all located on the same socket (0)
but different physical cores. (Also useful is "numactl --hardware", or
C<lscpu> but those doesn't show Hyperthreading cores.) Then we execute
numactl -m 0 -C 0,1,2,3 -- verilated_executable_name
This will limit memory to socket 0, and threads to cores 0, 1, 2, 3,
(presumably on socket 0) optimizing performance. Of course this must be
adjusted if you want another simulator using e.g. socket 1, or if you
Verilated with a different number of threads. To see what CPUs are
actually used, use --prof-threads.
=head2 Multithreaded Verilog and Library Support
$display/$stop/$finish are delayed until the end of an eval() call in order
to maintain ordering between threads. This may result in additional tasks
completing after the $stop or $finish.
If using --coverage, the coverage routines are fully thread safe.
If using --dpi, Verilator assumes pure DPI imports are thread safe,
balancing performance versus saftey. See --threads-dpi.
If using --savable, the save/restore classes are not multithreaded and are
must be called only by the eval thread.
If using --sc, the SystemC kernel is not thread safe, therefore the eval
thread and main thread must be the same.
If using --trace, the tracing classes must be constructed and called from
the main thread.
If using --vpi, since SystemVerilog VPI was not architected by IEEE to be
multithreaded, Verilator requires all VPI calls are only made from the main
thread.
=back
=head1 CONFIGURATION FILES
In addition to the command line, warnings and other features may be
@ -3636,6 +3802,21 @@ section for more details.
Ignoring this warning will only slow simulations, it will simulate
correctly.
=item UNOPTTHREADS
Warns that the thread scheduler was unable to partition the design to fill
the requested number of threads.
One workaround is to request fewer threads with C<--threads>.
Another possible workaround is to allow more MTasks in the runtime, by
increasing the value of --threads-max-mtasks. More MTasks will result in
more communication and synchronization overhead at runtime; the scheduler
attempts to minimize the number of MTasks for this reason.
Ignoring this warning will only slow simulations, it will simulate
correctly.
=item UNPACKED
Warns that unpacked structs and unions are not supported.
@ -4185,6 +4366,8 @@ performance gain.
In 2009, major SystemVerilog and DPI language support was added.
In 2018, Verilator 4.000 was released with multithreaded support.
Currently, various language features and performance enhancements are added
as the need arises. Verilator is now about 3x faster than in 2002, and is
faster than many popular commercial simulators.
@ -4282,7 +4465,7 @@ License Version 2.0.
=head1 SEE ALSO
L<verilator_coverage>, L<verilator_profcfunc>, L<make>,
L<verilator_coverage>, L<verilator_gantt>, L<verilator_profcfunc>, L<make>,
L<verilator --help> which is the source for this document,

559
bin/verilator_gantt Executable file
View File

@ -0,0 +1,559 @@
: # -*-Mode: perl;-*- use perl, wherever it is
eval 'exec perl -wS $0 ${1+"$@"}'
if 0;
# See copyright, etc in below POD section.
######################################################################
use strict;
use warnings;
use Getopt::Long;
use Pod::Usage;
use vars qw ($Debug);
$Debug = 0;
my $Opt_File;
my $Opt_Time_Per_Char = 0; # rdtsc ticks per char in gantt chart, 0=auto
my $opt_vcd = "profile_threads.vcd";
our %Threads;
our %Mtasks;
our %Global;
autoflush STDOUT 1;
autoflush STDERR 1;
Getopt::Long::config ("no_auto_abbrev");
if (! GetOptions (
"help" => \&usage,
"scale=i" => \$Opt_Time_Per_Char,
"debug" => sub { $Debug = 1; },
"vcd=s" => \$opt_vcd,
"no-vcd!" => sub { $opt_vcd = undef; },
"<>" => \&parameter,
)) {
die "%Error: Bad usage, try 'verilator_gantt --help'\n";
}
$Opt_File = "profile_threads.dat" if !defined $Opt_File;
process($Opt_File);
write_vcd($opt_vcd) if defined $opt_vcd;
exit(0);
#######################################################################
sub usage {
pod2usage(-verbose=>2, -exitval=>2, -output=>\*STDOUT);
exit (1);
}
sub parameter {
my $param = shift;
if (!defined $Opt_File) {
$Opt_File = $param;
} else {
die "%Error: Unknown parameter: $param\n";
}
}
#######################################################################
sub process {
my $filename = shift;
read_data($filename);
report();
}
#######################################################################
sub read_data {
my $filename = shift;
%Global = (rdtsc_cycle_time => 0);
my $fh = IO::File->new ($filename) or die "%Error: $! $filename,";
while (my $line = $fh->getline) {
if ($line =~ m/VLPROF mtask\s(\d+)\sstart\s(\d+)\send\s(\d+)\selapsed\s(\d+)\spredict_time\s(\d+)\scpu\s(\d+)\son thread (\d+)/) {
my $mtask = $1;
my $start = $2;
my $end = $3;
my $elapsed_time = $4;
my $predict_time = $5;
my $cpu = $6;
my $thread = $7;
$Threads{$thread}{$start}{mtask} = $mtask;
$Threads{$thread}{$start}{end} = $end;
$Threads{$thread}{$start}{cpu} = $cpu;
if (!exists $Mtasks{$mtask}{elapsed}) {
$Mtasks{$mtask}{elapsed} = 0;
}
$Mtasks{$mtask}{elapsed} += $elapsed_time;
$Mtasks{$mtask}{predict} = $predict_time;
$Mtasks{$mtask}{end} = max($Mtasks{$mtask}{end}, $end);
}
elsif ($line =~ /^VLPROFTHREAD/) {}
elsif ($line =~ m/VLPROF arg\s+(\S+)\+([0-9.])\s*$/
|| $line =~ m/VLPROF arg\s+(\S+)\s+([0-9.])\s*$/) {
$Global{args}{$1} = $2;
}
elsif ($line =~ m/VLPROF stat\s+(\S+)\s+([0-9.]+)/) {
$Global{stats}{$1} = $2;
}
elsif ($line =~ /^#/) {}
elsif ($Debug) {
chomp $line;
print "Unk: $line\n";
}
# TODO -- this is parsing text printed by a client.
# Really, verilator proper should generate this
# if it's useful...
if ($line =~ m/rdtsc time = (\d+) ticks/) {
$Global{rdtsc_cycle_time} = $1;
}
}
}
sub report {
print "Verilator Gantt report\n";
print "\nArgument settings:\n";
foreach my $arg (sort keys %{$Global{args}}) {
my $plus = ($arg =~ /^\+/) ? "+" : " ";
printf " %s%s%d\n", $arg, $plus, $Global{args}{$arg};
}
my $nthreads = scalar keys %Threads;
$Global{cpus}{cpu_time} = {};
foreach my $thread (keys %Threads) {
# Make potentially multiple characters per column
foreach my $start (keys %{$Threads{$thread}}) {
my $cpu = $Threads{$thread}{$start}{cpu};
my $elapsed = $Threads{$thread}{$start}{end} - $start;
$Global{cpus}{cpu_time}{$cpu} += $elapsed;
}
}
my $mt_mtask_time = 0;
my $long_mtask_time = 0;
my $last_end = 0;
foreach my $mtask (keys %Mtasks) {
$mt_mtask_time += $Mtasks{$mtask}{elapsed};
$last_end = max($last_end, $Mtasks{$mtask}{end});
$long_mtask_time = max($long_mtask_time, $Mtasks{$mtask}{elapsed});
}
$Global{last_end} = $last_end;
report_graph();
# If we know cycle time in the same (rdtsc) units,
# this will give us an actual utilization number,
# (how effectively we keep the cores busy.)
#
# It also gives us a number we can compare against
# serial mode, to estimate the overhead of data sharing,
# which will show up in the total elapsed time. (Overhead
# of synchronization and scheduling should not.)
print "\nAnalysis:\n";
printf " Total threads = %d\n", $nthreads;
printf " Total mtasks = %d\n", scalar (keys %Mtasks);
printf " Total cpus used = %d\n", scalar (keys %{$Global{cpus}});
printf " Total yields = %d\n", $Global{stats}{yields};
printf " Total eval time = %d rdtsc ticks\n", $Global{last_end};
printf " Longest mtask time = %d rdtsc ticks\n", $long_mtask_time;
printf " All-thread mtask time = %d rdtsc ticks\n", $mt_mtask_time;
my $long_efficiency = $long_mtask_time/($Global{last_end});
printf " Longest-thread efficiency = %0.1f%%\n", $long_efficiency*100;
my $mt_efficiency = $mt_mtask_time/($Global{last_end}*$nthreads);
printf " All-thread efficiency = %0.1f%%\n", $mt_efficiency*100;
printf " All-thread speedup = %0.1f\n", $mt_efficiency*$nthreads;
if ($Global{rdtsc_cycle_time} > 0) {
my $ut = $mt_mtask_time / $Global{rdtsc_cycle_time};
print "tot_mtask_cpu=$mt_mtask_time cyc=$Global{rdtsc_cycle_time} ut=$ut\n";
}
my @p2e_ratios;
my $min_p2e = 1000000;
my $min_mtask;
my $max_p2e = -1000000;
my $max_mtask;
foreach my $mtask (sort keys %Mtasks) {
if ($Mtasks{$mtask}{elapsed} > 0) {
if ($Mtasks{$mtask}{predict} == 0) {
$Mtasks{$mtask}{predict} = 1; # don't log(0) below
}
my $p2e_ratio = log( $Mtasks{$mtask}{predict} / $Mtasks{$mtask}{elapsed} );
#print "log(p2e $mtask) = $p2e_ratio (predict $Mtasks{$mtask}{predict}, elapsed $Mtasks{$mtask}{elapsed})\n";
push @p2e_ratios, $p2e_ratio;
if ($p2e_ratio > $max_p2e) {
$max_p2e = $p2e_ratio;
$max_mtask = $mtask;
}
if ($p2e_ratio < $min_p2e) {
$min_p2e = $p2e_ratio;
$min_mtask = $mtask;
}
}
}
print "\nStatistics:\n";
print " min log(p2e) = $min_p2e from mtask $min_mtask (predict $Mtasks{$min_mtask}{predict}, elapsed $Mtasks{$min_mtask}{elapsed})\n";
print " max log(p2e) = $max_p2e from mtask $max_mtask (predict $Mtasks{$max_mtask}{predict}, elapsed $Mtasks{$max_mtask}{elapsed})\n";
my $stddev = stddev(\@p2e_ratios);
my $mean = mean(\@p2e_ratios);
print " mean = " . ($mean) . "\n";
print " stddev = " . ($stddev) . "\n";
print " e ^ stddev = " . exp($stddev). "\n";
print "\n";
}
sub report_graph {
my $time_per = $Opt_Time_Per_Char;
if ($time_per == 0) {
$time_per = ($Global{last_end} / 40); # Start with 40 columns
while ($time_per > 10) {
my ($graph, $conflicts) = _make_graph($time_per);
last if !$conflicts;
$time_per = int($time_per/2);
}
# One more step so we can fit more labels
$time_per = int($time_per/2);
}
my ($graph, $conflicts) = _make_graph($time_per);
print "\nThread gantt graph:\n";
print " Legend: One character width = $time_per rdtsc ticks\n";
print " Legend: '&' = multiple mtasks in this period (character width)\n";
my $scale = " <-".$Global{last_end}." rdtsc total";
for (my $col = length($scale); # -2 for '->' below
$col < ($Global{last_end}/$time_per); ++$col) {
$scale .= "-";
}
print " $scale->\n";
foreach my $thread (sort keys %{$graph}) {
print " t: ";
_print_graph_line($graph->{$thread}, '');
}
}
sub _make_graph {
my $time_per = shift;
my $graph = {}; # {thread}{column}{char=>'x' or chars=>#}
my $conflicts = 0;
foreach my $thread (keys %Threads) {
# Make potentially multiple characters per column
foreach my $start (sort {$a <=> $b} keys %{$Threads{$thread}}) {
my $end = $Threads{$thread}{$start}{end};
my $mtask = $Threads{$thread}{$start}{mtask};
my $cpu = $Threads{$thread}{$start}{cpu};
my $startcol = _time_col($time_per, $start);
my $endcol = _time_col($time_per, $end);
my $label = "[";
$label .= "$cpu"; # Maybe make optional in future
my $width = $endcol - $startcol + 1;
while (length($label) < ($width-1)) { # -1 for ']'
$label .= "-";
}
$label .= "]";
$graph->{$thread}[$startcol]{char} .= $label;
}
if ($Debug) {
print "# Multicol: "; _print_graph_line($graph->{$thread}, '|');
}
# Expand line to one char per column
for (my $col = 0; $col <= $#{$graph->{$thread}}; ++$col) {
if (my $chars = $graph->{$thread}[$col]{char}) {
my $ok = 1;
for (my $coladd = 1; $coladd<length($chars); ++$coladd) {
if ($graph->{$thread}[$col + $coladd]{char}) {
$ok = 0; last;
}
}
if (!$ok) {
if ($chars =~ /\[.*\[/) { # Two begins or more
$conflicts++;
$graph->{$thread}[$col]{char} = "&";
} else {
$graph->{$thread}[$col]{char} = "[";
}
for (my $coladd = 1; $coladd<length($chars); ++$coladd) {
if ($graph->{$thread}[$col + $coladd]{char}) {
last;
} else {
$graph->{$thread}[$col + $coladd]{char} = 'x';
}
}
} else {
my $coladd = 0;
foreach my $char (split //, $chars) {
$graph->{$thread}[$col+$coladd]{char} = $char;
++$coladd;
}
}
}
}
if ($Debug) {
print "# Singlcol: "; _print_graph_line($graph->{$thread}, '|');
}
}
print "# Conflicts $conflicts\n" if $Debug;
return ($graph, $conflicts);
}
sub _print_graph_line {
my $graph_thread = shift;
my $sep = shift;
for (my $col = 0; $col <= $#{$graph_thread}; ++$col) {
my $c = $graph_thread->[$col]{char}; $c=' ' if !defined $c;
print $c, $sep;
}
print "\n";
}
sub _time_col {
my $time_per = shift;
my $time = shift;
return int($time/$time_per);
}
#######################################################################
sub write_vcd {
my $filename = shift;
print "Writing $filename\n";
my $fh = IO::File->new(">$filename") or die "%Error: $! $filename,";
my $vcd = {values => {}, # {<time>}{<code>} = value
sigs => {}, # {<module>}{<sig}} = code
code => 0,
};
my %parallelism;
foreach my $thread (keys %Threads) {
my $mcode = ($vcd->{sigs}{threads}{"thread${thread}_mtask"} ||= $vcd->{code}++);
foreach my $start (sort {$a <=> $b} keys %{$Threads{$thread}}) {
my $end = $Threads{$thread}{$start}{end};
my $mtask = $Threads{$thread}{$start}{mtask};
my $cpu = $Threads{$thread}{$start}{cpu};
$vcd->{values}{$start}{$mcode} = $mtask;
$vcd->{values}{$end}{$mcode} = undef;
$parallelism{$start}++;
$parallelism{$end}--;
my $ccode = $vcd->{sigs}{cpus}{"cpu${cpu}_thread"} ||= $vcd->{code}++;
$vcd->{values}{$start}{$ccode} = $thread;
$vcd->{values}{$end}{$ccode} = undef;
my $mcode = $vcd->{sigs}{mtasks}{"mtask${mtask}_cpu"} ||= $vcd->{code}++;
$vcd->{values}{$start}{$mcode} = $cpu;
$vcd->{values}{$end}{$mcode} = undef;
}
}
{
my $pcode = ($vcd->{sigs}{Stats}{"parallelism"} ||= $vcd->{code}++);
my $value = 0;
foreach my $time (sort {$a<=>$b} keys %parallelism) {
$value += $parallelism{$time};
$vcd->{values}{$time}{$pcode} = $value;
}
}
$fh->print('$version Generated by verilator_gantt $end'."\n");
$fh->print('$timescale 1ns $end'."\n");
$fh->print("\n");
my %all_codes;
$fh->print(' $scope module gantt $end'."\n");
foreach my $module (sort keys %{$vcd->{sigs}}) {
$fh->printf(' $scope module %s $end'."\n", $module);
foreach my $sig (sort keys %{$vcd->{sigs}{$module}}) {
my $code = $vcd->{sigs}{$module}{$sig};
$fh->printf(' $var wire 32 v%x %s [31:0] $end'."\n",
$code, $sig);
$all_codes{$code} = 1;
}
$fh->print(' $upscope $end'."\n");
}
$fh->print(' $upscope $end'."\n");
$fh->print('$enddefinitions $end'."\n");
$fh->print("\n");
my $first = 1;
foreach my $time (sort {$a <=> $b} keys %{$vcd->{values}}) {
if ($first) {
$first = 0;
# Start with Z for any signals without time zero data
foreach my $code (keys %all_codes) {
if (!defined $vcd->{values}{$time}{$code}) {
$vcd->{values}{$time}{$code} = undef;
}
}
}
$fh->printf("#%d\n", $time);
foreach my $code (sort keys %{$vcd->{values}{$time}}) {
my $value = $vcd->{values}{$time}{$code};
if (defined $value) {
$fh->printf("b%b v%x\n", $value, $code);
} else {
$fh->printf("bz v%x\n", $code);
}
}
}
}
#######################################################################
# Similar to Statistics::Basic functions, but avoid a package dependency
sub max {
my $n = $_[0]; shift;
while (defined $_[0]) {
$n = $_[0] if !defined $n || $_[0] > $n;
shift;
}
return $n;
}
sub mean {
my $arrayref = shift;
my $n = 0;
my $sum = 0;
foreach my $v (@$arrayref) {
$sum += $v;
$n++;
}
return undef if !$n;
return $sum/$n;
}
sub stddev {
my $arrayref = shift;
my $n = 0;
my $sum = 0;
my $sumsq = 0;
foreach my $v (@$arrayref) {
$sum += $v;
$sumsq += $v**2;
$n++;
}
return undef if !$n;
return sqrt(($sumsq/$n) - ($sum/$n)**2);
}
#######################################################################
__END__
=pod
=head1 NAME
verilator_gantt - Create Gantt chart of multi-threaded execution
=head1 SYNOPSIS
Creates a visual representation to help analyze Verilator multithreaded
simulation performance, by showing when each macro-task starts and ends,
and showing when each thread is busy or idle.
The generated Gantt chart has time on the X-axis. Times shown are to the
scale printed, i.e. a certain about of time for each character width. The
Y-axis shows threads, each thread's execution is shown on one line. That
line shows "[" at the position in time when it executes.
Following the "[" is the cpu number the task executed on, followed by zero
or more "-" to make the width of the characters match the scaled execution
time, followed by a "]". If the scale is too small, the cpu number and
mtask number will not be printed. If the scale is very small, a "&"
indicates multiple mtasks started at that time position.
Also creates a value change dump (VCD) format dump file which may be viewed
in a waveform viewer (e.g. C<GTKWave>). See below.
=head1 USAGE
Build with --prof-threads.
Run a sim with +verilator+prof+threads+window 2.
This will create profile_threads.dat.
Then run:
verilator_gantt profile_threads.dat
The report will be printed on standard output, this also generates
profile_threads.vcd
View profile_threads.vcd in a waveform viewer.
=head1 VCD SIGNALS
In waveforms there are the following signals. Most signals the "decimal"
format will remove the leading zeros and make the traces easier to read.
parallelism: The number of mtasks active at this time, for best performance
this will match the thread count. You may want to use an "analog step"
format to view this signal.
cpu#_thread: For the given CPU number, the thread number executing.
mtask#_cpu; For the given mtask id, the CPU it is executing on.
thread#_mtask: For the given thread number, the mtask id executing.
=head1 ARGUMENTS
=over 4
=item I<filename>
The filename to read data from, defaults to "profile_threads.dat".
=item --help
Displays this message and program version and exits.
=item --scale I<n>
On the X-axis of the generated Gantt chart, each character represents this
many time units. (On x86, time units are rdtsc ticks.) Defaults to 0,
which will automatically compute a reasonable scale where no two mtasks
need to fit into same character width's worth of scaled time.
=item --no-vcd
=item --vcd I<filename>
Set output filename for vcd dump, or disable. Default is
verilator_gantt.vcd.
=back
=head1 DISTRIBUTION
The latest version is available from L<http://www.veripool.org/>.
Copyright 2018-2018 by Wilson Snyder. Verilator is free software; you can
redistribute it and/or modify it under the terms of either the GNU Lesser
General Public License Version 3 or the Perl Artistic License Version 2.0.
=head1 AUTHORS
Wilson Snyder <wsnyder@wsnyder.org>
=head1 SEE ALSO
C<verilator>
=cut
######################################################################
### Local Variables:
### compile-command: "$V4/bin/verilator_gantt $V4/test_regress/obj_vltmt/t_gantt/vlt_sim.log"
### End:

View File

@ -38,6 +38,7 @@ VerilatedVoidCb Verilated::s_flushCb = NULL;
// Keep below together in one cache line
Verilated::Serialized Verilated::s_s;
Verilated::NonSerialized Verilated::s_ns;
VL_THREAD_LOCAL Verilated::ThreadLocal Verilated::t_s;
Verilated::CommandArgValues Verilated::s_args;
@ -196,6 +197,17 @@ Verilated::Serialized::Serialized() {
s_fatalOnVpiError = true; // retains old default behaviour
}
Verilated::NonSerialized::NonSerialized() {
s_profThreadsStart = 1;
s_profThreadsWindow = 2;
s_profThreadsFilenamep = strdup("profile_threads.dat");
}
Verilated::NonSerialized::~NonSerialized() {
if (s_profThreadsFilenamep) {
free(const_cast<char*>(s_profThreadsFilenamep)); s_profThreadsFilenamep=NULL;
}
}
//===========================================================================
// Random reset -- Only called at init time, so don't inline.
@ -1648,6 +1660,20 @@ void Verilated::fatalOnVpiError(bool flag) VL_MT_SAFE {
VerilatedLockGuard lock(m_mutex);
s_s.s_fatalOnVpiError = flag;
}
void Verilated::profThreadsStart(vluint64_t flag) VL_MT_SAFE {
VerilatedLockGuard lock(m_mutex);
s_ns.s_profThreadsStart = flag;
}
void Verilated::profThreadsWindow(vluint64_t flag) VL_MT_SAFE {
VerilatedLockGuard lock(m_mutex);
s_ns.s_profThreadsWindow = flag;
}
void Verilated::profThreadsFilenamep(const char* flagp) VL_MT_SAFE {
VerilatedLockGuard lock(m_mutex);
if (s_ns.s_profThreadsFilenamep) free(const_cast<char*>(s_ns.s_profThreadsFilenamep));
s_ns.s_profThreadsFilenamep = strdup(flagp);
}
const char* Verilated::catName(const char* n1, const char* n2) VL_MT_SAFE {
// Returns new'ed data
@ -1800,6 +1826,15 @@ void VerilatedImp::commandArgVl(const std::string& arg) {
VL_PRINTF_MT("For help, please see 'verilator --help'\n");
VL_FATAL_MT("COMMAND_LINE", 0, "", "Exiting due to command line argument (not an error)");
}
else if (commandArgVlValue(arg, "+verilator+prof+threads+start+", value/*ref*/)) {
Verilated::profThreadsStart(atoll(value.c_str()));
}
else if (commandArgVlValue(arg, "+verilator+prof+threads+window+", value/*ref*/)) {
Verilated::profThreadsWindow(atol(value.c_str()));
}
else if (commandArgVlValue(arg, "+verilator+prof+threads+file+", value/*ref*/)) {
Verilated::profThreadsFilenamep(value.c_str());
}
else if (commandArgVlValue(arg, "+verilator+rand+reset+", value/*ref*/)) {
Verilated::randReset(atoi(value.c_str()));
}

View File

@ -344,6 +344,17 @@ class Verilated {
~Serialized() {}
} s_s;
static struct NonSerialized { // Non-serialized information
// These are reloaded from on command-line settings, so do not need to persist
// Fast path
vluint64_t s_profThreadsStart; ///< +prof+threads starting time
vluint32_t s_profThreadsWindow; ///< +prof+threads window size
// Slow path
const char* s_profThreadsFilenamep; ///< +prof+threads filename
NonSerialized();
~NonSerialized();
} s_ns;
// no need to be save-restored (serialized) the
// assumption is that the restore is allowed to pass different arguments
static struct CommandArgValues {
@ -409,6 +420,14 @@ public:
/// Enable/disable vpi fatal
static void fatalOnVpiError(bool flag) VL_MT_SAFE;
static bool fatalOnVpiError() VL_MT_SAFE { return s_s.s_fatalOnVpiError; }
/// --prof-threads related settings
static void profThreadsStart(vluint64_t flag) VL_MT_SAFE;
static vluint64_t profThreadsStart() VL_MT_SAFE { return s_ns.s_profThreadsStart; }
static void profThreadsWindow(vluint64_t flag) VL_MT_SAFE;
static vluint32_t profThreadsWindow() VL_MT_SAFE { return s_ns.s_profThreadsWindow; }
static void profThreadsFilenamep(const char* flagp) VL_MT_SAFE;
static const char* profThreadsFilenamep() VL_MT_SAFE { return s_ns.s_profThreadsFilenamep; }
/// Flush callback for VCD waves
static void flushCb(VerilatedVoidCb cb) VL_MT_SAFE;
static void flushCall() VL_MT_SAFE;

View File

@ -0,0 +1,229 @@
// -*- mode: C++; c-file-style: "cc-mode" -*-
//=============================================================================
//
// THIS MODULE IS PUBLICLY LICENSED
//
// Copyright 2012-2018 by Wilson Snyder. This program is free software;
// you can redistribute it and/or modify it under the terms of either the GNU
// Lesser General Public License Version 3 or the Perl Artistic License Version 2.0.
//
// This is distributed in the hope that it will be useful, but WITHOUT ANY
// WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
//=============================================================================
///
/// \file
/// \brief Thread pool for verilated modules
///
//=============================================================================
#include "verilatedos.h"
#include "verilated_threads.h"
#include <cstdio>
std::atomic<vluint64_t> VlNotification::s_yields;
VL_THREAD_LOCAL VlThreadPool::ProfileTrace* VlThreadPool::t_profilep = NULL;
//=============================================================================
// VlMTaskVertex
VlMTaskVertex::VlMTaskVertex(vluint32_t upstreamDepCount)
: m_upstreamDepsDone(0),
m_upstreamDepCount(upstreamDepCount) {
assert(atomic_is_lock_free(&m_upstreamDepsDone));
}
//=============================================================================
// VlWorkerThread
VlWorkerThread::VlWorkerThread(VlThreadPool* poolp, bool profiling)
: m_poolp(poolp)
, m_profiling(profiling)
, m_exiting(false)
// Must init this last -- after setting up fields that it might read:
, m_cthread(startWorker, this) {}
VlWorkerThread::~VlWorkerThread() {
m_exiting.store(true, std::memory_order_release);
{
VerilatedLockGuard lk(m_mutex);
if (sleeping()) {
wakeUp();
}
}
// The thread should exit; join it.
m_cthread.join();
}
void VlWorkerThread::workerLoop() {
if (VL_UNLIKELY(m_profiling)) {
m_poolp->setupProfilingClientThread();
}
VlNotification alarm;
ExecRec work;
work.m_fnp = NULL;
while (1) {
bool sleep = false;
if (VL_UNLIKELY(!work.m_fnp)) {
// Look for work
VerilatedLockGuard lk(m_mutex);
if (VL_LIKELY(!m_ready.empty())) {
dequeWork(&work);
} else {
// No work available, prepare to sleep. Pass alarm/work
// into m_sleepAlarm so wakeUp will tall this function.
//
// Must modify m_sleepAlarm in the same critical section as
// the check for ready work, otherwise we could race with
// another thread enqueueing work and never be awoken.
m_sleepAlarm.first = &alarm;
m_sleepAlarm.second = &work;
sleep = true;
}
}
// Do this here, not above, to avoid a race with the destructor.
if (VL_UNLIKELY(m_exiting.load(std::memory_order_acquire)))
break;
if (VL_UNLIKELY(sleep)) {
alarm.waitForNotification(); // ZZZzzzzz
alarm.reset();
}
if (VL_LIKELY(work.m_fnp)) {
work.m_fnp(work.m_evenCycle, work.m_sym);
work.m_fnp = NULL;
}
}
if (VL_UNLIKELY(m_profiling)) {
m_poolp->tearDownProfilingClientThread();
}
}
void VlWorkerThread::startWorker(VlWorkerThread* workerp) {
workerp->workerLoop();
}
//=============================================================================
// VlThreadPool
VlThreadPool::VlThreadPool(int nThreads, bool profiling)
: m_profiling(profiling) {
// --threads N passes nThreads=N-1, as the "main" threads counts as 1
unsigned cpus = std::thread::hardware_concurrency();
if (cpus < nThreads+1) {
VL_PRINTF_MT("%%Warning: System has %u CPUs but model Verilated with"
" --threads %d; may run slow.\n", cpus, nThreads+1);
}
// Create'em
for (int i=0; i<nThreads; ++i) {
m_workers.push_back(new VlWorkerThread(this, profiling));
}
// Set up a profile buffer for the current thread too -- on the
// assumption that it's the same thread that calls eval and may be
// donated to run mtasks during the eval.
if (VL_UNLIKELY(m_profiling)) {
setupProfilingClientThread();
}
}
VlThreadPool::~VlThreadPool() {
for (int i = 0; i < m_workers.size(); ++i) {
// Each ~WorkerThread will wait for its thread to exit.
delete m_workers[i];
}
if (VL_UNLIKELY(m_profiling)) {
tearDownProfilingClientThread();
}
}
void VlThreadPool::tearDownProfilingClientThread() {
assert(t_profilep);
delete t_profilep;
t_profilep = NULL;
}
void VlThreadPool::setupProfilingClientThread() {
assert(!t_profilep);
t_profilep = new ProfileTrace;
// Reserve some space in the thread-local profiling buffer;
// try not to malloc while collecting profiling.
t_profilep->reserve(4096);
{
VerilatedLockGuard lk(m_mutex);
m_allProfiles.insert(t_profilep);
}
}
void VlThreadPool::profileAppendAll(const VlProfileRec& rec) {
VerilatedLockGuard lk(m_mutex);
for (ProfileSet::iterator it = m_allProfiles.begin();
it != m_allProfiles.end(); ++it) {
// Every thread's profile trace gets a copy of rec.
(*it)->emplace_back(rec);
}
}
void VlThreadPool::profileDump(const char* filenamep, vluint64_t ticksElapsed) {
VerilatedLockGuard lk(m_mutex);
VL_DEBUG_IF(VL_DBG_MSGF("+prof+threads writing to '%s'\n", filenamep););
FILE* fp = fopen(filenamep, "w");
if (VL_UNLIKELY(!fp)) {
VL_FATAL_MT(filenamep, 0, "", "+prof+threads+file file not writable");
return;
}
// TODO Perhaps merge with verilated_coverage output format, so can
// have a common merging and reporting tool, etc.
fprintf(fp, "VLPROFTHREAD 1.0 # Verilator thread profile dump version 1.0\n");
fprintf(fp, "VLPROF arg --threads %" VL_PRI64 "u\n",
vluint64_t(m_workers.size()+1));
fprintf(fp, "VLPROF arg +verilator+prof+threads+start+%" VL_PRI64 "u\n",
Verilated::profThreadsStart());
fprintf(fp, "VLPROF arg +verilator+prof+threads+window+%u\n",
Verilated::profThreadsWindow());
fprintf(fp, "VLPROF stat yields %" VL_PRI64 "u\n",
VlNotification::yields());
vluint32_t thread_id = 0;
for (ProfileSet::iterator pit = m_allProfiles.begin();
pit != m_allProfiles.end(); ++pit) {
++thread_id;
bool printing = false; // False while in warmup phase
for (ProfileTrace::iterator eit = (*pit)->begin();
eit != (*pit)->end(); ++eit) {
switch (eit->m_type) {
case VlProfileRec::TYPE_BARRIER:
printing = true;
break;
case VlProfileRec::TYPE_MTASK_RUN:
if (!printing) break;
fprintf(fp, "VLPROF mtask %d"
" start %" VL_PRI64"u end %" VL_PRI64"u elapsed %" VL_PRI64 "u"
" predict_time %u cpu %u on thread %u\n",
eit->m_mtaskId,
eit->m_startTime,
eit->m_endTime,
(eit->m_endTime - eit->m_startTime),
eit->m_predictTime,
eit->m_cpu,
thread_id);
break;
default: assert(false);
break;
}
}
}
fprintf(fp, "VLPROF stat ticks %" VL_PRI64 "u\n",
ticksElapsed);
fclose(fp);
}

313
include/verilated_threads.h Normal file
View File

@ -0,0 +1,313 @@
// -*- mode: C++; c-file-style: "cc-mode" -*-
//=============================================================================
//
// THIS MODULE IS PUBLICLY LICENSED
//
// Copyright 2012-2018 by Wilson Snyder. This program is free software;
// you can redistribute it and/or modify it under the terms of either the GNU
// Lesser General Public License Version 3 or the Perl Artistic License Version 2.0.
//
// This is distributed in the hope that it will be useful, but WITHOUT ANY
// WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
//=============================================================================
///
/// \file
/// \brief Thread pool and profiling for Verilated modules
///
//=============================================================================
#ifndef _VERILATED_THREADS_H_
#define _VERILATED_THREADS_H_
#include "verilatedos.h"
#include <atomic>
#include <thread>
#include <vector>
#include <set>
#include <sched.h> // For sched_getcpu()
#include "verilated.h" // for VerilatedMutex and clang annotations
// VlMTaskVertex and VlThreadpool will work with multiple symbol table types.
// Since the type is opaque to VlMTaskVertex and VlThreadPool, represent it
// as a void* here.
typedef void* VlThrSymTab;
class VlNotification {
// MEMBERS
std::atomic<bool> m_notified; // Notification pending
static std::atomic<vluint64_t> s_yields; // Statistics
public:
// CONSTRUCTORS
VlNotification()
: m_notified(false) {
assert(atomic_is_lock_free(&m_notified));
}
~VlNotification() {}
// METHODS
static vluint64_t yields() { return s_yields; }
// Block until notify() has occurred, then return.
// If notify() has already occurred, return immediately.
//
// This is logically const: the object will remain in notified state
// after WaitForNotification() returns, so you could notify more than
// one thread of the same event.
inline void waitForNotification() {
unsigned ct = 0;
while (VL_UNLIKELY(!notified())) {
VL_CPU_RELAX();
ct++;
if (VL_UNLIKELY(ct > VL_LOCK_SPINS)) {
ct = 0;
++s_yields; // Statistics
std::this_thread::yield();
}
}
}
// The 'inline' keyword here means nothing to the compiler, it's
// implicit on methods defined within the class body anyway.
//
// 'inline' is attached the this method, and others in this file,
// to remind humans that some routines in this file are called many
// times per cycle in threaded mode. Such routines should be
// inlinable; that's why they're declared in the .h and not the .cpp.
inline bool notified() {
return m_notified.load(std::memory_order_acquire);
}
// Set notified state. If state is already notified,
// it remains so.
inline void notify() {
m_notified.store(true, std::memory_order_release);
}
// Reset the state to un-notified state, which is also the
// state of a new Notification object.
inline void reset() {
m_notified.store(false, std::memory_order_relaxed);
}
};
typedef void (*VlExecFnp)(bool, VlThrSymTab);
/// Track dependencies for a single MTask.
class VlMTaskVertex {
// MEMBERS
// On even cycles, _upstreamDepsDone increases as upstream
// dependencies complete. When it reaches _upstreamDepCount,
// this MTaskVertex is ready.
//
// On odd cycles, _upstreamDepsDone decreases as upstream
// dependencies complete, and when it reaches zero this MTaskVertex
// is ready.
//
// An atomic is smaller than a mutex, and lock-free.
//
// (Why does the size of this class matter? If an mtask has many
// downstream mtasks to notify, we hope these will pack into a
// small number of cache lines to reduce the cost of pointer chasing
// during done-notification. Nobody's quantified that cost though.
// If we were really serious about shrinking this class, we could
// use 16-bit types here...)
std::atomic<vluint32_t> m_upstreamDepsDone;
const vluint32_t m_upstreamDepCount;
public:
// CONSTRUCTORS
// 'upstreamDepCount' is the number of upstream MTaskVertex's
// that must notify this MTaskVertex before it will become ready
// to run.
explicit VlMTaskVertex(vluint32_t upstreamDepCount);
~VlMTaskVertex() {}
// Upstream mtasks must call this when they complete.
// Returns true when the current MTaskVertex becomes ready to execute,
// false while it's still waiting on more dependencies.
inline bool signalUpstreamDone(bool evenCycle) {
if (evenCycle) {
vluint32_t upstreamDepsDone
= 1 + m_upstreamDepsDone.fetch_add(1, std::memory_order_release);
assert(upstreamDepsDone <= m_upstreamDepCount);
return (upstreamDepsDone == m_upstreamDepCount);
} else {
vluint32_t upstreamDepsDone_prev
= m_upstreamDepsDone.fetch_sub(1, std::memory_order_release);
assert(upstreamDepsDone_prev > 0);
return (upstreamDepsDone_prev == 1);
}
}
inline bool areUpstreamDepsDone(bool evenCycle) const {
vluint32_t target = evenCycle ? m_upstreamDepCount : 0;
return m_upstreamDepsDone.load(std::memory_order_acquire) == target;
}
inline void waitUntilUpstreamDone(bool evenCycle) const {
while (VL_UNLIKELY(!areUpstreamDepsDone(evenCycle))) {
VL_CPU_RELAX();
}
}
};
// Profiling support
class VlProfileRec {
protected:
friend class VlThreadPool;
enum VlProfileE {
TYPE_MTASK_RUN,
TYPE_BARRIER
};
VlProfileE m_type; // Record type
vluint32_t m_mtaskId; // Mtask we're logging
vluint32_t m_predictTime; // How long scheduler predicted would take
vluint64_t m_startTime; // Tick at start of execution
vluint64_t m_endTime; // Tick at end of execution
unsigned m_cpu; // Execution CPU number (at start anyways)
public:
class Barrier {};
VlProfileRec() {}
explicit VlProfileRec(Barrier) {
m_type = TYPE_BARRIER;
m_mtaskId = 0;
m_predictTime = 0;
m_startTime = 0;
m_cpu = sched_getcpu();
}
void startRecord(vluint64_t time, uint32_t mtask, uint32_t predict) {
m_type = VlProfileRec::TYPE_MTASK_RUN;
m_mtaskId = mtask;
m_predictTime = predict;
m_startTime = time;
m_cpu = sched_getcpu();
}
void endRecord(vluint64_t time) {
m_endTime = time;
}
};
class VlThreadPool;
class VlWorkerThread {
private:
// TYPES
struct ExecRec {
VlExecFnp m_fnp; // Function to execute
VlThrSymTab m_sym; // Symbol table to execute
bool m_evenCycle; // Even/odd for flag alternation
ExecRec() : m_fnp(NULL), m_sym(NULL), m_evenCycle(false) {}
ExecRec(VlExecFnp fnp, bool evenCycle, VlThrSymTab sym)
: m_fnp(fnp), m_sym(sym), m_evenCycle(evenCycle) {}
};
// MEMBERS
VerilatedMutex m_mutex;
// Why a vector? We expect the pending list to be very short, typically
// 0 or 1 or 2, so popping from the front shouldn't be
// expensive. Revisit if we ever have longer queues...
std::vector<ExecRec> m_ready VL_GUARDED_BY(m_mutex);
VlThreadPool* m_poolp; // Our associated thread pool
// If values stored are non-NULL, the thread is asleep pending new
// work. If the thread is not asleep, both parts of m_sleepAlarm must
// be NULL.
std::pair<VlNotification*, ExecRec*> m_sleepAlarm VL_GUARDED_BY(m_mutex);
bool m_profiling; // Is profiling enabled?
std::atomic<bool> m_exiting; // Worker thread should exit
std::thread m_cthread; // Underlying C++ thread record
VL_UNCOPYABLE(VlWorkerThread);
public:
// CONSTRUCTORS
explicit VlWorkerThread(VlThreadPool* poolp, bool profiling);
~VlWorkerThread();
// METHODS
inline void dequeWork(ExecRec* workp) VL_REQUIRES(m_mutex) {
// As noted above this is inefficient if our ready list is ever
// long (but it shouldn't be)
*workp = m_ready.front();
m_ready.erase(m_ready.begin());
}
inline void wakeUp() VL_REQUIRES(m_mutex) {
VlNotification* notifyp = m_sleepAlarm.first;
m_sleepAlarm.first = NULL; // NULL+NULL means wake
m_sleepAlarm.second = NULL;
notifyp->notify();
}
inline bool sleeping() VL_REQUIRES(m_mutex) {
return (m_sleepAlarm.first != NULL);
}
inline void addTask(VlExecFnp fnp, bool evenCycle, VlThrSymTab sym) {
VerilatedLockGuard lk(m_mutex);
m_ready.emplace_back(fnp, evenCycle, sym);
if (VL_LIKELY(sleeping())) { // Generally queue is waiting for work
// Awaken thread
dequeWork(m_sleepAlarm.second);
wakeUp();
}
}
void workerLoop();
static void startWorker(VlWorkerThread* workerp);
};
class VlThreadPool {
// TYPES
typedef std::vector<VlProfileRec> ProfileTrace;
typedef std::set<ProfileTrace*> ProfileSet;
// MEMBERS
std::vector<VlWorkerThread*> m_workers; // our workers
bool m_profiling; // is profiling enabled?
// Support profiling -- we can append records of profiling events
// to this vector with very low overhead, and then dump them out
// later. This prevents the overhead of printf/malloc/IO from
// corrupting the profiling data. It's super cheap to append
// a VlProfileRec struct on the end of a pre-allocated vector;
// this is the only cost we pay in real-time during a profiling cycle.
static VL_THREAD_LOCAL ProfileTrace* t_profilep;
ProfileSet m_allProfiles VL_GUARDED_BY(m_mutex);
VerilatedMutex m_mutex;
public:
// CONSTRUCTORS
// Construct a thread pool with 'nThreads' dedicated threads. The thread
// pool will create these threads and make them available to execute tasks
// via this->workerp(index)->addTask(...)
VlThreadPool(int nThreads, bool profiling);
~VlThreadPool();
// METHODS
inline int numThreads() const {
return m_workers.size();
}
inline VlWorkerThread* workerp(int index) {
assert(index >= 0);
assert(index < m_workers.size());
return m_workers[index];
}
inline VlProfileRec* profileAppend() {
t_profilep->emplace_back();
return &(t_profilep->back());
}
void profileAppendAll(const VlProfileRec& rec);
void profileDump(const char* filenamep, vluint64_t ticksElapsed);
// In profiling mode, each executing thread must call
// this once to setup profiling state:
void setupProfilingClientThread();
void tearDownProfilingClientThread();
private:
VL_UNCOPYABLE(VlThreadPool);
};
#endif

View File

@ -155,6 +155,221 @@ provided and documented in C<V3GraphAlg.cpp>.
=back
=head2 Multithreaded Mode
In --threads mode, the frontend of the Verilator pipeline is the same as
serial mode, up until V3Order.
V3Order builds a fine-grained, statement-level dependency graph that governs
the ordering of code within a single eval() call. In serial mode, that
dependency graph is used to order all statements into a total serial order.
In parallel mode, the same dependency graph is the starting point for a
partitioner (V3Partition).
The partitioner's goal is to coarsen the fine-grained DAG into a coarser
DAG, while maintaining as much available parallelism as possible. Often the
partitioner can transform an input graph with millions of nodes into a
coarsened execution graph with a few dozen nodes, while maintaining enough
parallelism to take advantage of a modern multicore CPU. Runtime
synchronization cost is not prohibitive with so few nodes.
=head3 Partitioning
Our partitioner is similar to the one Vivek Sarkar described in his 1989
paper "Partitioning and Scheduling Parallel Programs for Multiprocessors".
Let's define some terms:
=over 4
=item C<Par Factor>
The available parallelism or "par-factor" of a DAG is the total cost to
execute all nodes, divided by the cost to execute the longest critical path
through the graph. This is the speedup you would get from running the graph
in parallel, if given infinite CPU cores available and communication and
synchronization are zero.
=item C<Macro Task>
When the partitioner coarsens the graph, it combines nodes together. Each
fine-grained node represents an atomic "task"; combined nodes in the
coarsened graph are "macro-tasks". This term comes from Sarkar. Each
macro-task executes from start to end on one processor, without any
synchronization to any other macro-task during its
execution. (Synchronization only happens before the macro-task begins or
after it ends.)
=item C<Edge Contraction>
Our partitioner, like Sarkar's, primarily relies on "edge contraction" to
coarsen the graph. It starts with one macro-task per atomic task and
iteratively combines pairs of edge-connected macro-tasks.
=item C<Local Critical Path>
Each node in the graph has a "local" critical path. That's the critical
path from the start of the graph to the start of the node, plus the node's
cost, plus the critical path from the end of the node to the end of the
graph.
=back
Sarkar calls out an important trade-off: coarsening the graph reduces
runtime synchronization overhead among the macro-tasks, but it tends to
increase the critical path through the graph and thus reduces par-factor.
Sarkar's partitioner, and ours, chooses pairs of macro-tasks to merge such
that the growth in critical path is minimized. Each candidate merge would
result in a new node, which would have some local critical path. We choose
the candidate that would produce the shortest local critical path. Repeat
until par-factor falls to a target threshold. It's a greedy algorithm, and
it's not guaranteed to produce the best partition (which Sarkar proves is
NP-hard).
=head3 Estimating Logic Costs
To compute the cost of any given path through the graph, Verilator
estimates an execution cost for each task. Each macro-task has an execution
cost which is simply the sum of its tasks' costs. We assume that
communication overhead and synchronization overhead are zero, so the cost
of any given path through the graph is simply the sum of macro-task
execution costs. Sarkar does almost the same thing, except that he has
nonzero estimates for synchronization costs.
Verilator's cost estimates are assigned by the InstrCountCostVisitor. This
class is perhaps the most fragile piece of the multithread implementation.
It's easy to have a bug where you count something cheap (eg. accessing one
element of a huge array) as if it were expensive (eg. by counting it as if
it were an access to the entire array.) Even without such gross bugs, the
estimates this produce are only loosely predictive of actual runtime cost.
Multithread performance would be better with better runtime costs
estimates. This is an area to improve.
=head3 Scheduling Macro-Tasks at Runtime
After coarsening the graph, we must schedule the macro-tasks for runtime.
Sarkar describes two options: you can dynamically schedule tasks at
runtime, with a runtime graph follower. Sarkar calls this the
"macro-dataflow model." Verilator does not support this; early experiments
with this approach had poor performance.
The other option is to statically assign macro-tasks to threads, with each
thread running its macro-tasks in a static order. Sarkar describes this in
Chapter 5. Verilator takes this static approach. The only dynamic aspect is
that each macro task may block before starting, to wait until its
prerequisites on other threads have finished.
The synchronization cost is cheap if the prereqs are done. If they're not,
fragmentation (idle CPU cores waiting) is possible. This is the major
source of overhead in this approach. The --prof-threads switch and the
C<verilator_gantt> script can visualize the time lost to such
fragmentation.
=head3 Locating Variables for Best Spatial Locality
After scheduling all code, we attempt to locate variables in memory such
that variables accessed by a single macro-task are close together in
memory. This provides "spatial locality" -- when we pull in a 64-byte
cache line to access a 2-byte variable, we want the other 62 bytes to be
ones we'll also likely access soon, for best cache performance.
This turns out to be critical for performance. It should allow Verilator
to scale to very large models. We don't rely on our working set fitting
in any CPU cache; instead we essentially "stream" data into caches from
memory. It's not literally streaming, where the address increases
monotonically, but it should have similar performance characteristics,
so long as each macro-task's dataset fits in one core's local caches.
To achieve spatial locality, we tag each variable with the set of
macro-tasks that access it. Let's call this set the "footprint" of that
variable. The variables in a given module have a set of footprints. We can
order those footprints to minimize the distance between them (distance is
the number of macro-tasks that are different across any two footprints) and
then emit all variables into the struct in ordered-footprint order.
The footprint ordering is literally the traveling salesman problem, and we
use a TSP-approximation algorithm to get close to an optimal sort.
This is an old idea. Simulators designed at DEC in the early 1990s used
similar techniques to optimize both single-thread and multi-thread modes.
(Verilator does not optimize variable placement for spatial locality in
serial mode; that is a possible area for improvement.)
=head3 Improving Multithreaded Performance Further (a TODO list)
=over 4
=item C<Wave Scheduling>
To allow the verilated model to run in parallel with the testbench, it
might be nice to support "wave" scheduling, in which work on a cycle begins
before eval() is called or continues after eval() returns. For now all
work on a cycle happens during the eval() call, leaving Verilator's threads
idle while the testbench (everything outside eval()) is working. This would
involve fundamental changes within the partitioner, however, it's probably
the best bet for hiding testbench latency.
=item C<Efficient Dynamic Scheduling>
To scale to more than a few threads, we may revisit a fully dynamic
scheduler. For large (>16 core) systems it might make sense to dedicate an
entire core to scheduling, so that scheduler data structures would fit in
its L1 cache and thus the cost of traversing priority-ordered ready lists
would not be prohibitive.
=item C<Static Scheduling with Runtime Repack>
We could modify the static scheduling approach by gathering actual
macro-task execution times at run time, and dynamically re-packing the
macro-tasks into the threads also at run time. Say, re-pack once every
10,000 cycles or something. This has the potential to do better than our
static estimates about macro-task run times. It could potentially react to
CPU cores that aren't performing equally, due to NUMA or thermal throttling
or nonuniform competing memory traffic or whatever.
=item C<Clock Domain Balancing>
Right now Verilator makes no attempt to balance clock domains across
macro-tasks. For a multi-domain model, that could lead to bad gantt chart
fragmentation. This could be improved if it's a real problem in practice.
=item C<Other Forms of MTask Balancing>
The largest source of runtime overhead is idle CPUs, which happens due to
variance between our predicted runtime for each MTask and its actual
runtime. That variance is magnified if MTasks are homogeneous, containing
similar repeating logic which was generally close together in source code
and which is still packed together even after going through Verilator's
digestive tract.
If Verilator could avoid doing that, and instead would take source logic
that was close together and distribute it across MTasks, that would
increase the diversity of any given MTask, and this should reduce variance
in the cost estimates.
One way to do that might be to make various "tie breaker" comparison
routines in the sources to rely more heavily on randomness, and generally
try harder not to keep input nodes together when we have the option to
scramble things.
=item C<Performance Regression>
It would be nice if we had a regression of large designs, with some
diversity of design styles, to test on both single- and multi-threaded
modes. This would help to avoid performance regressions, and also to
evaluate the optimizations while minimizing the impact of parasitic noise.
=item C<Per-Instance Classes>
If we have multiple instances of the same module, and they partition
differently (likely; we make no attempt to partition them the same) then
the variable sort will be suboptimal for either instance. A possible
improvement would be to emit a unique class for each instance of a module,
and sort its variables optimally for that instance's code stream.
=back
=head2 Verilated Flow
The evaluation loop outputted by Verilator is designed to allow a single

View File

@ -64,6 +64,7 @@ sub test {
run("test -e $prefix/bin/verilator");
run("test -e $prefix/bin/verilator_bin");
run("test -e $prefix/bin/verilator_bin_dbg");
run("test -e $prefix/bin/verilator_gantt");
run("test -e $prefix/bin/verilator_profcfunc");
}

View File

@ -217,6 +217,7 @@ RAW_OBJS = \
V3Order.o \
V3Os.o \
V3Param.o \
V3Partition.o \
V3PreShell.o \
V3Premit.o \
V3Reloop.o \

View File

@ -29,16 +29,24 @@
#include <vector>
#include <cmath>
#include <map>
#include VL_INCLUDE_UNORDERED_SET
#include "V3Ast__gen_classes.h" // From ./astgen
// Things like:
// class V3AstNode;
// Forward declarations
class V3Graph;
class ExecMTask;
// Hint class so we can choose constructors
class VFlagLogicPacked {};
class VFlagBitPacked {};
class VFlagChildDType {}; // Used by parser.y to select constructor that sets childDType
// Used as key for another map, needs operator<, hence not an unordered_set
typedef std::set<int> MTaskIdSet; // Set of mtaskIds for Var sorting
//######################################################################
// For broken() function, return error string if have a match

View File

@ -31,6 +31,8 @@
#include "V3Ast.h"
#include "V3File.h"
#include "V3Global.h"
#include "V3Graph.h"
#include "V3PartitionGraph.h" // Just for mtask dumping
//======================================================================
// Special methods
@ -151,22 +153,26 @@ AstNodeBiop* AstEqWild::newTyped(FileLine* fl, AstNode* lhsp, AstNode* rhsp) {
}
}
AstExecGraph::AstExecGraph(FileLine* fileline)
: AstNode(fileline) {
m_depGraphp = new V3Graph;
}
AstExecGraph::~AstExecGraph() {
delete m_depGraphp; VL_DANGLING(m_depGraphp);
}
bool AstVar::isSigPublic() const {
return (m_sigPublic || (v3Global.opt.allPublic() && !isTemp() && !isGenVar()));
}
bool AstVar::isScQuad() const {
return (isSc() && isQuad() && !isScBv() && !isScBigUint());
}
bool AstVar::isScBv() const {
return ((isSc() && width() >= v3Global.opt.pinsBv()) || m_attrScBv);
}
bool AstVar::isScUint() const {
return ((isSc() && v3Global.opt.pinsScUint() && width() >= 2 && width() <= 64) && !isScBv());
}
bool AstVar::isScBigUint() const {
return ((isSc() && v3Global.opt.pinsScBigUint() && width() >= 65 && width() <= 512) && !isScBv());
}
@ -441,6 +447,16 @@ AstVar* AstVar::scVarRecurse(AstNode* nodep) {
return NULL;
}
string AstVar::mtasksString() const {
std::ostringstream os;
os<<" all: ";
for (MTaskIdSet::const_iterator it = m_mtaskIds.begin();
it != m_mtaskIds.end(); ++it) {
os<<*it<<" ";
}
return os.str();
}
AstNodeDType* AstNodeDType::dtypeDimensionp(int dimension) {
// dimension passed from AstArraySel::dimension
// Dimension 0 means the VAR itself, 1 is the closest SEL to the AstVar,
@ -970,6 +986,11 @@ void AstSliceSel::dump(std::ostream& str) {
str<<" decl"<<declRange();
}
}
void AstMTaskBody::dump(std::ostream& str) {
this->AstNode::dump(str);
str<<" ";
m_execMTaskp->dump(str);
}
void AstTypeTable::dump(std::ostream& str) {
this->AstNode::dump(str);
for (int i=0; i<(int)(AstBasicDTypeKwd::_ENUM_MAX); ++i) {

View File

@ -1124,6 +1124,7 @@ private:
bool m_noSubst:1; // Do not substitute out references
bool m_trace:1; // Trace this variable
AstVarAttrClocker m_attrClocker;
MTaskIdSet m_mtaskIds; // MTaskID's that read or write this var
void init() {
m_input=false; m_output=false; m_tristate=false; m_declOutput=false;
@ -1323,6 +1324,10 @@ public:
if (varType()==AstVarType::INPUT || varType()==AstVarType::OUTPUT) m_varType = AstVarType::WIRE;
}
static AstVar* scVarRecurse(AstNode* nodep);
void addProducingMTaskId(int id) { m_mtaskIds.insert(id); }
void addConsumingMTaskId(int id) { m_mtaskIds.insert(id); }
const MTaskIdSet& mtaskIds() const { return m_mtaskIds; }
string mtasksString() const;
};
class AstDefParam : public AstNode {
@ -5698,6 +5703,44 @@ public:
AstNode* bodysp() const { return op1p(); } // op1= expressions to print
};
class AstMTaskBody : public AstNode {
// Hold statements for each MTask
private:
ExecMTask* m_execMTaskp;
public:
explicit AstMTaskBody(FileLine* flp)
: AstNode(flp)
, m_execMTaskp(NULL) {}
ASTNODE_NODE_FUNCS(MTaskBody);
virtual const char* broken() const { BROKEN_RTN(!m_execMTaskp); return NULL; }
AstNode* stmtsp() const { return op1p(); }
void addStmtsp(AstNode* nodep) { addOp1p(nodep); }
ExecMTask* execMTaskp() const { return m_execMTaskp; }
void execMTaskp(ExecMTask* execMTaskp) { m_execMTaskp = execMTaskp; }
virtual void dump(std::ostream& str=std::cout);
};
class AstExecGraph : public AstNode {
// For parallel execution, this node contains a dependency graph. Each
// node in the graph is an ExecMTask, which contains a body for the
// mtask, which contains a set of AstActive's, each of which calls a
// leaf AstCFunc. whew!
//
// The mtask bodies are also children of this node, so we can visit
// them without traversing the graph (it's not always needed to
// traverse the graph.)
private:
V3Graph *m_depGraphp; // contains ExecMTask's
public:
explicit AstExecGraph(FileLine* fileline);
ASTNODE_NODE_FUNCS_NO_DTOR(ExecGraph)
virtual ~AstExecGraph();
virtual const char* broken() const { BROKEN_RTN(!m_depGraphp); return NULL; }
const V3Graph* depGraphp() const { return m_depGraphp; }
V3Graph* mutableDepGraphp() { return m_depGraphp; }
void addMTaskBody(AstMTaskBody* bodyp) { addOp1p(bodyp); }
};
class AstSplitPlaceholder : public AstNode {
public:
// Dummy node used within V3Split; never exists outside of V3Split.
@ -5749,12 +5792,14 @@ private:
AstTypeTable* m_typeTablep; // Reference to top type table, for faster lookup
AstPackage* m_dollarUnitPkgp;
AstCFunc* m_evalp; // The '_eval' function
AstExecGraph* m_execGraphp; // Execution MTask graph for threads>1 mode
public:
AstNetlist()
: AstNode(new FileLine("AstRoot",0))
, m_typeTablep(NULL)
, m_dollarUnitPkgp(NULL)
, m_evalp(NULL) { }
, m_evalp(NULL)
, m_execGraphp(NULL) { }
ASTNODE_NODE_FUNCS(Netlist)
virtual const char* broken() const {
BROKEN_RTN(m_dollarUnitPkgp && !m_dollarUnitPkgp->brokeExists());
@ -5784,6 +5829,8 @@ public:
return m_dollarUnitPkgp; }
AstCFunc* evalp() const { return m_evalp; }
void evalp(AstCFunc* evalp) { m_evalp = evalp; }
AstExecGraph* execGraphp() const { return m_execGraphp; }
void execGraphp(AstExecGraph* graphp) { m_execGraphp = graphp; }
};
//######################################################################

View File

@ -68,6 +68,7 @@ private:
AstCFunc* m_settleFuncp; // Top settlement function we are creating
AstSenTree* m_lastSenp; // Last sensitivity match, so we can detect duplicates.
AstIf* m_lastIfp; // Last sensitivity if active to add more under
AstMTaskBody* m_mtaskBodyp; // Current mtask body
// METHODS
VL_DEBUG_FUNC; // Declare debug()
@ -338,6 +339,30 @@ private:
// Only empty blocks should be leftover on the non-top. Killem.
if (nodep->stmtsp()) nodep->v3fatalSrc("Non-empty lower active");
nodep->unlinkFrBack()->deleteTree(); VL_DANGLING(nodep);
} else if (m_mtaskBodyp) {
UINFO(4," TR ACTIVE "<<nodep<<endl);
AstNode* stmtsp = nodep->stmtsp()->unlinkFrBackWithNext();
if (nodep->hasClocked()) {
if (nodep->hasInitial()) nodep->v3fatalSrc("Initial block should not have clock sensitivity");
if (m_lastSenp && nodep->sensesp()->sameTree(m_lastSenp)) {
UINFO(4," sameSenseTree\n");
} else {
clearLastSen();
m_lastSenp = nodep->sensesp();
// Make a new if statement
m_lastIfp = makeActiveIf(m_lastSenp);
m_mtaskBodyp->addStmtsp(m_lastIfp);
}
// Move statements to if
m_lastIfp->addIfsp(stmtsp);
} else if (nodep->hasInitial() || nodep->hasSettle()) {
nodep->v3fatalSrc("MTask should not include initial/settle logic.");
} else {
// Combo logic. Move statements to mtask func.
clearLastSen();
m_mtaskBodyp->addStmtsp(stmtsp);
}
nodep->unlinkFrBack()->deleteTree(); VL_DANGLING(nodep);
} else {
UINFO(4," ACTIVE "<<nodep<<endl);
AstNode* stmtsp = nodep->stmtsp()->unlinkFrBackWithNext();
@ -372,6 +397,20 @@ private:
nodep->unlinkFrBack()->deleteTree(); VL_DANGLING(nodep);
}
}
virtual void visit(AstExecGraph* nodep) {
for (m_mtaskBodyp = VN_CAST(nodep->op1p(), MTaskBody);
m_mtaskBodyp;
m_mtaskBodyp = VN_CAST(m_mtaskBodyp->nextp(), MTaskBody)) {
clearLastSen();
iterate(m_mtaskBodyp);
}
clearLastSen();
// Move the ExecGraph into _eval. Its location marks the
// spot where the graph will execute, relative to other
// (serial) logic in the cycle.
nodep->unlinkFrBack();
addToEvalLoop(nodep);
}
//--------------------
// Default: Just iterate
@ -391,6 +430,7 @@ public:
m_lastSenp = NULL;
m_lastIfp = NULL;
m_scopep = NULL;
m_mtaskBodyp = NULL;
//
iterate(nodep);
// Allow downstream modules to find _eval()

View File

@ -34,6 +34,8 @@
#include "V3EmitC.h"
#include "V3EmitCBase.h"
#include "V3Number.h"
#include "V3PartitionGraph.h"
#include "V3TSP.h"
#define VL_VALUE_STRING_MAX_WIDTH 8192 // We use a static char array in VL_VALUE_STRING
@ -103,7 +105,13 @@ public:
puts("["+cvtToStr(arrayp->elementsConst())+"]");
}
}
void emitVarCmtChg(const AstVar* varp, string* curVarCmtp) {
string newVarCmt = varp->mtasksString();
if (*curVarCmtp != newVarCmt) {
*curVarCmtp = newVarCmt;
puts("// Begin mtask footprint "+*curVarCmtp+"\n");
}
}
void emitTypedefs(AstNode* firstp) {
bool first = true;
for (AstNode* loopp=firstp; loopp; loopp = loopp->nextp()) {
@ -783,6 +791,50 @@ public:
virtual ~EmitCStmts() {}
};
//######################################################################
// Establish mtask variable sort order in mtasks mode
class EmitVarTspSorter : public V3TSP::TspStateBase {
private:
// MEMBERS
const MTaskIdSet& m_mtaskIds; // Mtask we're ordering
static unsigned m_serialNext; // Unique ID to establish serial order
unsigned m_serial; // Serial ordering
public:
// CONSTRUCTORS
explicit EmitVarTspSorter(const MTaskIdSet& mtaskIds)
: m_mtaskIds(mtaskIds),
m_serial(++m_serialNext) {}
virtual ~EmitVarTspSorter() {}
// METHODS
bool operator<(const TspStateBase& other) const {
return operator<(dynamic_cast<const EmitVarTspSorter&>(other));
}
bool operator<(const EmitVarTspSorter& other) const {
return m_serial < other.m_serial;
}
const MTaskIdSet& mtaskIds() const { return m_mtaskIds; }
virtual int cost(const TspStateBase* otherp) const {
return cost(dynamic_cast<const EmitVarTspSorter*>(otherp));
}
virtual int cost(const EmitVarTspSorter* otherp) const {
int cost = diffs(m_mtaskIds, otherp->m_mtaskIds);
cost += diffs(otherp->m_mtaskIds, m_mtaskIds);
return cost;
}
// Returns the number of elements in set_a that don't appear in set_b
static int diffs(const MTaskIdSet& set_a, const MTaskIdSet& set_b) {
int diffs = 0;
for (MTaskIdSet::iterator it = set_a.begin();
it != set_a.end(); ++it) {
if (set_b.find(*it) == set_b.end()) ++diffs;
}
return diffs;
}
};
unsigned EmitVarTspSorter::m_serialNext = 0;
//######################################################################
// Internal EmitC implementation
@ -873,6 +925,91 @@ class EmitCImp : EmitCStmts {
return ofp;
}
// Returns the number of cross-thread dependencies into mtaskp.
// If >0, mtaskp must test whether its prereqs are done before starting,
// and may need to block.
static uint32_t packedMTaskMayBlock(const ExecMTask* mtaskp) {
uint32_t result = 0;
for (V3GraphEdge* edgep = mtaskp->inBeginp(); edgep; edgep = edgep->inNextp()) {
const ExecMTask* prevp = dynamic_cast<ExecMTask*>(edgep->fromp());
if (prevp->thread() != mtaskp->thread()) {
++result;
}
}
return result;
}
void emitMTaskBody(AstMTaskBody* nodep) {
ExecMTask* curExecMTaskp = nodep->execMTaskp();
if (packedMTaskMayBlock(curExecMTaskp)) {
puts("vlTOPp->__Vm_mt_" + cvtToStr(curExecMTaskp->id())
+ ".waitUntilUpstreamDone(even_cycle);\n");
}
string recName;
if (v3Global.opt.profThreads()) {
recName = "__Vprfthr_" + cvtToStr(curExecMTaskp->id());
puts("VlProfileRec* " + recName + " = NULL;\n");
// Leave this if() here, as don't want to call VL_RDTSC_Q unless profiling
puts("if (VL_UNLIKELY(vlTOPp->__Vm_profile_cycle_start)) {\n");
puts( recName + " = vlTOPp->__Vm_threadPoolp->profileAppend();\n");
puts( recName + "->startRecord(VL_RDTSC_Q() - vlTOPp->__Vm_profile_cycle_start,");
puts( " "+cvtToStr(curExecMTaskp->id())+ ",");
puts( " "+cvtToStr(curExecMTaskp->cost())+");\n");
puts("}\n");
}
puts("Verilated::mtaskId(" + cvtToStr(curExecMTaskp->id()) + ");\n");
// The actual body of calls to leaf functions
iterateAndNextNull(nodep->stmtsp());
if (v3Global.opt.profThreads()) {
// Leave this if() here, as don't want to call VL_RDTSC_Q unless profiling
puts("if (VL_UNLIKELY("+recName+")) {\n");
puts( recName + "->endRecord(VL_RDTSC_Q() - vlTOPp->__Vm_profile_cycle_start);\n");
puts("}\n");
}
// Flush message queue
puts("Verilated::endOfThreadMTask(vlSymsp->__Vm_evalMsgQp);\n");
// For any downstream mtask that's on another thread, bump its
// counter and maybe notify it.
for (V3GraphEdge* edgep = curExecMTaskp->outBeginp();
edgep; edgep = edgep->outNextp()) {
const ExecMTask* nextp = dynamic_cast<ExecMTask*>(edgep->top());
if (nextp->thread() != curExecMTaskp->thread()) {
puts("vlTOPp->__Vm_mt_"+cvtToStr(nextp->id())
+ ".signalUpstreamDone(even_cycle);\n");
}
}
// Run the next mtask inline
const ExecMTask* nextp = curExecMTaskp->packNextp();
if (nextp) {
emitMTaskBody(nextp->bodyp());
} else {
// Unblock the fake "final" mtask
puts("vlTOPp->__Vm_mt_final.signalUpstreamDone(even_cycle);\n");
}
}
virtual void visit(AstMTaskBody* nodep) {
ExecMTask* mtp = nodep->execMTaskp();
puts("\n");
puts("void ");
puts(modClassName(m_modp)+"::"+mtp->cFuncName());
puts("(bool even_cycle, void* symtab) {\n");
// Declare and set vlSymsp
puts(EmitCBaseVisitor::symClassVar() + " = ("
+ EmitCBaseVisitor::symClassName() + "*)symtab;\n");
puts(EmitCBaseVisitor::symTopAssign()+"\n");
emitMTaskBody(nodep);
puts("}\n");
}
//---------------------------------------
// VISITORS
using EmitCStmts::visit; // Suppress hidden overloaded virtual function warning
@ -973,6 +1110,54 @@ class EmitCImp : EmitCStmts {
emitVarReset(varp);
}
virtual void visit(AstExecGraph* nodep) {
if (nodep != v3Global.rootp()->execGraphp()) {
nodep->v3fatalSrc("ExecGraph should be a singleton!");
}
// The location of the AstExecGraph within the containing _eval()
// function is where we want to invoke the graph and wait for it to
// complete. Do that now.
//
// Don't recurse to children -- this isn't the place to emit
// function definitions for the nested CFuncs. We'll do that at the
// end.
puts("vlTOPp->__Vm_even_cycle = !vlTOPp->__Vm_even_cycle;\n");
// Build the list of initial mtasks to start
std::vector<const ExecMTask*> execMTasks;
// Start each root mtask
for (const V3GraphVertex* vxp = nodep->depGraphp()->verticesBeginp();
vxp; vxp = vxp->verticesNextp()) {
const ExecMTask* etp = dynamic_cast<const ExecMTask*>(vxp);
if (etp->threadRoot()) execMTasks.push_back(etp);
}
if (execMTasks.size() >
static_cast<unsigned>(v3Global.opt.threads())) {
nodep->v3fatalSrc("More root mtasks than available threads");
}
if (!execMTasks.empty()) {
for (uint32_t i = 0; i < execMTasks.size(); ++i) {
bool runInline = (i == execMTasks.size() - 1);
if (runInline) {
// The thread calling eval() will run this mtask inline,
// along with its packed successors.
puts(execMTasks[i]->cFuncName()
+ "(vlTOPp->__Vm_even_cycle, vlSymsp);\n");
puts("Verilated::mtaskId(0);\n");
} else {
// The other N-1 go to the thread pool.
puts("vlTOPp->__Vm_threadPoolp->workerp("
+ cvtToStr(i)+")->addTask("
+ execMTasks[i]->cFuncName()
+ ", vlTOPp->__Vm_even_cycle, vlSymsp);\n");
}
}
puts("vlTOPp->__Vm_mt_final.waitUntilUpstreamDone(vlTOPp->__Vm_even_cycle);\n");
}
}
//---------------------------------------
// ACCESSORS
@ -995,6 +1180,8 @@ class EmitCImp : EmitCStmts {
void emitStaticDecl(AstNodeModule* modp);
void emitSettleLoop(const std::string& eval_call, bool initial);
void emitWrapEval(AstNodeModule* modp);
void emitMTaskState();
void emitMTaskVertexCtors(bool* firstp);
void emitInt(AstNodeModule* modp);
void maybeSplit(AstNodeModule* modp);
@ -1534,6 +1721,36 @@ void EmitCImp::emitCoverageDecl(AstNodeModule* modp) {
}
}
void EmitCImp::emitMTaskVertexCtors(bool* firstp) {
AstExecGraph* execGraphp = v3Global.rootp()->execGraphp();
if (!execGraphp) v3Global.rootp()->v3fatalSrc("Should have an execGraphp");
const V3Graph* depGraphp = execGraphp->depGraphp();
unsigned finalEdgesInCt = 0;
for (const V3GraphVertex* vxp = depGraphp->verticesBeginp();
vxp; vxp = vxp->verticesNextp()) {
const ExecMTask* mtp = dynamic_cast<const ExecMTask*>(vxp);
unsigned edgesInCt = packedMTaskMayBlock(mtp);
if (packedMTaskMayBlock(mtp) > 0) {
emitCtorSep(firstp);
puts("__Vm_mt_"+cvtToStr(mtp->id())+"("+cvtToStr(edgesInCt)+")");
}
// Each mtask with no packed successor will become a dependency
// for the final node:
if (!mtp->packNextp()) ++finalEdgesInCt;
}
emitCtorSep(firstp);
puts("__Vm_mt_final(" + cvtToStr(finalEdgesInCt) + ")");
// This will flip to 'true' before the start of the 0th cycle.
emitCtorSep(firstp); puts("__Vm_threadPoolp(NULL)");
if (v3Global.opt.profThreads()) {
emitCtorSep(firstp); puts("__Vm_profile_cycle_start(0)");
}
emitCtorSep(firstp); puts("__Vm_even_cycle(false)");
}
void EmitCImp::emitCtorImp(AstNodeModule* modp) {
puts("\n");
bool first = true;
@ -1544,6 +1761,9 @@ void EmitCImp::emitCtorImp(AstNodeModule* modp) {
first = false; // VL_CTOR_IMP includes the first ':'
}
emitVarCtors(&first);
if (modp->isTop() && v3Global.opt.mtasks()) {
emitMTaskVertexCtors(&first);
}
puts(" {\n");
emitCellCtors(modp);
emitSensitives();
@ -1556,6 +1776,39 @@ void EmitCImp::emitCtorImp(AstNodeModule* modp) {
putsDecoration("// Reset structure values\n");
puts("_ctor_var_reset();\n");
emitTextSection(AstType::atScCtor);
if (modp->isTop() && v3Global.opt.mtasks()) {
// TODO-- For now each top module creates its own ThreadPool here,
// and deletes it in the destructor. If A and B are each top level
// modules, each creates a separate thread pool. This allows
// A.eval() and B.eval() to run concurrently without any
// interference -- so long as the physical machine has enough cores
// to support both pools and all testbench threads.
//
// In the future, we might want to let the client provide a
// threadpool to the constructor. This would allow two or more
// models to share a single threadpool.
//
// For example: suppose models A and B are each compiled to run on
// 4 threads. The client might create a single thread pool with 3
// threads and pass it to both models. If the client can ensure tht
// A.eval() and B.eval() do NOT run concurrently, there will be no
// contention for the threads. This mode is missing for now. (Is
// there demand for such a setup?)
puts("__Vm_threadPoolp = new VlThreadPool("
// Note we create N-1 threads in the thread pool. The thread
// that calls eval() becomes the final Nth thread for the
// duration of the eval call.
+ cvtToStr(v3Global.opt.threads() - 1)
+ ", " + cvtToStr(v3Global.opt.profThreads())
+ ");\n");
if (v3Global.opt.profThreads()) {
puts("__Vm_profile_cycle_start = 0;\n");
puts("__Vm_profile_time_finished = 0;\n");
puts("__Vm_profile_window_ct = 0;");
}
}
puts("}\n");
}
@ -1597,6 +1850,9 @@ void EmitCImp::emitCoverageImp(AstNodeModule* modp) {
void EmitCImp::emitDestructorImp(AstNodeModule* modp) {
puts("\n");
puts(modClassName(modp)+"::~"+modClassName(modp)+"() {\n");
if (modp->isTop() && v3Global.opt.mtasks()) {
puts("delete __Vm_threadPoolp; __Vm_threadPoolp = NULL;\n");
}
emitTextSection(AstType::atScDtor);
if (modp->isTop()) puts("delete __VlSymsp; __VlSymsp=NULL;\n");
puts("}\n");
@ -1796,9 +2052,47 @@ void EmitCImp::emitWrapEval(AstNodeModule* modp) {
if (v3Global.opt.threads() == 1) {
uint32_t mtaskId = 0;
putsDecoration("// MTask "+cvtToStr(mtaskId)+" start\n");
puts("VL_DEBUG_IF(VL_DBG_MSGF(\"MTask starting, mtaskId="+cvtToStr(mtaskId)+"\\n\"););\n");
puts("VL_DEBUG_IF(VL_DBG_MSGF(\"MTask"+cvtToStr(mtaskId)+" starting\\n\"););\n");
puts("Verilated::mtaskId("+cvtToStr(mtaskId)+");\n");
}
if (v3Global.opt.mtasks()
&& v3Global.opt.profThreads()) {
puts("if (VL_UNLIKELY((Verilated::profThreadsStart() != __Vm_profile_time_finished)\n");
puts( " && (VL_TIME_Q() > Verilated::profThreadsStart())\n");
puts( " && (Verilated::profThreadsWindow() >= 1))) {\n");
// Within a profile (either starting, middle, or end)
puts( "if (vlTOPp->__Vm_profile_window_ct == 0) {\n"); // Opening file?
// Start profile on this cycle. We'll capture a window worth, then
// only analyze the next window worth. The idea is that the first window
// capture will hit some cache-cold stuff (eg printf) but it'll be warm
// by the time we hit the second window, we hope.
puts( "vlTOPp->__Vm_profile_cycle_start = VL_RDTSC_Q();\n");
// "* 2" as first half is warmup, second half is collection
puts( "vlTOPp->__Vm_profile_window_ct = Verilated::profThreadsWindow() * 2 + 1;\n");
puts( "}\n");
puts( "--vlTOPp->__Vm_profile_window_ct;\n");
puts( "if (vlTOPp->__Vm_profile_window_ct == (Verilated::profThreadsWindow())) {\n");
// This barrier record in every threads' profile demarcates the
// cache-warm-up cycles before the barrier from the actual profile
// cycles afterward.
puts( "vlTOPp->__Vm_threadPoolp->profileAppendAll(");
puts( "VlProfileRec(VlProfileRec::Barrier()));\n");
puts( "vlTOPp->__Vm_profile_cycle_start = VL_RDTSC_Q();\n");
puts( "}\n");
puts( "else if (vlTOPp->__Vm_profile_window_ct == 0) {\n");
// Ending file.
puts( "vluint64_t elapsed = VL_RDTSC_Q() - vlTOPp->__Vm_profile_cycle_start;\n");
puts( "vlTOPp->__Vm_threadPoolp->profileDump(Verilated::profThreadsFilenamep(), elapsed);\n");
// This turns off the test to enter the profiling code, but still
// allows the user to collect another profile by changing
// profThreadsStart
puts( "__Vm_profile_time_finished = Verilated::profThreadsStart();\n");
puts( "vlTOPp->__Vm_profile_cycle_start = 0;\n");
puts( "}\n");
puts("}\n");
}
emitSettleLoop(
(string("VL_DEBUG_IF(VL_DBG_MSGF(\"+ Clock loop\\n\"););\n")
+ (v3Global.opt.trace() ? "vlSymsp->__Vm_activity = true;\n" : "")
@ -1832,10 +2126,13 @@ void EmitCStmts::emitVarList(AstNode* firstp, EisWhich which, const string& pref
// Put out a list of signal declarations
// in order of 0:clocks, 1:vluint8, 2:vluint16, 4:vluint32, 5:vluint64, 6:wide, 7:arrays
// This aids cache packing and locality
// Largest->smallest reduces the number of pad variables.
// But for now, Smallest->largest makes it more likely a small offset will allow access to the signal.
// TODO: Move this sort to an earlier visitor stage.
//
// Largest->smallest reduces the number of pad variables. Also
// experimented with alternating between large->small and small->large
// on successive Mtask groups, but then when a new mtask gets added may
// cause a huge delta.
//
// TODO: Move this sort to an earlier visitor stage.
VarSortMap varAnonMap;
VarSortMap varNonanonMap;
@ -1891,8 +2188,9 @@ void EmitCStmts::emitVarList(AstNode* firstp, EisWhich which, const string& pref
void EmitCStmts::emitVarSort(const VarSortMap& vmap, VarVec* sortedp) {
UASSERT(sortedp->empty(), "Sorted should be initially empty");
{
// Plain old serial mode. Sort by size, from small to large.
if (!v3Global.opt.mtasks()) {
// Plain old serial mode. Sort by size, from small to large,
// to optimize for both packing and small offsets in code.
for (VarSortMap::const_iterator it = vmap.begin();
it != vmap.end(); ++it) {
for (VarVec::const_iterator jt = it->second.begin();
@ -1900,12 +2198,52 @@ void EmitCStmts::emitVarSort(const VarSortMap& vmap, VarVec* sortedp) {
sortedp->push_back(*jt);
}
}
return;
}
// MacroTask mode. Sort by MTask-affinity group first, size second.
typedef std::map<MTaskIdSet, VarSortMap> MTaskVarSortMap;
MTaskVarSortMap m2v;
for (VarSortMap::const_iterator it = vmap.begin(); it != vmap.end(); ++it) {
int size_class = it->first;
const VarVec& vec = it->second;
for (VarVec::const_iterator jt = vec.begin(); jt != vec.end(); ++jt) {
const AstVar* varp = *jt;
m2v[varp->mtaskIds()][size_class].push_back(varp);
}
}
// Create a TSP sort state for each MTaskIdSet footprint
V3TSP::StateVec states;
for (MTaskVarSortMap::iterator it = m2v.begin(); it != m2v.end(); ++it) {
states.push_back(new EmitVarTspSorter(it->first));
}
// Do the TSP sort
V3TSP::StateVec sorted_states;
V3TSP::tspSort(states, &sorted_states);
for (V3TSP::StateVec::iterator it = sorted_states.begin();
it != sorted_states.end(); ++it) {
const EmitVarTspSorter* statep = dynamic_cast<const EmitVarTspSorter*>(*it);
const VarSortMap& localVmap = m2v[statep->mtaskIds()];
// use rbegin/rend to sort size large->small
for (VarSortMap::const_reverse_iterator jt = localVmap.rbegin();
jt != localVmap.rend(); ++jt) {
const VarVec& vec = jt->second;
for (VarVec::const_iterator kt = vec.begin();
kt != vec.end(); ++kt) {
sortedp->push_back(*kt);
}
}
delete statep; VL_DANGLING(statep);
}
}
void EmitCStmts::emitSortedVarList(const VarVec& anons,
const VarVec& nonanons,
const string& prefixIfImp) {
string curVarCmt = "";
// Output anons
{
int anonMembers = anons.size();
@ -1933,6 +2271,7 @@ void EmitCStmts::emitSortedVarList(const VarVec& anons,
if (anonL1s != 1) puts("struct {\n");
for (int l0=0; l0<lim && it != anons.end(); ++l0) {
const AstVar* varp = *it;
emitVarCmtChg(varp, &curVarCmt);
emitVarDecl(varp, prefixIfImp);
++it;
}
@ -1945,12 +2284,14 @@ void EmitCStmts::emitSortedVarList(const VarVec& anons,
// Leftovers, just in case off by one error somewhere above
for (; it != anons.end(); ++it) {
const AstVar* varp = *it;
emitVarCmtChg(varp, &curVarCmt);
emitVarDecl(varp, prefixIfImp);
}
}
// Output nonanons
for (VarVec::const_iterator it = nonanons.begin(); it != nonanons.end(); ++it) {
const AstVar* varp = *it;
emitVarCmtChg(varp, &curVarCmt);
emitVarDecl(varp, prefixIfImp);
}
}
@ -1986,6 +2327,59 @@ void EmitCImp::emitIntFuncDecls(AstNodeModule* modp) {
if (funcp->ifdef()!="") puts("#endif // "+funcp->ifdef()+"\n");
}
}
if (modp->isTop() && v3Global.opt.mtasks()) {
// Emit the mtask func prototypes.
AstExecGraph* execGraphp = v3Global.rootp()->execGraphp();
if (!execGraphp) v3Global.rootp()->v3fatalSrc("Root should have an execGraphp");
const V3Graph* depGraphp = execGraphp->depGraphp();
for (const V3GraphVertex* vxp = depGraphp->verticesBeginp();
vxp; vxp = vxp->verticesNextp()) {
const ExecMTask* mtp = dynamic_cast<const ExecMTask*>(vxp);
if (mtp->threadRoot()) {
// Emit function declaration for this mtask
ofp()->putsPrivate(true);
puts("static void "); puts(mtp->cFuncName());
puts("(bool even_cycle, void* symtab);\n");
}
}
// No AstCFunc for this one, as it's synthetic. Just write it:
puts("static void __Vmtask__final(bool even_cycle, void* symtab);\n");
}
}
void EmitCImp::emitMTaskState() {
ofp()->putsPrivate(true);
AstExecGraph* execGraphp = v3Global.rootp()->execGraphp();
if (!execGraphp) v3Global.rootp()->v3fatalSrc("Root should have an execGraphp");
const V3Graph* depGraphp = execGraphp->depGraphp();
for (const V3GraphVertex* vxp = depGraphp->verticesBeginp();
vxp; vxp = vxp->verticesNextp()) {
const ExecMTask* mtp = dynamic_cast<const ExecMTask*>(vxp);
if (packedMTaskMayBlock(mtp) > 0) {
puts("VlMTaskVertex __Vm_mt_" + cvtToStr(mtp->id()) + ";\n");
}
}
// This fake mtask depends on all the real ones. We use it to block
// eval() until all mtasks are done.
//
// In the future we might allow _eval() to return before the graph is
// fully done executing, for "half wave" scheduling. For now we wait
// for all mtasks though.
puts("VlMTaskVertex __Vm_mt_final;\n");
puts("VlThreadPool* __Vm_threadPoolp;\n");
if (v3Global.opt.profThreads()) {
// rdtsc() at current cycle start
puts("vluint64_t __Vm_profile_cycle_start;\n");
// Time we finished analysis
puts("vluint64_t __Vm_profile_time_finished;\n");
// Track our position in the cache warmup and actual profile window
puts("vluint32_t __Vm_profile_window_ct;\n");
}
puts("bool __Vm_even_cycle;\n");
}
void EmitCImp::emitInt(AstNodeModule* modp) {
@ -2000,6 +2394,9 @@ void EmitCImp::emitInt(AstNodeModule* modp) {
} else {
puts("#include \"verilated.h\"\n");
}
if (v3Global.opt.mtasks()) {
puts("#include \"verilated_threads.h\"\n");
}
if (v3Global.opt.savable()) {
puts("#include \"verilated_save.h\"\n");
}
@ -2084,6 +2481,9 @@ void EmitCImp::emitInt(AstNodeModule* modp) {
puts("bool __Vm_inhibitSim; ///< Set true to disable evaluation of module\n");
}
}
if (modp->isTop() && v3Global.opt.mtasks()) {
emitMTaskState();
}
emitCoverageDecl(modp); // may flip public/private
puts("\n// PARAMETERS\n");
@ -2291,6 +2691,24 @@ void EmitCImp::main(AstNodeModule* modp, bool slow, bool fast) {
}
}
if (fast && modp->isTop() && v3Global.opt.mtasks()) {
// Make a final pass and emit function definitions for the mtasks
// in the ExecGraph
AstExecGraph* execGraphp = v3Global.rootp()->execGraphp();
const V3Graph* depGraphp = execGraphp->depGraphp();
for (const V3GraphVertex* vxp = depGraphp->verticesBeginp();
vxp; vxp = vxp->verticesNextp()) {
const ExecMTask* mtaskp = dynamic_cast<const ExecMTask*>(vxp);
if (mtaskp->threadRoot()) {
maybeSplit(modp);
// Only define one function for all the mtasks packed on
// a given thread. We'll name this function after the
// root mtask though it contains multiple mtasks' worth
// of logic.
iterate(mtaskp->bodyp());
}
}
}
delete m_ofp; m_ofp=NULL;
}

View File

@ -94,6 +94,9 @@ public:
putMakeClassEntry(of, "verilated_vcd_sc.cpp");
}
}
if (v3Global.opt.mtasks()) {
putMakeClassEntry(of, "verilated_threads.cpp");
}
}
else if (support==2 && slow) {
}

View File

@ -131,7 +131,7 @@ public:
"ALWCOMBORDER", "ASSIGNDLY", "ASSIGNIN",
"BLKANDNBLK", "BLKLOOPINIT", "BLKSEQ", "BSSPACE",
"CASEINCOMPLETE", "CASEOVERLAP", "CASEWITHX", "CASEX", "CDCRSTLOGIC", "CLKDATA",
"CMPCONST", "COLONPLUS", "COMBDLY", "DEFPARAM", "DECLFILENAME",
"CMPCONST", "COLONPLUS", "COMBDLY", "DEFPARAM", "DECLFILENAME",
"ENDLABEL", "GENCLK",
"IFDEPTH", "IMPERFECTSCH", "IMPLICIT", "IMPURE",
"INCABSPATH", "INFINITELOOP", "INITIALDLY",

View File

@ -37,6 +37,8 @@
#include VL_INCLUDE_UNORDERED_MAP
#include "V3Global.h"
#include "V3PartitionGraph.h"
#include "V3GraphPathChecker.h"
#include "V3LifePost.h"
#include "V3Stats.h"
#include "V3Ast.h"
@ -78,6 +80,11 @@ private:
iterate(nodep->funcp());
}
}
virtual void visit(AstExecGraph* nodep) {
// Can just iterate across the MTask bodies in any order. Order
// isn't important for LifePostElimVisitor's simple substitution.
iterateChildren(nodep);
}
virtual void visit(AstCFunc* nodep) {
if (!m_tracingCall && !nodep->entryPoint()) return;
m_tracingCall = false;
@ -101,11 +108,17 @@ public:
// and a sequence number within the mtask:
struct LifeLocation {
const ExecMTask* mtaskp;
uint32_t sequence;
public:
LifeLocation() : sequence(0) {}
LifeLocation(uint32_t sequence_) : sequence(sequence_) {}
LifeLocation() : mtaskp(NULL), sequence(0) {}
LifeLocation(const ExecMTask* mtaskp_, uint32_t sequence_)
: mtaskp(mtaskp_), sequence(sequence_) {}
bool operator< (const LifeLocation& b) const {
unsigned a_id = mtaskp ? mtaskp->id() : 0;
unsigned b_id = b.mtaskp ? b.mtaskp->id() : 0;
if (a_id < b_id) { return true; }
if (b_id < a_id) { return false; }
return sequence < b.sequence;
}
};
@ -130,6 +143,9 @@ private:
// STATE
uint32_t m_sequence; // Sequence number of assigns/varrefs,
// // local to the current MTask.
const ExecMTask* m_execMTaskp; // Current ExecMTask being processed,
// // or NULL for serial code.
V3Double0 m_statAssnDel; // Statistic tracking
bool m_tracingCall; // Currently tracing a CCall to a CFunc
@ -143,11 +159,15 @@ private:
typedef vl_unordered_map<const AstVarScope*, LifePostLocation> PostLocMap;
PostLocMap m_assignposts; // AssignPost dly var locations
const V3Graph* m_mtasksGraphp; // Mtask tracking graph
vl_unique_ptr<GraphPathChecker> m_checker;
// METHODS
VL_DEBUG_FUNC; // Declare debug()
static bool before(const LifeLocation& a, const LifeLocation& b) {
return a.sequence < b.sequence;
bool before(const LifeLocation& a, const LifeLocation& b) {
if (a.mtaskp == b.mtaskp) return a.sequence < b.sequence;
return m_checker->pathExistsFrom(a.mtaskp, b.mtaskp);
}
bool outsideCriticalArea(LifeLocation loc,
const std::set<LifeLocation>& dlyVarAssigns,
@ -159,6 +179,13 @@ private:
// Otherwise, loc could fall in the "critical" area where the
// substitution affects the result of the operation at loc, so
// return false.
if (!loc.mtaskp && assignPostLoc.mtaskp) {
// This is threaded mode; 'loc' is something that happens at
// initial/settle time, or perhaps in _eval() but outside of
// the mtask graph.
// In either case, it's not in the critical area.
return true;
}
if (before(assignPostLoc, loc)) return true;
for (std::set<LifeLocation>::iterator it = dlyVarAssigns.begin();
it != dlyVarAssigns.end(); ++it) {
@ -239,6 +266,17 @@ private:
// within the mtask) where each varscope is read, and written.
iterateChildren(nodep);
if (v3Global.opt.mtasks()) {
if (!m_mtasksGraphp) {
nodep->v3fatalSrc("Should have initted m_mtasksGraphp by now");
}
m_checker.reset(new GraphPathChecker(m_mtasksGraphp));
} else {
if (m_mtasksGraphp) {
nodep->v3fatalSrc("Did not expect any m_mtasksGraphp in serial mode");
}
}
// Find all assignposts. Determine which ones can be
// eliminated. Remove those, and mark their dly vars' user4 field
// to indicate we should replace these dly vars with their original
@ -252,7 +290,8 @@ private:
// Consumption/generation of a variable,
AstVarScope* vscp = nodep->varScopep();
if (!vscp) nodep->v3fatalSrc("Scope not assigned");
LifeLocation loc(++m_sequence);
LifeLocation loc(m_execMTaskp, ++m_sequence);
if (nodep->lvalue()) {
m_writes[vscp].insert(loc);
} else {
@ -275,7 +314,7 @@ private:
if (m_assignposts.find(dlyVarp) != m_assignposts.end()) {
nodep->v3fatalSrc("LifePostLocation attempted duplicate dlyvar map addition");
}
LifeLocation loc(++m_sequence);
LifeLocation loc(m_execMTaskp, ++m_sequence);
m_assignposts[dlyVarp] = LifePostLocation(loc, nodep);
}
}
@ -291,6 +330,18 @@ private:
iterate(nodep->funcp());
}
}
virtual void visit(AstExecGraph* nodep) {
// Treat the ExecGraph like a call to each mtask body
m_mtasksGraphp = nodep->depGraphp();
for (V3GraphVertex* mtaskVxp = m_mtasksGraphp->verticesBeginp();
mtaskVxp; mtaskVxp = mtaskVxp->verticesNextp()) {
ExecMTask* mtaskp = dynamic_cast<ExecMTask*>(mtaskVxp);
m_execMTaskp = mtaskp;
m_sequence = 0;
iterate(mtaskp->bodyp());
}
m_execMTaskp = NULL;
}
virtual void visit(AstCFunc* nodep) {
if (!m_tracingCall && !nodep->entryPoint()) return;
m_tracingCall = false;
@ -305,7 +356,9 @@ public:
// CONSTRUCTORS
explicit LifePostDlyVisitor(AstNetlist* nodep)
: m_sequence(0)
, m_tracingCall(false) {
, m_execMTaskp(NULL)
, m_tracingCall(false)
, m_mtasksGraphp(NULL) {
iterate(nodep);
}
virtual ~LifePostDlyVisitor() {

View File

@ -661,6 +661,9 @@ void V3Options::parseOptsList(FileLine* fl, const string& optdir, int argc, char
else if ( !strcmp (sw, "-debug-abort") ) { abort(); } // Undocumented, see also --debug-sigsegv
else if ( onoff (sw, "-debug-check", flag/*ref*/) ){ m_debugCheck = flag; }
else if ( onoff (sw, "-debug-leak", flag/*ref*/) ){ m_debugLeak = flag; }
else if ( onoff (sw, "-debug-nondeterminism", flag/*ref*/) ){ m_debugNondeterminism = flag; }
else if ( onoff (sw, "-debug-partition", flag/*ref*/) ){ m_debugPartition = flag; } // Undocumented
else if ( onoff (sw, "-debug-self-test", flag/*ref*/) ){ m_debugSelfTest = flag; } // Undocumented
else if ( !strcmp (sw, "-debug-sigsegv") ) { throwSigsegv(); } // Undocumented, see also --debug-abort
else if ( !strcmp (sw, "-debug-fatalsrc") ) { v3fatalSrc("--debug-fatal-src"); } // Undocumented, see also --debug-abort
else if ( onoff (sw, "-decoration", flag/*ref*/) ) { m_decoration = flag; }
@ -678,6 +681,7 @@ void V3Options::parseOptsList(FileLine* fl, const string& optdir, int argc, char
else if ( !strcmp (sw, "-private") ) { m_public = false; }
else if ( onoff (sw, "-prof-cfuncs", flag/*ref*/) ) { m_profCFuncs = flag; }
else if ( onoff (sw, "-profile-cfuncs", flag/*ref*/) ) { m_profCFuncs = flag; } // Undocumented, for backward compat
else if ( onoff (sw, "-prof-threads", flag/*ref*/) ) { m_profThreads = flag; }
else if ( onoff (sw, "-public", flag/*ref*/) ) { m_public = flag; }
else if ( !strncmp(sw, "-pvalue+", strlen("-pvalue+"))) { addParameter(string(sw+strlen("-pvalue+")), false); }
else if ( onoff (sw, "-relative-cfuncs", flag/*ref*/) ) { m_relativeCFuncs = flag; }
@ -689,6 +693,7 @@ void V3Options::parseOptsList(FileLine* fl, const string& optdir, int argc, char
else if ( onoff (sw, "-stats", flag/*ref*/) ) { m_stats = flag; }
else if ( onoff (sw, "-stats-vars", flag/*ref*/) ) { m_statsVars = flag; m_stats |= flag; }
else if ( !strcmp (sw, "-sv") ) { m_defaultLanguage = V3LangCode::L1800_2005; }
else if ( onoff (sw, "-threads-coarsen", flag/*ref*/)) { m_threadsCoarsen = flag; } // Undocumented, debug
else if ( onoff (sw, "-trace", flag/*ref*/) ) { m_trace = flag; }
else if ( onoff (sw, "-trace-dups", flag/*ref*/) ) { m_traceDups = flag; }
else if ( onoff (sw, "-trace-params", flag/*ref*/) ) { m_traceParams = flag; }
@ -1013,6 +1018,20 @@ void V3Options::parseOptsList(FileLine* fl, const string& optdir, int argc, char
shift; m_threads = atoi(argv[i]);
if (m_threads < 0) fl->v3fatal("--threads must be >= 0: "<<argv[i]);
}
else if ( !strcmp (sw, "-threads-dpi") && (i+1)<argc) {
shift;
if (!strcmp(argv[i], "all")) { m_threadsDpiPure=true; m_threadsDpiUnpure=true; }
else if (!strcmp(argv[i], "none")) { m_threadsDpiPure=false; m_threadsDpiUnpure=false; }
else if (!strcmp(argv[i], "pure")) { m_threadsDpiPure=true; m_threadsDpiUnpure=false; }
else {
fl->v3fatal("Unknown setting for --threads-dpi: "<<argv[i]);
}
}
else if ( !strcmp (sw, "-threads-max-mtasks") ) {
shift; m_threadsMaxMTasks = atoi(argv[i]);
if (m_threadsMaxMTasks < 1)
fl->v3fatal("--threads-max-mtasks must be >= 1: "<<argv[i]);
}
else if ( !strcmp (sw, "-top-module") && (i+1)<argc ) {
shift; m_topModule = argv[i];
}
@ -1223,6 +1242,9 @@ V3Options::V3Options() {
m_coverageUser = false;
m_debugCheck = false;
m_debugLeak = true;
m_debugNondeterminism = false;
m_debugPartition = false;
m_debugSelfTest = false;
m_decoration = true;
m_exe = false;
m_ignc = false;
@ -1237,6 +1259,7 @@ V3Options::V3Options() {
m_pinsScBigUint = false;
m_pinsUint8 = false;
m_profCFuncs = false;
m_profThreads = false;
m_preprocOnly = false;
m_preprocNoLine = false;
m_public = false;
@ -1249,6 +1272,10 @@ V3Options::V3Options() {
m_statsVars = false;
m_systemC = false;
m_threads = 0;
m_threadsDpiPure = true;
m_threadsDpiUnpure = false;
m_threadsCoarsen = true;
m_threadsMaxMTasks = 0;
m_trace = false;
m_traceDups = false;
m_traceParams = true;

View File

@ -75,7 +75,10 @@ class V3Options {
bool m_coverageUnderscore;// main switch: --coverage-underscore
bool m_coverageUser; // main switch: --coverage-func
bool m_debugCheck; // main switch: --debug-check
bool m_debugLeak; // main switch: --debug-leak
bool m_debugLeak; // main switch: --debug-leak
bool m_debugNondeterminism; // main switch: --debug-nondeterminism
bool m_debugPartition; // main switch: --debug-partition
bool m_debugSelfTest; // main switch: --debug-self-test
bool m_decoration; // main switch: --decoration
bool m_exe; // main switch: --exe
bool m_ignc; // main switch: --ignc
@ -87,6 +90,7 @@ class V3Options {
bool m_pinsScBigUint;// main switch: --pins-sc-biguint
bool m_pinsUint8; // main switch: --pins-uint8
bool m_profCFuncs; // main switch: --prof-cfuncs
bool m_profThreads; // main switch: --prof-threads
bool m_public; // main switch: --public
bool m_relativeCFuncs; // main switch: --relative-cfuncs
bool m_relativeIncludes; // main switch: --relative-includes
@ -96,6 +100,9 @@ class V3Options {
bool m_skipIdentical;// main switch: --skip-identical
bool m_stats; // main switch: --stats
bool m_statsVars; // main switch: --stats-vars
bool m_threadsCoarsen; // main switch: --threads-coarsen
bool m_threadsDpiPure; // main switch: --threads-dpi all/pure
bool m_threadsDpiUnpure; // main switch: --threads-dpi all
bool m_trace; // main switch: --trace
bool m_traceDups; // main switch: --trace-dups
bool m_traceParams; // main switch: --trace-params
@ -117,6 +124,7 @@ class V3Options {
int m_outputSplitCTrace;// main switch: --output-split-ctrace
int m_pinsBv; // main switch: --pins-bv
int m_threads; // main switch: --threads (0 == --no-threads)
int m_threadsMaxMTasks; // main switch: --threads-max-mtasks
int m_traceDepth; // main switch: --trace-depth
int m_traceMaxArray;// main switch: --trace-max-array
int m_traceMaxWidth;// main switch: --trace-max-width
@ -232,8 +240,14 @@ class V3Options {
bool coverageUser() const { return m_coverageUser; }
bool debugCheck() const { return m_debugCheck; }
bool debugLeak() const { return m_debugLeak; }
bool debugNondeterminism() const { return m_debugNondeterminism; }
bool debugPartition() const { return m_debugPartition; }
bool debugSelfTest() const { return m_debugSelfTest; }
bool decoration() const { return m_decoration; }
bool exe() const { return m_exe; }
bool threadsDpiPure() const { return m_threadsDpiPure; }
bool threadsDpiUnpure() const { return m_threadsDpiUnpure; }
bool threadsCoarsen() const { return m_threadsCoarsen; }
bool trace() const { return m_trace; }
bool traceDups() const { return m_traceDups; }
bool traceParams() const { return m_traceParams; }
@ -246,6 +260,7 @@ class V3Options {
bool pinsScBigUint() const { return m_pinsScBigUint; }
bool pinsUint8() const { return m_pinsUint8; }
bool profCFuncs() const { return m_profCFuncs; }
bool profThreads() const { return m_profThreads; }
bool allPublic() const { return m_public; }
bool lintOnly() const { return m_lintOnly; }
bool ignc() const { return m_ignc; }
@ -267,6 +282,7 @@ class V3Options {
int outputSplitCTrace() const { return m_outputSplitCTrace; }
int pinsBv() const { return m_pinsBv; }
int threads() const { return m_threads; }
int threadsMaxMTasks() const { return m_threadsMaxMTasks; }
bool mtasks() const { return (m_threads > 1); }
int traceDepth() const { return m_traceDepth; }
int traceMaxArray() const { return m_traceMaxArray; }

View File

@ -89,19 +89,22 @@
#include <sstream>
#include <memory>
#include "V3Global.h"
#include "V3File.h"
#include "V3Ast.h"
#include "V3Const.h"
#include "V3EmitCBase.h"
#include "V3EmitV.h"
#include "V3File.h"
#include "V3Global.h"
#include "V3Graph.h"
#include "V3GraphStream.h"
#include "V3List.h"
#include "V3Partition.h"
#include "V3PartitionGraph.h"
#include "V3SenTree.h"
#include "V3Stats.h"
#include "V3EmitCBase.h"
#include "V3Const.h"
#include "V3Order.h"
#include "V3OrderGraph.h"
#include "V3EmitV.h"
#include VL_INCLUDE_UNORDERED_MAP
#include VL_INCLUDE_UNORDERED_SET
@ -423,10 +426,15 @@ class ProcessMoveBuildGraph {
// OrderVisitor. It produces a slightly coarsened graph to drive the
// code scheduling.
//
// * The new graph contains nodes of type OrderMoveVertex.
// * For the serial code scheduler, the new graph contains
// nodes of type OrderMoveVertex.
//
// * For the threaded code scheduler, the new graph contains
// nodes of type MTaskMoveVertex.
//
// * The difference in output type is abstracted away by the
// 'T_MoveVertex' template parameter.
// 'T_MoveVertex' template parameter; ProcessMoveBuildGraph otherwise
// works the same way for both cases.
// TYPES
typedef std::pair<const V3GraphVertex*, const AstSenTree*> VxDomPair;
@ -563,7 +571,7 @@ private:
};
//######################################################################
// OrderMoveVertexMaker
// OrderMoveVertexMaker and related
class OrderMoveVertexMaker
: public ProcessMoveBuildGraph<OrderMoveVertex>::MoveVertexMaker {
@ -595,6 +603,64 @@ private:
VL_UNCOPYABLE(OrderMoveVertexMaker);
};
class OrderMTaskMoveVertexMaker
: public ProcessMoveBuildGraph<MTaskMoveVertex>::MoveVertexMaker {
V3Graph* m_pomGraphp;
public:
explicit OrderMTaskMoveVertexMaker(V3Graph* pomGraphp)
: m_pomGraphp(pomGraphp) {}
MTaskMoveVertex* makeVertexp(OrderLogicVertex* lvertexp,
const OrderEitherVertex* varVertexp,
const AstScope* scopep,
const AstSenTree* domainp) {
// Exclude initial/settle logic from the mtasks graph.
// We'll output time-zero logic separately.
if (domainp->hasInitial() || domainp->hasSettle()) {
return NULL;
}
return new MTaskMoveVertex(m_pomGraphp, lvertexp, varVertexp, scopep, domainp);
}
void freeVertexp(MTaskMoveVertex* freeMep) {
freeMep->unlinkDelete(m_pomGraphp);
}
private:
VL_UNCOPYABLE(OrderMTaskMoveVertexMaker);
};
class OrderVerticesByDomainThenScope {
PartPtrIdMap m_ids;
public:
virtual bool operator()(const V3GraphVertex* lhsp,
const V3GraphVertex* rhsp) const {
const MTaskMoveVertex* l_vxp = dynamic_cast<const MTaskMoveVertex*>(lhsp);
const MTaskMoveVertex* r_vxp = dynamic_cast<const MTaskMoveVertex*>(rhsp);
vluint64_t l_id = m_ids.findId(l_vxp->domainp());
vluint64_t r_id = m_ids.findId(r_vxp->domainp());
if (l_id < r_id) return true;
if (l_id > r_id) return false;
l_id = m_ids.findId(l_vxp->scopep());
r_id = m_ids.findId(r_vxp->scopep());
return l_id < r_id;
}
};
class MTaskVxIdLessThan {
public:
MTaskVxIdLessThan() {}
virtual ~MTaskVxIdLessThan() {}
// Sort vertex's, which must be AbstractMTask's, into a deterministic
// order by comparing their serial IDs.
virtual bool operator()(const V3GraphVertex* lhsp,
const V3GraphVertex* rhsp) const {
const AbstractMTask* lmtaskp =
dynamic_cast<const AbstractLogicMTask*>(lhsp);
const AbstractMTask* rmtaskp =
dynamic_cast<const AbstractLogicMTask*>(rhsp);
return lmtaskp->id() < rmtaskp->id();
}
};
//######################################################################
// Order class functions
@ -701,6 +767,7 @@ private:
void processDomainsIterate(OrderEitherVertex* vertexp);
void processEdgeReport();
// processMove* routines schedule serial execution
void processMove();
void processMoveClear();
void processMoveBuildGraph();
@ -711,6 +778,18 @@ private:
AstActive* processMoveOneLogic(const OrderLogicVertex* lvertexp,
AstCFunc*& newFuncpr, int& newStmtsr);
// processMTask* routines schedule threaded execution
struct MTaskState {
typedef std::list<const OrderLogicVertex*> Logics;
AstMTaskBody* m_mtaskBodyp;
Logics m_logics;
ExecMTask* m_execMTaskp;
MTaskState() : m_mtaskBodyp(NULL), m_execMTaskp(NULL) {}
};
void processMTasks();
typedef enum {LOGIC_INITIAL, LOGIC_SETTLE} InitialLogicE;
void processMTasksInitial(InitialLogicE logic_type);
string cfuncName(AstNodeModule* modp, AstSenTree* domainp, AstScope* scopep, AstNode* forWhatp) {
modp->user3Inc();
int funcnum = modp->user3();
@ -1726,6 +1805,173 @@ AstActive* OrderVisitor::processMoveOneLogic(const OrderLogicVertex* lvertexp,
return activep;
}
void OrderVisitor::processMTasksInitial(InitialLogicE logic_type) {
// Emit initial/settle logic. Initial blocks won't be part of the
// mtask partition, aren't eligible for parallelism.
//
int initStmts = 0;
AstCFunc* initCFunc = NULL;
AstScope* lastScopep = NULL;
for (V3GraphVertex* initVxp = m_graph.verticesBeginp();
initVxp; initVxp = initVxp->verticesNextp()) {
OrderLogicVertex* initp = dynamic_cast<OrderLogicVertex*>(initVxp);
if (!initp) continue;
if ((logic_type == LOGIC_INITIAL)
&& !initp->domainp()->hasInitial()) continue;
if ((logic_type == LOGIC_SETTLE)
&& !initp->domainp()->hasSettle()) continue;
if (initp->scopep() != lastScopep) {
// Start new cfunc, don't let the cfunc cross scopes
initCFunc = NULL;
lastScopep = initp->scopep();
}
AstActive* newActivep = processMoveOneLogic(initp, initCFunc/*ref*/, initStmts/*ref*/);
if (newActivep) m_scopetopp->addActivep(newActivep);
}
}
void OrderVisitor::processMTasks() {
// For nondeterminism debug:
V3Partition::hashGraphDebug(&m_graph, "V3Order's m_graph");
processMTasksInitial(LOGIC_INITIAL);
processMTasksInitial(LOGIC_SETTLE);
// We already produced a graph of every var, input, logic, and settle
// block and all dependencies; this is 'm_graph'.
//
// Now, starting from m_graph, make a slightly-coarsened graph representing
// only logic, and discarding edges we know we can ignore.
// This is quite similar to the 'm_pomGraph' of the serial code gen:
V3Graph logicGraph;
OrderMTaskMoveVertexMaker create_mtask_vertex(&logicGraph);
ProcessMoveBuildGraph<MTaskMoveVertex> mtask_pmbg(
&m_graph, &logicGraph, &create_mtask_vertex);
mtask_pmbg.build();
// Needed? We do this for m_pomGraph in serial mode, so do it here too:
logicGraph.removeRedundantEdges(&V3GraphEdge::followAlwaysTrue);
// Partition logicGraph into LogicMTask's. The partitioner will annotate
// each vertex in logicGraph with a 'color' which is really an mtask ID
// in this context.
V3Partition partitioner(&logicGraph);
V3Graph mtasks;
partitioner.go(&mtasks);
vl_unordered_map<unsigned /*mtask id*/, MTaskState> mtaskStates;
// Iterate through the entire logicGraph. For each logic node,
// attach it to a per-MTask ordered list of logic nodes.
// This is the order we'll execute logic nodes within the MTask.
//
// MTasks may span scopes and domains, so sort by both here:
GraphStream<OrderVerticesByDomainThenScope> emit_logic(&logicGraph);
const V3GraphVertex* moveVxp;
while ((moveVxp = emit_logic.nextp())) {
const MTaskMoveVertex* movep =
dynamic_cast<const MTaskMoveVertex*>(moveVxp);
unsigned mtaskId = movep->color();
UASSERT(mtaskId > 0,
"Every MTaskMoveVertex should have an mtask assignment >0");
if (movep->logicp()) {
// Add this logic to the per-mtask order
mtaskStates[mtaskId].m_logics.push_back(movep->logicp());
// Since we happen to be iterating over every logic node,
// take this opportunity to annotate each AstVar with the id's
// of mtasks that consume it and produce it. We'll use this
// information in V3EmitC when we lay out var's in memory.
const OrderLogicVertex* logicp = movep->logicp();
for (const V3GraphEdge* edgep = logicp->inBeginp();
edgep; edgep = edgep->inNextp()) {
const OrderVarVertex* pre_varp =
dynamic_cast<const OrderVarVertex*>(edgep->fromp());
if (!pre_varp) continue;
AstVar* varp = pre_varp->varScp()->varp();
// varp depends on logicp, so logicp produces varp,
// and vice-versa below
varp->addProducingMTaskId(mtaskId);
}
for (const V3GraphEdge* edgep = logicp->outBeginp();
edgep; edgep = edgep->outNextp()) {
const OrderVarVertex* post_varp
= dynamic_cast<const OrderVarVertex*>(edgep->top());
if (!post_varp) continue;
AstVar* varp = post_varp->varScp()->varp();
varp->addConsumingMTaskId(mtaskId);
}
// TODO? We ignore IO vars here, so those will have empty mtask
// signatures. But we could also give those mtask signatures.
}
}
// Create the AstExecGraph node which represents the execution
// of the MTask graph.
FileLine* rootFlp = new FileLine("AstRoot", 0);
AstExecGraph* execGraphp = new AstExecGraph(rootFlp);
m_scopetopp->addActivep(execGraphp);
v3Global.rootp()->execGraphp(execGraphp);
// Create CFuncs and bodies for each MTask.
GraphStream<MTaskVxIdLessThan> emit_mtasks(&mtasks);
const V3GraphVertex* mtaskVxp;
while ((mtaskVxp = emit_mtasks.nextp())) {
const AbstractLogicMTask* mtaskp =
dynamic_cast<const AbstractLogicMTask*>(mtaskVxp);
// Create a body for this mtask
AstMTaskBody* bodyp = new AstMTaskBody(rootFlp);
MTaskState& state = mtaskStates[mtaskp->id()];
state.m_mtaskBodyp = bodyp;
// Create leaf CFunc's to run this mtask's logic,
// and create a set of AstActive's to call those CFuncs.
// Add the AstActive's into the AstMTaskBody.
const AstSenTree* last_domainp = NULL;
AstCFunc* leafCFuncp = NULL;
int leafStmts = 0;
for (MTaskState::Logics::iterator it = state.m_logics.begin();
it != state.m_logics.end(); ++it) {
const OrderLogicVertex* logicp = *it;
if (logicp->domainp() != last_domainp) {
// Start a new leaf function.
leafCFuncp = NULL;
}
last_domainp = logicp->domainp();
AstActive* newActivep = processMoveOneLogic(logicp, leafCFuncp/*ref*/, leafStmts/*ref*/);
if (newActivep) bodyp->addStmtsp(newActivep);
}
// Translate the LogicMTask graph into the corresponding ExecMTask
// graph, which will outlive V3Order and persist for the remainder
// of verilator's processing.
// - The LogicMTask graph points to MTaskMoveVertex's
// and OrderLogicVertex's which are ephemeral to V3Order.
// - The ExecMTask graph and the AstMTaskBody's produced here
// persist until code generation time.
state.m_execMTaskp =
new ExecMTask(execGraphp->mutableDepGraphp(),
bodyp, mtaskp->id());
// Cross-link each ExecMTask and MTaskBody
// Q: Why even have two objects?
// A: One is an AstNode, the other is a GraphVertex,
// to combine them would involve multiple inheritance...
state.m_mtaskBodyp->execMTaskp(state.m_execMTaskp);
for (V3GraphEdge* inp = mtaskp->inBeginp();
inp; inp = inp->inNextp()) {
const V3GraphVertex* fromVxp = inp->fromp();
const AbstractLogicMTask* fromp =
dynamic_cast<const AbstractLogicMTask*>(fromVxp);
MTaskState& fromState = mtaskStates[fromp->id()];
new V3GraphEdge(execGraphp->mutableDepGraphp(),
fromState.m_execMTaskp, state.m_execMTaskp, 1);
}
execGraphp->addMTaskBody(bodyp);
}
}
//######################################################################
// OrderVisitor - Top processing
@ -1762,7 +2008,7 @@ void OrderVisitor::process() {
if (debug() && v3Global.opt.dumpTree()) processEdgeReport();
{
if (!v3Global.opt.mtasks()) {
UINFO(2," Construct Move Graph...\n");
processMoveBuildGraph();
if (debug()>=4) m_pomGraph.dumpDotFilePrefixed("ordermv_start"); // Different prefix (ordermv) as it's not the same graph
@ -1771,6 +2017,9 @@ void OrderVisitor::process() {
UINFO(2," Move...\n");
processMove();
} else {
UINFO(2," Set up mtasks...\n");
processMTasks();
}
// Any SC inputs feeding a combo domain must be marked, so we can make them sc_sensitive

View File

@ -21,6 +21,7 @@
//
// V3GraphVertex
// OrderMoveVertex
// MTaskMoveVertex
// OrderEitherVertex
// OrderInputsVertex
// OrderSettleVertex
@ -47,6 +48,7 @@
#include "verilatedos.h"
#include "V3Ast.h"
#include "V3Graph.h"
#include VL_INCLUDE_UNORDERED_MAP
class OrderVisitor;
class OrderMoveVertex;
@ -363,6 +365,57 @@ public:
void domScopep(OrderMoveDomScope* ds) { m_domScopep=ds; }
};
// Similar to OrderMoveVertex, but modified for threaded code generation.
class MTaskMoveVertex : public V3GraphVertex {
// This could be more compact, since we know m_varp and m_logicp
// cannot both be set. Each MTaskMoveVertex represents a logic node
// or a var node, it can't be both.
OrderLogicVertex* m_logicp; // Logic represented by this vertex
const OrderEitherVertex* m_varp; // Var represented by this vertex
const AstScope* m_scopep;
const AstSenTree* m_domainp;
protected:
friend class OrderVisitor;
friend class MTaskMoveVertexMaker;
public:
MTaskMoveVertex(V3Graph* graphp, OrderLogicVertex* logicp,
const OrderEitherVertex* varp,
const AstScope* scopep, const AstSenTree* domainp)
: V3GraphVertex(graphp), m_logicp(logicp),
m_varp(varp), m_scopep(scopep), m_domainp(domainp) {
UASSERT(!(logicp && varp),
"MTaskMoveVertex: logicp and varp may not both be set!\n");
}
virtual ~MTaskMoveVertex() {}
virtual MTaskMoveVertex* clone(V3Graph* graphp) const {
v3fatalSrc("Unsupported"); return NULL; }
virtual OrderVEdgeType type() const { return OrderVEdgeType::VERTEX_MOVE; }
virtual string dotColor() const {
if (logicp()) return logicp()->dotColor();
else return "yellow";
}
virtual string name() const {
string nm;
if (logicp()) {
nm = logicp()->name();
nm += (string("\\nMV:")
+" d="+cvtToStr((void*)logicp()->domainp())
+" s="+cvtToStr((void*)logicp()->scopep())
// "color()" represents the mtask ID.
+"\\nt="+cvtToStr(color()));
} else {
nm = "nolog\\nt="+cvtToStr(color());
}
return nm;
}
// ACCESSORS
OrderLogicVertex* logicp() const { return m_logicp; }
const OrderEitherVertex* varp() const { return m_varp; }
const AstScope* scopep() const { return m_scopep; }
const AstSenTree* domainp() const { return m_domainp; }
};
//######################################################################
// Edge types

2759
src/V3Partition.cpp Normal file

File diff suppressed because it is too large Load Diff

99
src/V3Partition.h Normal file
View File

@ -0,0 +1,99 @@
// -*- mode: C++; c-file-style: "cc-mode" -*-
//*************************************************************************
// DESCRIPTION: Verilator: Threading's logic to mtask partitioner
//
// Code available from: http://www.veripool.org/verilator
//
//*************************************************************************
//
// Copyright 2003-2018 by Wilson Snyder. This program is free software; you can
// redistribute it and/or modify it under the terms of either the GNU
// Lesser General Public License Version 3 or the Perl Artistic License
// Version 2.0.
//
// Verilator is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
//*************************************************************************
#ifndef _V3PARTITION_H_
#define _V3PARTITION_H_
#include "config_build.h"
#include "verilatedos.h"
#include <list>
#include "V3Graph.h"
#include "V3OrderGraph.h"
class LogicMTask;
typedef vl_unordered_map<const MTaskMoveVertex*, LogicMTask*> Vx2MTaskMap;
//*************************************************************************
/// V3Partition takes the fine-grained logic graph from V3Order and
/// collapses it into a coarse-grained graph of AbstractLogicMTask's, each
/// of which contains of set of the logic nodes from the fine-grained
/// graph.
class V3Partition {
// MEMBERS
V3Graph* m_fineDepsGraphp; // Fine-grained dependency graph
public:
// CONSTRUCTORS
explicit V3Partition(V3Graph* fineDepsGraphp)
: m_fineDepsGraphp(fineDepsGraphp) {}
~V3Partition() {}
// METHODS
// Fill in the provided empty graph with AbstractLogicMTask's and their
// interdependencies.
void go(V3Graph* mtasksp);
static void selfTest();
// Print out a hash of the shape of graphp. Only needed to debug the
// origin of some nondeterminism; otherwise this is pretty useless.
static void hashGraphDebug(const V3Graph* graphp, const char* debugName);
// Print debug stats about graphp whose nodes must be AbstractMTask's.
static void debugMTaskGraphStats(const V3Graph* graphp, const string& name);
// Operate on the final ExecMTask graph, immediately prior to code
// generation time.
static void finalize();
private:
static void finalizeCosts(V3Graph* execMTaskGraphp);
static void setupMTaskDeps(V3Graph* mtasksp, const Vx2MTaskMap* vx2mtaskp);
VL_DEBUG_FUNC; // Declare debug()
VL_UNCOPYABLE(V3Partition);
};
//*************************************************************************
// Map a pointer into a id, for e.g. nodep to mtask mappings
class PartPtrIdMap {
private:
// TYPES
typedef vl_unordered_map <const void*, vluint64_t> PtrMap;
// MEMBERS
mutable vluint64_t m_nextId;
mutable PtrMap m_id;
public:
// CONSTRUCTORS
PartPtrIdMap() : m_nextId(0) {}
// METHODS
vluint64_t findId(const void* ptrp) const {
PtrMap::iterator it = m_id.find(ptrp);
if (it != m_id.end()) {
return it->second;
}
m_id[ptrp] = m_nextId;
return m_nextId++;
}
};
#endif // Guard

108
src/V3PartitionGraph.h Normal file
View File

@ -0,0 +1,108 @@
// -*- mode: C++; c-file-style: "cc-mode" -*-
//*************************************************************************
// DESCRIPTION: Verilator: Threading's graph structures
//
// Code available from: http://www.veripool.org/verilator
//
//*************************************************************************
//
// Copyright 2003-2018 by Wilson Snyder. This program is free software; you can
// redistribute it and/or modify it under the terms of either the GNU
// Lesser General Public License Version 3 or the Perl Artistic License
// Version 2.0.
//
// Verilator is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
//*************************************************************************
#ifndef _V3PARTITIONGRAPH_H_
#define _V3PARTITIONGRAPH_H_
#include "config_build.h"
#include "verilatedos.h"
#include <list>
#include "V3Graph.h"
#include "V3OrderGraph.h"
//*************************************************************************
// MTasks and graph structures
class AbstractMTask : public V3GraphVertex {
public:
AbstractMTask(V3Graph* graphp) : V3GraphVertex(graphp) {}
virtual ~AbstractMTask() {}
virtual uint32_t id() const = 0;
virtual uint32_t cost() const = 0;
};
class AbstractLogicMTask : public AbstractMTask {
public:
// TYPES
typedef std::list<MTaskMoveVertex*> VxList;
// CONSTRUCTORS
AbstractLogicMTask(V3Graph* graphp) : AbstractMTask(graphp) {}
virtual ~AbstractLogicMTask() {}
// METHODS
// Set of logic vertices in this mtask. Order is not significant.
virtual const VxList* vertexListp() const = 0;
virtual uint32_t id() const = 0; // Unique id of this mtask.
virtual uint32_t cost() const = 0;
};
class ExecMTask : public AbstractMTask {
private:
AstMTaskBody* m_bodyp; // Task body
uint32_t m_id; // Unique id of this mtask.
uint32_t m_priority; // Predicted critical path from the start of
// this mtask to the ends of the graph that are reachable from this
// mtask. In abstract time units.
uint32_t m_cost; // Predicted runtime of this mtask, in the same
// abstract time units as priority().
uint32_t m_thread; // Thread for static (pack_mtasks) scheduling,
// or 0xffffffff if not yet assigned.
const ExecMTask* m_packNextp; // Next for static (pack_mtasks) scheduling
bool m_threadRoot; // Is root thread
VL_UNCOPYABLE(ExecMTask);
public:
ExecMTask(V3Graph* graphp, AstMTaskBody* bodyp, uint32_t id)
: AbstractMTask(graphp),
m_bodyp(bodyp),
m_id(id),
m_priority(0),
m_cost(0),
m_thread(0xffffffff),
m_packNextp(NULL),
m_threadRoot(false) {}
AstMTaskBody* bodyp() const { return m_bodyp; }
virtual uint32_t id() const { return m_id; }
uint32_t priority() const { return m_priority; }
void priority(uint32_t pri) { m_priority = pri; }
virtual uint32_t cost() const { return m_cost; }
void cost(uint32_t cost) { m_cost = cost; }
void thread(uint32_t thread) { m_thread = thread; }
uint32_t thread() const { return m_thread; }
void packNextp(const ExecMTask* nextp) { m_packNextp = nextp; }
const ExecMTask* packNextp() const { return m_packNextp; }
bool threadRoot() const { return m_threadRoot; }
void threadRoot(bool threadRoot) { m_threadRoot = threadRoot; }
string cFuncName() const {
// If this MTask maps to a C function, this should be the name
return string("__Vmtask")+"__"+cvtToStr(m_id);
}
string name() const { return string("mt")+cvtToStr(id()); }
void dump(std::ostream& str) const {
str <<name()<<"."<<((void*)this);
if (priority() || cost()) str <<" [pr="<<priority()<<" c="<<cvtToStr(cost())<<"]";
if (thread() != 0xffffffff) str <<" th="<<thread();
if (threadRoot()) str <<" [ROOT]";
if (packNextp()) str <<" nx="<<packNextp()->name();
}
};
inline std::ostream& operator<<(std::ostream& os, const ExecMTask& rhs) {
rhs.dump(os); return os; }
#endif // Guard

View File

@ -182,6 +182,7 @@ private:
AstNode* m_chgSubParentp;// Which node has call to m_chgSubFuncp
int m_chgSubStmts; // Statements under function being built
AstVarScope* m_activityVscp; // Activity variable
uint32_t m_activityNumber; // Count of fields in activity variable
uint32_t m_code; // Trace ident code# being assigned
V3Graph m_graph; // Var/CFunc tracking
TraceActivityVertex* m_alwaysVtxp; // "Always trace" vertex
@ -297,7 +298,7 @@ private:
void assignActivity() {
// Select activity numbers and put into each CFunc vertex
uint32_t activityNumber = 1; // Note 0 indicates "slow"
m_activityNumber = 1; // Note 0 indicates "slow"
for (V3GraphVertex* itp = m_graph.verticesBeginp(); itp; itp=itp->verticesNextp()) {
if (TraceActivityVertex* vvertexp = dynamic_cast<TraceActivityVertex*>(itp)) {
if (!vvertexp->activityCodeValid()) {
@ -306,17 +307,39 @@ private:
// This makes us need less activityNumbers and so speeds up the fast path.
vvertexp->activityCode(TraceActivityVertex::ACTIVITY_SLOW);
} else {
vvertexp->activityCode(activityNumber++);
vvertexp->activityCode(m_activityNumber++);
}
}
}
}
// Insert global variable
if (!activityNumber) activityNumber++; // For simplicity, always create it
int activityBits = VL_WORDS_I(activityNumber)*VL_WORDSIZE; // For tighter code; round to next 32 bit point.
AstVar* newvarp = new AstVar (m_chgFuncp->fileline(), AstVarType::MODULETEMP,
"__Vm_traceActivity", VFlagBitPacked(), activityBits);
AstVar* newvarp;
if (v3Global.opt.mtasks()) {
// Create a vector of bytes, not bits, for the tracing vector,
// so that we can set them atomically without locking.
//
// TODO: It would be slightly faster to have a bit vector per
// chain of packed MTasks, but we haven't packed the MTasks yet.
// If we support fully threaded tracing in the future, it would
// make sense to improve this at that time.
AstNodeDType* newScalarDtp
= new AstBasicDType(m_chgFuncp->fileline(), VFlagLogicPacked(), 1);
v3Global.rootp()->typeTablep()->addTypesp(newScalarDtp);
AstNodeDType* newArrDtp = new AstUnpackArrayDType(
m_chgFuncp->fileline(),
newScalarDtp,
new AstRange(m_chgFuncp->fileline(),
VNumRange(m_activityNumber-1, 0, false)));
v3Global.rootp()->typeTablep()->addTypesp(newArrDtp);
newvarp = new AstVar(m_chgFuncp->fileline(),
AstVarType::MODULETEMP,
"__Vm_traceActivity", newArrDtp);
} else {
// For tighter code; round to next 32 bit point.
int activityBits = VL_WORDS_I(m_activityNumber)*VL_WORDSIZE;
newvarp = new AstVar(m_chgFuncp->fileline(), AstVarType::MODULETEMP,
"__Vm_traceActivity", VFlagBitPacked(), activityBits);
}
m_topModp->addStmtp(newvarp);
AstVarScope* newvscp = new AstVarScope(newvarp->fileline(), m_highScopep, newvarp);
m_highScopep->addVarp(newvscp);
@ -329,15 +352,23 @@ private:
FileLine* fl = vvertexp->insertp()->fileline();
uint32_t acode = vvertexp->activityCode();
vvertexp->insertp()->addNextHere
(new AstAssign (fl,
new AstSel (fl, new AstVarRef(fl, m_activityVscp, true),
acode, 1),
new AstConst (fl, AstConst::LogicTrue())));
(new AstAssign(fl, selectActivity(fl, acode, true),
new AstConst(fl, AstConst::LogicTrue())));
}
}
}
}
AstNode* selectActivity(FileLine* flp, uint32_t acode, bool lvalue) {
if (v3Global.opt.mtasks()) {
return new AstArraySel(
flp, new AstVarRef(flp, m_activityVscp, lvalue), acode);
} else {
return new AstSel(
flp, new AstVarRef(flp, m_activityVscp, lvalue), acode, 1);
}
}
AstCFunc* newCFunc(AstCFuncType type, const string& name, AstCFunc* basep) {
AstCFunc* funcp = new AstCFunc(basep->fileline(), name, basep->scopep());
funcp->slow(basep->slow());
@ -453,8 +484,7 @@ private:
AstNode* condp = NULL;
for (ActCodeSet::const_iterator csit = actset.begin(); csit!=actset.end(); ++csit) {
uint32_t acode = *csit;
AstNode* selp = new AstSel (fl, new AstVarRef(fl, m_activityVscp, false),
acode, 1);
AstNode* selp = selectActivity(fl, acode, false);
if (condp) condp = new AstOr (fl, condp, selp);
else condp = selp;
}
@ -473,11 +503,19 @@ private:
// Clear activity after tracing completes
FileLine* fl = m_chgFuncp->fileline();
AstNode* clrp = new AstAssign (fl,
new AstVarRef(fl, m_activityVscp, true),
new AstConst(fl, V3Number(fl, m_activityVscp->width())));
m_fullFuncp->addFinalsp(clrp->cloneTree(true));
m_chgFuncp->addFinalsp(clrp);
if (v3Global.opt.mtasks()) {
for (uint32_t i = 0; i < m_activityNumber; ++i) {
AstNode* clrp = new AstAssign(fl, selectActivity(fl, i, true),
new AstConst(fl, AstConst::LogicFalse()));
m_fullFuncp->addFinalsp(clrp->cloneTree(true));
m_chgFuncp->addFinalsp(clrp);
}
} else {
AstNode* clrp = new AstAssign(fl, new AstVarRef(fl, m_activityVscp, true),
new AstConst(fl, V3Number(fl, m_activityVscp->width())));
m_fullFuncp->addFinalsp(clrp->cloneTree(true));
m_chgFuncp->addFinalsp(clrp);
}
}
uint32_t assignDeclCode(AstTraceDecl* nodep) {
@ -699,6 +737,7 @@ public:
m_chgSubFuncp = NULL;
m_chgSubParentp = NULL;
m_chgSubStmts = 0;
m_activityNumber = 0;
m_code = 0;
m_finding = false;
m_funcNum = 0;

View File

@ -73,6 +73,7 @@
#include "V3Param.h"
#include "V3Parse.h"
#include "V3ParseSym.h"
#include "V3Partition.h"
#include "V3PreShell.h"
#include "V3Premit.h"
#include "V3Reloop.h"
@ -524,6 +525,14 @@ void process () {
V3EmitC::emitcSyms();
V3EmitC::emitcTrace();
}
if (!v3Global.opt.xmlOnly()
&& v3Global.opt.mtasks()) {
// Finalize our MTask cost estimates and pack the mtasks into
// threads. Must happen pre-EmitC which relies on the packing
// order. Must happen post-V3LifePost which changes the relative
// costs of mtasks.
V3Partition::finalize();
}
if (!v3Global.opt.xmlOnly()) { // Unfortunately we have some lint checks in emitc.
V3EmitC::emitc();
}
@ -607,8 +616,11 @@ int main(int argc, char** argv, char** env) {
VHashSha1::selfTest();
AstBasicDTypeKwd::selfTest();
V3Graph::selfTest();
V3TSP::selfTest();
V3ScoreboardBase::selfTest();
if (v3Global.opt.debugSelfTest()) {
V3TSP::selfTest();
V3ScoreboardBase::selfTest();
V3Partition::selfTest();
}
// Read first filename
v3Global.readFiles();

View File

@ -44,7 +44,7 @@ endif
.PHONY: test
test:
$(PERL) driver.pl $(DRIVER_FLAGS) --vlt --dist
$(PERL) driver.pl $(DRIVER_FLAGS) --vlt --vltmt --dist
######################################################################
@ -61,6 +61,9 @@ nc:
vlt:
$(PERL) driver.pl $(DRIVER_FLAGS) --vlt --stop
vltmt:
$(PERL) driver.pl $(DRIVER_FLAGS) --vltmt --stop
######################################################################
random:

View File

@ -45,6 +45,7 @@ our %All_Scenarios
nc => ["simulator", "nc"],
vcs => ["simulator", "vcs"],
vlt => ["simulator", "vlt_all", "vlt"],
vltmt => ["simulator", "vlt_all", "vltmt"],
);
#======================================================================
@ -104,6 +105,7 @@ if (! GetOptions (
"ms!" => sub { $opt_scenarios{ms} = $_[1]; },
"nc!" => sub { $opt_scenarios{nc} = $_[1]; },
"vlt!" => sub { $opt_scenarios{vlt} = $_[1]; },
"vltmt!" => sub { $opt_scenarios{vltmt} = $_[1]; },
"vcs!" => sub { $opt_scenarios{vcs} = $_[1]; },
"<>" => \&parameter,
)) {
@ -322,6 +324,7 @@ sub new {
$self->{scenario} ||= "ghdl" if $self->{ghdl};
$self->{scenario} ||= "vcs" if $self->{vcs};
$self->{scenario} ||= "vlt" if $self->{vlt};
$self->{scenario} ||= "vltmt" if $self->{vltmt};
$self->{scenario} ||= "nc" if $self->{nc};
$self->{scenario} ||= "ms" if $self->{ms};
$self->{scenario} ||= "iv" if $self->{iv};
@ -407,6 +410,7 @@ sub new {
ms_run_flags => [split(/\s+/,"-lib $self->{obj_dir}/work -c -do 'run -all;quit' ")],
# Verilator
vlt => 0,
vltmt => 0,
verilator_flags => ["-cc",
"-Mdir $self->{obj_dir}",
"-OD", # As currently disabled unless -O3
@ -420,7 +424,7 @@ sub new {
%$self};
bless $self, $class;
$self->{vlt_all} = $self->{vlt}; # Any Verilator scenario
$self->{vlt_all} = $self->{vlt} || $self->{vltmt}; # Any Verilator scenario
$self->{VM_PREFIX} ||= "V".$self->{name};
$self->{stats} ||= "$self->{obj_dir}/V".$self->{name}."__stats.txt";
@ -593,6 +597,8 @@ sub compile_vlt_flags {
unshift @verilator_flags, "--gdbbt" if $opt_gdbbt;
unshift @verilator_flags, "--x-assign unique"; # More likely to be buggy
unshift @verilator_flags, "--trace" if $opt_trace;
unshift @verilator_flags, "--threads 3" if $param{vltmt};
unshift @verilator_flags, "--debug-partition" if $param{vltmt};
if (defined $opt_optimize) {
my $letters = "";
if ($opt_optimize =~ /[a-zA-Z]/) {
@ -746,6 +752,11 @@ sub compile {
return 1;
}
if ($self->{vltmt} && !$self->cfg_with_threaded) {
$self->skip("Test requires Verilator configured with threads\n");
return 1;
}
if (!$param{fails} && $param{verilator_make_gcc}
&& $param{make_main}) {
$self->_make_main();
@ -2045,7 +2056,11 @@ Run Synopsys VCS simulator tests.
=item --vlt
Run Verilator tests. Default unless another scenario flag is provided.
Run Verilator tests in single-threaded mode. Default unless another scenario flag is provided.
=item --vltmt
Run Verilator tests in multithreaded mode.
=back

22
test_regress/t/t_a_selftest.pl Executable file
View File

@ -0,0 +1,22 @@
#!/usr/bin/perl
if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); die; }
# DESCRIPTION: Verilator: Verilog Test driver/expect definition
#
# Copyright 2003 by Wilson Snyder. This program is free software; you can
# redistribute it and/or modify it under the terms of either the GNU
# Lesser General Public License Version 3 or the Perl Artistic License
# Version 2.0.
scenarios(vlt_all => 1);
top_filename("t/t_EXAMPLE.v");
compile(
verilator_flags2 => ['--debug-self-test'],
verilator_make_gcc => 0,
make_top_shell => 0,
make_main => 0,
);
ok(1);
1;

View File

@ -15,7 +15,8 @@ compile(
if ($Self->{vlt_all}) {
file_grep ($Self->{stats}, qr/Optimizations, Tables created\s+(\d+)/i, 10);
file_grep ($Self->{stats}, qr/Optimizations, Combined CFuncs\s+(\d+)/i, 8);
file_grep ($Self->{stats}, qr/Optimizations, Combined CFuncs\s+(\d+)/i,
($Self->{vltmt} ? 0 : 8));
}
execute(

21
test_regress/t/t_dpi_threads.pl Executable file
View File

@ -0,0 +1,21 @@
#!/usr/bin/perl
if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); die; }
# DESCRIPTION: Verilator: Verilog Test driver/expect definition
#
# Copyright 2018 by Wilson Snyder. This program is free software; you can
# redistribute it and/or modify it under the terms of either the GNU
# Lesser General Public License Version 3 or the Perl Artistic License
# Version 2.0.
scenarios(vltmt => 1);
compile(
v_flags2 => ["t/t_dpi_threads_c.cpp --no-threads-coarsen"],
);
execute(
check_finished => 1,
);
ok(1);
1;

View File

@ -0,0 +1,62 @@
// DESCRIPTION: Verilator: Verilog Test module
//
// Copyright 2018 by Wilson Snyder. This program is free software; you can
// redistribute it and/or modify it under the terms of either the GNU
// Lesser General Public License Version 3 or the Perl Artistic License
// Version 2.0.
import "DPI-C" dpii_sys_task = function void \$dpii_sys ();
import "DPI-C" dpii_failure = function int \$dpii_failure ();
module t (clk);
input clk;
integer cyc;
integer failure;
initial cyc = 0;
`ifndef verilator
`error "Only Verilator supports PLI-ish DPI calls."
`endif
always @ (posedge clk) begin
if (cyc == 2) begin
failure = $dpii_failure();
$write("* failure = %0d\n", failure);
if (failure > 0) begin
$stop;
end
$write("*-* All Finished *-*\n");
$finish;
end
cyc <= cyc + 1;
end
// The purpose of this test is to confirm that the DPI-call serialization
// code in V3Partition does ensure that these DPI calls do not run
// concurrently.
//
// Alternatively, the test may be run with "--threads-dpi all" in which case
// it should confirm that the calls do run concurrently and do detect a
// collision (they should, if the test is set up right.) This is
// t_dpi_threads_collide.pl.
//
// Q) Is it a risk that the partitioner will merge or serialize these always
// blocks, just by luck, even if the DPI-call serialization code fails?
//
// A) Yes, that's why t_dpi_threads_collide.pl also passes
// --no-threads-do-coaren to disable MTask coarsening. This ensures that
// the MTask graph at the end of FixDataHazards (where we resolve DPI
// hazards) is basically the final MTasks graph, and that data hazards
// which persist beyond FixDataHazards should persist in the final
// generated C code.
always @ (posedge clk) begin
$dpii_sys();
end
always @ (posedge clk) begin
$dpii_sys();
end
endmodule

View File

@ -0,0 +1,78 @@
// -*- mode: C++; c-file-style: "cc-mode" -*-
//*************************************************************************
//
// Copyright 2018-2018 by Wilson Snyder. This program is free software; you can
// redistribute it and/or modify it under the terms of either the GNU
// Lesser General Public License Version 3 or the Perl Artistic License.
// Version 2.0.
//
// Verilator is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
//*************************************************************************
#include <atomic>
#include <cstdio>
#include <iostream>
#include <unistd.h>
#include "svdpi.h"
//======================================================================
#if defined(VERILATOR)
# ifdef T_DPI_THREADS_COLLIDE
# include "Vt_dpi_threads_collide__Dpi.h"
# else
# include "Vt_dpi_threads__Dpi.h"
# endif
#elif defined(VCS)
# include "../vc_hdrs.h"
#elif defined(CADENCE)
# define NEED_EXTERNS
#else
# error "Unknown simulator for DPI test"
#endif
#ifdef NEED_EXTERNS
extern "C" {
extern void dpii_sys_task();
extern int dpii_failure();
}
#endif
//======================================================================
struct state {
std::atomic<bool> task_is_running;
std::atomic<int> failure;
state() : task_is_running(false)
, failure(false) {}
};
static state st;
void dpii_sys_task() {
bool other_task_running = atomic_exchange(&st.task_is_running, true);
if (other_task_running) {
// Another task is running. This is a collision.
st.failure = 1;
std::cerr << "t_dpi_threads_c.cpp dpii_sys_task() saw threads collide.\n";
} else {
std::cerr << "t_dpi_threads_c.cpp dpii_sys_task() no collision. @" << &st.task_is_running << "\n";
}
// Spend some time in the DPI call, so that if we can have a collision
// we probably will. Technically this is not guaranteed to detect every
// race. However, one second is so much greater than the expected
// runtime of everything else in the test, it really should pick up on
// races just about all of the time.
sleep(1);
atomic_exchange(&st.task_is_running, false);
}
int dpii_failure() {
return st.failure;
}

View File

@ -0,0 +1,28 @@
#!/usr/bin/perl
if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); die; }
# DESCRIPTION: Verilator: Verilog Test driver/expect definition
#
# Copyright 2018 by Wilson Snyder. This program is free software; you can
# redistribute it and/or modify it under the terms of either the GNU
# Lesser General Public License Version 3 or the Perl Artistic License
# Version 2.0.
scenarios(vltmt => 1);
top_filename("t/t_dpi_threads.v");
compile(
v_flags2 => ["t/t_dpi_threads_c.cpp --threads-dpi all --no-threads-coarsen"],
);
# Similar to t_dpi_threads, which confirms that Verilator can prevent a
# race between DPI import calls, this test confirms that the race exists
# and that the DPI C code can detect it under --threads-dpi all
# mode.
#
execute(
fails => 1,
);
ok(1);
1;

View File

@ -43,7 +43,10 @@ gen($Self->{top_filename}, 6000);
compile(
verilator_flags2=>["-x-assign fast --x-initial fast",
"-Wno-UNOPTTHREADS",
],
# The slow V3Partition asserts are just too slow
# in this test. They're disabled just for performance
# reasons:
"--no-debug-partition"],
);
execute(

74
test_regress/t/t_gantt.pl Executable file
View File

@ -0,0 +1,74 @@
#!/usr/bin/perl
if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); die; }
# DESCRIPTION: Verilator: Verilog Test driver/expect definition
#
# Copyright 2003 by Wilson Snyder. This program is free software; you can
# redistribute it and/or modify it under the terms of either the GNU
# Lesser General Public License Version 3 or the Perl Artistic License
# Version 2.0.
use IO::File;
# Test for bin/verilator_gantt,
#
# Only needed in multithreaded regression.
scenarios(vltmt => 1);
# It doesn't really matter what test
# we use, so long as it runs several cycles,
# enough for the profiling to happen:
top_filename("t/t_gen_alw.v");
compile(
v_flags2 => ["--prof-threads"]
);
execute(
all_run_flags => ["+verilator+prof+threads+start+2",
" +verilator+prof+threads+window+2",
" +verilator+prof+threads+file+$Self->{obj_dir}/profile_threads.dat",
],
check_finished => 1,
);
# For now, verilator_gantt still reads from STDIN
# (probably it should take a file, gantt.dat like verilator_profcfunc)
# The profiling data still goes direct to the runtime's STDOUT
# (maybe that should go to a separate file - gantt.dat?)
run(cmd => ["$ENV{VERILATOR_ROOT}/bin/verilator_gantt",
"$Self->{obj_dir}/profile_threads.dat",
"--vcd $Self->{obj_dir}/profile_threads.vcd",
"> $Self->{obj_dir}/gantt.log"]);
# We should have three lines of gantt chart, each with
# an even number of mtask-bars (eg "[123--]")
my $gantt_line_ct = 0;
my $global_mtask_ct = 0;
{
my $fh = IO::File->new("<$Self->{obj_dir}/gantt.log")
or error("$! $Self->{obj_dir}/gantt.log");
while (my $line = ($fh && $fh->getline)) {
if ($line !~ m/^ t:/) { next; }
$gantt_line_ct++;
my $this_thread_mtask_ct = 0;
my @mtasks = split(/\[/, $line);
shift @mtasks; # throw the '>> ' away
foreach my $mtask (@mtasks) {
# Format of each mtask is "[123--]" where the hyphens
# number or ] may or may not appear; it depends on exact timing.
$this_thread_mtask_ct++;
$global_mtask_ct++;
}
if ($this_thread_mtask_ct % 2 != 0) { error("odd number of mtasks found"); }
}
}
if ($gantt_line_ct != 3) { error("wrong number of gantt lines"); }
if ($global_mtask_ct == 0) { error("wrong number of mtasks, should be > 0"); }
print "Found $gantt_line_ct lines of gantt data with $global_mtask_ct mtasks\n"
if $Self->{verbose};
# Diff to itself, just to check parsing
vcd_identical("$Self->{obj_dir}/profile_threads.vcd", "$Self->{obj_dir}/profile_threads.vcd");
ok(1);
1;

View File

@ -117,6 +117,10 @@ compile(
);
execute(
all_run_flags => ["+verilator+prof+threads+start+100",
" +verilator+prof+threads+window+2",
" +verilator+prof+threads+file+$Self->{obj_dir}/profile_threads.dat",
],
check_finished => 1,
);

View File

@ -13,6 +13,7 @@ foreach my $prog (
"../bin/verilator",
"../bin/verilator_coverage",
"../bin/verilator_difftree",
"../bin/verilator_gantt",
"../bin/verilator_profcfunc",
) {
run(fails => 1,

View File

@ -38,7 +38,8 @@ sub checkRelativeRefs {
if ($Self->{vlt_all}) {
# We expect to combine sequent functions across multiple instances of
# l2, l3, l4, l5. If this number drops, please confirm this has not broken.
file_grep ($Self->{stats}, qr/Optimizations, Combined CFuncs\s+(\d+)/i, 52);
file_grep ($Self->{stats}, qr/Optimizations, Combined CFuncs\s+(\d+)/i,
($Self->{vltmt} ? 84 : 52));
# Expect absolute refs in CFuncs for t (top module) and l1 (because it
# has only one instance)

View File

@ -18,7 +18,8 @@ compile(
if ($Self->{vlt_all}) {
# Fewer optimizations than t_inst_tree_inl0_pub1 which allows
# relative CFuncs:
file_grep ($Self->{stats}, qr/Optimizations, Combined CFuncs\s+(\d+)/i, 31);
file_grep ($Self->{stats}, qr/Optimizations, Combined CFuncs\s+(\d+)/i,
($Self->{vltmt} ? 0 : 31));
# Should not find any 'this->' except some 'this->__VlSymsp'
my @files = `ls $Self->{obj_dir}/*.cpp`;

View File

@ -7,8 +7,7 @@ if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); di
# Lesser General Public License Version 3 or the Perl Artistic License
# Version 2.0.
scenarios(simulator => 1);
$Self->cfg_with_threaded or skip("No thread support");
scenarios(vltmt => 1);
top_filename("t/t_threads_counter.v");

View File

@ -7,8 +7,7 @@ if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); di
# Lesser General Public License Version 3 or the Perl Artistic License
# Version 2.0.
scenarios(simulator => 1);
$Self->cfg_with_threaded or skip("No thread support");
scenarios(vltmt => 1);
top_filename("t/t_threads_counter.v");

View File

@ -0,0 +1,23 @@
#!/usr/bin/perl
if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); die; }
# DESCRIPTION: Verilator: Verilog Test driver/expect definition
#
# Copyright 2003-2009 by Wilson Snyder. This program is free software; you can
# redistribute it and/or modify it under the terms of either the GNU
# Lesser General Public License Version 3 or the Perl Artistic License
# Version 2.0.
scenarios(vltmt => 1);
top_filename("t/t_threads_counter.v");
compile(
verilator_flags2 => ['--cc --threads 4'],
);
execute(
check_finished => 1,
);
ok(1);
1;

View File

@ -0,0 +1,25 @@
#!/usr/bin/perl
if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); die; }
# DESCRIPTION: Verilator: Verilog Test driver/expect definition
#
# Copyright 2003-2009 by Wilson Snyder. This program is free software; you can
# redistribute it and/or modify it under the terms of either the GNU
# Lesser General Public License Version 3 or the Perl Artistic License
# Version 2.0.
scenarios(vltmt => 1);
top_filename("t/t_threads_counter.v");
compile(
verilator_flags2 => ['--cc --threads 2 --debug-nondeterminism'],
);
execute(
check_finished => 1,
);
file_grep("$Self->{obj_dir}/vlt_compile.log", qr/hash of shape/i);
ok(1);
1;

View File

@ -13,7 +13,12 @@ my $root = "..";
compile(
# Can't use --coverage and --savable together, so cheat and compile inline
verilator_flags2 => ['--cc --coverage-toggle --coverage-line --coverage-user --trace --vpi $root/include/verilated_save.cpp'],
verilator_flags2 => ["--cc",
"--coverage-toggle --coverage-line --coverage-user",
"--trace --vpi ",
($Self->cfg_with_threaded
? "--threads 2 $root/include/verilated_threads.cpp" : ""),
"$root/include/verilated_save.cpp"],
);
execute(
@ -43,7 +48,8 @@ foreach my $dfile (glob("$Self->{obj_dir}/*.d")) {
foreach my $file (sort keys %hit) {
if (!$hit{$file}
&& $file !~ /_sc/) {
&& $file !~ /_sc/
&& ($file !~ /_thread/ || $Self->cfg_with_threaded)) {
error("Include file not covered by t_verilated_all test: ",$file);
}
}

View File

@ -7,8 +7,7 @@ if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); di
# Lesser General Public License Version 3 or the Perl Artistic License
# Version 2.0.
scenarios(simulator => 1);
$Self->cfg_with_threaded or skip("No thread support");
scenarios(vltmt => 1);
top_filename("t/t_verilated_all.v");