mirror of
https://github.com/verilator/verilator.git
synced 2025-04-05 12:12:39 +00:00
MAJOR: Add multithreaded model generation.
This commit is contained in:
parent
0070520edb
commit
ec8dbbffed
7
Changes
7
Changes
@ -5,15 +5,18 @@ The contributors that suggested a given feature are shown in []. Thanks!
|
||||
* Verilator 4.000 devel
|
||||
|
||||
** This is a major release. Any patches may require major rework to apply.
|
||||
[Thanks everyone]
|
||||
|
||||
** Add multithreaded model generation.
|
||||
|
||||
** Add runtime arguments.
|
||||
|
||||
** Fix internals to be C++ null-pointer-check clean.
|
||||
|
||||
*** Better optimize large always block splitting, bug1244. [John Coiner]
|
||||
|
||||
*** Add new reloop optimization for repetitive assignment compression.
|
||||
|
||||
**** Fix internals to be C++ null-pointer-check clean.
|
||||
|
||||
**** Fix internals to avoid 'using namespace std'.
|
||||
|
||||
**** Fix Verilation performance issues, bug1316. [John Coiner]
|
||||
|
@ -120,6 +120,7 @@ DISTFILES_INC = $(INFOS) .gitignore Artistic COPYING COPYING.LESSER \
|
||||
bin/verilator \
|
||||
bin/verilator_coverage \
|
||||
bin/verilator_difftree \
|
||||
bin/verilator_gantt \
|
||||
bin/verilator_includer \
|
||||
bin/verilator_profcfunc \
|
||||
doxygen-mainpage doxygen.config veripool-logo.png \
|
||||
@ -154,6 +155,7 @@ DISTFILES_INC = $(INFOS) .gitignore Artistic COPYING COPYING.LESSER \
|
||||
INST_PROJ_FILES = \
|
||||
bin/verilator \
|
||||
bin/verilator_coverage \
|
||||
bin/verilator_gantt \
|
||||
bin/verilator_includer \
|
||||
bin/verilator_profcfunc \
|
||||
include/verilated.mk \
|
||||
@ -272,12 +274,12 @@ internals.pdf: internals.pod Makefile
|
||||
|
||||
# See uninstall also - don't put wildcards in this variable, it might uninstall other stuff
|
||||
VL_INST_BIN_FILES = verilator verilator_bin verilator_bin_dbg verilator_coverage_bin_dbg \
|
||||
verilator_coverage verilator_includer verilator_profcfunc
|
||||
verilator_coverage verilator_gantt verilator_includer verilator_profcfunc
|
||||
# Some scripts go into both the search path and pkgdatadir,
|
||||
# so they can be found by the user, and under $VERILATOR_ROOT.
|
||||
|
||||
# See uninstall also - don't put wildcards in this variable, it might uninstall other stuff
|
||||
VL_INST_MAN_FILES = verilator.1 verilator_coverage.1 verilator_profcfunc.1
|
||||
VL_INST_MAN_FILES = verilator.1 verilator_coverage.1 verilator_gantt.1 verilator_profcfunc.1
|
||||
|
||||
VL_INST_INC_BLDDIR_FILES = \
|
||||
include/verilated_config.h \
|
||||
@ -295,6 +297,7 @@ installbin:
|
||||
$(SHELL) ${srcdir}/mkinstalldirs $(DESTDIR)$(bindir)
|
||||
( cd ${srcdir}/bin ; $(INSTALL_PROGRAM) verilator $(DESTDIR)$(bindir)/verilator )
|
||||
( cd ${srcdir}/bin ; $(INSTALL_PROGRAM) verilator_coverage $(DESTDIR)$(bindir)/verilator_coverage )
|
||||
( cd ${srcdir}/bin ; $(INSTALL_PROGRAM) verilator_gantt $(DESTDIR)$(bindir)/verilator_gantt )
|
||||
( cd ${srcdir}/bin ; $(INSTALL_PROGRAM) verilator_profcfunc $(DESTDIR)$(bindir)/verilator_profcfunc )
|
||||
( cd ${srcdir}/bin ; $(INSTALL_PROGRAM) verilator_bin $(DESTDIR)$(bindir)/verilator_bin )
|
||||
( cd ${srcdir}/bin ; $(INSTALL_PROGRAM) verilator_bin_dbg $(DESTDIR)$(bindir)/verilator_bin_dbg )
|
||||
|
187
bin/verilator
187
bin/verilator
@ -338,6 +338,7 @@ detailed descriptions in L</"VERILATION ARGUMENTS"> for more information.
|
||||
--pipe-filter <command> Filter all input through a script
|
||||
--prefix <topname> Name of top level class
|
||||
--prof-cfuncs Name functions for profiling
|
||||
--prof-threads Enable generating gantt chart data for threads
|
||||
--private Debugging; see docs
|
||||
--public Debugging; see docs
|
||||
-pvalue+<name>=<value> Overwrite toplevel parameter
|
||||
@ -350,6 +351,9 @@ detailed descriptions in L</"VERILATION ARGUMENTS"> for more information.
|
||||
--stats-vars Provide statistics on variables
|
||||
-sv Enable SystemVerilog parsing
|
||||
+systemverilogext+<ext> Synonym for +1800-2017ext+<ext>
|
||||
--threads <threads> Enable multithreading
|
||||
--threads-dpi <mode> Enable multithreaded DPI
|
||||
--threads-max-mtasks <mtasks> Tune maximum mtask partitioning
|
||||
--top-module <topname> Name of top level input module
|
||||
--trace Enable waveform creation
|
||||
--trace-depth <levels> Depth of tracing
|
||||
@ -386,6 +390,9 @@ detailed descriptions in L</"RUNTIME ARGUMENTS"> for more information.
|
||||
+verilator+debug Enable debugging
|
||||
+verilator+debugi+<value> Enable debugging at a level
|
||||
+verilator+help Display help
|
||||
+verilator+prof+threads+file+I<filename> Set profile filename
|
||||
+verilator+prof+threads+start+I<value> Set profile starting point
|
||||
+verilator+prof+threads+window+I<value> Set profile duration
|
||||
+verilator+rand+reset+<value> Set random reset technique
|
||||
+verilator+V Verbose version and config
|
||||
+verilator+version Show version and exit
|
||||
@ -1080,6 +1087,18 @@ Verilog module and line number the statement came from. This allows gprof
|
||||
or oprofile reports to be correlated with the original Verilog source
|
||||
statements. See also L<verilator_profcfunc>.
|
||||
|
||||
=item --prof-threads
|
||||
|
||||
Enable gantt chart data collection for threaded builds.
|
||||
|
||||
Verilator will record the start and end time of each macro-task across a
|
||||
number of calls to eval. (What is a macro-task? See the Verilator internals
|
||||
document.)
|
||||
|
||||
When profiling is enabled, the runtime will emit a blurb of profiling data
|
||||
in non-human-friendly form. The C<verilator_gantt> script will transform
|
||||
this into a nicer visual format and produce some related statistics.
|
||||
|
||||
=item --private
|
||||
|
||||
Opposite of --public. Is the default; this option exists for backwards
|
||||
@ -1134,7 +1153,10 @@ Enable including save and restore functions in the generated model.
|
||||
|
||||
The user code must create a VerilatedSerialize or VerilatedDeserialze
|
||||
object then calling the << or >> operators on the generated model and any
|
||||
other data the process needs saved/restored. For example:
|
||||
other data the process needs saved/restored. These functions are not
|
||||
thread safe, and are typically called only by a main thread.
|
||||
|
||||
For example:
|
||||
|
||||
void save_model(const char* filenamep) {
|
||||
VerilatedSave os;
|
||||
@ -1173,6 +1195,42 @@ compatibility with other simulators.
|
||||
|
||||
A synonym for C<+1800-2017ext+>I<ext>.
|
||||
|
||||
=item --threads I<threads>
|
||||
|
||||
=item --no-threads
|
||||
|
||||
With --threads 0 or --no-threads, the default, the generated model is not
|
||||
thread safe. With --threads 1, the generated model is single threaded but
|
||||
may run in a multithreaded environment. With --threads N, where N >= 2, the
|
||||
model is generated to run multithreaded on up to N threads. See
|
||||
L</"MULTITHREADING">.
|
||||
|
||||
=item --threads-dpi all
|
||||
|
||||
=item --threads-dpi none
|
||||
|
||||
=item --threads-dpi pure
|
||||
|
||||
When using --dpi with --threads, control what DPI tasks are thread safe.
|
||||
|
||||
With --threads-dpi all, enable Verilator to assume all DPI imports are
|
||||
threadsafe, and to use thread-local storage for communication with DPI,
|
||||
potentially improving performance. Any DPI libraries need appropriate
|
||||
mutexes to avoid undefined behavior.
|
||||
|
||||
With --threads-dpi none, Verilator assume DPI imports are not thread safe,
|
||||
and Verilator will serialize calls to DPI imports by default, potentially
|
||||
harming performance.
|
||||
|
||||
With --threads-dpi pure, the default, Verilator assumes DPI pure imports
|
||||
are threadsafe, but non-pure DPI imports are not.
|
||||
|
||||
=item --threads-max-mtasks I<value>
|
||||
|
||||
Rarely needed. When using --threads, specify the number of mtasks the
|
||||
model is to be partitioned into. If unspecified, Verilator approximates a
|
||||
good value.
|
||||
|
||||
=item --top-module I<topname>
|
||||
|
||||
When the input Verilog contains more than one top level module, specifies
|
||||
@ -1464,6 +1522,28 @@ Enable debugging at the provided level.
|
||||
|
||||
Display help and exit.
|
||||
|
||||
=item +verilator+prof+threads+file+I<filename>
|
||||
|
||||
When using --prof-threads, the filename to dump to. Defaults to
|
||||
"profile_threads.dat".
|
||||
|
||||
=item +verilator+prof+threads+start+I<value>
|
||||
|
||||
When using --prof-threads, Verilator will wait until $time is at this
|
||||
value, then start the profiling warmup, then capturing. Generally this
|
||||
should be set to some time that is well within the normal operation of the
|
||||
simulation, i.e. outside of reset. If 0, the dump is disabled. Defaults to
|
||||
1.
|
||||
|
||||
=item +verilator+prof+threads+window+I<value>
|
||||
|
||||
When using --prof-threads, after $time reaches
|
||||
+verilator+prof+threads+start, Verilator will warm up the profiling for
|
||||
this number of eval() calls, then will capture the profiling of this number
|
||||
of eval() calls. Defaults to 2, which makes sense for a
|
||||
single-clock-domain module where it's typical to want to capture one
|
||||
posedge eval() and one negedge eval().
|
||||
|
||||
=item +verilator+rand+reset+I<value>
|
||||
|
||||
When a model was Verilated using "-x-inital unique", sets the
|
||||
@ -1635,6 +1715,9 @@ compile times, and --x-assign=fast --x-initial=fast may increase the risk
|
||||
of reset bugs in trade for performance; see the above documentation for
|
||||
these flags.
|
||||
|
||||
If using Verilated multithreaded, use C<numactl> to ensure you are using
|
||||
non-conflicting hardware resources. See L</"MULTITHREADING">.
|
||||
|
||||
Minor Verilog code changes can also give big wins. You should not have any
|
||||
UNOPTFLAT warnings from Verilator. Fixing these warnings can result in
|
||||
huge improvements; one user fixed their one UNOPTFLAT warning by making a
|
||||
@ -2176,6 +2259,89 @@ the names of the .cpp files to compile in from the make variables generated
|
||||
in obj_dir/Vour_classes.mk.
|
||||
|
||||
|
||||
=head1 MULTITHREADING
|
||||
|
||||
Verilator experimentally supports multithreading.
|
||||
|
||||
With --no-threads, the default, the model is not thread safe, and any use
|
||||
of more than one thread calling into one or even different Verilated models
|
||||
may result in unpredictable behavior. This gives the highest single thread
|
||||
performance.
|
||||
|
||||
With --threads 1, the generated model is single threaded, however the
|
||||
support libraries are multithread safe. This allows different
|
||||
instantiations of model(s) to potentially each be run under a different
|
||||
thread. All threading is the responsibility of the user's C++ testbench.
|
||||
|
||||
With --threads N, where N is at least 2, the generated model will be
|
||||
designed to run in parallel on N threads. The thread calling eval()
|
||||
provides one of those threads, and the generated model will create and
|
||||
manage the other N-1 threads. It's the client's responsibility not to
|
||||
oversubscribe the available CPU cores. Under CPU oversubscription, the
|
||||
Verilated model should not livelock nor deadlock, however, you can expect
|
||||
performance to be far worse than it would be with proper stoichiometry of
|
||||
threads and CPU cores.
|
||||
|
||||
The remainder of this section describe behavior with --threads 1 or
|
||||
--threads N (not --no-threads).
|
||||
|
||||
VL_THREADED is defined when compiling a threaded Verilated module, causing
|
||||
the Verilated support classes become threadsafe.
|
||||
|
||||
The thread used for constructing a model must the the same thread that
|
||||
calls eval() into the model, this is called the "eval thread". The thread
|
||||
used to perform certain global operations such as saving and tracing must
|
||||
be done by a "main thread". In most cases the eval thread and main thread
|
||||
are the same thread (i.e. the user's top C++ testbench runs on a single
|
||||
thread), but this is not required.
|
||||
|
||||
When running a multithreaded model, the default Linux task scheduler often
|
||||
works against the model, by assuming threads are short lived, and thus
|
||||
often schedules threads using multiple hyperthreads within the same
|
||||
physical core. For best performance use the C<numactl> program to (when the
|
||||
threading count fits) select unique physical cores on the same socket. For
|
||||
example, if a model was Verilated with "--threads 4", we consult
|
||||
|
||||
egrep 'processor|physical id|core id' /proc/cpuinfo
|
||||
|
||||
To select cores 0, 1, 2, and 3 that are all located on the same socket (0)
|
||||
but different physical cores. (Also useful is "numactl --hardware", or
|
||||
C<lscpu> but those doesn't show Hyperthreading cores.) Then we execute
|
||||
|
||||
numactl -m 0 -C 0,1,2,3 -- verilated_executable_name
|
||||
|
||||
This will limit memory to socket 0, and threads to cores 0, 1, 2, 3,
|
||||
(presumably on socket 0) optimizing performance. Of course this must be
|
||||
adjusted if you want another simulator using e.g. socket 1, or if you
|
||||
Verilated with a different number of threads. To see what CPUs are
|
||||
actually used, use --prof-threads.
|
||||
|
||||
=head2 Multithreaded Verilog and Library Support
|
||||
|
||||
$display/$stop/$finish are delayed until the end of an eval() call in order
|
||||
to maintain ordering between threads. This may result in additional tasks
|
||||
completing after the $stop or $finish.
|
||||
|
||||
If using --coverage, the coverage routines are fully thread safe.
|
||||
|
||||
If using --dpi, Verilator assumes pure DPI imports are thread safe,
|
||||
balancing performance versus saftey. See --threads-dpi.
|
||||
|
||||
If using --savable, the save/restore classes are not multithreaded and are
|
||||
must be called only by the eval thread.
|
||||
|
||||
If using --sc, the SystemC kernel is not thread safe, therefore the eval
|
||||
thread and main thread must be the same.
|
||||
|
||||
If using --trace, the tracing classes must be constructed and called from
|
||||
the main thread.
|
||||
|
||||
If using --vpi, since SystemVerilog VPI was not architected by IEEE to be
|
||||
multithreaded, Verilator requires all VPI calls are only made from the main
|
||||
thread.
|
||||
|
||||
=back
|
||||
|
||||
=head1 CONFIGURATION FILES
|
||||
|
||||
In addition to the command line, warnings and other features may be
|
||||
@ -3636,6 +3802,21 @@ section for more details.
|
||||
Ignoring this warning will only slow simulations, it will simulate
|
||||
correctly.
|
||||
|
||||
=item UNOPTTHREADS
|
||||
|
||||
Warns that the thread scheduler was unable to partition the design to fill
|
||||
the requested number of threads.
|
||||
|
||||
One workaround is to request fewer threads with C<--threads>.
|
||||
|
||||
Another possible workaround is to allow more MTasks in the runtime, by
|
||||
increasing the value of --threads-max-mtasks. More MTasks will result in
|
||||
more communication and synchronization overhead at runtime; the scheduler
|
||||
attempts to minimize the number of MTasks for this reason.
|
||||
|
||||
Ignoring this warning will only slow simulations, it will simulate
|
||||
correctly.
|
||||
|
||||
=item UNPACKED
|
||||
|
||||
Warns that unpacked structs and unions are not supported.
|
||||
@ -4185,6 +4366,8 @@ performance gain.
|
||||
|
||||
In 2009, major SystemVerilog and DPI language support was added.
|
||||
|
||||
In 2018, Verilator 4.000 was released with multithreaded support.
|
||||
|
||||
Currently, various language features and performance enhancements are added
|
||||
as the need arises. Verilator is now about 3x faster than in 2002, and is
|
||||
faster than many popular commercial simulators.
|
||||
@ -4282,7 +4465,7 @@ License Version 2.0.
|
||||
|
||||
=head1 SEE ALSO
|
||||
|
||||
L<verilator_coverage>, L<verilator_profcfunc>, L<make>,
|
||||
L<verilator_coverage>, L<verilator_gantt>, L<verilator_profcfunc>, L<make>,
|
||||
|
||||
L<verilator --help> which is the source for this document,
|
||||
|
||||
|
559
bin/verilator_gantt
Executable file
559
bin/verilator_gantt
Executable file
@ -0,0 +1,559 @@
|
||||
: # -*-Mode: perl;-*- use perl, wherever it is
|
||||
eval 'exec perl -wS $0 ${1+"$@"}'
|
||||
if 0;
|
||||
# See copyright, etc in below POD section.
|
||||
######################################################################
|
||||
|
||||
use strict;
|
||||
use warnings;
|
||||
use Getopt::Long;
|
||||
use Pod::Usage;
|
||||
use vars qw ($Debug);
|
||||
|
||||
$Debug = 0;
|
||||
my $Opt_File;
|
||||
my $Opt_Time_Per_Char = 0; # rdtsc ticks per char in gantt chart, 0=auto
|
||||
my $opt_vcd = "profile_threads.vcd";
|
||||
|
||||
our %Threads;
|
||||
our %Mtasks;
|
||||
our %Global;
|
||||
|
||||
autoflush STDOUT 1;
|
||||
autoflush STDERR 1;
|
||||
Getopt::Long::config ("no_auto_abbrev");
|
||||
if (! GetOptions (
|
||||
"help" => \&usage,
|
||||
"scale=i" => \$Opt_Time_Per_Char,
|
||||
"debug" => sub { $Debug = 1; },
|
||||
"vcd=s" => \$opt_vcd,
|
||||
"no-vcd!" => sub { $opt_vcd = undef; },
|
||||
"<>" => \¶meter,
|
||||
)) {
|
||||
die "%Error: Bad usage, try 'verilator_gantt --help'\n";
|
||||
}
|
||||
|
||||
$Opt_File = "profile_threads.dat" if !defined $Opt_File;
|
||||
|
||||
process($Opt_File);
|
||||
write_vcd($opt_vcd) if defined $opt_vcd;
|
||||
exit(0);
|
||||
|
||||
#######################################################################
|
||||
|
||||
sub usage {
|
||||
pod2usage(-verbose=>2, -exitval=>2, -output=>\*STDOUT);
|
||||
exit (1);
|
||||
}
|
||||
|
||||
sub parameter {
|
||||
my $param = shift;
|
||||
if (!defined $Opt_File) {
|
||||
$Opt_File = $param;
|
||||
} else {
|
||||
die "%Error: Unknown parameter: $param\n";
|
||||
}
|
||||
}
|
||||
|
||||
#######################################################################
|
||||
|
||||
sub process {
|
||||
my $filename = shift;
|
||||
|
||||
read_data($filename);
|
||||
report();
|
||||
}
|
||||
|
||||
#######################################################################
|
||||
|
||||
sub read_data {
|
||||
my $filename = shift;
|
||||
|
||||
%Global = (rdtsc_cycle_time => 0);
|
||||
|
||||
my $fh = IO::File->new ($filename) or die "%Error: $! $filename,";
|
||||
while (my $line = $fh->getline) {
|
||||
if ($line =~ m/VLPROF mtask\s(\d+)\sstart\s(\d+)\send\s(\d+)\selapsed\s(\d+)\spredict_time\s(\d+)\scpu\s(\d+)\son thread (\d+)/) {
|
||||
my $mtask = $1;
|
||||
my $start = $2;
|
||||
my $end = $3;
|
||||
my $elapsed_time = $4;
|
||||
my $predict_time = $5;
|
||||
my $cpu = $6;
|
||||
my $thread = $7;
|
||||
$Threads{$thread}{$start}{mtask} = $mtask;
|
||||
$Threads{$thread}{$start}{end} = $end;
|
||||
$Threads{$thread}{$start}{cpu} = $cpu;
|
||||
|
||||
if (!exists $Mtasks{$mtask}{elapsed}) {
|
||||
$Mtasks{$mtask}{elapsed} = 0;
|
||||
}
|
||||
$Mtasks{$mtask}{elapsed} += $elapsed_time;
|
||||
$Mtasks{$mtask}{predict} = $predict_time;
|
||||
$Mtasks{$mtask}{end} = max($Mtasks{$mtask}{end}, $end);
|
||||
}
|
||||
elsif ($line =~ /^VLPROFTHREAD/) {}
|
||||
elsif ($line =~ m/VLPROF arg\s+(\S+)\+([0-9.])\s*$/
|
||||
|| $line =~ m/VLPROF arg\s+(\S+)\s+([0-9.])\s*$/) {
|
||||
$Global{args}{$1} = $2;
|
||||
}
|
||||
elsif ($line =~ m/VLPROF stat\s+(\S+)\s+([0-9.]+)/) {
|
||||
$Global{stats}{$1} = $2;
|
||||
}
|
||||
elsif ($line =~ /^#/) {}
|
||||
elsif ($Debug) {
|
||||
chomp $line;
|
||||
print "Unk: $line\n";
|
||||
}
|
||||
# TODO -- this is parsing text printed by a client.
|
||||
# Really, verilator proper should generate this
|
||||
# if it's useful...
|
||||
if ($line =~ m/rdtsc time = (\d+) ticks/) {
|
||||
$Global{rdtsc_cycle_time} = $1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sub report {
|
||||
print "Verilator Gantt report\n";
|
||||
|
||||
print "\nArgument settings:\n";
|
||||
foreach my $arg (sort keys %{$Global{args}}) {
|
||||
my $plus = ($arg =~ /^\+/) ? "+" : " ";
|
||||
printf " %s%s%d\n", $arg, $plus, $Global{args}{$arg};
|
||||
}
|
||||
|
||||
my $nthreads = scalar keys %Threads;
|
||||
$Global{cpus}{cpu_time} = {};
|
||||
foreach my $thread (keys %Threads) {
|
||||
# Make potentially multiple characters per column
|
||||
foreach my $start (keys %{$Threads{$thread}}) {
|
||||
my $cpu = $Threads{$thread}{$start}{cpu};
|
||||
my $elapsed = $Threads{$thread}{$start}{end} - $start;
|
||||
$Global{cpus}{cpu_time}{$cpu} += $elapsed;
|
||||
}
|
||||
}
|
||||
|
||||
my $mt_mtask_time = 0;
|
||||
my $long_mtask_time = 0;
|
||||
my $last_end = 0;
|
||||
foreach my $mtask (keys %Mtasks) {
|
||||
$mt_mtask_time += $Mtasks{$mtask}{elapsed};
|
||||
$last_end = max($last_end, $Mtasks{$mtask}{end});
|
||||
$long_mtask_time = max($long_mtask_time, $Mtasks{$mtask}{elapsed});
|
||||
}
|
||||
$Global{last_end} = $last_end;
|
||||
|
||||
report_graph();
|
||||
|
||||
# If we know cycle time in the same (rdtsc) units,
|
||||
# this will give us an actual utilization number,
|
||||
# (how effectively we keep the cores busy.)
|
||||
#
|
||||
# It also gives us a number we can compare against
|
||||
# serial mode, to estimate the overhead of data sharing,
|
||||
# which will show up in the total elapsed time. (Overhead
|
||||
# of synchronization and scheduling should not.)
|
||||
print "\nAnalysis:\n";
|
||||
printf " Total threads = %d\n", $nthreads;
|
||||
printf " Total mtasks = %d\n", scalar (keys %Mtasks);
|
||||
printf " Total cpus used = %d\n", scalar (keys %{$Global{cpus}});
|
||||
printf " Total yields = %d\n", $Global{stats}{yields};
|
||||
printf " Total eval time = %d rdtsc ticks\n", $Global{last_end};
|
||||
printf " Longest mtask time = %d rdtsc ticks\n", $long_mtask_time;
|
||||
printf " All-thread mtask time = %d rdtsc ticks\n", $mt_mtask_time;
|
||||
my $long_efficiency = $long_mtask_time/($Global{last_end});
|
||||
printf " Longest-thread efficiency = %0.1f%%\n", $long_efficiency*100;
|
||||
my $mt_efficiency = $mt_mtask_time/($Global{last_end}*$nthreads);
|
||||
printf " All-thread efficiency = %0.1f%%\n", $mt_efficiency*100;
|
||||
printf " All-thread speedup = %0.1f\n", $mt_efficiency*$nthreads;
|
||||
if ($Global{rdtsc_cycle_time} > 0) {
|
||||
my $ut = $mt_mtask_time / $Global{rdtsc_cycle_time};
|
||||
print "tot_mtask_cpu=$mt_mtask_time cyc=$Global{rdtsc_cycle_time} ut=$ut\n";
|
||||
}
|
||||
|
||||
my @p2e_ratios;
|
||||
my $min_p2e = 1000000;
|
||||
my $min_mtask;
|
||||
my $max_p2e = -1000000;
|
||||
my $max_mtask;
|
||||
foreach my $mtask (sort keys %Mtasks) {
|
||||
if ($Mtasks{$mtask}{elapsed} > 0) {
|
||||
if ($Mtasks{$mtask}{predict} == 0) {
|
||||
$Mtasks{$mtask}{predict} = 1; # don't log(0) below
|
||||
}
|
||||
my $p2e_ratio = log( $Mtasks{$mtask}{predict} / $Mtasks{$mtask}{elapsed} );
|
||||
#print "log(p2e $mtask) = $p2e_ratio (predict $Mtasks{$mtask}{predict}, elapsed $Mtasks{$mtask}{elapsed})\n";
|
||||
push @p2e_ratios, $p2e_ratio;
|
||||
|
||||
if ($p2e_ratio > $max_p2e) {
|
||||
$max_p2e = $p2e_ratio;
|
||||
$max_mtask = $mtask;
|
||||
}
|
||||
if ($p2e_ratio < $min_p2e) {
|
||||
$min_p2e = $p2e_ratio;
|
||||
$min_mtask = $mtask;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
print "\nStatistics:\n";
|
||||
print " min log(p2e) = $min_p2e from mtask $min_mtask (predict $Mtasks{$min_mtask}{predict}, elapsed $Mtasks{$min_mtask}{elapsed})\n";
|
||||
print " max log(p2e) = $max_p2e from mtask $max_mtask (predict $Mtasks{$max_mtask}{predict}, elapsed $Mtasks{$max_mtask}{elapsed})\n";
|
||||
|
||||
my $stddev = stddev(\@p2e_ratios);
|
||||
my $mean = mean(\@p2e_ratios);
|
||||
print " mean = " . ($mean) . "\n";
|
||||
print " stddev = " . ($stddev) . "\n";
|
||||
print " e ^ stddev = " . exp($stddev). "\n";
|
||||
print "\n";
|
||||
}
|
||||
|
||||
sub report_graph {
|
||||
my $time_per = $Opt_Time_Per_Char;
|
||||
if ($time_per == 0) {
|
||||
$time_per = ($Global{last_end} / 40); # Start with 40 columns
|
||||
while ($time_per > 10) {
|
||||
my ($graph, $conflicts) = _make_graph($time_per);
|
||||
last if !$conflicts;
|
||||
$time_per = int($time_per/2);
|
||||
}
|
||||
# One more step so we can fit more labels
|
||||
$time_per = int($time_per/2);
|
||||
}
|
||||
|
||||
my ($graph, $conflicts) = _make_graph($time_per);
|
||||
|
||||
print "\nThread gantt graph:\n";
|
||||
print " Legend: One character width = $time_per rdtsc ticks\n";
|
||||
print " Legend: '&' = multiple mtasks in this period (character width)\n";
|
||||
|
||||
my $scale = " <-".$Global{last_end}." rdtsc total";
|
||||
for (my $col = length($scale); # -2 for '->' below
|
||||
$col < ($Global{last_end}/$time_per); ++$col) {
|
||||
$scale .= "-";
|
||||
}
|
||||
print " $scale->\n";
|
||||
|
||||
foreach my $thread (sort keys %{$graph}) {
|
||||
print " t: ";
|
||||
_print_graph_line($graph->{$thread}, '');
|
||||
}
|
||||
}
|
||||
|
||||
sub _make_graph {
|
||||
my $time_per = shift;
|
||||
|
||||
my $graph = {}; # {thread}{column}{char=>'x' or chars=>#}
|
||||
my $conflicts = 0;
|
||||
foreach my $thread (keys %Threads) {
|
||||
# Make potentially multiple characters per column
|
||||
foreach my $start (sort {$a <=> $b} keys %{$Threads{$thread}}) {
|
||||
my $end = $Threads{$thread}{$start}{end};
|
||||
my $mtask = $Threads{$thread}{$start}{mtask};
|
||||
my $cpu = $Threads{$thread}{$start}{cpu};
|
||||
|
||||
my $startcol = _time_col($time_per, $start);
|
||||
my $endcol = _time_col($time_per, $end);
|
||||
|
||||
my $label = "[";
|
||||
$label .= "$cpu"; # Maybe make optional in future
|
||||
my $width = $endcol - $startcol + 1;
|
||||
while (length($label) < ($width-1)) { # -1 for ']'
|
||||
$label .= "-";
|
||||
}
|
||||
$label .= "]";
|
||||
$graph->{$thread}[$startcol]{char} .= $label;
|
||||
}
|
||||
if ($Debug) {
|
||||
print "# Multicol: "; _print_graph_line($graph->{$thread}, '|');
|
||||
}
|
||||
# Expand line to one char per column
|
||||
for (my $col = 0; $col <= $#{$graph->{$thread}}; ++$col) {
|
||||
if (my $chars = $graph->{$thread}[$col]{char}) {
|
||||
my $ok = 1;
|
||||
for (my $coladd = 1; $coladd<length($chars); ++$coladd) {
|
||||
if ($graph->{$thread}[$col + $coladd]{char}) {
|
||||
$ok = 0; last;
|
||||
}
|
||||
}
|
||||
if (!$ok) {
|
||||
if ($chars =~ /\[.*\[/) { # Two begins or more
|
||||
$conflicts++;
|
||||
$graph->{$thread}[$col]{char} = "&";
|
||||
} else {
|
||||
$graph->{$thread}[$col]{char} = "[";
|
||||
}
|
||||
for (my $coladd = 1; $coladd<length($chars); ++$coladd) {
|
||||
if ($graph->{$thread}[$col + $coladd]{char}) {
|
||||
last;
|
||||
} else {
|
||||
$graph->{$thread}[$col + $coladd]{char} = 'x';
|
||||
}
|
||||
}
|
||||
} else {
|
||||
my $coladd = 0;
|
||||
foreach my $char (split //, $chars) {
|
||||
$graph->{$thread}[$col+$coladd]{char} = $char;
|
||||
++$coladd;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if ($Debug) {
|
||||
print "# Singlcol: "; _print_graph_line($graph->{$thread}, '|');
|
||||
}
|
||||
}
|
||||
print "# Conflicts $conflicts\n" if $Debug;
|
||||
return ($graph, $conflicts);
|
||||
}
|
||||
|
||||
sub _print_graph_line {
|
||||
my $graph_thread = shift;
|
||||
my $sep = shift;
|
||||
for (my $col = 0; $col <= $#{$graph_thread}; ++$col) {
|
||||
my $c = $graph_thread->[$col]{char}; $c=' ' if !defined $c;
|
||||
print $c, $sep;
|
||||
}
|
||||
print "\n";
|
||||
}
|
||||
|
||||
sub _time_col {
|
||||
my $time_per = shift;
|
||||
my $time = shift;
|
||||
return int($time/$time_per);
|
||||
}
|
||||
|
||||
#######################################################################
|
||||
|
||||
sub write_vcd {
|
||||
my $filename = shift;
|
||||
print "Writing $filename\n";
|
||||
my $fh = IO::File->new(">$filename") or die "%Error: $! $filename,";
|
||||
my $vcd = {values => {}, # {<time>}{<code>} = value
|
||||
sigs => {}, # {<module>}{<sig}} = code
|
||||
code => 0,
|
||||
};
|
||||
|
||||
my %parallelism;
|
||||
foreach my $thread (keys %Threads) {
|
||||
my $mcode = ($vcd->{sigs}{threads}{"thread${thread}_mtask"} ||= $vcd->{code}++);
|
||||
foreach my $start (sort {$a <=> $b} keys %{$Threads{$thread}}) {
|
||||
my $end = $Threads{$thread}{$start}{end};
|
||||
my $mtask = $Threads{$thread}{$start}{mtask};
|
||||
my $cpu = $Threads{$thread}{$start}{cpu};
|
||||
$vcd->{values}{$start}{$mcode} = $mtask;
|
||||
$vcd->{values}{$end}{$mcode} = undef;
|
||||
$parallelism{$start}++;
|
||||
$parallelism{$end}--;
|
||||
|
||||
my $ccode = $vcd->{sigs}{cpus}{"cpu${cpu}_thread"} ||= $vcd->{code}++;
|
||||
$vcd->{values}{$start}{$ccode} = $thread;
|
||||
$vcd->{values}{$end}{$ccode} = undef;
|
||||
|
||||
my $mcode = $vcd->{sigs}{mtasks}{"mtask${mtask}_cpu"} ||= $vcd->{code}++;
|
||||
$vcd->{values}{$start}{$mcode} = $cpu;
|
||||
$vcd->{values}{$end}{$mcode} = undef;
|
||||
}
|
||||
}
|
||||
{
|
||||
my $pcode = ($vcd->{sigs}{Stats}{"parallelism"} ||= $vcd->{code}++);
|
||||
my $value = 0;
|
||||
foreach my $time (sort {$a<=>$b} keys %parallelism) {
|
||||
$value += $parallelism{$time};
|
||||
$vcd->{values}{$time}{$pcode} = $value;
|
||||
}
|
||||
}
|
||||
|
||||
$fh->print('$version Generated by verilator_gantt $end'."\n");
|
||||
$fh->print('$timescale 1ns $end'."\n");
|
||||
$fh->print("\n");
|
||||
|
||||
my %all_codes;
|
||||
$fh->print(' $scope module gantt $end'."\n");
|
||||
foreach my $module (sort keys %{$vcd->{sigs}}) {
|
||||
$fh->printf(' $scope module %s $end'."\n", $module);
|
||||
foreach my $sig (sort keys %{$vcd->{sigs}{$module}}) {
|
||||
my $code = $vcd->{sigs}{$module}{$sig};
|
||||
$fh->printf(' $var wire 32 v%x %s [31:0] $end'."\n",
|
||||
$code, $sig);
|
||||
$all_codes{$code} = 1;
|
||||
}
|
||||
$fh->print(' $upscope $end'."\n");
|
||||
}
|
||||
$fh->print(' $upscope $end'."\n");
|
||||
$fh->print('$enddefinitions $end'."\n");
|
||||
$fh->print("\n");
|
||||
|
||||
my $first = 1;
|
||||
foreach my $time (sort {$a <=> $b} keys %{$vcd->{values}}) {
|
||||
if ($first) {
|
||||
$first = 0;
|
||||
# Start with Z for any signals without time zero data
|
||||
foreach my $code (keys %all_codes) {
|
||||
if (!defined $vcd->{values}{$time}{$code}) {
|
||||
$vcd->{values}{$time}{$code} = undef;
|
||||
}
|
||||
}
|
||||
}
|
||||
$fh->printf("#%d\n", $time);
|
||||
foreach my $code (sort keys %{$vcd->{values}{$time}}) {
|
||||
my $value = $vcd->{values}{$time}{$code};
|
||||
if (defined $value) {
|
||||
$fh->printf("b%b v%x\n", $value, $code);
|
||||
} else {
|
||||
$fh->printf("bz v%x\n", $code);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#######################################################################
|
||||
# Similar to Statistics::Basic functions, but avoid a package dependency
|
||||
|
||||
sub max {
|
||||
my $n = $_[0]; shift;
|
||||
while (defined $_[0]) {
|
||||
$n = $_[0] if !defined $n || $_[0] > $n;
|
||||
shift;
|
||||
}
|
||||
return $n;
|
||||
}
|
||||
|
||||
sub mean {
|
||||
my $arrayref = shift;
|
||||
my $n = 0;
|
||||
my $sum = 0;
|
||||
foreach my $v (@$arrayref) {
|
||||
$sum += $v;
|
||||
$n++;
|
||||
}
|
||||
return undef if !$n;
|
||||
return $sum/$n;
|
||||
}
|
||||
|
||||
sub stddev {
|
||||
my $arrayref = shift;
|
||||
my $n = 0;
|
||||
my $sum = 0;
|
||||
my $sumsq = 0;
|
||||
foreach my $v (@$arrayref) {
|
||||
$sum += $v;
|
||||
$sumsq += $v**2;
|
||||
$n++;
|
||||
}
|
||||
return undef if !$n;
|
||||
return sqrt(($sumsq/$n) - ($sum/$n)**2);
|
||||
}
|
||||
|
||||
#######################################################################
|
||||
__END__
|
||||
|
||||
=pod
|
||||
|
||||
=head1 NAME
|
||||
|
||||
verilator_gantt - Create Gantt chart of multi-threaded execution
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
Creates a visual representation to help analyze Verilator multithreaded
|
||||
simulation performance, by showing when each macro-task starts and ends,
|
||||
and showing when each thread is busy or idle.
|
||||
|
||||
The generated Gantt chart has time on the X-axis. Times shown are to the
|
||||
scale printed, i.e. a certain about of time for each character width. The
|
||||
Y-axis shows threads, each thread's execution is shown on one line. That
|
||||
line shows "[" at the position in time when it executes.
|
||||
|
||||
Following the "[" is the cpu number the task executed on, followed by zero
|
||||
or more "-" to make the width of the characters match the scaled execution
|
||||
time, followed by a "]". If the scale is too small, the cpu number and
|
||||
mtask number will not be printed. If the scale is very small, a "&"
|
||||
indicates multiple mtasks started at that time position.
|
||||
|
||||
Also creates a value change dump (VCD) format dump file which may be viewed
|
||||
in a waveform viewer (e.g. C<GTKWave>). See below.
|
||||
|
||||
=head1 USAGE
|
||||
|
||||
Build with --prof-threads.
|
||||
|
||||
Run a sim with +verilator+prof+threads+window 2.
|
||||
|
||||
This will create profile_threads.dat.
|
||||
|
||||
Then run:
|
||||
|
||||
verilator_gantt profile_threads.dat
|
||||
|
||||
The report will be printed on standard output, this also generates
|
||||
profile_threads.vcd
|
||||
|
||||
View profile_threads.vcd in a waveform viewer.
|
||||
|
||||
=head1 VCD SIGNALS
|
||||
|
||||
In waveforms there are the following signals. Most signals the "decimal"
|
||||
format will remove the leading zeros and make the traces easier to read.
|
||||
|
||||
parallelism: The number of mtasks active at this time, for best performance
|
||||
this will match the thread count. You may want to use an "analog step"
|
||||
format to view this signal.
|
||||
|
||||
cpu#_thread: For the given CPU number, the thread number executing.
|
||||
|
||||
mtask#_cpu; For the given mtask id, the CPU it is executing on.
|
||||
|
||||
thread#_mtask: For the given thread number, the mtask id executing.
|
||||
|
||||
=head1 ARGUMENTS
|
||||
|
||||
=over 4
|
||||
|
||||
=item I<filename>
|
||||
|
||||
The filename to read data from, defaults to "profile_threads.dat".
|
||||
|
||||
=item --help
|
||||
|
||||
Displays this message and program version and exits.
|
||||
|
||||
=item --scale I<n>
|
||||
|
||||
On the X-axis of the generated Gantt chart, each character represents this
|
||||
many time units. (On x86, time units are rdtsc ticks.) Defaults to 0,
|
||||
which will automatically compute a reasonable scale where no two mtasks
|
||||
need to fit into same character width's worth of scaled time.
|
||||
|
||||
=item --no-vcd
|
||||
|
||||
=item --vcd I<filename>
|
||||
|
||||
Set output filename for vcd dump, or disable. Default is
|
||||
verilator_gantt.vcd.
|
||||
|
||||
=back
|
||||
|
||||
=head1 DISTRIBUTION
|
||||
|
||||
The latest version is available from L<http://www.veripool.org/>.
|
||||
|
||||
Copyright 2018-2018 by Wilson Snyder. Verilator is free software; you can
|
||||
redistribute it and/or modify it under the terms of either the GNU Lesser
|
||||
General Public License Version 3 or the Perl Artistic License Version 2.0.
|
||||
|
||||
=head1 AUTHORS
|
||||
|
||||
Wilson Snyder <wsnyder@wsnyder.org>
|
||||
|
||||
=head1 SEE ALSO
|
||||
|
||||
C<verilator>
|
||||
|
||||
=cut
|
||||
|
||||
######################################################################
|
||||
### Local Variables:
|
||||
### compile-command: "$V4/bin/verilator_gantt $V4/test_regress/obj_vltmt/t_gantt/vlt_sim.log"
|
||||
### End:
|
@ -38,6 +38,7 @@ VerilatedVoidCb Verilated::s_flushCb = NULL;
|
||||
|
||||
// Keep below together in one cache line
|
||||
Verilated::Serialized Verilated::s_s;
|
||||
Verilated::NonSerialized Verilated::s_ns;
|
||||
VL_THREAD_LOCAL Verilated::ThreadLocal Verilated::t_s;
|
||||
|
||||
Verilated::CommandArgValues Verilated::s_args;
|
||||
@ -196,6 +197,17 @@ Verilated::Serialized::Serialized() {
|
||||
s_fatalOnVpiError = true; // retains old default behaviour
|
||||
}
|
||||
|
||||
Verilated::NonSerialized::NonSerialized() {
|
||||
s_profThreadsStart = 1;
|
||||
s_profThreadsWindow = 2;
|
||||
s_profThreadsFilenamep = strdup("profile_threads.dat");
|
||||
}
|
||||
Verilated::NonSerialized::~NonSerialized() {
|
||||
if (s_profThreadsFilenamep) {
|
||||
free(const_cast<char*>(s_profThreadsFilenamep)); s_profThreadsFilenamep=NULL;
|
||||
}
|
||||
}
|
||||
|
||||
//===========================================================================
|
||||
// Random reset -- Only called at init time, so don't inline.
|
||||
|
||||
@ -1648,6 +1660,20 @@ void Verilated::fatalOnVpiError(bool flag) VL_MT_SAFE {
|
||||
VerilatedLockGuard lock(m_mutex);
|
||||
s_s.s_fatalOnVpiError = flag;
|
||||
}
|
||||
void Verilated::profThreadsStart(vluint64_t flag) VL_MT_SAFE {
|
||||
VerilatedLockGuard lock(m_mutex);
|
||||
s_ns.s_profThreadsStart = flag;
|
||||
}
|
||||
void Verilated::profThreadsWindow(vluint64_t flag) VL_MT_SAFE {
|
||||
VerilatedLockGuard lock(m_mutex);
|
||||
s_ns.s_profThreadsWindow = flag;
|
||||
}
|
||||
void Verilated::profThreadsFilenamep(const char* flagp) VL_MT_SAFE {
|
||||
VerilatedLockGuard lock(m_mutex);
|
||||
if (s_ns.s_profThreadsFilenamep) free(const_cast<char*>(s_ns.s_profThreadsFilenamep));
|
||||
s_ns.s_profThreadsFilenamep = strdup(flagp);
|
||||
}
|
||||
|
||||
|
||||
const char* Verilated::catName(const char* n1, const char* n2) VL_MT_SAFE {
|
||||
// Returns new'ed data
|
||||
@ -1800,6 +1826,15 @@ void VerilatedImp::commandArgVl(const std::string& arg) {
|
||||
VL_PRINTF_MT("For help, please see 'verilator --help'\n");
|
||||
VL_FATAL_MT("COMMAND_LINE", 0, "", "Exiting due to command line argument (not an error)");
|
||||
}
|
||||
else if (commandArgVlValue(arg, "+verilator+prof+threads+start+", value/*ref*/)) {
|
||||
Verilated::profThreadsStart(atoll(value.c_str()));
|
||||
}
|
||||
else if (commandArgVlValue(arg, "+verilator+prof+threads+window+", value/*ref*/)) {
|
||||
Verilated::profThreadsWindow(atol(value.c_str()));
|
||||
}
|
||||
else if (commandArgVlValue(arg, "+verilator+prof+threads+file+", value/*ref*/)) {
|
||||
Verilated::profThreadsFilenamep(value.c_str());
|
||||
}
|
||||
else if (commandArgVlValue(arg, "+verilator+rand+reset+", value/*ref*/)) {
|
||||
Verilated::randReset(atoi(value.c_str()));
|
||||
}
|
||||
|
@ -344,6 +344,17 @@ class Verilated {
|
||||
~Serialized() {}
|
||||
} s_s;
|
||||
|
||||
static struct NonSerialized { // Non-serialized information
|
||||
// These are reloaded from on command-line settings, so do not need to persist
|
||||
// Fast path
|
||||
vluint64_t s_profThreadsStart; ///< +prof+threads starting time
|
||||
vluint32_t s_profThreadsWindow; ///< +prof+threads window size
|
||||
// Slow path
|
||||
const char* s_profThreadsFilenamep; ///< +prof+threads filename
|
||||
NonSerialized();
|
||||
~NonSerialized();
|
||||
} s_ns;
|
||||
|
||||
// no need to be save-restored (serialized) the
|
||||
// assumption is that the restore is allowed to pass different arguments
|
||||
static struct CommandArgValues {
|
||||
@ -409,6 +420,14 @@ public:
|
||||
/// Enable/disable vpi fatal
|
||||
static void fatalOnVpiError(bool flag) VL_MT_SAFE;
|
||||
static bool fatalOnVpiError() VL_MT_SAFE { return s_s.s_fatalOnVpiError; }
|
||||
/// --prof-threads related settings
|
||||
static void profThreadsStart(vluint64_t flag) VL_MT_SAFE;
|
||||
static vluint64_t profThreadsStart() VL_MT_SAFE { return s_ns.s_profThreadsStart; }
|
||||
static void profThreadsWindow(vluint64_t flag) VL_MT_SAFE;
|
||||
static vluint32_t profThreadsWindow() VL_MT_SAFE { return s_ns.s_profThreadsWindow; }
|
||||
static void profThreadsFilenamep(const char* flagp) VL_MT_SAFE;
|
||||
static const char* profThreadsFilenamep() VL_MT_SAFE { return s_ns.s_profThreadsFilenamep; }
|
||||
|
||||
/// Flush callback for VCD waves
|
||||
static void flushCb(VerilatedVoidCb cb) VL_MT_SAFE;
|
||||
static void flushCall() VL_MT_SAFE;
|
||||
|
229
include/verilated_threads.cpp
Normal file
229
include/verilated_threads.cpp
Normal file
@ -0,0 +1,229 @@
|
||||
// -*- mode: C++; c-file-style: "cc-mode" -*-
|
||||
//=============================================================================
|
||||
//
|
||||
// THIS MODULE IS PUBLICLY LICENSED
|
||||
//
|
||||
// Copyright 2012-2018 by Wilson Snyder. This program is free software;
|
||||
// you can redistribute it and/or modify it under the terms of either the GNU
|
||||
// Lesser General Public License Version 3 or the Perl Artistic License Version 2.0.
|
||||
//
|
||||
// This is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||
// WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
// for more details.
|
||||
//
|
||||
//=============================================================================
|
||||
///
|
||||
/// \file
|
||||
/// \brief Thread pool for verilated modules
|
||||
///
|
||||
//=============================================================================
|
||||
|
||||
#include "verilatedos.h"
|
||||
#include "verilated_threads.h"
|
||||
#include <cstdio>
|
||||
|
||||
std::atomic<vluint64_t> VlNotification::s_yields;
|
||||
|
||||
VL_THREAD_LOCAL VlThreadPool::ProfileTrace* VlThreadPool::t_profilep = NULL;
|
||||
|
||||
//=============================================================================
|
||||
// VlMTaskVertex
|
||||
|
||||
VlMTaskVertex::VlMTaskVertex(vluint32_t upstreamDepCount)
|
||||
: m_upstreamDepsDone(0),
|
||||
m_upstreamDepCount(upstreamDepCount) {
|
||||
assert(atomic_is_lock_free(&m_upstreamDepsDone));
|
||||
}
|
||||
|
||||
//=============================================================================
|
||||
// VlWorkerThread
|
||||
|
||||
VlWorkerThread::VlWorkerThread(VlThreadPool* poolp, bool profiling)
|
||||
: m_poolp(poolp)
|
||||
, m_profiling(profiling)
|
||||
, m_exiting(false)
|
||||
// Must init this last -- after setting up fields that it might read:
|
||||
, m_cthread(startWorker, this) {}
|
||||
|
||||
VlWorkerThread::~VlWorkerThread() {
|
||||
m_exiting.store(true, std::memory_order_release);
|
||||
{
|
||||
VerilatedLockGuard lk(m_mutex);
|
||||
if (sleeping()) {
|
||||
wakeUp();
|
||||
}
|
||||
}
|
||||
// The thread should exit; join it.
|
||||
m_cthread.join();
|
||||
}
|
||||
|
||||
void VlWorkerThread::workerLoop() {
|
||||
if (VL_UNLIKELY(m_profiling)) {
|
||||
m_poolp->setupProfilingClientThread();
|
||||
}
|
||||
|
||||
VlNotification alarm;
|
||||
ExecRec work;
|
||||
work.m_fnp = NULL;
|
||||
|
||||
while (1) {
|
||||
bool sleep = false;
|
||||
if (VL_UNLIKELY(!work.m_fnp)) {
|
||||
// Look for work
|
||||
VerilatedLockGuard lk(m_mutex);
|
||||
if (VL_LIKELY(!m_ready.empty())) {
|
||||
dequeWork(&work);
|
||||
} else {
|
||||
// No work available, prepare to sleep. Pass alarm/work
|
||||
// into m_sleepAlarm so wakeUp will tall this function.
|
||||
//
|
||||
// Must modify m_sleepAlarm in the same critical section as
|
||||
// the check for ready work, otherwise we could race with
|
||||
// another thread enqueueing work and never be awoken.
|
||||
m_sleepAlarm.first = &alarm;
|
||||
m_sleepAlarm.second = &work;
|
||||
sleep = true;
|
||||
}
|
||||
}
|
||||
|
||||
// Do this here, not above, to avoid a race with the destructor.
|
||||
if (VL_UNLIKELY(m_exiting.load(std::memory_order_acquire)))
|
||||
break;
|
||||
|
||||
if (VL_UNLIKELY(sleep)) {
|
||||
alarm.waitForNotification(); // ZZZzzzzz
|
||||
alarm.reset();
|
||||
}
|
||||
if (VL_LIKELY(work.m_fnp)) {
|
||||
work.m_fnp(work.m_evenCycle, work.m_sym);
|
||||
work.m_fnp = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
if (VL_UNLIKELY(m_profiling)) {
|
||||
m_poolp->tearDownProfilingClientThread();
|
||||
}
|
||||
}
|
||||
|
||||
void VlWorkerThread::startWorker(VlWorkerThread* workerp) {
|
||||
workerp->workerLoop();
|
||||
}
|
||||
|
||||
//=============================================================================
|
||||
// VlThreadPool
|
||||
|
||||
VlThreadPool::VlThreadPool(int nThreads, bool profiling)
|
||||
: m_profiling(profiling) {
|
||||
// --threads N passes nThreads=N-1, as the "main" threads counts as 1
|
||||
unsigned cpus = std::thread::hardware_concurrency();
|
||||
if (cpus < nThreads+1) {
|
||||
VL_PRINTF_MT("%%Warning: System has %u CPUs but model Verilated with"
|
||||
" --threads %d; may run slow.\n", cpus, nThreads+1);
|
||||
}
|
||||
// Create'em
|
||||
for (int i=0; i<nThreads; ++i) {
|
||||
m_workers.push_back(new VlWorkerThread(this, profiling));
|
||||
}
|
||||
// Set up a profile buffer for the current thread too -- on the
|
||||
// assumption that it's the same thread that calls eval and may be
|
||||
// donated to run mtasks during the eval.
|
||||
if (VL_UNLIKELY(m_profiling)) {
|
||||
setupProfilingClientThread();
|
||||
}
|
||||
}
|
||||
|
||||
VlThreadPool::~VlThreadPool() {
|
||||
for (int i = 0; i < m_workers.size(); ++i) {
|
||||
// Each ~WorkerThread will wait for its thread to exit.
|
||||
delete m_workers[i];
|
||||
}
|
||||
if (VL_UNLIKELY(m_profiling)) {
|
||||
tearDownProfilingClientThread();
|
||||
}
|
||||
}
|
||||
|
||||
void VlThreadPool::tearDownProfilingClientThread() {
|
||||
assert(t_profilep);
|
||||
delete t_profilep;
|
||||
t_profilep = NULL;
|
||||
}
|
||||
|
||||
void VlThreadPool::setupProfilingClientThread() {
|
||||
assert(!t_profilep);
|
||||
t_profilep = new ProfileTrace;
|
||||
// Reserve some space in the thread-local profiling buffer;
|
||||
// try not to malloc while collecting profiling.
|
||||
t_profilep->reserve(4096);
|
||||
{
|
||||
VerilatedLockGuard lk(m_mutex);
|
||||
m_allProfiles.insert(t_profilep);
|
||||
}
|
||||
}
|
||||
|
||||
void VlThreadPool::profileAppendAll(const VlProfileRec& rec) {
|
||||
VerilatedLockGuard lk(m_mutex);
|
||||
for (ProfileSet::iterator it = m_allProfiles.begin();
|
||||
it != m_allProfiles.end(); ++it) {
|
||||
// Every thread's profile trace gets a copy of rec.
|
||||
(*it)->emplace_back(rec);
|
||||
}
|
||||
}
|
||||
|
||||
void VlThreadPool::profileDump(const char* filenamep, vluint64_t ticksElapsed) {
|
||||
VerilatedLockGuard lk(m_mutex);
|
||||
VL_DEBUG_IF(VL_DBG_MSGF("+prof+threads writing to '%s'\n", filenamep););
|
||||
|
||||
FILE* fp = fopen(filenamep, "w");
|
||||
if (VL_UNLIKELY(!fp)) {
|
||||
VL_FATAL_MT(filenamep, 0, "", "+prof+threads+file file not writable");
|
||||
return;
|
||||
}
|
||||
|
||||
// TODO Perhaps merge with verilated_coverage output format, so can
|
||||
// have a common merging and reporting tool, etc.
|
||||
fprintf(fp, "VLPROFTHREAD 1.0 # Verilator thread profile dump version 1.0\n");
|
||||
fprintf(fp, "VLPROF arg --threads %" VL_PRI64 "u\n",
|
||||
vluint64_t(m_workers.size()+1));
|
||||
fprintf(fp, "VLPROF arg +verilator+prof+threads+start+%" VL_PRI64 "u\n",
|
||||
Verilated::profThreadsStart());
|
||||
fprintf(fp, "VLPROF arg +verilator+prof+threads+window+%u\n",
|
||||
Verilated::profThreadsWindow());
|
||||
fprintf(fp, "VLPROF stat yields %" VL_PRI64 "u\n",
|
||||
VlNotification::yields());
|
||||
|
||||
vluint32_t thread_id = 0;
|
||||
for (ProfileSet::iterator pit = m_allProfiles.begin();
|
||||
pit != m_allProfiles.end(); ++pit) {
|
||||
++thread_id;
|
||||
|
||||
bool printing = false; // False while in warmup phase
|
||||
for (ProfileTrace::iterator eit = (*pit)->begin();
|
||||
eit != (*pit)->end(); ++eit) {
|
||||
switch (eit->m_type) {
|
||||
case VlProfileRec::TYPE_BARRIER:
|
||||
printing = true;
|
||||
break;
|
||||
case VlProfileRec::TYPE_MTASK_RUN:
|
||||
if (!printing) break;
|
||||
fprintf(fp, "VLPROF mtask %d"
|
||||
" start %" VL_PRI64"u end %" VL_PRI64"u elapsed %" VL_PRI64 "u"
|
||||
" predict_time %u cpu %u on thread %u\n",
|
||||
eit->m_mtaskId,
|
||||
eit->m_startTime,
|
||||
eit->m_endTime,
|
||||
(eit->m_endTime - eit->m_startTime),
|
||||
eit->m_predictTime,
|
||||
eit->m_cpu,
|
||||
thread_id);
|
||||
break;
|
||||
default: assert(false);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
fprintf(fp, "VLPROF stat ticks %" VL_PRI64 "u\n",
|
||||
ticksElapsed);
|
||||
|
||||
fclose(fp);
|
||||
}
|
313
include/verilated_threads.h
Normal file
313
include/verilated_threads.h
Normal file
@ -0,0 +1,313 @@
|
||||
// -*- mode: C++; c-file-style: "cc-mode" -*-
|
||||
//=============================================================================
|
||||
//
|
||||
// THIS MODULE IS PUBLICLY LICENSED
|
||||
//
|
||||
// Copyright 2012-2018 by Wilson Snyder. This program is free software;
|
||||
// you can redistribute it and/or modify it under the terms of either the GNU
|
||||
// Lesser General Public License Version 3 or the Perl Artistic License Version 2.0.
|
||||
//
|
||||
// This is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||
// WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
// for more details.
|
||||
//
|
||||
//=============================================================================
|
||||
///
|
||||
/// \file
|
||||
/// \brief Thread pool and profiling for Verilated modules
|
||||
///
|
||||
//=============================================================================
|
||||
|
||||
#ifndef _VERILATED_THREADS_H_
|
||||
#define _VERILATED_THREADS_H_
|
||||
|
||||
#include "verilatedos.h"
|
||||
|
||||
#include <atomic>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
#include <set>
|
||||
#include <sched.h> // For sched_getcpu()
|
||||
|
||||
#include "verilated.h" // for VerilatedMutex and clang annotations
|
||||
|
||||
// VlMTaskVertex and VlThreadpool will work with multiple symbol table types.
|
||||
// Since the type is opaque to VlMTaskVertex and VlThreadPool, represent it
|
||||
// as a void* here.
|
||||
typedef void* VlThrSymTab;
|
||||
|
||||
class VlNotification {
|
||||
// MEMBERS
|
||||
std::atomic<bool> m_notified; // Notification pending
|
||||
static std::atomic<vluint64_t> s_yields; // Statistics
|
||||
|
||||
public:
|
||||
// CONSTRUCTORS
|
||||
VlNotification()
|
||||
: m_notified(false) {
|
||||
assert(atomic_is_lock_free(&m_notified));
|
||||
}
|
||||
~VlNotification() {}
|
||||
|
||||
// METHODS
|
||||
static vluint64_t yields() { return s_yields; }
|
||||
|
||||
// Block until notify() has occurred, then return.
|
||||
// If notify() has already occurred, return immediately.
|
||||
//
|
||||
// This is logically const: the object will remain in notified state
|
||||
// after WaitForNotification() returns, so you could notify more than
|
||||
// one thread of the same event.
|
||||
inline void waitForNotification() {
|
||||
unsigned ct = 0;
|
||||
while (VL_UNLIKELY(!notified())) {
|
||||
VL_CPU_RELAX();
|
||||
ct++;
|
||||
if (VL_UNLIKELY(ct > VL_LOCK_SPINS)) {
|
||||
ct = 0;
|
||||
++s_yields; // Statistics
|
||||
std::this_thread::yield();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// The 'inline' keyword here means nothing to the compiler, it's
|
||||
// implicit on methods defined within the class body anyway.
|
||||
//
|
||||
// 'inline' is attached the this method, and others in this file,
|
||||
// to remind humans that some routines in this file are called many
|
||||
// times per cycle in threaded mode. Such routines should be
|
||||
// inlinable; that's why they're declared in the .h and not the .cpp.
|
||||
inline bool notified() {
|
||||
return m_notified.load(std::memory_order_acquire);
|
||||
}
|
||||
// Set notified state. If state is already notified,
|
||||
// it remains so.
|
||||
inline void notify() {
|
||||
m_notified.store(true, std::memory_order_release);
|
||||
}
|
||||
// Reset the state to un-notified state, which is also the
|
||||
// state of a new Notification object.
|
||||
inline void reset() {
|
||||
m_notified.store(false, std::memory_order_relaxed);
|
||||
}
|
||||
};
|
||||
|
||||
typedef void (*VlExecFnp)(bool, VlThrSymTab);
|
||||
|
||||
/// Track dependencies for a single MTask.
|
||||
class VlMTaskVertex {
|
||||
// MEMBERS
|
||||
|
||||
// On even cycles, _upstreamDepsDone increases as upstream
|
||||
// dependencies complete. When it reaches _upstreamDepCount,
|
||||
// this MTaskVertex is ready.
|
||||
//
|
||||
// On odd cycles, _upstreamDepsDone decreases as upstream
|
||||
// dependencies complete, and when it reaches zero this MTaskVertex
|
||||
// is ready.
|
||||
//
|
||||
// An atomic is smaller than a mutex, and lock-free.
|
||||
//
|
||||
// (Why does the size of this class matter? If an mtask has many
|
||||
// downstream mtasks to notify, we hope these will pack into a
|
||||
// small number of cache lines to reduce the cost of pointer chasing
|
||||
// during done-notification. Nobody's quantified that cost though.
|
||||
// If we were really serious about shrinking this class, we could
|
||||
// use 16-bit types here...)
|
||||
std::atomic<vluint32_t> m_upstreamDepsDone;
|
||||
const vluint32_t m_upstreamDepCount;
|
||||
|
||||
public:
|
||||
// CONSTRUCTORS
|
||||
|
||||
// 'upstreamDepCount' is the number of upstream MTaskVertex's
|
||||
// that must notify this MTaskVertex before it will become ready
|
||||
// to run.
|
||||
explicit VlMTaskVertex(vluint32_t upstreamDepCount);
|
||||
~VlMTaskVertex() {}
|
||||
|
||||
// Upstream mtasks must call this when they complete.
|
||||
// Returns true when the current MTaskVertex becomes ready to execute,
|
||||
// false while it's still waiting on more dependencies.
|
||||
inline bool signalUpstreamDone(bool evenCycle) {
|
||||
if (evenCycle) {
|
||||
vluint32_t upstreamDepsDone
|
||||
= 1 + m_upstreamDepsDone.fetch_add(1, std::memory_order_release);
|
||||
assert(upstreamDepsDone <= m_upstreamDepCount);
|
||||
return (upstreamDepsDone == m_upstreamDepCount);
|
||||
} else {
|
||||
vluint32_t upstreamDepsDone_prev
|
||||
= m_upstreamDepsDone.fetch_sub(1, std::memory_order_release);
|
||||
assert(upstreamDepsDone_prev > 0);
|
||||
return (upstreamDepsDone_prev == 1);
|
||||
}
|
||||
}
|
||||
inline bool areUpstreamDepsDone(bool evenCycle) const {
|
||||
vluint32_t target = evenCycle ? m_upstreamDepCount : 0;
|
||||
return m_upstreamDepsDone.load(std::memory_order_acquire) == target;
|
||||
}
|
||||
inline void waitUntilUpstreamDone(bool evenCycle) const {
|
||||
while (VL_UNLIKELY(!areUpstreamDepsDone(evenCycle))) {
|
||||
VL_CPU_RELAX();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Profiling support
|
||||
class VlProfileRec {
|
||||
protected:
|
||||
friend class VlThreadPool;
|
||||
enum VlProfileE {
|
||||
TYPE_MTASK_RUN,
|
||||
TYPE_BARRIER
|
||||
};
|
||||
VlProfileE m_type; // Record type
|
||||
vluint32_t m_mtaskId; // Mtask we're logging
|
||||
vluint32_t m_predictTime; // How long scheduler predicted would take
|
||||
vluint64_t m_startTime; // Tick at start of execution
|
||||
vluint64_t m_endTime; // Tick at end of execution
|
||||
unsigned m_cpu; // Execution CPU number (at start anyways)
|
||||
public:
|
||||
class Barrier {};
|
||||
VlProfileRec() {}
|
||||
explicit VlProfileRec(Barrier) {
|
||||
m_type = TYPE_BARRIER;
|
||||
m_mtaskId = 0;
|
||||
m_predictTime = 0;
|
||||
m_startTime = 0;
|
||||
m_cpu = sched_getcpu();
|
||||
}
|
||||
void startRecord(vluint64_t time, uint32_t mtask, uint32_t predict) {
|
||||
m_type = VlProfileRec::TYPE_MTASK_RUN;
|
||||
m_mtaskId = mtask;
|
||||
m_predictTime = predict;
|
||||
m_startTime = time;
|
||||
m_cpu = sched_getcpu();
|
||||
}
|
||||
void endRecord(vluint64_t time) {
|
||||
m_endTime = time;
|
||||
}
|
||||
};
|
||||
|
||||
class VlThreadPool;
|
||||
|
||||
class VlWorkerThread {
|
||||
private:
|
||||
// TYPES
|
||||
struct ExecRec {
|
||||
VlExecFnp m_fnp; // Function to execute
|
||||
VlThrSymTab m_sym; // Symbol table to execute
|
||||
bool m_evenCycle; // Even/odd for flag alternation
|
||||
ExecRec() : m_fnp(NULL), m_sym(NULL), m_evenCycle(false) {}
|
||||
ExecRec(VlExecFnp fnp, bool evenCycle, VlThrSymTab sym)
|
||||
: m_fnp(fnp), m_sym(sym), m_evenCycle(evenCycle) {}
|
||||
};
|
||||
|
||||
// MEMBERS
|
||||
VerilatedMutex m_mutex;
|
||||
|
||||
// Why a vector? We expect the pending list to be very short, typically
|
||||
// 0 or 1 or 2, so popping from the front shouldn't be
|
||||
// expensive. Revisit if we ever have longer queues...
|
||||
std::vector<ExecRec> m_ready VL_GUARDED_BY(m_mutex);
|
||||
|
||||
VlThreadPool* m_poolp; // Our associated thread pool
|
||||
|
||||
// If values stored are non-NULL, the thread is asleep pending new
|
||||
// work. If the thread is not asleep, both parts of m_sleepAlarm must
|
||||
// be NULL.
|
||||
std::pair<VlNotification*, ExecRec*> m_sleepAlarm VL_GUARDED_BY(m_mutex);
|
||||
|
||||
bool m_profiling; // Is profiling enabled?
|
||||
std::atomic<bool> m_exiting; // Worker thread should exit
|
||||
std::thread m_cthread; // Underlying C++ thread record
|
||||
|
||||
VL_UNCOPYABLE(VlWorkerThread);
|
||||
|
||||
public:
|
||||
// CONSTRUCTORS
|
||||
explicit VlWorkerThread(VlThreadPool* poolp, bool profiling);
|
||||
~VlWorkerThread();
|
||||
|
||||
// METHODS
|
||||
inline void dequeWork(ExecRec* workp) VL_REQUIRES(m_mutex) {
|
||||
// As noted above this is inefficient if our ready list is ever
|
||||
// long (but it shouldn't be)
|
||||
*workp = m_ready.front();
|
||||
m_ready.erase(m_ready.begin());
|
||||
}
|
||||
inline void wakeUp() VL_REQUIRES(m_mutex) {
|
||||
VlNotification* notifyp = m_sleepAlarm.first;
|
||||
m_sleepAlarm.first = NULL; // NULL+NULL means wake
|
||||
m_sleepAlarm.second = NULL;
|
||||
notifyp->notify();
|
||||
}
|
||||
inline bool sleeping() VL_REQUIRES(m_mutex) {
|
||||
return (m_sleepAlarm.first != NULL);
|
||||
}
|
||||
inline void addTask(VlExecFnp fnp, bool evenCycle, VlThrSymTab sym) {
|
||||
VerilatedLockGuard lk(m_mutex);
|
||||
m_ready.emplace_back(fnp, evenCycle, sym);
|
||||
if (VL_LIKELY(sleeping())) { // Generally queue is waiting for work
|
||||
// Awaken thread
|
||||
dequeWork(m_sleepAlarm.second);
|
||||
wakeUp();
|
||||
}
|
||||
}
|
||||
void workerLoop();
|
||||
static void startWorker(VlWorkerThread* workerp);
|
||||
};
|
||||
|
||||
class VlThreadPool {
|
||||
// TYPES
|
||||
typedef std::vector<VlProfileRec> ProfileTrace;
|
||||
typedef std::set<ProfileTrace*> ProfileSet;
|
||||
|
||||
// MEMBERS
|
||||
std::vector<VlWorkerThread*> m_workers; // our workers
|
||||
bool m_profiling; // is profiling enabled?
|
||||
|
||||
// Support profiling -- we can append records of profiling events
|
||||
// to this vector with very low overhead, and then dump them out
|
||||
// later. This prevents the overhead of printf/malloc/IO from
|
||||
// corrupting the profiling data. It's super cheap to append
|
||||
// a VlProfileRec struct on the end of a pre-allocated vector;
|
||||
// this is the only cost we pay in real-time during a profiling cycle.
|
||||
static VL_THREAD_LOCAL ProfileTrace* t_profilep;
|
||||
ProfileSet m_allProfiles VL_GUARDED_BY(m_mutex);
|
||||
VerilatedMutex m_mutex;
|
||||
|
||||
public:
|
||||
// CONSTRUCTORS
|
||||
// Construct a thread pool with 'nThreads' dedicated threads. The thread
|
||||
// pool will create these threads and make them available to execute tasks
|
||||
// via this->workerp(index)->addTask(...)
|
||||
VlThreadPool(int nThreads, bool profiling);
|
||||
~VlThreadPool();
|
||||
|
||||
// METHODS
|
||||
inline int numThreads() const {
|
||||
return m_workers.size();
|
||||
}
|
||||
inline VlWorkerThread* workerp(int index) {
|
||||
assert(index >= 0);
|
||||
assert(index < m_workers.size());
|
||||
return m_workers[index];
|
||||
}
|
||||
inline VlProfileRec* profileAppend() {
|
||||
t_profilep->emplace_back();
|
||||
return &(t_profilep->back());
|
||||
}
|
||||
void profileAppendAll(const VlProfileRec& rec);
|
||||
void profileDump(const char* filenamep, vluint64_t ticksElapsed);
|
||||
// In profiling mode, each executing thread must call
|
||||
// this once to setup profiling state:
|
||||
void setupProfilingClientThread();
|
||||
void tearDownProfilingClientThread();
|
||||
private:
|
||||
VL_UNCOPYABLE(VlThreadPool);
|
||||
};
|
||||
|
||||
#endif
|
215
internals.pod
215
internals.pod
@ -155,6 +155,221 @@ provided and documented in C<V3GraphAlg.cpp>.
|
||||
|
||||
=back
|
||||
|
||||
=head2 Multithreaded Mode
|
||||
|
||||
In --threads mode, the frontend of the Verilator pipeline is the same as
|
||||
serial mode, up until V3Order.
|
||||
|
||||
V3Order builds a fine-grained, statement-level dependency graph that governs
|
||||
the ordering of code within a single eval() call. In serial mode, that
|
||||
dependency graph is used to order all statements into a total serial order.
|
||||
In parallel mode, the same dependency graph is the starting point for a
|
||||
partitioner (V3Partition).
|
||||
|
||||
The partitioner's goal is to coarsen the fine-grained DAG into a coarser
|
||||
DAG, while maintaining as much available parallelism as possible. Often the
|
||||
partitioner can transform an input graph with millions of nodes into a
|
||||
coarsened execution graph with a few dozen nodes, while maintaining enough
|
||||
parallelism to take advantage of a modern multicore CPU. Runtime
|
||||
synchronization cost is not prohibitive with so few nodes.
|
||||
|
||||
=head3 Partitioning
|
||||
|
||||
Our partitioner is similar to the one Vivek Sarkar described in his 1989
|
||||
paper "Partitioning and Scheduling Parallel Programs for Multiprocessors".
|
||||
|
||||
Let's define some terms:
|
||||
|
||||
=over 4
|
||||
|
||||
=item C<Par Factor>
|
||||
|
||||
The available parallelism or "par-factor" of a DAG is the total cost to
|
||||
execute all nodes, divided by the cost to execute the longest critical path
|
||||
through the graph. This is the speedup you would get from running the graph
|
||||
in parallel, if given infinite CPU cores available and communication and
|
||||
synchronization are zero.
|
||||
|
||||
=item C<Macro Task>
|
||||
|
||||
When the partitioner coarsens the graph, it combines nodes together. Each
|
||||
fine-grained node represents an atomic "task"; combined nodes in the
|
||||
coarsened graph are "macro-tasks". This term comes from Sarkar. Each
|
||||
macro-task executes from start to end on one processor, without any
|
||||
synchronization to any other macro-task during its
|
||||
execution. (Synchronization only happens before the macro-task begins or
|
||||
after it ends.)
|
||||
|
||||
=item C<Edge Contraction>
|
||||
|
||||
Our partitioner, like Sarkar's, primarily relies on "edge contraction" to
|
||||
coarsen the graph. It starts with one macro-task per atomic task and
|
||||
iteratively combines pairs of edge-connected macro-tasks.
|
||||
|
||||
=item C<Local Critical Path>
|
||||
|
||||
Each node in the graph has a "local" critical path. That's the critical
|
||||
path from the start of the graph to the start of the node, plus the node's
|
||||
cost, plus the critical path from the end of the node to the end of the
|
||||
graph.
|
||||
|
||||
=back
|
||||
|
||||
Sarkar calls out an important trade-off: coarsening the graph reduces
|
||||
runtime synchronization overhead among the macro-tasks, but it tends to
|
||||
increase the critical path through the graph and thus reduces par-factor.
|
||||
|
||||
Sarkar's partitioner, and ours, chooses pairs of macro-tasks to merge such
|
||||
that the growth in critical path is minimized. Each candidate merge would
|
||||
result in a new node, which would have some local critical path. We choose
|
||||
the candidate that would produce the shortest local critical path. Repeat
|
||||
until par-factor falls to a target threshold. It's a greedy algorithm, and
|
||||
it's not guaranteed to produce the best partition (which Sarkar proves is
|
||||
NP-hard).
|
||||
|
||||
=head3 Estimating Logic Costs
|
||||
|
||||
To compute the cost of any given path through the graph, Verilator
|
||||
estimates an execution cost for each task. Each macro-task has an execution
|
||||
cost which is simply the sum of its tasks' costs. We assume that
|
||||
communication overhead and synchronization overhead are zero, so the cost
|
||||
of any given path through the graph is simply the sum of macro-task
|
||||
execution costs. Sarkar does almost the same thing, except that he has
|
||||
nonzero estimates for synchronization costs.
|
||||
|
||||
Verilator's cost estimates are assigned by the InstrCountCostVisitor. This
|
||||
class is perhaps the most fragile piece of the multithread implementation.
|
||||
It's easy to have a bug where you count something cheap (eg. accessing one
|
||||
element of a huge array) as if it were expensive (eg. by counting it as if
|
||||
it were an access to the entire array.) Even without such gross bugs, the
|
||||
estimates this produce are only loosely predictive of actual runtime cost.
|
||||
Multithread performance would be better with better runtime costs
|
||||
estimates. This is an area to improve.
|
||||
|
||||
=head3 Scheduling Macro-Tasks at Runtime
|
||||
|
||||
After coarsening the graph, we must schedule the macro-tasks for runtime.
|
||||
Sarkar describes two options: you can dynamically schedule tasks at
|
||||
runtime, with a runtime graph follower. Sarkar calls this the
|
||||
"macro-dataflow model." Verilator does not support this; early experiments
|
||||
with this approach had poor performance.
|
||||
|
||||
The other option is to statically assign macro-tasks to threads, with each
|
||||
thread running its macro-tasks in a static order. Sarkar describes this in
|
||||
Chapter 5. Verilator takes this static approach. The only dynamic aspect is
|
||||
that each macro task may block before starting, to wait until its
|
||||
prerequisites on other threads have finished.
|
||||
|
||||
The synchronization cost is cheap if the prereqs are done. If they're not,
|
||||
fragmentation (idle CPU cores waiting) is possible. This is the major
|
||||
source of overhead in this approach. The --prof-threads switch and the
|
||||
C<verilator_gantt> script can visualize the time lost to such
|
||||
fragmentation.
|
||||
|
||||
=head3 Locating Variables for Best Spatial Locality
|
||||
|
||||
After scheduling all code, we attempt to locate variables in memory such
|
||||
that variables accessed by a single macro-task are close together in
|
||||
memory. This provides "spatial locality" -- when we pull in a 64-byte
|
||||
cache line to access a 2-byte variable, we want the other 62 bytes to be
|
||||
ones we'll also likely access soon, for best cache performance.
|
||||
|
||||
This turns out to be critical for performance. It should allow Verilator
|
||||
to scale to very large models. We don't rely on our working set fitting
|
||||
in any CPU cache; instead we essentially "stream" data into caches from
|
||||
memory. It's not literally streaming, where the address increases
|
||||
monotonically, but it should have similar performance characteristics,
|
||||
so long as each macro-task's dataset fits in one core's local caches.
|
||||
|
||||
To achieve spatial locality, we tag each variable with the set of
|
||||
macro-tasks that access it. Let's call this set the "footprint" of that
|
||||
variable. The variables in a given module have a set of footprints. We can
|
||||
order those footprints to minimize the distance between them (distance is
|
||||
the number of macro-tasks that are different across any two footprints) and
|
||||
then emit all variables into the struct in ordered-footprint order.
|
||||
|
||||
The footprint ordering is literally the traveling salesman problem, and we
|
||||
use a TSP-approximation algorithm to get close to an optimal sort.
|
||||
|
||||
This is an old idea. Simulators designed at DEC in the early 1990s used
|
||||
similar techniques to optimize both single-thread and multi-thread modes.
|
||||
(Verilator does not optimize variable placement for spatial locality in
|
||||
serial mode; that is a possible area for improvement.)
|
||||
|
||||
=head3 Improving Multithreaded Performance Further (a TODO list)
|
||||
|
||||
=over 4
|
||||
|
||||
=item C<Wave Scheduling>
|
||||
|
||||
To allow the verilated model to run in parallel with the testbench, it
|
||||
might be nice to support "wave" scheduling, in which work on a cycle begins
|
||||
before eval() is called or continues after eval() returns. For now all
|
||||
work on a cycle happens during the eval() call, leaving Verilator's threads
|
||||
idle while the testbench (everything outside eval()) is working. This would
|
||||
involve fundamental changes within the partitioner, however, it's probably
|
||||
the best bet for hiding testbench latency.
|
||||
|
||||
=item C<Efficient Dynamic Scheduling>
|
||||
|
||||
To scale to more than a few threads, we may revisit a fully dynamic
|
||||
scheduler. For large (>16 core) systems it might make sense to dedicate an
|
||||
entire core to scheduling, so that scheduler data structures would fit in
|
||||
its L1 cache and thus the cost of traversing priority-ordered ready lists
|
||||
would not be prohibitive.
|
||||
|
||||
=item C<Static Scheduling with Runtime Repack>
|
||||
|
||||
We could modify the static scheduling approach by gathering actual
|
||||
macro-task execution times at run time, and dynamically re-packing the
|
||||
macro-tasks into the threads also at run time. Say, re-pack once every
|
||||
10,000 cycles or something. This has the potential to do better than our
|
||||
static estimates about macro-task run times. It could potentially react to
|
||||
CPU cores that aren't performing equally, due to NUMA or thermal throttling
|
||||
or nonuniform competing memory traffic or whatever.
|
||||
|
||||
=item C<Clock Domain Balancing>
|
||||
|
||||
Right now Verilator makes no attempt to balance clock domains across
|
||||
macro-tasks. For a multi-domain model, that could lead to bad gantt chart
|
||||
fragmentation. This could be improved if it's a real problem in practice.
|
||||
|
||||
=item C<Other Forms of MTask Balancing>
|
||||
|
||||
The largest source of runtime overhead is idle CPUs, which happens due to
|
||||
variance between our predicted runtime for each MTask and its actual
|
||||
runtime. That variance is magnified if MTasks are homogeneous, containing
|
||||
similar repeating logic which was generally close together in source code
|
||||
and which is still packed together even after going through Verilator's
|
||||
digestive tract.
|
||||
|
||||
If Verilator could avoid doing that, and instead would take source logic
|
||||
that was close together and distribute it across MTasks, that would
|
||||
increase the diversity of any given MTask, and this should reduce variance
|
||||
in the cost estimates.
|
||||
|
||||
One way to do that might be to make various "tie breaker" comparison
|
||||
routines in the sources to rely more heavily on randomness, and generally
|
||||
try harder not to keep input nodes together when we have the option to
|
||||
scramble things.
|
||||
|
||||
=item C<Performance Regression>
|
||||
|
||||
It would be nice if we had a regression of large designs, with some
|
||||
diversity of design styles, to test on both single- and multi-threaded
|
||||
modes. This would help to avoid performance regressions, and also to
|
||||
evaluate the optimizations while minimizing the impact of parasitic noise.
|
||||
|
||||
=item C<Per-Instance Classes>
|
||||
|
||||
If we have multiple instances of the same module, and they partition
|
||||
differently (likely; we make no attempt to partition them the same) then
|
||||
the variable sort will be suboptimal for either instance. A possible
|
||||
improvement would be to emit a unique class for each instance of a module,
|
||||
and sort its variables optimally for that instance's code stream.
|
||||
|
||||
=back
|
||||
|
||||
=head2 Verilated Flow
|
||||
|
||||
The evaluation loop outputted by Verilator is designed to allow a single
|
||||
|
@ -64,6 +64,7 @@ sub test {
|
||||
run("test -e $prefix/bin/verilator");
|
||||
run("test -e $prefix/bin/verilator_bin");
|
||||
run("test -e $prefix/bin/verilator_bin_dbg");
|
||||
run("test -e $prefix/bin/verilator_gantt");
|
||||
run("test -e $prefix/bin/verilator_profcfunc");
|
||||
}
|
||||
|
||||
|
@ -217,6 +217,7 @@ RAW_OBJS = \
|
||||
V3Order.o \
|
||||
V3Os.o \
|
||||
V3Param.o \
|
||||
V3Partition.o \
|
||||
V3PreShell.o \
|
||||
V3Premit.o \
|
||||
V3Reloop.o \
|
||||
|
@ -29,16 +29,24 @@
|
||||
#include <vector>
|
||||
#include <cmath>
|
||||
#include <map>
|
||||
#include VL_INCLUDE_UNORDERED_SET
|
||||
|
||||
#include "V3Ast__gen_classes.h" // From ./astgen
|
||||
// Things like:
|
||||
// class V3AstNode;
|
||||
|
||||
// Forward declarations
|
||||
class V3Graph;
|
||||
class ExecMTask;
|
||||
|
||||
// Hint class so we can choose constructors
|
||||
class VFlagLogicPacked {};
|
||||
class VFlagBitPacked {};
|
||||
class VFlagChildDType {}; // Used by parser.y to select constructor that sets childDType
|
||||
|
||||
// Used as key for another map, needs operator<, hence not an unordered_set
|
||||
typedef std::set<int> MTaskIdSet; // Set of mtaskIds for Var sorting
|
||||
|
||||
//######################################################################
|
||||
|
||||
// For broken() function, return error string if have a match
|
||||
|
@ -31,6 +31,8 @@
|
||||
#include "V3Ast.h"
|
||||
#include "V3File.h"
|
||||
#include "V3Global.h"
|
||||
#include "V3Graph.h"
|
||||
#include "V3PartitionGraph.h" // Just for mtask dumping
|
||||
|
||||
//======================================================================
|
||||
// Special methods
|
||||
@ -151,22 +153,26 @@ AstNodeBiop* AstEqWild::newTyped(FileLine* fl, AstNode* lhsp, AstNode* rhsp) {
|
||||
}
|
||||
}
|
||||
|
||||
AstExecGraph::AstExecGraph(FileLine* fileline)
|
||||
: AstNode(fileline) {
|
||||
m_depGraphp = new V3Graph;
|
||||
}
|
||||
AstExecGraph::~AstExecGraph() {
|
||||
delete m_depGraphp; VL_DANGLING(m_depGraphp);
|
||||
}
|
||||
|
||||
bool AstVar::isSigPublic() const {
|
||||
return (m_sigPublic || (v3Global.opt.allPublic() && !isTemp() && !isGenVar()));
|
||||
}
|
||||
|
||||
bool AstVar::isScQuad() const {
|
||||
return (isSc() && isQuad() && !isScBv() && !isScBigUint());
|
||||
}
|
||||
|
||||
bool AstVar::isScBv() const {
|
||||
return ((isSc() && width() >= v3Global.opt.pinsBv()) || m_attrScBv);
|
||||
}
|
||||
|
||||
bool AstVar::isScUint() const {
|
||||
return ((isSc() && v3Global.opt.pinsScUint() && width() >= 2 && width() <= 64) && !isScBv());
|
||||
}
|
||||
|
||||
bool AstVar::isScBigUint() const {
|
||||
return ((isSc() && v3Global.opt.pinsScBigUint() && width() >= 65 && width() <= 512) && !isScBv());
|
||||
}
|
||||
@ -441,6 +447,16 @@ AstVar* AstVar::scVarRecurse(AstNode* nodep) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
string AstVar::mtasksString() const {
|
||||
std::ostringstream os;
|
||||
os<<" all: ";
|
||||
for (MTaskIdSet::const_iterator it = m_mtaskIds.begin();
|
||||
it != m_mtaskIds.end(); ++it) {
|
||||
os<<*it<<" ";
|
||||
}
|
||||
return os.str();
|
||||
}
|
||||
|
||||
AstNodeDType* AstNodeDType::dtypeDimensionp(int dimension) {
|
||||
// dimension passed from AstArraySel::dimension
|
||||
// Dimension 0 means the VAR itself, 1 is the closest SEL to the AstVar,
|
||||
@ -970,6 +986,11 @@ void AstSliceSel::dump(std::ostream& str) {
|
||||
str<<" decl"<<declRange();
|
||||
}
|
||||
}
|
||||
void AstMTaskBody::dump(std::ostream& str) {
|
||||
this->AstNode::dump(str);
|
||||
str<<" ";
|
||||
m_execMTaskp->dump(str);
|
||||
}
|
||||
void AstTypeTable::dump(std::ostream& str) {
|
||||
this->AstNode::dump(str);
|
||||
for (int i=0; i<(int)(AstBasicDTypeKwd::_ENUM_MAX); ++i) {
|
||||
|
@ -1124,6 +1124,7 @@ private:
|
||||
bool m_noSubst:1; // Do not substitute out references
|
||||
bool m_trace:1; // Trace this variable
|
||||
AstVarAttrClocker m_attrClocker;
|
||||
MTaskIdSet m_mtaskIds; // MTaskID's that read or write this var
|
||||
|
||||
void init() {
|
||||
m_input=false; m_output=false; m_tristate=false; m_declOutput=false;
|
||||
@ -1323,6 +1324,10 @@ public:
|
||||
if (varType()==AstVarType::INPUT || varType()==AstVarType::OUTPUT) m_varType = AstVarType::WIRE;
|
||||
}
|
||||
static AstVar* scVarRecurse(AstNode* nodep);
|
||||
void addProducingMTaskId(int id) { m_mtaskIds.insert(id); }
|
||||
void addConsumingMTaskId(int id) { m_mtaskIds.insert(id); }
|
||||
const MTaskIdSet& mtaskIds() const { return m_mtaskIds; }
|
||||
string mtasksString() const;
|
||||
};
|
||||
|
||||
class AstDefParam : public AstNode {
|
||||
@ -5698,6 +5703,44 @@ public:
|
||||
AstNode* bodysp() const { return op1p(); } // op1= expressions to print
|
||||
};
|
||||
|
||||
class AstMTaskBody : public AstNode {
|
||||
// Hold statements for each MTask
|
||||
private:
|
||||
ExecMTask* m_execMTaskp;
|
||||
public:
|
||||
explicit AstMTaskBody(FileLine* flp)
|
||||
: AstNode(flp)
|
||||
, m_execMTaskp(NULL) {}
|
||||
ASTNODE_NODE_FUNCS(MTaskBody);
|
||||
virtual const char* broken() const { BROKEN_RTN(!m_execMTaskp); return NULL; }
|
||||
AstNode* stmtsp() const { return op1p(); }
|
||||
void addStmtsp(AstNode* nodep) { addOp1p(nodep); }
|
||||
ExecMTask* execMTaskp() const { return m_execMTaskp; }
|
||||
void execMTaskp(ExecMTask* execMTaskp) { m_execMTaskp = execMTaskp; }
|
||||
virtual void dump(std::ostream& str=std::cout);
|
||||
};
|
||||
|
||||
class AstExecGraph : public AstNode {
|
||||
// For parallel execution, this node contains a dependency graph. Each
|
||||
// node in the graph is an ExecMTask, which contains a body for the
|
||||
// mtask, which contains a set of AstActive's, each of which calls a
|
||||
// leaf AstCFunc. whew!
|
||||
//
|
||||
// The mtask bodies are also children of this node, so we can visit
|
||||
// them without traversing the graph (it's not always needed to
|
||||
// traverse the graph.)
|
||||
private:
|
||||
V3Graph *m_depGraphp; // contains ExecMTask's
|
||||
public:
|
||||
explicit AstExecGraph(FileLine* fileline);
|
||||
ASTNODE_NODE_FUNCS_NO_DTOR(ExecGraph)
|
||||
virtual ~AstExecGraph();
|
||||
virtual const char* broken() const { BROKEN_RTN(!m_depGraphp); return NULL; }
|
||||
const V3Graph* depGraphp() const { return m_depGraphp; }
|
||||
V3Graph* mutableDepGraphp() { return m_depGraphp; }
|
||||
void addMTaskBody(AstMTaskBody* bodyp) { addOp1p(bodyp); }
|
||||
};
|
||||
|
||||
class AstSplitPlaceholder : public AstNode {
|
||||
public:
|
||||
// Dummy node used within V3Split; never exists outside of V3Split.
|
||||
@ -5749,12 +5792,14 @@ private:
|
||||
AstTypeTable* m_typeTablep; // Reference to top type table, for faster lookup
|
||||
AstPackage* m_dollarUnitPkgp;
|
||||
AstCFunc* m_evalp; // The '_eval' function
|
||||
AstExecGraph* m_execGraphp; // Execution MTask graph for threads>1 mode
|
||||
public:
|
||||
AstNetlist()
|
||||
: AstNode(new FileLine("AstRoot",0))
|
||||
, m_typeTablep(NULL)
|
||||
, m_dollarUnitPkgp(NULL)
|
||||
, m_evalp(NULL) { }
|
||||
, m_evalp(NULL)
|
||||
, m_execGraphp(NULL) { }
|
||||
ASTNODE_NODE_FUNCS(Netlist)
|
||||
virtual const char* broken() const {
|
||||
BROKEN_RTN(m_dollarUnitPkgp && !m_dollarUnitPkgp->brokeExists());
|
||||
@ -5784,6 +5829,8 @@ public:
|
||||
return m_dollarUnitPkgp; }
|
||||
AstCFunc* evalp() const { return m_evalp; }
|
||||
void evalp(AstCFunc* evalp) { m_evalp = evalp; }
|
||||
AstExecGraph* execGraphp() const { return m_execGraphp; }
|
||||
void execGraphp(AstExecGraph* graphp) { m_execGraphp = graphp; }
|
||||
};
|
||||
|
||||
//######################################################################
|
||||
|
@ -68,6 +68,7 @@ private:
|
||||
AstCFunc* m_settleFuncp; // Top settlement function we are creating
|
||||
AstSenTree* m_lastSenp; // Last sensitivity match, so we can detect duplicates.
|
||||
AstIf* m_lastIfp; // Last sensitivity if active to add more under
|
||||
AstMTaskBody* m_mtaskBodyp; // Current mtask body
|
||||
|
||||
// METHODS
|
||||
VL_DEBUG_FUNC; // Declare debug()
|
||||
@ -338,6 +339,30 @@ private:
|
||||
// Only empty blocks should be leftover on the non-top. Killem.
|
||||
if (nodep->stmtsp()) nodep->v3fatalSrc("Non-empty lower active");
|
||||
nodep->unlinkFrBack()->deleteTree(); VL_DANGLING(nodep);
|
||||
} else if (m_mtaskBodyp) {
|
||||
UINFO(4," TR ACTIVE "<<nodep<<endl);
|
||||
AstNode* stmtsp = nodep->stmtsp()->unlinkFrBackWithNext();
|
||||
if (nodep->hasClocked()) {
|
||||
if (nodep->hasInitial()) nodep->v3fatalSrc("Initial block should not have clock sensitivity");
|
||||
if (m_lastSenp && nodep->sensesp()->sameTree(m_lastSenp)) {
|
||||
UINFO(4," sameSenseTree\n");
|
||||
} else {
|
||||
clearLastSen();
|
||||
m_lastSenp = nodep->sensesp();
|
||||
// Make a new if statement
|
||||
m_lastIfp = makeActiveIf(m_lastSenp);
|
||||
m_mtaskBodyp->addStmtsp(m_lastIfp);
|
||||
}
|
||||
// Move statements to if
|
||||
m_lastIfp->addIfsp(stmtsp);
|
||||
} else if (nodep->hasInitial() || nodep->hasSettle()) {
|
||||
nodep->v3fatalSrc("MTask should not include initial/settle logic.");
|
||||
} else {
|
||||
// Combo logic. Move statements to mtask func.
|
||||
clearLastSen();
|
||||
m_mtaskBodyp->addStmtsp(stmtsp);
|
||||
}
|
||||
nodep->unlinkFrBack()->deleteTree(); VL_DANGLING(nodep);
|
||||
} else {
|
||||
UINFO(4," ACTIVE "<<nodep<<endl);
|
||||
AstNode* stmtsp = nodep->stmtsp()->unlinkFrBackWithNext();
|
||||
@ -372,6 +397,20 @@ private:
|
||||
nodep->unlinkFrBack()->deleteTree(); VL_DANGLING(nodep);
|
||||
}
|
||||
}
|
||||
virtual void visit(AstExecGraph* nodep) {
|
||||
for (m_mtaskBodyp = VN_CAST(nodep->op1p(), MTaskBody);
|
||||
m_mtaskBodyp;
|
||||
m_mtaskBodyp = VN_CAST(m_mtaskBodyp->nextp(), MTaskBody)) {
|
||||
clearLastSen();
|
||||
iterate(m_mtaskBodyp);
|
||||
}
|
||||
clearLastSen();
|
||||
// Move the ExecGraph into _eval. Its location marks the
|
||||
// spot where the graph will execute, relative to other
|
||||
// (serial) logic in the cycle.
|
||||
nodep->unlinkFrBack();
|
||||
addToEvalLoop(nodep);
|
||||
}
|
||||
|
||||
//--------------------
|
||||
// Default: Just iterate
|
||||
@ -391,6 +430,7 @@ public:
|
||||
m_lastSenp = NULL;
|
||||
m_lastIfp = NULL;
|
||||
m_scopep = NULL;
|
||||
m_mtaskBodyp = NULL;
|
||||
//
|
||||
iterate(nodep);
|
||||
// Allow downstream modules to find _eval()
|
||||
|
432
src/V3EmitC.cpp
432
src/V3EmitC.cpp
@ -34,6 +34,8 @@
|
||||
#include "V3EmitC.h"
|
||||
#include "V3EmitCBase.h"
|
||||
#include "V3Number.h"
|
||||
#include "V3PartitionGraph.h"
|
||||
#include "V3TSP.h"
|
||||
|
||||
#define VL_VALUE_STRING_MAX_WIDTH 8192 // We use a static char array in VL_VALUE_STRING
|
||||
|
||||
@ -103,7 +105,13 @@ public:
|
||||
puts("["+cvtToStr(arrayp->elementsConst())+"]");
|
||||
}
|
||||
}
|
||||
|
||||
void emitVarCmtChg(const AstVar* varp, string* curVarCmtp) {
|
||||
string newVarCmt = varp->mtasksString();
|
||||
if (*curVarCmtp != newVarCmt) {
|
||||
*curVarCmtp = newVarCmt;
|
||||
puts("// Begin mtask footprint "+*curVarCmtp+"\n");
|
||||
}
|
||||
}
|
||||
void emitTypedefs(AstNode* firstp) {
|
||||
bool first = true;
|
||||
for (AstNode* loopp=firstp; loopp; loopp = loopp->nextp()) {
|
||||
@ -783,6 +791,50 @@ public:
|
||||
virtual ~EmitCStmts() {}
|
||||
};
|
||||
|
||||
//######################################################################
|
||||
// Establish mtask variable sort order in mtasks mode
|
||||
|
||||
class EmitVarTspSorter : public V3TSP::TspStateBase {
|
||||
private:
|
||||
// MEMBERS
|
||||
const MTaskIdSet& m_mtaskIds; // Mtask we're ordering
|
||||
static unsigned m_serialNext; // Unique ID to establish serial order
|
||||
unsigned m_serial; // Serial ordering
|
||||
public:
|
||||
// CONSTRUCTORS
|
||||
explicit EmitVarTspSorter(const MTaskIdSet& mtaskIds)
|
||||
: m_mtaskIds(mtaskIds),
|
||||
m_serial(++m_serialNext) {}
|
||||
virtual ~EmitVarTspSorter() {}
|
||||
// METHODS
|
||||
bool operator<(const TspStateBase& other) const {
|
||||
return operator<(dynamic_cast<const EmitVarTspSorter&>(other));
|
||||
}
|
||||
bool operator<(const EmitVarTspSorter& other) const {
|
||||
return m_serial < other.m_serial;
|
||||
}
|
||||
const MTaskIdSet& mtaskIds() const { return m_mtaskIds; }
|
||||
virtual int cost(const TspStateBase* otherp) const {
|
||||
return cost(dynamic_cast<const EmitVarTspSorter*>(otherp));
|
||||
}
|
||||
virtual int cost(const EmitVarTspSorter* otherp) const {
|
||||
int cost = diffs(m_mtaskIds, otherp->m_mtaskIds);
|
||||
cost += diffs(otherp->m_mtaskIds, m_mtaskIds);
|
||||
return cost;
|
||||
}
|
||||
// Returns the number of elements in set_a that don't appear in set_b
|
||||
static int diffs(const MTaskIdSet& set_a, const MTaskIdSet& set_b) {
|
||||
int diffs = 0;
|
||||
for (MTaskIdSet::iterator it = set_a.begin();
|
||||
it != set_a.end(); ++it) {
|
||||
if (set_b.find(*it) == set_b.end()) ++diffs;
|
||||
}
|
||||
return diffs;
|
||||
}
|
||||
};
|
||||
|
||||
unsigned EmitVarTspSorter::m_serialNext = 0;
|
||||
|
||||
//######################################################################
|
||||
// Internal EmitC implementation
|
||||
|
||||
@ -873,6 +925,91 @@ class EmitCImp : EmitCStmts {
|
||||
return ofp;
|
||||
}
|
||||
|
||||
// Returns the number of cross-thread dependencies into mtaskp.
|
||||
// If >0, mtaskp must test whether its prereqs are done before starting,
|
||||
// and may need to block.
|
||||
static uint32_t packedMTaskMayBlock(const ExecMTask* mtaskp) {
|
||||
uint32_t result = 0;
|
||||
for (V3GraphEdge* edgep = mtaskp->inBeginp(); edgep; edgep = edgep->inNextp()) {
|
||||
const ExecMTask* prevp = dynamic_cast<ExecMTask*>(edgep->fromp());
|
||||
if (prevp->thread() != mtaskp->thread()) {
|
||||
++result;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
void emitMTaskBody(AstMTaskBody* nodep) {
|
||||
ExecMTask* curExecMTaskp = nodep->execMTaskp();
|
||||
if (packedMTaskMayBlock(curExecMTaskp)) {
|
||||
puts("vlTOPp->__Vm_mt_" + cvtToStr(curExecMTaskp->id())
|
||||
+ ".waitUntilUpstreamDone(even_cycle);\n");
|
||||
}
|
||||
|
||||
string recName;
|
||||
if (v3Global.opt.profThreads()) {
|
||||
recName = "__Vprfthr_" + cvtToStr(curExecMTaskp->id());
|
||||
puts("VlProfileRec* " + recName + " = NULL;\n");
|
||||
// Leave this if() here, as don't want to call VL_RDTSC_Q unless profiling
|
||||
puts("if (VL_UNLIKELY(vlTOPp->__Vm_profile_cycle_start)) {\n");
|
||||
puts( recName + " = vlTOPp->__Vm_threadPoolp->profileAppend();\n");
|
||||
puts( recName + "->startRecord(VL_RDTSC_Q() - vlTOPp->__Vm_profile_cycle_start,");
|
||||
puts( " "+cvtToStr(curExecMTaskp->id())+ ",");
|
||||
puts( " "+cvtToStr(curExecMTaskp->cost())+");\n");
|
||||
puts("}\n");
|
||||
}
|
||||
puts("Verilated::mtaskId(" + cvtToStr(curExecMTaskp->id()) + ");\n");
|
||||
|
||||
// The actual body of calls to leaf functions
|
||||
iterateAndNextNull(nodep->stmtsp());
|
||||
|
||||
if (v3Global.opt.profThreads()) {
|
||||
// Leave this if() here, as don't want to call VL_RDTSC_Q unless profiling
|
||||
puts("if (VL_UNLIKELY("+recName+")) {\n");
|
||||
puts( recName + "->endRecord(VL_RDTSC_Q() - vlTOPp->__Vm_profile_cycle_start);\n");
|
||||
puts("}\n");
|
||||
}
|
||||
|
||||
// Flush message queue
|
||||
puts("Verilated::endOfThreadMTask(vlSymsp->__Vm_evalMsgQp);\n");
|
||||
|
||||
// For any downstream mtask that's on another thread, bump its
|
||||
// counter and maybe notify it.
|
||||
for (V3GraphEdge* edgep = curExecMTaskp->outBeginp();
|
||||
edgep; edgep = edgep->outNextp()) {
|
||||
const ExecMTask* nextp = dynamic_cast<ExecMTask*>(edgep->top());
|
||||
if (nextp->thread() != curExecMTaskp->thread()) {
|
||||
puts("vlTOPp->__Vm_mt_"+cvtToStr(nextp->id())
|
||||
+ ".signalUpstreamDone(even_cycle);\n");
|
||||
}
|
||||
}
|
||||
|
||||
// Run the next mtask inline
|
||||
const ExecMTask* nextp = curExecMTaskp->packNextp();
|
||||
if (nextp) {
|
||||
emitMTaskBody(nextp->bodyp());
|
||||
} else {
|
||||
// Unblock the fake "final" mtask
|
||||
puts("vlTOPp->__Vm_mt_final.signalUpstreamDone(even_cycle);\n");
|
||||
}
|
||||
}
|
||||
|
||||
virtual void visit(AstMTaskBody* nodep) {
|
||||
ExecMTask* mtp = nodep->execMTaskp();
|
||||
puts("\n");
|
||||
puts("void ");
|
||||
puts(modClassName(m_modp)+"::"+mtp->cFuncName());
|
||||
puts("(bool even_cycle, void* symtab) {\n");
|
||||
|
||||
// Declare and set vlSymsp
|
||||
puts(EmitCBaseVisitor::symClassVar() + " = ("
|
||||
+ EmitCBaseVisitor::symClassName() + "*)symtab;\n");
|
||||
puts(EmitCBaseVisitor::symTopAssign()+"\n");
|
||||
|
||||
emitMTaskBody(nodep);
|
||||
puts("}\n");
|
||||
}
|
||||
|
||||
//---------------------------------------
|
||||
// VISITORS
|
||||
using EmitCStmts::visit; // Suppress hidden overloaded virtual function warning
|
||||
@ -973,6 +1110,54 @@ class EmitCImp : EmitCStmts {
|
||||
emitVarReset(varp);
|
||||
}
|
||||
|
||||
virtual void visit(AstExecGraph* nodep) {
|
||||
if (nodep != v3Global.rootp()->execGraphp()) {
|
||||
nodep->v3fatalSrc("ExecGraph should be a singleton!");
|
||||
}
|
||||
// The location of the AstExecGraph within the containing _eval()
|
||||
// function is where we want to invoke the graph and wait for it to
|
||||
// complete. Do that now.
|
||||
//
|
||||
// Don't recurse to children -- this isn't the place to emit
|
||||
// function definitions for the nested CFuncs. We'll do that at the
|
||||
// end.
|
||||
puts("vlTOPp->__Vm_even_cycle = !vlTOPp->__Vm_even_cycle;\n");
|
||||
|
||||
// Build the list of initial mtasks to start
|
||||
std::vector<const ExecMTask*> execMTasks;
|
||||
|
||||
// Start each root mtask
|
||||
for (const V3GraphVertex* vxp = nodep->depGraphp()->verticesBeginp();
|
||||
vxp; vxp = vxp->verticesNextp()) {
|
||||
const ExecMTask* etp = dynamic_cast<const ExecMTask*>(vxp);
|
||||
if (etp->threadRoot()) execMTasks.push_back(etp);
|
||||
}
|
||||
if (execMTasks.size() >
|
||||
static_cast<unsigned>(v3Global.opt.threads())) {
|
||||
nodep->v3fatalSrc("More root mtasks than available threads");
|
||||
}
|
||||
|
||||
if (!execMTasks.empty()) {
|
||||
for (uint32_t i = 0; i < execMTasks.size(); ++i) {
|
||||
bool runInline = (i == execMTasks.size() - 1);
|
||||
if (runInline) {
|
||||
// The thread calling eval() will run this mtask inline,
|
||||
// along with its packed successors.
|
||||
puts(execMTasks[i]->cFuncName()
|
||||
+ "(vlTOPp->__Vm_even_cycle, vlSymsp);\n");
|
||||
puts("Verilated::mtaskId(0);\n");
|
||||
} else {
|
||||
// The other N-1 go to the thread pool.
|
||||
puts("vlTOPp->__Vm_threadPoolp->workerp("
|
||||
+ cvtToStr(i)+")->addTask("
|
||||
+ execMTasks[i]->cFuncName()
|
||||
+ ", vlTOPp->__Vm_even_cycle, vlSymsp);\n");
|
||||
}
|
||||
}
|
||||
puts("vlTOPp->__Vm_mt_final.waitUntilUpstreamDone(vlTOPp->__Vm_even_cycle);\n");
|
||||
}
|
||||
}
|
||||
|
||||
//---------------------------------------
|
||||
// ACCESSORS
|
||||
|
||||
@ -995,6 +1180,8 @@ class EmitCImp : EmitCStmts {
|
||||
void emitStaticDecl(AstNodeModule* modp);
|
||||
void emitSettleLoop(const std::string& eval_call, bool initial);
|
||||
void emitWrapEval(AstNodeModule* modp);
|
||||
void emitMTaskState();
|
||||
void emitMTaskVertexCtors(bool* firstp);
|
||||
void emitInt(AstNodeModule* modp);
|
||||
void maybeSplit(AstNodeModule* modp);
|
||||
|
||||
@ -1534,6 +1721,36 @@ void EmitCImp::emitCoverageDecl(AstNodeModule* modp) {
|
||||
}
|
||||
}
|
||||
|
||||
void EmitCImp::emitMTaskVertexCtors(bool* firstp) {
|
||||
AstExecGraph* execGraphp = v3Global.rootp()->execGraphp();
|
||||
if (!execGraphp) v3Global.rootp()->v3fatalSrc("Should have an execGraphp");
|
||||
const V3Graph* depGraphp = execGraphp->depGraphp();
|
||||
|
||||
unsigned finalEdgesInCt = 0;
|
||||
for (const V3GraphVertex* vxp = depGraphp->verticesBeginp();
|
||||
vxp; vxp = vxp->verticesNextp()) {
|
||||
const ExecMTask* mtp = dynamic_cast<const ExecMTask*>(vxp);
|
||||
unsigned edgesInCt = packedMTaskMayBlock(mtp);
|
||||
if (packedMTaskMayBlock(mtp) > 0) {
|
||||
emitCtorSep(firstp);
|
||||
puts("__Vm_mt_"+cvtToStr(mtp->id())+"("+cvtToStr(edgesInCt)+")");
|
||||
}
|
||||
// Each mtask with no packed successor will become a dependency
|
||||
// for the final node:
|
||||
if (!mtp->packNextp()) ++finalEdgesInCt;
|
||||
}
|
||||
|
||||
emitCtorSep(firstp);
|
||||
puts("__Vm_mt_final(" + cvtToStr(finalEdgesInCt) + ")");
|
||||
|
||||
// This will flip to 'true' before the start of the 0th cycle.
|
||||
emitCtorSep(firstp); puts("__Vm_threadPoolp(NULL)");
|
||||
if (v3Global.opt.profThreads()) {
|
||||
emitCtorSep(firstp); puts("__Vm_profile_cycle_start(0)");
|
||||
}
|
||||
emitCtorSep(firstp); puts("__Vm_even_cycle(false)");
|
||||
}
|
||||
|
||||
void EmitCImp::emitCtorImp(AstNodeModule* modp) {
|
||||
puts("\n");
|
||||
bool first = true;
|
||||
@ -1544,6 +1761,9 @@ void EmitCImp::emitCtorImp(AstNodeModule* modp) {
|
||||
first = false; // VL_CTOR_IMP includes the first ':'
|
||||
}
|
||||
emitVarCtors(&first);
|
||||
if (modp->isTop() && v3Global.opt.mtasks()) {
|
||||
emitMTaskVertexCtors(&first);
|
||||
}
|
||||
puts(" {\n");
|
||||
emitCellCtors(modp);
|
||||
emitSensitives();
|
||||
@ -1556,6 +1776,39 @@ void EmitCImp::emitCtorImp(AstNodeModule* modp) {
|
||||
putsDecoration("// Reset structure values\n");
|
||||
puts("_ctor_var_reset();\n");
|
||||
emitTextSection(AstType::atScCtor);
|
||||
|
||||
if (modp->isTop() && v3Global.opt.mtasks()) {
|
||||
// TODO-- For now each top module creates its own ThreadPool here,
|
||||
// and deletes it in the destructor. If A and B are each top level
|
||||
// modules, each creates a separate thread pool. This allows
|
||||
// A.eval() and B.eval() to run concurrently without any
|
||||
// interference -- so long as the physical machine has enough cores
|
||||
// to support both pools and all testbench threads.
|
||||
//
|
||||
// In the future, we might want to let the client provide a
|
||||
// threadpool to the constructor. This would allow two or more
|
||||
// models to share a single threadpool.
|
||||
//
|
||||
// For example: suppose models A and B are each compiled to run on
|
||||
// 4 threads. The client might create a single thread pool with 3
|
||||
// threads and pass it to both models. If the client can ensure tht
|
||||
// A.eval() and B.eval() do NOT run concurrently, there will be no
|
||||
// contention for the threads. This mode is missing for now. (Is
|
||||
// there demand for such a setup?)
|
||||
puts("__Vm_threadPoolp = new VlThreadPool("
|
||||
// Note we create N-1 threads in the thread pool. The thread
|
||||
// that calls eval() becomes the final Nth thread for the
|
||||
// duration of the eval call.
|
||||
+ cvtToStr(v3Global.opt.threads() - 1)
|
||||
+ ", " + cvtToStr(v3Global.opt.profThreads())
|
||||
+ ");\n");
|
||||
|
||||
if (v3Global.opt.profThreads()) {
|
||||
puts("__Vm_profile_cycle_start = 0;\n");
|
||||
puts("__Vm_profile_time_finished = 0;\n");
|
||||
puts("__Vm_profile_window_ct = 0;");
|
||||
}
|
||||
}
|
||||
puts("}\n");
|
||||
}
|
||||
|
||||
@ -1597,6 +1850,9 @@ void EmitCImp::emitCoverageImp(AstNodeModule* modp) {
|
||||
void EmitCImp::emitDestructorImp(AstNodeModule* modp) {
|
||||
puts("\n");
|
||||
puts(modClassName(modp)+"::~"+modClassName(modp)+"() {\n");
|
||||
if (modp->isTop() && v3Global.opt.mtasks()) {
|
||||
puts("delete __Vm_threadPoolp; __Vm_threadPoolp = NULL;\n");
|
||||
}
|
||||
emitTextSection(AstType::atScDtor);
|
||||
if (modp->isTop()) puts("delete __VlSymsp; __VlSymsp=NULL;\n");
|
||||
puts("}\n");
|
||||
@ -1796,9 +2052,47 @@ void EmitCImp::emitWrapEval(AstNodeModule* modp) {
|
||||
if (v3Global.opt.threads() == 1) {
|
||||
uint32_t mtaskId = 0;
|
||||
putsDecoration("// MTask "+cvtToStr(mtaskId)+" start\n");
|
||||
puts("VL_DEBUG_IF(VL_DBG_MSGF(\"MTask starting, mtaskId="+cvtToStr(mtaskId)+"\\n\"););\n");
|
||||
puts("VL_DEBUG_IF(VL_DBG_MSGF(\"MTask"+cvtToStr(mtaskId)+" starting\\n\"););\n");
|
||||
puts("Verilated::mtaskId("+cvtToStr(mtaskId)+");\n");
|
||||
}
|
||||
|
||||
if (v3Global.opt.mtasks()
|
||||
&& v3Global.opt.profThreads()) {
|
||||
puts("if (VL_UNLIKELY((Verilated::profThreadsStart() != __Vm_profile_time_finished)\n");
|
||||
puts( " && (VL_TIME_Q() > Verilated::profThreadsStart())\n");
|
||||
puts( " && (Verilated::profThreadsWindow() >= 1))) {\n");
|
||||
// Within a profile (either starting, middle, or end)
|
||||
puts( "if (vlTOPp->__Vm_profile_window_ct == 0) {\n"); // Opening file?
|
||||
// Start profile on this cycle. We'll capture a window worth, then
|
||||
// only analyze the next window worth. The idea is that the first window
|
||||
// capture will hit some cache-cold stuff (eg printf) but it'll be warm
|
||||
// by the time we hit the second window, we hope.
|
||||
puts( "vlTOPp->__Vm_profile_cycle_start = VL_RDTSC_Q();\n");
|
||||
// "* 2" as first half is warmup, second half is collection
|
||||
puts( "vlTOPp->__Vm_profile_window_ct = Verilated::profThreadsWindow() * 2 + 1;\n");
|
||||
puts( "}\n");
|
||||
puts( "--vlTOPp->__Vm_profile_window_ct;\n");
|
||||
puts( "if (vlTOPp->__Vm_profile_window_ct == (Verilated::profThreadsWindow())) {\n");
|
||||
// This barrier record in every threads' profile demarcates the
|
||||
// cache-warm-up cycles before the barrier from the actual profile
|
||||
// cycles afterward.
|
||||
puts( "vlTOPp->__Vm_threadPoolp->profileAppendAll(");
|
||||
puts( "VlProfileRec(VlProfileRec::Barrier()));\n");
|
||||
puts( "vlTOPp->__Vm_profile_cycle_start = VL_RDTSC_Q();\n");
|
||||
puts( "}\n");
|
||||
puts( "else if (vlTOPp->__Vm_profile_window_ct == 0) {\n");
|
||||
// Ending file.
|
||||
puts( "vluint64_t elapsed = VL_RDTSC_Q() - vlTOPp->__Vm_profile_cycle_start;\n");
|
||||
puts( "vlTOPp->__Vm_threadPoolp->profileDump(Verilated::profThreadsFilenamep(), elapsed);\n");
|
||||
// This turns off the test to enter the profiling code, but still
|
||||
// allows the user to collect another profile by changing
|
||||
// profThreadsStart
|
||||
puts( "__Vm_profile_time_finished = Verilated::profThreadsStart();\n");
|
||||
puts( "vlTOPp->__Vm_profile_cycle_start = 0;\n");
|
||||
puts( "}\n");
|
||||
puts("}\n");
|
||||
}
|
||||
|
||||
emitSettleLoop(
|
||||
(string("VL_DEBUG_IF(VL_DBG_MSGF(\"+ Clock loop\\n\"););\n")
|
||||
+ (v3Global.opt.trace() ? "vlSymsp->__Vm_activity = true;\n" : "")
|
||||
@ -1832,10 +2126,13 @@ void EmitCStmts::emitVarList(AstNode* firstp, EisWhich which, const string& pref
|
||||
// Put out a list of signal declarations
|
||||
// in order of 0:clocks, 1:vluint8, 2:vluint16, 4:vluint32, 5:vluint64, 6:wide, 7:arrays
|
||||
// This aids cache packing and locality
|
||||
// Largest->smallest reduces the number of pad variables.
|
||||
// But for now, Smallest->largest makes it more likely a small offset will allow access to the signal.
|
||||
// TODO: Move this sort to an earlier visitor stage.
|
||||
//
|
||||
// Largest->smallest reduces the number of pad variables. Also
|
||||
// experimented with alternating between large->small and small->large
|
||||
// on successive Mtask groups, but then when a new mtask gets added may
|
||||
// cause a huge delta.
|
||||
//
|
||||
// TODO: Move this sort to an earlier visitor stage.
|
||||
VarSortMap varAnonMap;
|
||||
VarSortMap varNonanonMap;
|
||||
|
||||
@ -1891,8 +2188,9 @@ void EmitCStmts::emitVarList(AstNode* firstp, EisWhich which, const string& pref
|
||||
|
||||
void EmitCStmts::emitVarSort(const VarSortMap& vmap, VarVec* sortedp) {
|
||||
UASSERT(sortedp->empty(), "Sorted should be initially empty");
|
||||
{
|
||||
// Plain old serial mode. Sort by size, from small to large.
|
||||
if (!v3Global.opt.mtasks()) {
|
||||
// Plain old serial mode. Sort by size, from small to large,
|
||||
// to optimize for both packing and small offsets in code.
|
||||
for (VarSortMap::const_iterator it = vmap.begin();
|
||||
it != vmap.end(); ++it) {
|
||||
for (VarVec::const_iterator jt = it->second.begin();
|
||||
@ -1900,12 +2198,52 @@ void EmitCStmts::emitVarSort(const VarSortMap& vmap, VarVec* sortedp) {
|
||||
sortedp->push_back(*jt);
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// MacroTask mode. Sort by MTask-affinity group first, size second.
|
||||
typedef std::map<MTaskIdSet, VarSortMap> MTaskVarSortMap;
|
||||
MTaskVarSortMap m2v;
|
||||
for (VarSortMap::const_iterator it = vmap.begin(); it != vmap.end(); ++it) {
|
||||
int size_class = it->first;
|
||||
const VarVec& vec = it->second;
|
||||
for (VarVec::const_iterator jt = vec.begin(); jt != vec.end(); ++jt) {
|
||||
const AstVar* varp = *jt;
|
||||
m2v[varp->mtaskIds()][size_class].push_back(varp);
|
||||
}
|
||||
}
|
||||
|
||||
// Create a TSP sort state for each MTaskIdSet footprint
|
||||
V3TSP::StateVec states;
|
||||
for (MTaskVarSortMap::iterator it = m2v.begin(); it != m2v.end(); ++it) {
|
||||
states.push_back(new EmitVarTspSorter(it->first));
|
||||
}
|
||||
|
||||
// Do the TSP sort
|
||||
V3TSP::StateVec sorted_states;
|
||||
V3TSP::tspSort(states, &sorted_states);
|
||||
|
||||
for (V3TSP::StateVec::iterator it = sorted_states.begin();
|
||||
it != sorted_states.end(); ++it) {
|
||||
const EmitVarTspSorter* statep = dynamic_cast<const EmitVarTspSorter*>(*it);
|
||||
const VarSortMap& localVmap = m2v[statep->mtaskIds()];
|
||||
// use rbegin/rend to sort size large->small
|
||||
for (VarSortMap::const_reverse_iterator jt = localVmap.rbegin();
|
||||
jt != localVmap.rend(); ++jt) {
|
||||
const VarVec& vec = jt->second;
|
||||
for (VarVec::const_iterator kt = vec.begin();
|
||||
kt != vec.end(); ++kt) {
|
||||
sortedp->push_back(*kt);
|
||||
}
|
||||
}
|
||||
delete statep; VL_DANGLING(statep);
|
||||
}
|
||||
}
|
||||
|
||||
void EmitCStmts::emitSortedVarList(const VarVec& anons,
|
||||
const VarVec& nonanons,
|
||||
const string& prefixIfImp) {
|
||||
string curVarCmt = "";
|
||||
// Output anons
|
||||
{
|
||||
int anonMembers = anons.size();
|
||||
@ -1933,6 +2271,7 @@ void EmitCStmts::emitSortedVarList(const VarVec& anons,
|
||||
if (anonL1s != 1) puts("struct {\n");
|
||||
for (int l0=0; l0<lim && it != anons.end(); ++l0) {
|
||||
const AstVar* varp = *it;
|
||||
emitVarCmtChg(varp, &curVarCmt);
|
||||
emitVarDecl(varp, prefixIfImp);
|
||||
++it;
|
||||
}
|
||||
@ -1945,12 +2284,14 @@ void EmitCStmts::emitSortedVarList(const VarVec& anons,
|
||||
// Leftovers, just in case off by one error somewhere above
|
||||
for (; it != anons.end(); ++it) {
|
||||
const AstVar* varp = *it;
|
||||
emitVarCmtChg(varp, &curVarCmt);
|
||||
emitVarDecl(varp, prefixIfImp);
|
||||
}
|
||||
}
|
||||
// Output nonanons
|
||||
for (VarVec::const_iterator it = nonanons.begin(); it != nonanons.end(); ++it) {
|
||||
const AstVar* varp = *it;
|
||||
emitVarCmtChg(varp, &curVarCmt);
|
||||
emitVarDecl(varp, prefixIfImp);
|
||||
}
|
||||
}
|
||||
@ -1986,6 +2327,59 @@ void EmitCImp::emitIntFuncDecls(AstNodeModule* modp) {
|
||||
if (funcp->ifdef()!="") puts("#endif // "+funcp->ifdef()+"\n");
|
||||
}
|
||||
}
|
||||
|
||||
if (modp->isTop() && v3Global.opt.mtasks()) {
|
||||
// Emit the mtask func prototypes.
|
||||
AstExecGraph* execGraphp = v3Global.rootp()->execGraphp();
|
||||
if (!execGraphp) v3Global.rootp()->v3fatalSrc("Root should have an execGraphp");
|
||||
const V3Graph* depGraphp = execGraphp->depGraphp();
|
||||
for (const V3GraphVertex* vxp = depGraphp->verticesBeginp();
|
||||
vxp; vxp = vxp->verticesNextp()) {
|
||||
const ExecMTask* mtp = dynamic_cast<const ExecMTask*>(vxp);
|
||||
if (mtp->threadRoot()) {
|
||||
// Emit function declaration for this mtask
|
||||
ofp()->putsPrivate(true);
|
||||
puts("static void "); puts(mtp->cFuncName());
|
||||
puts("(bool even_cycle, void* symtab);\n");
|
||||
}
|
||||
}
|
||||
// No AstCFunc for this one, as it's synthetic. Just write it:
|
||||
puts("static void __Vmtask__final(bool even_cycle, void* symtab);\n");
|
||||
}
|
||||
}
|
||||
|
||||
void EmitCImp::emitMTaskState() {
|
||||
ofp()->putsPrivate(true);
|
||||
AstExecGraph* execGraphp = v3Global.rootp()->execGraphp();
|
||||
if (!execGraphp) v3Global.rootp()->v3fatalSrc("Root should have an execGraphp");
|
||||
|
||||
const V3Graph* depGraphp = execGraphp->depGraphp();
|
||||
for (const V3GraphVertex* vxp = depGraphp->verticesBeginp();
|
||||
vxp; vxp = vxp->verticesNextp()) {
|
||||
const ExecMTask* mtp = dynamic_cast<const ExecMTask*>(vxp);
|
||||
if (packedMTaskMayBlock(mtp) > 0) {
|
||||
puts("VlMTaskVertex __Vm_mt_" + cvtToStr(mtp->id()) + ";\n");
|
||||
}
|
||||
}
|
||||
// This fake mtask depends on all the real ones. We use it to block
|
||||
// eval() until all mtasks are done.
|
||||
//
|
||||
// In the future we might allow _eval() to return before the graph is
|
||||
// fully done executing, for "half wave" scheduling. For now we wait
|
||||
// for all mtasks though.
|
||||
puts("VlMTaskVertex __Vm_mt_final;\n");
|
||||
puts("VlThreadPool* __Vm_threadPoolp;\n");
|
||||
|
||||
if (v3Global.opt.profThreads()) {
|
||||
// rdtsc() at current cycle start
|
||||
puts("vluint64_t __Vm_profile_cycle_start;\n");
|
||||
// Time we finished analysis
|
||||
puts("vluint64_t __Vm_profile_time_finished;\n");
|
||||
// Track our position in the cache warmup and actual profile window
|
||||
puts("vluint32_t __Vm_profile_window_ct;\n");
|
||||
}
|
||||
|
||||
puts("bool __Vm_even_cycle;\n");
|
||||
}
|
||||
|
||||
void EmitCImp::emitInt(AstNodeModule* modp) {
|
||||
@ -2000,6 +2394,9 @@ void EmitCImp::emitInt(AstNodeModule* modp) {
|
||||
} else {
|
||||
puts("#include \"verilated.h\"\n");
|
||||
}
|
||||
if (v3Global.opt.mtasks()) {
|
||||
puts("#include \"verilated_threads.h\"\n");
|
||||
}
|
||||
if (v3Global.opt.savable()) {
|
||||
puts("#include \"verilated_save.h\"\n");
|
||||
}
|
||||
@ -2084,6 +2481,9 @@ void EmitCImp::emitInt(AstNodeModule* modp) {
|
||||
puts("bool __Vm_inhibitSim; ///< Set true to disable evaluation of module\n");
|
||||
}
|
||||
}
|
||||
if (modp->isTop() && v3Global.opt.mtasks()) {
|
||||
emitMTaskState();
|
||||
}
|
||||
emitCoverageDecl(modp); // may flip public/private
|
||||
|
||||
puts("\n// PARAMETERS\n");
|
||||
@ -2291,6 +2691,24 @@ void EmitCImp::main(AstNodeModule* modp, bool slow, bool fast) {
|
||||
}
|
||||
}
|
||||
|
||||
if (fast && modp->isTop() && v3Global.opt.mtasks()) {
|
||||
// Make a final pass and emit function definitions for the mtasks
|
||||
// in the ExecGraph
|
||||
AstExecGraph* execGraphp = v3Global.rootp()->execGraphp();
|
||||
const V3Graph* depGraphp = execGraphp->depGraphp();
|
||||
for (const V3GraphVertex* vxp = depGraphp->verticesBeginp();
|
||||
vxp; vxp = vxp->verticesNextp()) {
|
||||
const ExecMTask* mtaskp = dynamic_cast<const ExecMTask*>(vxp);
|
||||
if (mtaskp->threadRoot()) {
|
||||
maybeSplit(modp);
|
||||
// Only define one function for all the mtasks packed on
|
||||
// a given thread. We'll name this function after the
|
||||
// root mtask though it contains multiple mtasks' worth
|
||||
// of logic.
|
||||
iterate(mtaskp->bodyp());
|
||||
}
|
||||
}
|
||||
}
|
||||
delete m_ofp; m_ofp=NULL;
|
||||
}
|
||||
|
||||
|
@ -94,6 +94,9 @@ public:
|
||||
putMakeClassEntry(of, "verilated_vcd_sc.cpp");
|
||||
}
|
||||
}
|
||||
if (v3Global.opt.mtasks()) {
|
||||
putMakeClassEntry(of, "verilated_threads.cpp");
|
||||
}
|
||||
}
|
||||
else if (support==2 && slow) {
|
||||
}
|
||||
|
@ -131,7 +131,7 @@ public:
|
||||
"ALWCOMBORDER", "ASSIGNDLY", "ASSIGNIN",
|
||||
"BLKANDNBLK", "BLKLOOPINIT", "BLKSEQ", "BSSPACE",
|
||||
"CASEINCOMPLETE", "CASEOVERLAP", "CASEWITHX", "CASEX", "CDCRSTLOGIC", "CLKDATA",
|
||||
"CMPCONST", "COLONPLUS", "COMBDLY", "DEFPARAM", "DECLFILENAME",
|
||||
"CMPCONST", "COLONPLUS", "COMBDLY", "DEFPARAM", "DECLFILENAME",
|
||||
"ENDLABEL", "GENCLK",
|
||||
"IFDEPTH", "IMPERFECTSCH", "IMPLICIT", "IMPURE",
|
||||
"INCABSPATH", "INFINITELOOP", "INITIALDLY",
|
||||
|
@ -37,6 +37,8 @@
|
||||
#include VL_INCLUDE_UNORDERED_MAP
|
||||
|
||||
#include "V3Global.h"
|
||||
#include "V3PartitionGraph.h"
|
||||
#include "V3GraphPathChecker.h"
|
||||
#include "V3LifePost.h"
|
||||
#include "V3Stats.h"
|
||||
#include "V3Ast.h"
|
||||
@ -78,6 +80,11 @@ private:
|
||||
iterate(nodep->funcp());
|
||||
}
|
||||
}
|
||||
virtual void visit(AstExecGraph* nodep) {
|
||||
// Can just iterate across the MTask bodies in any order. Order
|
||||
// isn't important for LifePostElimVisitor's simple substitution.
|
||||
iterateChildren(nodep);
|
||||
}
|
||||
virtual void visit(AstCFunc* nodep) {
|
||||
if (!m_tracingCall && !nodep->entryPoint()) return;
|
||||
m_tracingCall = false;
|
||||
@ -101,11 +108,17 @@ public:
|
||||
// and a sequence number within the mtask:
|
||||
|
||||
struct LifeLocation {
|
||||
const ExecMTask* mtaskp;
|
||||
uint32_t sequence;
|
||||
public:
|
||||
LifeLocation() : sequence(0) {}
|
||||
LifeLocation(uint32_t sequence_) : sequence(sequence_) {}
|
||||
LifeLocation() : mtaskp(NULL), sequence(0) {}
|
||||
LifeLocation(const ExecMTask* mtaskp_, uint32_t sequence_)
|
||||
: mtaskp(mtaskp_), sequence(sequence_) {}
|
||||
bool operator< (const LifeLocation& b) const {
|
||||
unsigned a_id = mtaskp ? mtaskp->id() : 0;
|
||||
unsigned b_id = b.mtaskp ? b.mtaskp->id() : 0;
|
||||
if (a_id < b_id) { return true; }
|
||||
if (b_id < a_id) { return false; }
|
||||
return sequence < b.sequence;
|
||||
}
|
||||
};
|
||||
@ -130,6 +143,9 @@ private:
|
||||
|
||||
// STATE
|
||||
uint32_t m_sequence; // Sequence number of assigns/varrefs,
|
||||
// // local to the current MTask.
|
||||
const ExecMTask* m_execMTaskp; // Current ExecMTask being processed,
|
||||
// // or NULL for serial code.
|
||||
V3Double0 m_statAssnDel; // Statistic tracking
|
||||
bool m_tracingCall; // Currently tracing a CCall to a CFunc
|
||||
|
||||
@ -143,11 +159,15 @@ private:
|
||||
typedef vl_unordered_map<const AstVarScope*, LifePostLocation> PostLocMap;
|
||||
PostLocMap m_assignposts; // AssignPost dly var locations
|
||||
|
||||
const V3Graph* m_mtasksGraphp; // Mtask tracking graph
|
||||
vl_unique_ptr<GraphPathChecker> m_checker;
|
||||
|
||||
// METHODS
|
||||
VL_DEBUG_FUNC; // Declare debug()
|
||||
|
||||
static bool before(const LifeLocation& a, const LifeLocation& b) {
|
||||
return a.sequence < b.sequence;
|
||||
bool before(const LifeLocation& a, const LifeLocation& b) {
|
||||
if (a.mtaskp == b.mtaskp) return a.sequence < b.sequence;
|
||||
return m_checker->pathExistsFrom(a.mtaskp, b.mtaskp);
|
||||
}
|
||||
bool outsideCriticalArea(LifeLocation loc,
|
||||
const std::set<LifeLocation>& dlyVarAssigns,
|
||||
@ -159,6 +179,13 @@ private:
|
||||
// Otherwise, loc could fall in the "critical" area where the
|
||||
// substitution affects the result of the operation at loc, so
|
||||
// return false.
|
||||
if (!loc.mtaskp && assignPostLoc.mtaskp) {
|
||||
// This is threaded mode; 'loc' is something that happens at
|
||||
// initial/settle time, or perhaps in _eval() but outside of
|
||||
// the mtask graph.
|
||||
// In either case, it's not in the critical area.
|
||||
return true;
|
||||
}
|
||||
if (before(assignPostLoc, loc)) return true;
|
||||
for (std::set<LifeLocation>::iterator it = dlyVarAssigns.begin();
|
||||
it != dlyVarAssigns.end(); ++it) {
|
||||
@ -239,6 +266,17 @@ private:
|
||||
// within the mtask) where each varscope is read, and written.
|
||||
iterateChildren(nodep);
|
||||
|
||||
if (v3Global.opt.mtasks()) {
|
||||
if (!m_mtasksGraphp) {
|
||||
nodep->v3fatalSrc("Should have initted m_mtasksGraphp by now");
|
||||
}
|
||||
m_checker.reset(new GraphPathChecker(m_mtasksGraphp));
|
||||
} else {
|
||||
if (m_mtasksGraphp) {
|
||||
nodep->v3fatalSrc("Did not expect any m_mtasksGraphp in serial mode");
|
||||
}
|
||||
}
|
||||
|
||||
// Find all assignposts. Determine which ones can be
|
||||
// eliminated. Remove those, and mark their dly vars' user4 field
|
||||
// to indicate we should replace these dly vars with their original
|
||||
@ -252,7 +290,8 @@ private:
|
||||
// Consumption/generation of a variable,
|
||||
AstVarScope* vscp = nodep->varScopep();
|
||||
if (!vscp) nodep->v3fatalSrc("Scope not assigned");
|
||||
LifeLocation loc(++m_sequence);
|
||||
|
||||
LifeLocation loc(m_execMTaskp, ++m_sequence);
|
||||
if (nodep->lvalue()) {
|
||||
m_writes[vscp].insert(loc);
|
||||
} else {
|
||||
@ -275,7 +314,7 @@ private:
|
||||
if (m_assignposts.find(dlyVarp) != m_assignposts.end()) {
|
||||
nodep->v3fatalSrc("LifePostLocation attempted duplicate dlyvar map addition");
|
||||
}
|
||||
LifeLocation loc(++m_sequence);
|
||||
LifeLocation loc(m_execMTaskp, ++m_sequence);
|
||||
m_assignposts[dlyVarp] = LifePostLocation(loc, nodep);
|
||||
}
|
||||
}
|
||||
@ -291,6 +330,18 @@ private:
|
||||
iterate(nodep->funcp());
|
||||
}
|
||||
}
|
||||
virtual void visit(AstExecGraph* nodep) {
|
||||
// Treat the ExecGraph like a call to each mtask body
|
||||
m_mtasksGraphp = nodep->depGraphp();
|
||||
for (V3GraphVertex* mtaskVxp = m_mtasksGraphp->verticesBeginp();
|
||||
mtaskVxp; mtaskVxp = mtaskVxp->verticesNextp()) {
|
||||
ExecMTask* mtaskp = dynamic_cast<ExecMTask*>(mtaskVxp);
|
||||
m_execMTaskp = mtaskp;
|
||||
m_sequence = 0;
|
||||
iterate(mtaskp->bodyp());
|
||||
}
|
||||
m_execMTaskp = NULL;
|
||||
}
|
||||
virtual void visit(AstCFunc* nodep) {
|
||||
if (!m_tracingCall && !nodep->entryPoint()) return;
|
||||
m_tracingCall = false;
|
||||
@ -305,7 +356,9 @@ public:
|
||||
// CONSTRUCTORS
|
||||
explicit LifePostDlyVisitor(AstNetlist* nodep)
|
||||
: m_sequence(0)
|
||||
, m_tracingCall(false) {
|
||||
, m_execMTaskp(NULL)
|
||||
, m_tracingCall(false)
|
||||
, m_mtasksGraphp(NULL) {
|
||||
iterate(nodep);
|
||||
}
|
||||
virtual ~LifePostDlyVisitor() {
|
||||
|
@ -661,6 +661,9 @@ void V3Options::parseOptsList(FileLine* fl, const string& optdir, int argc, char
|
||||
else if ( !strcmp (sw, "-debug-abort") ) { abort(); } // Undocumented, see also --debug-sigsegv
|
||||
else if ( onoff (sw, "-debug-check", flag/*ref*/) ){ m_debugCheck = flag; }
|
||||
else if ( onoff (sw, "-debug-leak", flag/*ref*/) ){ m_debugLeak = flag; }
|
||||
else if ( onoff (sw, "-debug-nondeterminism", flag/*ref*/) ){ m_debugNondeterminism = flag; }
|
||||
else if ( onoff (sw, "-debug-partition", flag/*ref*/) ){ m_debugPartition = flag; } // Undocumented
|
||||
else if ( onoff (sw, "-debug-self-test", flag/*ref*/) ){ m_debugSelfTest = flag; } // Undocumented
|
||||
else if ( !strcmp (sw, "-debug-sigsegv") ) { throwSigsegv(); } // Undocumented, see also --debug-abort
|
||||
else if ( !strcmp (sw, "-debug-fatalsrc") ) { v3fatalSrc("--debug-fatal-src"); } // Undocumented, see also --debug-abort
|
||||
else if ( onoff (sw, "-decoration", flag/*ref*/) ) { m_decoration = flag; }
|
||||
@ -678,6 +681,7 @@ void V3Options::parseOptsList(FileLine* fl, const string& optdir, int argc, char
|
||||
else if ( !strcmp (sw, "-private") ) { m_public = false; }
|
||||
else if ( onoff (sw, "-prof-cfuncs", flag/*ref*/) ) { m_profCFuncs = flag; }
|
||||
else if ( onoff (sw, "-profile-cfuncs", flag/*ref*/) ) { m_profCFuncs = flag; } // Undocumented, for backward compat
|
||||
else if ( onoff (sw, "-prof-threads", flag/*ref*/) ) { m_profThreads = flag; }
|
||||
else if ( onoff (sw, "-public", flag/*ref*/) ) { m_public = flag; }
|
||||
else if ( !strncmp(sw, "-pvalue+", strlen("-pvalue+"))) { addParameter(string(sw+strlen("-pvalue+")), false); }
|
||||
else if ( onoff (sw, "-relative-cfuncs", flag/*ref*/) ) { m_relativeCFuncs = flag; }
|
||||
@ -689,6 +693,7 @@ void V3Options::parseOptsList(FileLine* fl, const string& optdir, int argc, char
|
||||
else if ( onoff (sw, "-stats", flag/*ref*/) ) { m_stats = flag; }
|
||||
else if ( onoff (sw, "-stats-vars", flag/*ref*/) ) { m_statsVars = flag; m_stats |= flag; }
|
||||
else if ( !strcmp (sw, "-sv") ) { m_defaultLanguage = V3LangCode::L1800_2005; }
|
||||
else if ( onoff (sw, "-threads-coarsen", flag/*ref*/)) { m_threadsCoarsen = flag; } // Undocumented, debug
|
||||
else if ( onoff (sw, "-trace", flag/*ref*/) ) { m_trace = flag; }
|
||||
else if ( onoff (sw, "-trace-dups", flag/*ref*/) ) { m_traceDups = flag; }
|
||||
else if ( onoff (sw, "-trace-params", flag/*ref*/) ) { m_traceParams = flag; }
|
||||
@ -1013,6 +1018,20 @@ void V3Options::parseOptsList(FileLine* fl, const string& optdir, int argc, char
|
||||
shift; m_threads = atoi(argv[i]);
|
||||
if (m_threads < 0) fl->v3fatal("--threads must be >= 0: "<<argv[i]);
|
||||
}
|
||||
else if ( !strcmp (sw, "-threads-dpi") && (i+1)<argc) {
|
||||
shift;
|
||||
if (!strcmp(argv[i], "all")) { m_threadsDpiPure=true; m_threadsDpiUnpure=true; }
|
||||
else if (!strcmp(argv[i], "none")) { m_threadsDpiPure=false; m_threadsDpiUnpure=false; }
|
||||
else if (!strcmp(argv[i], "pure")) { m_threadsDpiPure=true; m_threadsDpiUnpure=false; }
|
||||
else {
|
||||
fl->v3fatal("Unknown setting for --threads-dpi: "<<argv[i]);
|
||||
}
|
||||
}
|
||||
else if ( !strcmp (sw, "-threads-max-mtasks") ) {
|
||||
shift; m_threadsMaxMTasks = atoi(argv[i]);
|
||||
if (m_threadsMaxMTasks < 1)
|
||||
fl->v3fatal("--threads-max-mtasks must be >= 1: "<<argv[i]);
|
||||
}
|
||||
else if ( !strcmp (sw, "-top-module") && (i+1)<argc ) {
|
||||
shift; m_topModule = argv[i];
|
||||
}
|
||||
@ -1223,6 +1242,9 @@ V3Options::V3Options() {
|
||||
m_coverageUser = false;
|
||||
m_debugCheck = false;
|
||||
m_debugLeak = true;
|
||||
m_debugNondeterminism = false;
|
||||
m_debugPartition = false;
|
||||
m_debugSelfTest = false;
|
||||
m_decoration = true;
|
||||
m_exe = false;
|
||||
m_ignc = false;
|
||||
@ -1237,6 +1259,7 @@ V3Options::V3Options() {
|
||||
m_pinsScBigUint = false;
|
||||
m_pinsUint8 = false;
|
||||
m_profCFuncs = false;
|
||||
m_profThreads = false;
|
||||
m_preprocOnly = false;
|
||||
m_preprocNoLine = false;
|
||||
m_public = false;
|
||||
@ -1249,6 +1272,10 @@ V3Options::V3Options() {
|
||||
m_statsVars = false;
|
||||
m_systemC = false;
|
||||
m_threads = 0;
|
||||
m_threadsDpiPure = true;
|
||||
m_threadsDpiUnpure = false;
|
||||
m_threadsCoarsen = true;
|
||||
m_threadsMaxMTasks = 0;
|
||||
m_trace = false;
|
||||
m_traceDups = false;
|
||||
m_traceParams = true;
|
||||
|
@ -75,7 +75,10 @@ class V3Options {
|
||||
bool m_coverageUnderscore;// main switch: --coverage-underscore
|
||||
bool m_coverageUser; // main switch: --coverage-func
|
||||
bool m_debugCheck; // main switch: --debug-check
|
||||
bool m_debugLeak; // main switch: --debug-leak
|
||||
bool m_debugLeak; // main switch: --debug-leak
|
||||
bool m_debugNondeterminism; // main switch: --debug-nondeterminism
|
||||
bool m_debugPartition; // main switch: --debug-partition
|
||||
bool m_debugSelfTest; // main switch: --debug-self-test
|
||||
bool m_decoration; // main switch: --decoration
|
||||
bool m_exe; // main switch: --exe
|
||||
bool m_ignc; // main switch: --ignc
|
||||
@ -87,6 +90,7 @@ class V3Options {
|
||||
bool m_pinsScBigUint;// main switch: --pins-sc-biguint
|
||||
bool m_pinsUint8; // main switch: --pins-uint8
|
||||
bool m_profCFuncs; // main switch: --prof-cfuncs
|
||||
bool m_profThreads; // main switch: --prof-threads
|
||||
bool m_public; // main switch: --public
|
||||
bool m_relativeCFuncs; // main switch: --relative-cfuncs
|
||||
bool m_relativeIncludes; // main switch: --relative-includes
|
||||
@ -96,6 +100,9 @@ class V3Options {
|
||||
bool m_skipIdentical;// main switch: --skip-identical
|
||||
bool m_stats; // main switch: --stats
|
||||
bool m_statsVars; // main switch: --stats-vars
|
||||
bool m_threadsCoarsen; // main switch: --threads-coarsen
|
||||
bool m_threadsDpiPure; // main switch: --threads-dpi all/pure
|
||||
bool m_threadsDpiUnpure; // main switch: --threads-dpi all
|
||||
bool m_trace; // main switch: --trace
|
||||
bool m_traceDups; // main switch: --trace-dups
|
||||
bool m_traceParams; // main switch: --trace-params
|
||||
@ -117,6 +124,7 @@ class V3Options {
|
||||
int m_outputSplitCTrace;// main switch: --output-split-ctrace
|
||||
int m_pinsBv; // main switch: --pins-bv
|
||||
int m_threads; // main switch: --threads (0 == --no-threads)
|
||||
int m_threadsMaxMTasks; // main switch: --threads-max-mtasks
|
||||
int m_traceDepth; // main switch: --trace-depth
|
||||
int m_traceMaxArray;// main switch: --trace-max-array
|
||||
int m_traceMaxWidth;// main switch: --trace-max-width
|
||||
@ -232,8 +240,14 @@ class V3Options {
|
||||
bool coverageUser() const { return m_coverageUser; }
|
||||
bool debugCheck() const { return m_debugCheck; }
|
||||
bool debugLeak() const { return m_debugLeak; }
|
||||
bool debugNondeterminism() const { return m_debugNondeterminism; }
|
||||
bool debugPartition() const { return m_debugPartition; }
|
||||
bool debugSelfTest() const { return m_debugSelfTest; }
|
||||
bool decoration() const { return m_decoration; }
|
||||
bool exe() const { return m_exe; }
|
||||
bool threadsDpiPure() const { return m_threadsDpiPure; }
|
||||
bool threadsDpiUnpure() const { return m_threadsDpiUnpure; }
|
||||
bool threadsCoarsen() const { return m_threadsCoarsen; }
|
||||
bool trace() const { return m_trace; }
|
||||
bool traceDups() const { return m_traceDups; }
|
||||
bool traceParams() const { return m_traceParams; }
|
||||
@ -246,6 +260,7 @@ class V3Options {
|
||||
bool pinsScBigUint() const { return m_pinsScBigUint; }
|
||||
bool pinsUint8() const { return m_pinsUint8; }
|
||||
bool profCFuncs() const { return m_profCFuncs; }
|
||||
bool profThreads() const { return m_profThreads; }
|
||||
bool allPublic() const { return m_public; }
|
||||
bool lintOnly() const { return m_lintOnly; }
|
||||
bool ignc() const { return m_ignc; }
|
||||
@ -267,6 +282,7 @@ class V3Options {
|
||||
int outputSplitCTrace() const { return m_outputSplitCTrace; }
|
||||
int pinsBv() const { return m_pinsBv; }
|
||||
int threads() const { return m_threads; }
|
||||
int threadsMaxMTasks() const { return m_threadsMaxMTasks; }
|
||||
bool mtasks() const { return (m_threads > 1); }
|
||||
int traceDepth() const { return m_traceDepth; }
|
||||
int traceMaxArray() const { return m_traceMaxArray; }
|
||||
|
267
src/V3Order.cpp
267
src/V3Order.cpp
@ -89,19 +89,22 @@
|
||||
#include <sstream>
|
||||
#include <memory>
|
||||
|
||||
#include "V3Global.h"
|
||||
#include "V3File.h"
|
||||
#include "V3Ast.h"
|
||||
#include "V3Const.h"
|
||||
#include "V3EmitCBase.h"
|
||||
#include "V3EmitV.h"
|
||||
#include "V3File.h"
|
||||
#include "V3Global.h"
|
||||
#include "V3Graph.h"
|
||||
#include "V3GraphStream.h"
|
||||
#include "V3List.h"
|
||||
#include "V3Partition.h"
|
||||
#include "V3PartitionGraph.h"
|
||||
#include "V3SenTree.h"
|
||||
#include "V3Stats.h"
|
||||
#include "V3EmitCBase.h"
|
||||
#include "V3Const.h"
|
||||
|
||||
#include "V3Order.h"
|
||||
#include "V3OrderGraph.h"
|
||||
#include "V3EmitV.h"
|
||||
|
||||
#include VL_INCLUDE_UNORDERED_MAP
|
||||
#include VL_INCLUDE_UNORDERED_SET
|
||||
@ -423,10 +426,15 @@ class ProcessMoveBuildGraph {
|
||||
// OrderVisitor. It produces a slightly coarsened graph to drive the
|
||||
// code scheduling.
|
||||
//
|
||||
// * The new graph contains nodes of type OrderMoveVertex.
|
||||
// * For the serial code scheduler, the new graph contains
|
||||
// nodes of type OrderMoveVertex.
|
||||
//
|
||||
// * For the threaded code scheduler, the new graph contains
|
||||
// nodes of type MTaskMoveVertex.
|
||||
//
|
||||
// * The difference in output type is abstracted away by the
|
||||
// 'T_MoveVertex' template parameter.
|
||||
// 'T_MoveVertex' template parameter; ProcessMoveBuildGraph otherwise
|
||||
// works the same way for both cases.
|
||||
|
||||
// TYPES
|
||||
typedef std::pair<const V3GraphVertex*, const AstSenTree*> VxDomPair;
|
||||
@ -563,7 +571,7 @@ private:
|
||||
};
|
||||
|
||||
//######################################################################
|
||||
// OrderMoveVertexMaker
|
||||
// OrderMoveVertexMaker and related
|
||||
|
||||
class OrderMoveVertexMaker
|
||||
: public ProcessMoveBuildGraph<OrderMoveVertex>::MoveVertexMaker {
|
||||
@ -595,6 +603,64 @@ private:
|
||||
VL_UNCOPYABLE(OrderMoveVertexMaker);
|
||||
};
|
||||
|
||||
class OrderMTaskMoveVertexMaker
|
||||
: public ProcessMoveBuildGraph<MTaskMoveVertex>::MoveVertexMaker {
|
||||
V3Graph* m_pomGraphp;
|
||||
public:
|
||||
explicit OrderMTaskMoveVertexMaker(V3Graph* pomGraphp)
|
||||
: m_pomGraphp(pomGraphp) {}
|
||||
MTaskMoveVertex* makeVertexp(OrderLogicVertex* lvertexp,
|
||||
const OrderEitherVertex* varVertexp,
|
||||
const AstScope* scopep,
|
||||
const AstSenTree* domainp) {
|
||||
// Exclude initial/settle logic from the mtasks graph.
|
||||
// We'll output time-zero logic separately.
|
||||
if (domainp->hasInitial() || domainp->hasSettle()) {
|
||||
return NULL;
|
||||
}
|
||||
return new MTaskMoveVertex(m_pomGraphp, lvertexp, varVertexp, scopep, domainp);
|
||||
}
|
||||
void freeVertexp(MTaskMoveVertex* freeMep) {
|
||||
freeMep->unlinkDelete(m_pomGraphp);
|
||||
}
|
||||
private:
|
||||
VL_UNCOPYABLE(OrderMTaskMoveVertexMaker);
|
||||
};
|
||||
|
||||
class OrderVerticesByDomainThenScope {
|
||||
PartPtrIdMap m_ids;
|
||||
public:
|
||||
virtual bool operator()(const V3GraphVertex* lhsp,
|
||||
const V3GraphVertex* rhsp) const {
|
||||
const MTaskMoveVertex* l_vxp = dynamic_cast<const MTaskMoveVertex*>(lhsp);
|
||||
const MTaskMoveVertex* r_vxp = dynamic_cast<const MTaskMoveVertex*>(rhsp);
|
||||
vluint64_t l_id = m_ids.findId(l_vxp->domainp());
|
||||
vluint64_t r_id = m_ids.findId(r_vxp->domainp());
|
||||
if (l_id < r_id) return true;
|
||||
if (l_id > r_id) return false;
|
||||
l_id = m_ids.findId(l_vxp->scopep());
|
||||
r_id = m_ids.findId(r_vxp->scopep());
|
||||
return l_id < r_id;
|
||||
}
|
||||
};
|
||||
|
||||
class MTaskVxIdLessThan {
|
||||
public:
|
||||
MTaskVxIdLessThan() {}
|
||||
virtual ~MTaskVxIdLessThan() {}
|
||||
|
||||
// Sort vertex's, which must be AbstractMTask's, into a deterministic
|
||||
// order by comparing their serial IDs.
|
||||
virtual bool operator()(const V3GraphVertex* lhsp,
|
||||
const V3GraphVertex* rhsp) const {
|
||||
const AbstractMTask* lmtaskp =
|
||||
dynamic_cast<const AbstractLogicMTask*>(lhsp);
|
||||
const AbstractMTask* rmtaskp =
|
||||
dynamic_cast<const AbstractLogicMTask*>(rhsp);
|
||||
return lmtaskp->id() < rmtaskp->id();
|
||||
}
|
||||
};
|
||||
|
||||
//######################################################################
|
||||
// Order class functions
|
||||
|
||||
@ -701,6 +767,7 @@ private:
|
||||
void processDomainsIterate(OrderEitherVertex* vertexp);
|
||||
void processEdgeReport();
|
||||
|
||||
// processMove* routines schedule serial execution
|
||||
void processMove();
|
||||
void processMoveClear();
|
||||
void processMoveBuildGraph();
|
||||
@ -711,6 +778,18 @@ private:
|
||||
AstActive* processMoveOneLogic(const OrderLogicVertex* lvertexp,
|
||||
AstCFunc*& newFuncpr, int& newStmtsr);
|
||||
|
||||
// processMTask* routines schedule threaded execution
|
||||
struct MTaskState {
|
||||
typedef std::list<const OrderLogicVertex*> Logics;
|
||||
AstMTaskBody* m_mtaskBodyp;
|
||||
Logics m_logics;
|
||||
ExecMTask* m_execMTaskp;
|
||||
MTaskState() : m_mtaskBodyp(NULL), m_execMTaskp(NULL) {}
|
||||
};
|
||||
void processMTasks();
|
||||
typedef enum {LOGIC_INITIAL, LOGIC_SETTLE} InitialLogicE;
|
||||
void processMTasksInitial(InitialLogicE logic_type);
|
||||
|
||||
string cfuncName(AstNodeModule* modp, AstSenTree* domainp, AstScope* scopep, AstNode* forWhatp) {
|
||||
modp->user3Inc();
|
||||
int funcnum = modp->user3();
|
||||
@ -1726,6 +1805,173 @@ AstActive* OrderVisitor::processMoveOneLogic(const OrderLogicVertex* lvertexp,
|
||||
return activep;
|
||||
}
|
||||
|
||||
void OrderVisitor::processMTasksInitial(InitialLogicE logic_type) {
|
||||
// Emit initial/settle logic. Initial blocks won't be part of the
|
||||
// mtask partition, aren't eligible for parallelism.
|
||||
//
|
||||
int initStmts = 0;
|
||||
AstCFunc* initCFunc = NULL;
|
||||
AstScope* lastScopep = NULL;
|
||||
for (V3GraphVertex* initVxp = m_graph.verticesBeginp();
|
||||
initVxp; initVxp = initVxp->verticesNextp()) {
|
||||
OrderLogicVertex* initp = dynamic_cast<OrderLogicVertex*>(initVxp);
|
||||
if (!initp) continue;
|
||||
if ((logic_type == LOGIC_INITIAL)
|
||||
&& !initp->domainp()->hasInitial()) continue;
|
||||
if ((logic_type == LOGIC_SETTLE)
|
||||
&& !initp->domainp()->hasSettle()) continue;
|
||||
if (initp->scopep() != lastScopep) {
|
||||
// Start new cfunc, don't let the cfunc cross scopes
|
||||
initCFunc = NULL;
|
||||
lastScopep = initp->scopep();
|
||||
}
|
||||
AstActive* newActivep = processMoveOneLogic(initp, initCFunc/*ref*/, initStmts/*ref*/);
|
||||
if (newActivep) m_scopetopp->addActivep(newActivep);
|
||||
}
|
||||
}
|
||||
|
||||
void OrderVisitor::processMTasks() {
|
||||
// For nondeterminism debug:
|
||||
V3Partition::hashGraphDebug(&m_graph, "V3Order's m_graph");
|
||||
|
||||
processMTasksInitial(LOGIC_INITIAL);
|
||||
processMTasksInitial(LOGIC_SETTLE);
|
||||
|
||||
// We already produced a graph of every var, input, logic, and settle
|
||||
// block and all dependencies; this is 'm_graph'.
|
||||
//
|
||||
// Now, starting from m_graph, make a slightly-coarsened graph representing
|
||||
// only logic, and discarding edges we know we can ignore.
|
||||
// This is quite similar to the 'm_pomGraph' of the serial code gen:
|
||||
V3Graph logicGraph;
|
||||
OrderMTaskMoveVertexMaker create_mtask_vertex(&logicGraph);
|
||||
ProcessMoveBuildGraph<MTaskMoveVertex> mtask_pmbg(
|
||||
&m_graph, &logicGraph, &create_mtask_vertex);
|
||||
mtask_pmbg.build();
|
||||
|
||||
// Needed? We do this for m_pomGraph in serial mode, so do it here too:
|
||||
logicGraph.removeRedundantEdges(&V3GraphEdge::followAlwaysTrue);
|
||||
|
||||
// Partition logicGraph into LogicMTask's. The partitioner will annotate
|
||||
// each vertex in logicGraph with a 'color' which is really an mtask ID
|
||||
// in this context.
|
||||
V3Partition partitioner(&logicGraph);
|
||||
V3Graph mtasks;
|
||||
partitioner.go(&mtasks);
|
||||
|
||||
vl_unordered_map<unsigned /*mtask id*/, MTaskState> mtaskStates;
|
||||
|
||||
// Iterate through the entire logicGraph. For each logic node,
|
||||
// attach it to a per-MTask ordered list of logic nodes.
|
||||
// This is the order we'll execute logic nodes within the MTask.
|
||||
//
|
||||
// MTasks may span scopes and domains, so sort by both here:
|
||||
GraphStream<OrderVerticesByDomainThenScope> emit_logic(&logicGraph);
|
||||
const V3GraphVertex* moveVxp;
|
||||
while ((moveVxp = emit_logic.nextp())) {
|
||||
const MTaskMoveVertex* movep =
|
||||
dynamic_cast<const MTaskMoveVertex*>(moveVxp);
|
||||
unsigned mtaskId = movep->color();
|
||||
UASSERT(mtaskId > 0,
|
||||
"Every MTaskMoveVertex should have an mtask assignment >0");
|
||||
if (movep->logicp()) {
|
||||
// Add this logic to the per-mtask order
|
||||
mtaskStates[mtaskId].m_logics.push_back(movep->logicp());
|
||||
|
||||
// Since we happen to be iterating over every logic node,
|
||||
// take this opportunity to annotate each AstVar with the id's
|
||||
// of mtasks that consume it and produce it. We'll use this
|
||||
// information in V3EmitC when we lay out var's in memory.
|
||||
const OrderLogicVertex* logicp = movep->logicp();
|
||||
for (const V3GraphEdge* edgep = logicp->inBeginp();
|
||||
edgep; edgep = edgep->inNextp()) {
|
||||
const OrderVarVertex* pre_varp =
|
||||
dynamic_cast<const OrderVarVertex*>(edgep->fromp());
|
||||
if (!pre_varp) continue;
|
||||
AstVar* varp = pre_varp->varScp()->varp();
|
||||
// varp depends on logicp, so logicp produces varp,
|
||||
// and vice-versa below
|
||||
varp->addProducingMTaskId(mtaskId);
|
||||
}
|
||||
for (const V3GraphEdge* edgep = logicp->outBeginp();
|
||||
edgep; edgep = edgep->outNextp()) {
|
||||
const OrderVarVertex* post_varp
|
||||
= dynamic_cast<const OrderVarVertex*>(edgep->top());
|
||||
if (!post_varp) continue;
|
||||
AstVar* varp = post_varp->varScp()->varp();
|
||||
varp->addConsumingMTaskId(mtaskId);
|
||||
}
|
||||
// TODO? We ignore IO vars here, so those will have empty mtask
|
||||
// signatures. But we could also give those mtask signatures.
|
||||
}
|
||||
}
|
||||
|
||||
// Create the AstExecGraph node which represents the execution
|
||||
// of the MTask graph.
|
||||
FileLine* rootFlp = new FileLine("AstRoot", 0);
|
||||
AstExecGraph* execGraphp = new AstExecGraph(rootFlp);
|
||||
m_scopetopp->addActivep(execGraphp);
|
||||
v3Global.rootp()->execGraphp(execGraphp);
|
||||
|
||||
// Create CFuncs and bodies for each MTask.
|
||||
GraphStream<MTaskVxIdLessThan> emit_mtasks(&mtasks);
|
||||
const V3GraphVertex* mtaskVxp;
|
||||
while ((mtaskVxp = emit_mtasks.nextp())) {
|
||||
const AbstractLogicMTask* mtaskp =
|
||||
dynamic_cast<const AbstractLogicMTask*>(mtaskVxp);
|
||||
|
||||
// Create a body for this mtask
|
||||
AstMTaskBody* bodyp = new AstMTaskBody(rootFlp);
|
||||
MTaskState& state = mtaskStates[mtaskp->id()];
|
||||
state.m_mtaskBodyp = bodyp;
|
||||
|
||||
// Create leaf CFunc's to run this mtask's logic,
|
||||
// and create a set of AstActive's to call those CFuncs.
|
||||
// Add the AstActive's into the AstMTaskBody.
|
||||
const AstSenTree* last_domainp = NULL;
|
||||
AstCFunc* leafCFuncp = NULL;
|
||||
int leafStmts = 0;
|
||||
for (MTaskState::Logics::iterator it = state.m_logics.begin();
|
||||
it != state.m_logics.end(); ++it) {
|
||||
const OrderLogicVertex* logicp = *it;
|
||||
if (logicp->domainp() != last_domainp) {
|
||||
// Start a new leaf function.
|
||||
leafCFuncp = NULL;
|
||||
}
|
||||
last_domainp = logicp->domainp();
|
||||
|
||||
AstActive* newActivep = processMoveOneLogic(logicp, leafCFuncp/*ref*/, leafStmts/*ref*/);
|
||||
if (newActivep) bodyp->addStmtsp(newActivep);
|
||||
}
|
||||
|
||||
// Translate the LogicMTask graph into the corresponding ExecMTask
|
||||
// graph, which will outlive V3Order and persist for the remainder
|
||||
// of verilator's processing.
|
||||
// - The LogicMTask graph points to MTaskMoveVertex's
|
||||
// and OrderLogicVertex's which are ephemeral to V3Order.
|
||||
// - The ExecMTask graph and the AstMTaskBody's produced here
|
||||
// persist until code generation time.
|
||||
state.m_execMTaskp =
|
||||
new ExecMTask(execGraphp->mutableDepGraphp(),
|
||||
bodyp, mtaskp->id());
|
||||
// Cross-link each ExecMTask and MTaskBody
|
||||
// Q: Why even have two objects?
|
||||
// A: One is an AstNode, the other is a GraphVertex,
|
||||
// to combine them would involve multiple inheritance...
|
||||
state.m_mtaskBodyp->execMTaskp(state.m_execMTaskp);
|
||||
for (V3GraphEdge* inp = mtaskp->inBeginp();
|
||||
inp; inp = inp->inNextp()) {
|
||||
const V3GraphVertex* fromVxp = inp->fromp();
|
||||
const AbstractLogicMTask* fromp =
|
||||
dynamic_cast<const AbstractLogicMTask*>(fromVxp);
|
||||
MTaskState& fromState = mtaskStates[fromp->id()];
|
||||
new V3GraphEdge(execGraphp->mutableDepGraphp(),
|
||||
fromState.m_execMTaskp, state.m_execMTaskp, 1);
|
||||
}
|
||||
execGraphp->addMTaskBody(bodyp);
|
||||
}
|
||||
}
|
||||
|
||||
//######################################################################
|
||||
// OrderVisitor - Top processing
|
||||
|
||||
@ -1762,7 +2008,7 @@ void OrderVisitor::process() {
|
||||
|
||||
if (debug() && v3Global.opt.dumpTree()) processEdgeReport();
|
||||
|
||||
{
|
||||
if (!v3Global.opt.mtasks()) {
|
||||
UINFO(2," Construct Move Graph...\n");
|
||||
processMoveBuildGraph();
|
||||
if (debug()>=4) m_pomGraph.dumpDotFilePrefixed("ordermv_start"); // Different prefix (ordermv) as it's not the same graph
|
||||
@ -1771,6 +2017,9 @@ void OrderVisitor::process() {
|
||||
|
||||
UINFO(2," Move...\n");
|
||||
processMove();
|
||||
} else {
|
||||
UINFO(2," Set up mtasks...\n");
|
||||
processMTasks();
|
||||
}
|
||||
|
||||
// Any SC inputs feeding a combo domain must be marked, so we can make them sc_sensitive
|
||||
|
@ -21,6 +21,7 @@
|
||||
//
|
||||
// V3GraphVertex
|
||||
// OrderMoveVertex
|
||||
// MTaskMoveVertex
|
||||
// OrderEitherVertex
|
||||
// OrderInputsVertex
|
||||
// OrderSettleVertex
|
||||
@ -47,6 +48,7 @@
|
||||
#include "verilatedos.h"
|
||||
#include "V3Ast.h"
|
||||
#include "V3Graph.h"
|
||||
#include VL_INCLUDE_UNORDERED_MAP
|
||||
|
||||
class OrderVisitor;
|
||||
class OrderMoveVertex;
|
||||
@ -363,6 +365,57 @@ public:
|
||||
void domScopep(OrderMoveDomScope* ds) { m_domScopep=ds; }
|
||||
};
|
||||
|
||||
// Similar to OrderMoveVertex, but modified for threaded code generation.
|
||||
class MTaskMoveVertex : public V3GraphVertex {
|
||||
// This could be more compact, since we know m_varp and m_logicp
|
||||
// cannot both be set. Each MTaskMoveVertex represents a logic node
|
||||
// or a var node, it can't be both.
|
||||
OrderLogicVertex* m_logicp; // Logic represented by this vertex
|
||||
const OrderEitherVertex* m_varp; // Var represented by this vertex
|
||||
const AstScope* m_scopep;
|
||||
const AstSenTree* m_domainp;
|
||||
|
||||
protected:
|
||||
friend class OrderVisitor;
|
||||
friend class MTaskMoveVertexMaker;
|
||||
public:
|
||||
MTaskMoveVertex(V3Graph* graphp, OrderLogicVertex* logicp,
|
||||
const OrderEitherVertex* varp,
|
||||
const AstScope* scopep, const AstSenTree* domainp)
|
||||
: V3GraphVertex(graphp), m_logicp(logicp),
|
||||
m_varp(varp), m_scopep(scopep), m_domainp(domainp) {
|
||||
UASSERT(!(logicp && varp),
|
||||
"MTaskMoveVertex: logicp and varp may not both be set!\n");
|
||||
}
|
||||
virtual ~MTaskMoveVertex() {}
|
||||
virtual MTaskMoveVertex* clone(V3Graph* graphp) const {
|
||||
v3fatalSrc("Unsupported"); return NULL; }
|
||||
virtual OrderVEdgeType type() const { return OrderVEdgeType::VERTEX_MOVE; }
|
||||
virtual string dotColor() const {
|
||||
if (logicp()) return logicp()->dotColor();
|
||||
else return "yellow";
|
||||
}
|
||||
virtual string name() const {
|
||||
string nm;
|
||||
if (logicp()) {
|
||||
nm = logicp()->name();
|
||||
nm += (string("\\nMV:")
|
||||
+" d="+cvtToStr((void*)logicp()->domainp())
|
||||
+" s="+cvtToStr((void*)logicp()->scopep())
|
||||
// "color()" represents the mtask ID.
|
||||
+"\\nt="+cvtToStr(color()));
|
||||
} else {
|
||||
nm = "nolog\\nt="+cvtToStr(color());
|
||||
}
|
||||
return nm;
|
||||
}
|
||||
// ACCESSORS
|
||||
OrderLogicVertex* logicp() const { return m_logicp; }
|
||||
const OrderEitherVertex* varp() const { return m_varp; }
|
||||
const AstScope* scopep() const { return m_scopep; }
|
||||
const AstSenTree* domainp() const { return m_domainp; }
|
||||
};
|
||||
|
||||
//######################################################################
|
||||
// Edge types
|
||||
|
||||
|
2759
src/V3Partition.cpp
Normal file
2759
src/V3Partition.cpp
Normal file
File diff suppressed because it is too large
Load Diff
99
src/V3Partition.h
Normal file
99
src/V3Partition.h
Normal file
@ -0,0 +1,99 @@
|
||||
// -*- mode: C++; c-file-style: "cc-mode" -*-
|
||||
//*************************************************************************
|
||||
// DESCRIPTION: Verilator: Threading's logic to mtask partitioner
|
||||
//
|
||||
// Code available from: http://www.veripool.org/verilator
|
||||
//
|
||||
//*************************************************************************
|
||||
//
|
||||
// Copyright 2003-2018 by Wilson Snyder. This program is free software; you can
|
||||
// redistribute it and/or modify it under the terms of either the GNU
|
||||
// Lesser General Public License Version 3 or the Perl Artistic License
|
||||
// Version 2.0.
|
||||
//
|
||||
// Verilator is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
//*************************************************************************
|
||||
|
||||
#ifndef _V3PARTITION_H_
|
||||
#define _V3PARTITION_H_
|
||||
|
||||
#include "config_build.h"
|
||||
#include "verilatedos.h"
|
||||
#include <list>
|
||||
|
||||
#include "V3Graph.h"
|
||||
#include "V3OrderGraph.h"
|
||||
|
||||
class LogicMTask;
|
||||
typedef vl_unordered_map<const MTaskMoveVertex*, LogicMTask*> Vx2MTaskMap;
|
||||
|
||||
//*************************************************************************
|
||||
/// V3Partition takes the fine-grained logic graph from V3Order and
|
||||
/// collapses it into a coarse-grained graph of AbstractLogicMTask's, each
|
||||
/// of which contains of set of the logic nodes from the fine-grained
|
||||
/// graph.
|
||||
|
||||
class V3Partition {
|
||||
// MEMBERS
|
||||
V3Graph* m_fineDepsGraphp; // Fine-grained dependency graph
|
||||
public:
|
||||
// CONSTRUCTORS
|
||||
explicit V3Partition(V3Graph* fineDepsGraphp)
|
||||
: m_fineDepsGraphp(fineDepsGraphp) {}
|
||||
~V3Partition() {}
|
||||
|
||||
// METHODS
|
||||
|
||||
// Fill in the provided empty graph with AbstractLogicMTask's and their
|
||||
// interdependencies.
|
||||
void go(V3Graph* mtasksp);
|
||||
|
||||
static void selfTest();
|
||||
|
||||
// Print out a hash of the shape of graphp. Only needed to debug the
|
||||
// origin of some nondeterminism; otherwise this is pretty useless.
|
||||
static void hashGraphDebug(const V3Graph* graphp, const char* debugName);
|
||||
|
||||
// Print debug stats about graphp whose nodes must be AbstractMTask's.
|
||||
static void debugMTaskGraphStats(const V3Graph* graphp, const string& name);
|
||||
|
||||
// Operate on the final ExecMTask graph, immediately prior to code
|
||||
// generation time.
|
||||
static void finalize();
|
||||
private:
|
||||
static void finalizeCosts(V3Graph* execMTaskGraphp);
|
||||
static void setupMTaskDeps(V3Graph* mtasksp, const Vx2MTaskMap* vx2mtaskp);
|
||||
|
||||
VL_DEBUG_FUNC; // Declare debug()
|
||||
VL_UNCOPYABLE(V3Partition);
|
||||
};
|
||||
|
||||
//*************************************************************************
|
||||
// Map a pointer into a id, for e.g. nodep to mtask mappings
|
||||
|
||||
class PartPtrIdMap {
|
||||
private:
|
||||
// TYPES
|
||||
typedef vl_unordered_map <const void*, vluint64_t> PtrMap;
|
||||
// MEMBERS
|
||||
mutable vluint64_t m_nextId;
|
||||
mutable PtrMap m_id;
|
||||
public:
|
||||
// CONSTRUCTORS
|
||||
PartPtrIdMap() : m_nextId(0) {}
|
||||
// METHODS
|
||||
vluint64_t findId(const void* ptrp) const {
|
||||
PtrMap::iterator it = m_id.find(ptrp);
|
||||
if (it != m_id.end()) {
|
||||
return it->second;
|
||||
}
|
||||
m_id[ptrp] = m_nextId;
|
||||
return m_nextId++;
|
||||
}
|
||||
};
|
||||
|
||||
#endif // Guard
|
108
src/V3PartitionGraph.h
Normal file
108
src/V3PartitionGraph.h
Normal file
@ -0,0 +1,108 @@
|
||||
// -*- mode: C++; c-file-style: "cc-mode" -*-
|
||||
//*************************************************************************
|
||||
// DESCRIPTION: Verilator: Threading's graph structures
|
||||
//
|
||||
// Code available from: http://www.veripool.org/verilator
|
||||
//
|
||||
//*************************************************************************
|
||||
//
|
||||
// Copyright 2003-2018 by Wilson Snyder. This program is free software; you can
|
||||
// redistribute it and/or modify it under the terms of either the GNU
|
||||
// Lesser General Public License Version 3 or the Perl Artistic License
|
||||
// Version 2.0.
|
||||
//
|
||||
// Verilator is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
//*************************************************************************
|
||||
|
||||
#ifndef _V3PARTITIONGRAPH_H_
|
||||
#define _V3PARTITIONGRAPH_H_
|
||||
|
||||
#include "config_build.h"
|
||||
#include "verilatedos.h"
|
||||
#include <list>
|
||||
|
||||
#include "V3Graph.h"
|
||||
#include "V3OrderGraph.h"
|
||||
|
||||
//*************************************************************************
|
||||
// MTasks and graph structures
|
||||
|
||||
class AbstractMTask : public V3GraphVertex {
|
||||
public:
|
||||
AbstractMTask(V3Graph* graphp) : V3GraphVertex(graphp) {}
|
||||
virtual ~AbstractMTask() {}
|
||||
virtual uint32_t id() const = 0;
|
||||
virtual uint32_t cost() const = 0;
|
||||
};
|
||||
|
||||
class AbstractLogicMTask : public AbstractMTask {
|
||||
public:
|
||||
// TYPES
|
||||
typedef std::list<MTaskMoveVertex*> VxList;
|
||||
// CONSTRUCTORS
|
||||
AbstractLogicMTask(V3Graph* graphp) : AbstractMTask(graphp) {}
|
||||
virtual ~AbstractLogicMTask() {}
|
||||
// METHODS
|
||||
// Set of logic vertices in this mtask. Order is not significant.
|
||||
virtual const VxList* vertexListp() const = 0;
|
||||
virtual uint32_t id() const = 0; // Unique id of this mtask.
|
||||
virtual uint32_t cost() const = 0;
|
||||
};
|
||||
|
||||
class ExecMTask : public AbstractMTask {
|
||||
private:
|
||||
AstMTaskBody* m_bodyp; // Task body
|
||||
uint32_t m_id; // Unique id of this mtask.
|
||||
uint32_t m_priority; // Predicted critical path from the start of
|
||||
// this mtask to the ends of the graph that are reachable from this
|
||||
// mtask. In abstract time units.
|
||||
uint32_t m_cost; // Predicted runtime of this mtask, in the same
|
||||
// abstract time units as priority().
|
||||
uint32_t m_thread; // Thread for static (pack_mtasks) scheduling,
|
||||
// or 0xffffffff if not yet assigned.
|
||||
const ExecMTask* m_packNextp; // Next for static (pack_mtasks) scheduling
|
||||
bool m_threadRoot; // Is root thread
|
||||
VL_UNCOPYABLE(ExecMTask);
|
||||
public:
|
||||
ExecMTask(V3Graph* graphp, AstMTaskBody* bodyp, uint32_t id)
|
||||
: AbstractMTask(graphp),
|
||||
m_bodyp(bodyp),
|
||||
m_id(id),
|
||||
m_priority(0),
|
||||
m_cost(0),
|
||||
m_thread(0xffffffff),
|
||||
m_packNextp(NULL),
|
||||
m_threadRoot(false) {}
|
||||
AstMTaskBody* bodyp() const { return m_bodyp; }
|
||||
virtual uint32_t id() const { return m_id; }
|
||||
uint32_t priority() const { return m_priority; }
|
||||
void priority(uint32_t pri) { m_priority = pri; }
|
||||
virtual uint32_t cost() const { return m_cost; }
|
||||
void cost(uint32_t cost) { m_cost = cost; }
|
||||
void thread(uint32_t thread) { m_thread = thread; }
|
||||
uint32_t thread() const { return m_thread; }
|
||||
void packNextp(const ExecMTask* nextp) { m_packNextp = nextp; }
|
||||
const ExecMTask* packNextp() const { return m_packNextp; }
|
||||
bool threadRoot() const { return m_threadRoot; }
|
||||
void threadRoot(bool threadRoot) { m_threadRoot = threadRoot; }
|
||||
string cFuncName() const {
|
||||
// If this MTask maps to a C function, this should be the name
|
||||
return string("__Vmtask")+"__"+cvtToStr(m_id);
|
||||
}
|
||||
string name() const { return string("mt")+cvtToStr(id()); }
|
||||
void dump(std::ostream& str) const {
|
||||
str <<name()<<"."<<((void*)this);
|
||||
if (priority() || cost()) str <<" [pr="<<priority()<<" c="<<cvtToStr(cost())<<"]";
|
||||
if (thread() != 0xffffffff) str <<" th="<<thread();
|
||||
if (threadRoot()) str <<" [ROOT]";
|
||||
if (packNextp()) str <<" nx="<<packNextp()->name();
|
||||
}
|
||||
};
|
||||
inline std::ostream& operator<<(std::ostream& os, const ExecMTask& rhs) {
|
||||
rhs.dump(os); return os; }
|
||||
|
||||
#endif // Guard
|
@ -182,6 +182,7 @@ private:
|
||||
AstNode* m_chgSubParentp;// Which node has call to m_chgSubFuncp
|
||||
int m_chgSubStmts; // Statements under function being built
|
||||
AstVarScope* m_activityVscp; // Activity variable
|
||||
uint32_t m_activityNumber; // Count of fields in activity variable
|
||||
uint32_t m_code; // Trace ident code# being assigned
|
||||
V3Graph m_graph; // Var/CFunc tracking
|
||||
TraceActivityVertex* m_alwaysVtxp; // "Always trace" vertex
|
||||
@ -297,7 +298,7 @@ private:
|
||||
|
||||
void assignActivity() {
|
||||
// Select activity numbers and put into each CFunc vertex
|
||||
uint32_t activityNumber = 1; // Note 0 indicates "slow"
|
||||
m_activityNumber = 1; // Note 0 indicates "slow"
|
||||
for (V3GraphVertex* itp = m_graph.verticesBeginp(); itp; itp=itp->verticesNextp()) {
|
||||
if (TraceActivityVertex* vvertexp = dynamic_cast<TraceActivityVertex*>(itp)) {
|
||||
if (!vvertexp->activityCodeValid()) {
|
||||
@ -306,17 +307,39 @@ private:
|
||||
// This makes us need less activityNumbers and so speeds up the fast path.
|
||||
vvertexp->activityCode(TraceActivityVertex::ACTIVITY_SLOW);
|
||||
} else {
|
||||
vvertexp->activityCode(activityNumber++);
|
||||
vvertexp->activityCode(m_activityNumber++);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Insert global variable
|
||||
if (!activityNumber) activityNumber++; // For simplicity, always create it
|
||||
int activityBits = VL_WORDS_I(activityNumber)*VL_WORDSIZE; // For tighter code; round to next 32 bit point.
|
||||
AstVar* newvarp = new AstVar (m_chgFuncp->fileline(), AstVarType::MODULETEMP,
|
||||
"__Vm_traceActivity", VFlagBitPacked(), activityBits);
|
||||
AstVar* newvarp;
|
||||
if (v3Global.opt.mtasks()) {
|
||||
// Create a vector of bytes, not bits, for the tracing vector,
|
||||
// so that we can set them atomically without locking.
|
||||
//
|
||||
// TODO: It would be slightly faster to have a bit vector per
|
||||
// chain of packed MTasks, but we haven't packed the MTasks yet.
|
||||
// If we support fully threaded tracing in the future, it would
|
||||
// make sense to improve this at that time.
|
||||
AstNodeDType* newScalarDtp
|
||||
= new AstBasicDType(m_chgFuncp->fileline(), VFlagLogicPacked(), 1);
|
||||
v3Global.rootp()->typeTablep()->addTypesp(newScalarDtp);
|
||||
AstNodeDType* newArrDtp = new AstUnpackArrayDType(
|
||||
m_chgFuncp->fileline(),
|
||||
newScalarDtp,
|
||||
new AstRange(m_chgFuncp->fileline(),
|
||||
VNumRange(m_activityNumber-1, 0, false)));
|
||||
v3Global.rootp()->typeTablep()->addTypesp(newArrDtp);
|
||||
newvarp = new AstVar(m_chgFuncp->fileline(),
|
||||
AstVarType::MODULETEMP,
|
||||
"__Vm_traceActivity", newArrDtp);
|
||||
} else {
|
||||
// For tighter code; round to next 32 bit point.
|
||||
int activityBits = VL_WORDS_I(m_activityNumber)*VL_WORDSIZE;
|
||||
newvarp = new AstVar(m_chgFuncp->fileline(), AstVarType::MODULETEMP,
|
||||
"__Vm_traceActivity", VFlagBitPacked(), activityBits);
|
||||
}
|
||||
m_topModp->addStmtp(newvarp);
|
||||
AstVarScope* newvscp = new AstVarScope(newvarp->fileline(), m_highScopep, newvarp);
|
||||
m_highScopep->addVarp(newvscp);
|
||||
@ -329,15 +352,23 @@ private:
|
||||
FileLine* fl = vvertexp->insertp()->fileline();
|
||||
uint32_t acode = vvertexp->activityCode();
|
||||
vvertexp->insertp()->addNextHere
|
||||
(new AstAssign (fl,
|
||||
new AstSel (fl, new AstVarRef(fl, m_activityVscp, true),
|
||||
acode, 1),
|
||||
new AstConst (fl, AstConst::LogicTrue())));
|
||||
(new AstAssign(fl, selectActivity(fl, acode, true),
|
||||
new AstConst(fl, AstConst::LogicTrue())));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
AstNode* selectActivity(FileLine* flp, uint32_t acode, bool lvalue) {
|
||||
if (v3Global.opt.mtasks()) {
|
||||
return new AstArraySel(
|
||||
flp, new AstVarRef(flp, m_activityVscp, lvalue), acode);
|
||||
} else {
|
||||
return new AstSel(
|
||||
flp, new AstVarRef(flp, m_activityVscp, lvalue), acode, 1);
|
||||
}
|
||||
}
|
||||
|
||||
AstCFunc* newCFunc(AstCFuncType type, const string& name, AstCFunc* basep) {
|
||||
AstCFunc* funcp = new AstCFunc(basep->fileline(), name, basep->scopep());
|
||||
funcp->slow(basep->slow());
|
||||
@ -453,8 +484,7 @@ private:
|
||||
AstNode* condp = NULL;
|
||||
for (ActCodeSet::const_iterator csit = actset.begin(); csit!=actset.end(); ++csit) {
|
||||
uint32_t acode = *csit;
|
||||
AstNode* selp = new AstSel (fl, new AstVarRef(fl, m_activityVscp, false),
|
||||
acode, 1);
|
||||
AstNode* selp = selectActivity(fl, acode, false);
|
||||
if (condp) condp = new AstOr (fl, condp, selp);
|
||||
else condp = selp;
|
||||
}
|
||||
@ -473,11 +503,19 @@ private:
|
||||
|
||||
// Clear activity after tracing completes
|
||||
FileLine* fl = m_chgFuncp->fileline();
|
||||
AstNode* clrp = new AstAssign (fl,
|
||||
new AstVarRef(fl, m_activityVscp, true),
|
||||
new AstConst(fl, V3Number(fl, m_activityVscp->width())));
|
||||
m_fullFuncp->addFinalsp(clrp->cloneTree(true));
|
||||
m_chgFuncp->addFinalsp(clrp);
|
||||
if (v3Global.opt.mtasks()) {
|
||||
for (uint32_t i = 0; i < m_activityNumber; ++i) {
|
||||
AstNode* clrp = new AstAssign(fl, selectActivity(fl, i, true),
|
||||
new AstConst(fl, AstConst::LogicFalse()));
|
||||
m_fullFuncp->addFinalsp(clrp->cloneTree(true));
|
||||
m_chgFuncp->addFinalsp(clrp);
|
||||
}
|
||||
} else {
|
||||
AstNode* clrp = new AstAssign(fl, new AstVarRef(fl, m_activityVscp, true),
|
||||
new AstConst(fl, V3Number(fl, m_activityVscp->width())));
|
||||
m_fullFuncp->addFinalsp(clrp->cloneTree(true));
|
||||
m_chgFuncp->addFinalsp(clrp);
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t assignDeclCode(AstTraceDecl* nodep) {
|
||||
@ -699,6 +737,7 @@ public:
|
||||
m_chgSubFuncp = NULL;
|
||||
m_chgSubParentp = NULL;
|
||||
m_chgSubStmts = 0;
|
||||
m_activityNumber = 0;
|
||||
m_code = 0;
|
||||
m_finding = false;
|
||||
m_funcNum = 0;
|
||||
|
@ -73,6 +73,7 @@
|
||||
#include "V3Param.h"
|
||||
#include "V3Parse.h"
|
||||
#include "V3ParseSym.h"
|
||||
#include "V3Partition.h"
|
||||
#include "V3PreShell.h"
|
||||
#include "V3Premit.h"
|
||||
#include "V3Reloop.h"
|
||||
@ -524,6 +525,14 @@ void process () {
|
||||
V3EmitC::emitcSyms();
|
||||
V3EmitC::emitcTrace();
|
||||
}
|
||||
if (!v3Global.opt.xmlOnly()
|
||||
&& v3Global.opt.mtasks()) {
|
||||
// Finalize our MTask cost estimates and pack the mtasks into
|
||||
// threads. Must happen pre-EmitC which relies on the packing
|
||||
// order. Must happen post-V3LifePost which changes the relative
|
||||
// costs of mtasks.
|
||||
V3Partition::finalize();
|
||||
}
|
||||
if (!v3Global.opt.xmlOnly()) { // Unfortunately we have some lint checks in emitc.
|
||||
V3EmitC::emitc();
|
||||
}
|
||||
@ -607,8 +616,11 @@ int main(int argc, char** argv, char** env) {
|
||||
VHashSha1::selfTest();
|
||||
AstBasicDTypeKwd::selfTest();
|
||||
V3Graph::selfTest();
|
||||
V3TSP::selfTest();
|
||||
V3ScoreboardBase::selfTest();
|
||||
if (v3Global.opt.debugSelfTest()) {
|
||||
V3TSP::selfTest();
|
||||
V3ScoreboardBase::selfTest();
|
||||
V3Partition::selfTest();
|
||||
}
|
||||
|
||||
// Read first filename
|
||||
v3Global.readFiles();
|
||||
|
@ -44,7 +44,7 @@ endif
|
||||
|
||||
.PHONY: test
|
||||
test:
|
||||
$(PERL) driver.pl $(DRIVER_FLAGS) --vlt --dist
|
||||
$(PERL) driver.pl $(DRIVER_FLAGS) --vlt --vltmt --dist
|
||||
|
||||
######################################################################
|
||||
|
||||
@ -61,6 +61,9 @@ nc:
|
||||
vlt:
|
||||
$(PERL) driver.pl $(DRIVER_FLAGS) --vlt --stop
|
||||
|
||||
vltmt:
|
||||
$(PERL) driver.pl $(DRIVER_FLAGS) --vltmt --stop
|
||||
|
||||
######################################################################
|
||||
|
||||
random:
|
||||
|
@ -45,6 +45,7 @@ our %All_Scenarios
|
||||
nc => ["simulator", "nc"],
|
||||
vcs => ["simulator", "vcs"],
|
||||
vlt => ["simulator", "vlt_all", "vlt"],
|
||||
vltmt => ["simulator", "vlt_all", "vltmt"],
|
||||
);
|
||||
|
||||
#======================================================================
|
||||
@ -104,6 +105,7 @@ if (! GetOptions (
|
||||
"ms!" => sub { $opt_scenarios{ms} = $_[1]; },
|
||||
"nc!" => sub { $opt_scenarios{nc} = $_[1]; },
|
||||
"vlt!" => sub { $opt_scenarios{vlt} = $_[1]; },
|
||||
"vltmt!" => sub { $opt_scenarios{vltmt} = $_[1]; },
|
||||
"vcs!" => sub { $opt_scenarios{vcs} = $_[1]; },
|
||||
"<>" => \¶meter,
|
||||
)) {
|
||||
@ -322,6 +324,7 @@ sub new {
|
||||
$self->{scenario} ||= "ghdl" if $self->{ghdl};
|
||||
$self->{scenario} ||= "vcs" if $self->{vcs};
|
||||
$self->{scenario} ||= "vlt" if $self->{vlt};
|
||||
$self->{scenario} ||= "vltmt" if $self->{vltmt};
|
||||
$self->{scenario} ||= "nc" if $self->{nc};
|
||||
$self->{scenario} ||= "ms" if $self->{ms};
|
||||
$self->{scenario} ||= "iv" if $self->{iv};
|
||||
@ -407,6 +410,7 @@ sub new {
|
||||
ms_run_flags => [split(/\s+/,"-lib $self->{obj_dir}/work -c -do 'run -all;quit' ")],
|
||||
# Verilator
|
||||
vlt => 0,
|
||||
vltmt => 0,
|
||||
verilator_flags => ["-cc",
|
||||
"-Mdir $self->{obj_dir}",
|
||||
"-OD", # As currently disabled unless -O3
|
||||
@ -420,7 +424,7 @@ sub new {
|
||||
%$self};
|
||||
bless $self, $class;
|
||||
|
||||
$self->{vlt_all} = $self->{vlt}; # Any Verilator scenario
|
||||
$self->{vlt_all} = $self->{vlt} || $self->{vltmt}; # Any Verilator scenario
|
||||
|
||||
$self->{VM_PREFIX} ||= "V".$self->{name};
|
||||
$self->{stats} ||= "$self->{obj_dir}/V".$self->{name}."__stats.txt";
|
||||
@ -593,6 +597,8 @@ sub compile_vlt_flags {
|
||||
unshift @verilator_flags, "--gdbbt" if $opt_gdbbt;
|
||||
unshift @verilator_flags, "--x-assign unique"; # More likely to be buggy
|
||||
unshift @verilator_flags, "--trace" if $opt_trace;
|
||||
unshift @verilator_flags, "--threads 3" if $param{vltmt};
|
||||
unshift @verilator_flags, "--debug-partition" if $param{vltmt};
|
||||
if (defined $opt_optimize) {
|
||||
my $letters = "";
|
||||
if ($opt_optimize =~ /[a-zA-Z]/) {
|
||||
@ -746,6 +752,11 @@ sub compile {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if ($self->{vltmt} && !$self->cfg_with_threaded) {
|
||||
$self->skip("Test requires Verilator configured with threads\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (!$param{fails} && $param{verilator_make_gcc}
|
||||
&& $param{make_main}) {
|
||||
$self->_make_main();
|
||||
@ -2045,7 +2056,11 @@ Run Synopsys VCS simulator tests.
|
||||
|
||||
=item --vlt
|
||||
|
||||
Run Verilator tests. Default unless another scenario flag is provided.
|
||||
Run Verilator tests in single-threaded mode. Default unless another scenario flag is provided.
|
||||
|
||||
=item --vltmt
|
||||
|
||||
Run Verilator tests in multithreaded mode.
|
||||
|
||||
=back
|
||||
|
||||
|
22
test_regress/t/t_a_selftest.pl
Executable file
22
test_regress/t/t_a_selftest.pl
Executable file
@ -0,0 +1,22 @@
|
||||
#!/usr/bin/perl
|
||||
if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); die; }
|
||||
# DESCRIPTION: Verilator: Verilog Test driver/expect definition
|
||||
#
|
||||
# Copyright 2003 by Wilson Snyder. This program is free software; you can
|
||||
# redistribute it and/or modify it under the terms of either the GNU
|
||||
# Lesser General Public License Version 3 or the Perl Artistic License
|
||||
# Version 2.0.
|
||||
|
||||
scenarios(vlt_all => 1);
|
||||
|
||||
top_filename("t/t_EXAMPLE.v");
|
||||
|
||||
compile(
|
||||
verilator_flags2 => ['--debug-self-test'],
|
||||
verilator_make_gcc => 0,
|
||||
make_top_shell => 0,
|
||||
make_main => 0,
|
||||
);
|
||||
|
||||
ok(1);
|
||||
1;
|
@ -15,7 +15,8 @@ compile(
|
||||
|
||||
if ($Self->{vlt_all}) {
|
||||
file_grep ($Self->{stats}, qr/Optimizations, Tables created\s+(\d+)/i, 10);
|
||||
file_grep ($Self->{stats}, qr/Optimizations, Combined CFuncs\s+(\d+)/i, 8);
|
||||
file_grep ($Self->{stats}, qr/Optimizations, Combined CFuncs\s+(\d+)/i,
|
||||
($Self->{vltmt} ? 0 : 8));
|
||||
}
|
||||
|
||||
execute(
|
||||
|
21
test_regress/t/t_dpi_threads.pl
Executable file
21
test_regress/t/t_dpi_threads.pl
Executable file
@ -0,0 +1,21 @@
|
||||
#!/usr/bin/perl
|
||||
if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); die; }
|
||||
# DESCRIPTION: Verilator: Verilog Test driver/expect definition
|
||||
#
|
||||
# Copyright 2018 by Wilson Snyder. This program is free software; you can
|
||||
# redistribute it and/or modify it under the terms of either the GNU
|
||||
# Lesser General Public License Version 3 or the Perl Artistic License
|
||||
# Version 2.0.
|
||||
|
||||
scenarios(vltmt => 1);
|
||||
|
||||
compile(
|
||||
v_flags2 => ["t/t_dpi_threads_c.cpp --no-threads-coarsen"],
|
||||
);
|
||||
|
||||
execute(
|
||||
check_finished => 1,
|
||||
);
|
||||
|
||||
ok(1);
|
||||
1;
|
62
test_regress/t/t_dpi_threads.v
Normal file
62
test_regress/t/t_dpi_threads.v
Normal file
@ -0,0 +1,62 @@
|
||||
// DESCRIPTION: Verilator: Verilog Test module
|
||||
//
|
||||
// Copyright 2018 by Wilson Snyder. This program is free software; you can
|
||||
// redistribute it and/or modify it under the terms of either the GNU
|
||||
// Lesser General Public License Version 3 or the Perl Artistic License
|
||||
// Version 2.0.
|
||||
|
||||
import "DPI-C" dpii_sys_task = function void \$dpii_sys ();
|
||||
import "DPI-C" dpii_failure = function int \$dpii_failure ();
|
||||
|
||||
module t (clk);
|
||||
input clk;
|
||||
integer cyc;
|
||||
integer failure;
|
||||
|
||||
initial cyc = 0;
|
||||
|
||||
`ifndef verilator
|
||||
`error "Only Verilator supports PLI-ish DPI calls."
|
||||
`endif
|
||||
|
||||
always @ (posedge clk) begin
|
||||
if (cyc == 2) begin
|
||||
failure = $dpii_failure();
|
||||
$write("* failure = %0d\n", failure);
|
||||
if (failure > 0) begin
|
||||
$stop;
|
||||
end
|
||||
$write("*-* All Finished *-*\n");
|
||||
$finish;
|
||||
end
|
||||
cyc <= cyc + 1;
|
||||
end
|
||||
|
||||
// The purpose of this test is to confirm that the DPI-call serialization
|
||||
// code in V3Partition does ensure that these DPI calls do not run
|
||||
// concurrently.
|
||||
//
|
||||
// Alternatively, the test may be run with "--threads-dpi all" in which case
|
||||
// it should confirm that the calls do run concurrently and do detect a
|
||||
// collision (they should, if the test is set up right.) This is
|
||||
// t_dpi_threads_collide.pl.
|
||||
//
|
||||
// Q) Is it a risk that the partitioner will merge or serialize these always
|
||||
// blocks, just by luck, even if the DPI-call serialization code fails?
|
||||
//
|
||||
// A) Yes, that's why t_dpi_threads_collide.pl also passes
|
||||
// --no-threads-do-coaren to disable MTask coarsening. This ensures that
|
||||
// the MTask graph at the end of FixDataHazards (where we resolve DPI
|
||||
// hazards) is basically the final MTasks graph, and that data hazards
|
||||
// which persist beyond FixDataHazards should persist in the final
|
||||
// generated C code.
|
||||
|
||||
always @ (posedge clk) begin
|
||||
$dpii_sys();
|
||||
end
|
||||
|
||||
always @ (posedge clk) begin
|
||||
$dpii_sys();
|
||||
end
|
||||
|
||||
endmodule
|
78
test_regress/t/t_dpi_threads_c.cpp
Normal file
78
test_regress/t/t_dpi_threads_c.cpp
Normal file
@ -0,0 +1,78 @@
|
||||
// -*- mode: C++; c-file-style: "cc-mode" -*-
|
||||
//*************************************************************************
|
||||
//
|
||||
// Copyright 2018-2018 by Wilson Snyder. This program is free software; you can
|
||||
// redistribute it and/or modify it under the terms of either the GNU
|
||||
// Lesser General Public License Version 3 or the Perl Artistic License.
|
||||
// Version 2.0.
|
||||
//
|
||||
// Verilator is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
//*************************************************************************
|
||||
|
||||
#include <atomic>
|
||||
#include <cstdio>
|
||||
#include <iostream>
|
||||
#include <unistd.h>
|
||||
#include "svdpi.h"
|
||||
|
||||
//======================================================================
|
||||
|
||||
#if defined(VERILATOR)
|
||||
# ifdef T_DPI_THREADS_COLLIDE
|
||||
# include "Vt_dpi_threads_collide__Dpi.h"
|
||||
# else
|
||||
# include "Vt_dpi_threads__Dpi.h"
|
||||
# endif
|
||||
#elif defined(VCS)
|
||||
# include "../vc_hdrs.h"
|
||||
#elif defined(CADENCE)
|
||||
# define NEED_EXTERNS
|
||||
#else
|
||||
# error "Unknown simulator for DPI test"
|
||||
#endif
|
||||
|
||||
#ifdef NEED_EXTERNS
|
||||
extern "C" {
|
||||
extern void dpii_sys_task();
|
||||
extern int dpii_failure();
|
||||
}
|
||||
#endif
|
||||
|
||||
//======================================================================
|
||||
|
||||
struct state {
|
||||
std::atomic<bool> task_is_running;
|
||||
std::atomic<int> failure;
|
||||
state() : task_is_running(false)
|
||||
, failure(false) {}
|
||||
};
|
||||
|
||||
static state st;
|
||||
|
||||
void dpii_sys_task() {
|
||||
bool other_task_running = atomic_exchange(&st.task_is_running, true);
|
||||
if (other_task_running) {
|
||||
// Another task is running. This is a collision.
|
||||
st.failure = 1;
|
||||
std::cerr << "t_dpi_threads_c.cpp dpii_sys_task() saw threads collide.\n";
|
||||
} else {
|
||||
std::cerr << "t_dpi_threads_c.cpp dpii_sys_task() no collision. @" << &st.task_is_running << "\n";
|
||||
}
|
||||
|
||||
// Spend some time in the DPI call, so that if we can have a collision
|
||||
// we probably will. Technically this is not guaranteed to detect every
|
||||
// race. However, one second is so much greater than the expected
|
||||
// runtime of everything else in the test, it really should pick up on
|
||||
// races just about all of the time.
|
||||
sleep(1);
|
||||
|
||||
atomic_exchange(&st.task_is_running, false);
|
||||
}
|
||||
|
||||
int dpii_failure() {
|
||||
return st.failure;
|
||||
}
|
28
test_regress/t/t_dpi_threads_collide.pl
Executable file
28
test_regress/t/t_dpi_threads_collide.pl
Executable file
@ -0,0 +1,28 @@
|
||||
#!/usr/bin/perl
|
||||
if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); die; }
|
||||
# DESCRIPTION: Verilator: Verilog Test driver/expect definition
|
||||
#
|
||||
# Copyright 2018 by Wilson Snyder. This program is free software; you can
|
||||
# redistribute it and/or modify it under the terms of either the GNU
|
||||
# Lesser General Public License Version 3 or the Perl Artistic License
|
||||
# Version 2.0.
|
||||
|
||||
scenarios(vltmt => 1);
|
||||
|
||||
top_filename("t/t_dpi_threads.v");
|
||||
|
||||
compile(
|
||||
v_flags2 => ["t/t_dpi_threads_c.cpp --threads-dpi all --no-threads-coarsen"],
|
||||
);
|
||||
|
||||
# Similar to t_dpi_threads, which confirms that Verilator can prevent a
|
||||
# race between DPI import calls, this test confirms that the race exists
|
||||
# and that the DPI C code can detect it under --threads-dpi all
|
||||
# mode.
|
||||
#
|
||||
execute(
|
||||
fails => 1,
|
||||
);
|
||||
|
||||
ok(1);
|
||||
1;
|
@ -43,7 +43,10 @@ gen($Self->{top_filename}, 6000);
|
||||
compile(
|
||||
verilator_flags2=>["-x-assign fast --x-initial fast",
|
||||
"-Wno-UNOPTTHREADS",
|
||||
],
|
||||
# The slow V3Partition asserts are just too slow
|
||||
# in this test. They're disabled just for performance
|
||||
# reasons:
|
||||
"--no-debug-partition"],
|
||||
);
|
||||
|
||||
execute(
|
||||
|
74
test_regress/t/t_gantt.pl
Executable file
74
test_regress/t/t_gantt.pl
Executable file
@ -0,0 +1,74 @@
|
||||
#!/usr/bin/perl
|
||||
if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); die; }
|
||||
# DESCRIPTION: Verilator: Verilog Test driver/expect definition
|
||||
#
|
||||
# Copyright 2003 by Wilson Snyder. This program is free software; you can
|
||||
# redistribute it and/or modify it under the terms of either the GNU
|
||||
# Lesser General Public License Version 3 or the Perl Artistic License
|
||||
# Version 2.0.
|
||||
|
||||
use IO::File;
|
||||
|
||||
# Test for bin/verilator_gantt,
|
||||
#
|
||||
# Only needed in multithreaded regression.
|
||||
scenarios(vltmt => 1);
|
||||
|
||||
# It doesn't really matter what test
|
||||
# we use, so long as it runs several cycles,
|
||||
# enough for the profiling to happen:
|
||||
top_filename("t/t_gen_alw.v");
|
||||
|
||||
compile(
|
||||
v_flags2 => ["--prof-threads"]
|
||||
);
|
||||
|
||||
execute(
|
||||
all_run_flags => ["+verilator+prof+threads+start+2",
|
||||
" +verilator+prof+threads+window+2",
|
||||
" +verilator+prof+threads+file+$Self->{obj_dir}/profile_threads.dat",
|
||||
],
|
||||
check_finished => 1,
|
||||
);
|
||||
|
||||
# For now, verilator_gantt still reads from STDIN
|
||||
# (probably it should take a file, gantt.dat like verilator_profcfunc)
|
||||
# The profiling data still goes direct to the runtime's STDOUT
|
||||
# (maybe that should go to a separate file - gantt.dat?)
|
||||
run(cmd => ["$ENV{VERILATOR_ROOT}/bin/verilator_gantt",
|
||||
"$Self->{obj_dir}/profile_threads.dat",
|
||||
"--vcd $Self->{obj_dir}/profile_threads.vcd",
|
||||
"> $Self->{obj_dir}/gantt.log"]);
|
||||
|
||||
# We should have three lines of gantt chart, each with
|
||||
# an even number of mtask-bars (eg "[123--]")
|
||||
my $gantt_line_ct = 0;
|
||||
my $global_mtask_ct = 0;
|
||||
{
|
||||
my $fh = IO::File->new("<$Self->{obj_dir}/gantt.log")
|
||||
or error("$! $Self->{obj_dir}/gantt.log");
|
||||
while (my $line = ($fh && $fh->getline)) {
|
||||
if ($line !~ m/^ t:/) { next; }
|
||||
$gantt_line_ct++;
|
||||
my $this_thread_mtask_ct = 0;
|
||||
my @mtasks = split(/\[/, $line);
|
||||
shift @mtasks; # throw the '>> ' away
|
||||
foreach my $mtask (@mtasks) {
|
||||
# Format of each mtask is "[123--]" where the hyphens
|
||||
# number or ] may or may not appear; it depends on exact timing.
|
||||
$this_thread_mtask_ct++;
|
||||
$global_mtask_ct++;
|
||||
}
|
||||
if ($this_thread_mtask_ct % 2 != 0) { error("odd number of mtasks found"); }
|
||||
}
|
||||
}
|
||||
if ($gantt_line_ct != 3) { error("wrong number of gantt lines"); }
|
||||
if ($global_mtask_ct == 0) { error("wrong number of mtasks, should be > 0"); }
|
||||
print "Found $gantt_line_ct lines of gantt data with $global_mtask_ct mtasks\n"
|
||||
if $Self->{verbose};
|
||||
|
||||
# Diff to itself, just to check parsing
|
||||
vcd_identical("$Self->{obj_dir}/profile_threads.vcd", "$Self->{obj_dir}/profile_threads.vcd");
|
||||
|
||||
ok(1);
|
||||
1;
|
@ -117,6 +117,10 @@ compile(
|
||||
);
|
||||
|
||||
execute(
|
||||
all_run_flags => ["+verilator+prof+threads+start+100",
|
||||
" +verilator+prof+threads+window+2",
|
||||
" +verilator+prof+threads+file+$Self->{obj_dir}/profile_threads.dat",
|
||||
],
|
||||
check_finished => 1,
|
||||
);
|
||||
|
||||
|
@ -13,6 +13,7 @@ foreach my $prog (
|
||||
"../bin/verilator",
|
||||
"../bin/verilator_coverage",
|
||||
"../bin/verilator_difftree",
|
||||
"../bin/verilator_gantt",
|
||||
"../bin/verilator_profcfunc",
|
||||
) {
|
||||
run(fails => 1,
|
||||
|
@ -38,7 +38,8 @@ sub checkRelativeRefs {
|
||||
if ($Self->{vlt_all}) {
|
||||
# We expect to combine sequent functions across multiple instances of
|
||||
# l2, l3, l4, l5. If this number drops, please confirm this has not broken.
|
||||
file_grep ($Self->{stats}, qr/Optimizations, Combined CFuncs\s+(\d+)/i, 52);
|
||||
file_grep ($Self->{stats}, qr/Optimizations, Combined CFuncs\s+(\d+)/i,
|
||||
($Self->{vltmt} ? 84 : 52));
|
||||
|
||||
# Expect absolute refs in CFuncs for t (top module) and l1 (because it
|
||||
# has only one instance)
|
||||
|
@ -18,7 +18,8 @@ compile(
|
||||
if ($Self->{vlt_all}) {
|
||||
# Fewer optimizations than t_inst_tree_inl0_pub1 which allows
|
||||
# relative CFuncs:
|
||||
file_grep ($Self->{stats}, qr/Optimizations, Combined CFuncs\s+(\d+)/i, 31);
|
||||
file_grep ($Self->{stats}, qr/Optimizations, Combined CFuncs\s+(\d+)/i,
|
||||
($Self->{vltmt} ? 0 : 31));
|
||||
|
||||
# Should not find any 'this->' except some 'this->__VlSymsp'
|
||||
my @files = `ls $Self->{obj_dir}/*.cpp`;
|
||||
|
@ -7,8 +7,7 @@ if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); di
|
||||
# Lesser General Public License Version 3 or the Perl Artistic License
|
||||
# Version 2.0.
|
||||
|
||||
scenarios(simulator => 1);
|
||||
$Self->cfg_with_threaded or skip("No thread support");
|
||||
scenarios(vltmt => 1);
|
||||
|
||||
top_filename("t/t_threads_counter.v");
|
||||
|
||||
|
@ -7,8 +7,7 @@ if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); di
|
||||
# Lesser General Public License Version 3 or the Perl Artistic License
|
||||
# Version 2.0.
|
||||
|
||||
scenarios(simulator => 1);
|
||||
$Self->cfg_with_threaded or skip("No thread support");
|
||||
scenarios(vltmt => 1);
|
||||
|
||||
top_filename("t/t_threads_counter.v");
|
||||
|
||||
|
23
test_regress/t/t_threads_counter_4.pl
Executable file
23
test_regress/t/t_threads_counter_4.pl
Executable file
@ -0,0 +1,23 @@
|
||||
#!/usr/bin/perl
|
||||
if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); die; }
|
||||
# DESCRIPTION: Verilator: Verilog Test driver/expect definition
|
||||
#
|
||||
# Copyright 2003-2009 by Wilson Snyder. This program is free software; you can
|
||||
# redistribute it and/or modify it under the terms of either the GNU
|
||||
# Lesser General Public License Version 3 or the Perl Artistic License
|
||||
# Version 2.0.
|
||||
|
||||
scenarios(vltmt => 1);
|
||||
|
||||
top_filename("t/t_threads_counter.v");
|
||||
|
||||
compile(
|
||||
verilator_flags2 => ['--cc --threads 4'],
|
||||
);
|
||||
|
||||
execute(
|
||||
check_finished => 1,
|
||||
);
|
||||
|
||||
ok(1);
|
||||
1;
|
25
test_regress/t/t_threads_nondeterminism.pl
Executable file
25
test_regress/t/t_threads_nondeterminism.pl
Executable file
@ -0,0 +1,25 @@
|
||||
#!/usr/bin/perl
|
||||
if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); die; }
|
||||
# DESCRIPTION: Verilator: Verilog Test driver/expect definition
|
||||
#
|
||||
# Copyright 2003-2009 by Wilson Snyder. This program is free software; you can
|
||||
# redistribute it and/or modify it under the terms of either the GNU
|
||||
# Lesser General Public License Version 3 or the Perl Artistic License
|
||||
# Version 2.0.
|
||||
|
||||
scenarios(vltmt => 1);
|
||||
|
||||
top_filename("t/t_threads_counter.v");
|
||||
|
||||
compile(
|
||||
verilator_flags2 => ['--cc --threads 2 --debug-nondeterminism'],
|
||||
);
|
||||
|
||||
execute(
|
||||
check_finished => 1,
|
||||
);
|
||||
|
||||
file_grep("$Self->{obj_dir}/vlt_compile.log", qr/hash of shape/i);
|
||||
|
||||
ok(1);
|
||||
1;
|
@ -13,7 +13,12 @@ my $root = "..";
|
||||
|
||||
compile(
|
||||
# Can't use --coverage and --savable together, so cheat and compile inline
|
||||
verilator_flags2 => ['--cc --coverage-toggle --coverage-line --coverage-user --trace --vpi $root/include/verilated_save.cpp'],
|
||||
verilator_flags2 => ["--cc",
|
||||
"--coverage-toggle --coverage-line --coverage-user",
|
||||
"--trace --vpi ",
|
||||
($Self->cfg_with_threaded
|
||||
? "--threads 2 $root/include/verilated_threads.cpp" : ""),
|
||||
"$root/include/verilated_save.cpp"],
|
||||
);
|
||||
|
||||
execute(
|
||||
@ -43,7 +48,8 @@ foreach my $dfile (glob("$Self->{obj_dir}/*.d")) {
|
||||
|
||||
foreach my $file (sort keys %hit) {
|
||||
if (!$hit{$file}
|
||||
&& $file !~ /_sc/) {
|
||||
&& $file !~ /_sc/
|
||||
&& ($file !~ /_thread/ || $Self->cfg_with_threaded)) {
|
||||
error("Include file not covered by t_verilated_all test: ",$file);
|
||||
}
|
||||
}
|
||||
|
@ -7,8 +7,7 @@ if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); di
|
||||
# Lesser General Public License Version 3 or the Perl Artistic License
|
||||
# Version 2.0.
|
||||
|
||||
scenarios(simulator => 1);
|
||||
$Self->cfg_with_threaded or skip("No thread support");
|
||||
scenarios(vltmt => 1);
|
||||
|
||||
top_filename("t/t_verilated_all.v");
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user