#!/usr/bin/env perl
###############################################################################
## ----------------------------------------------------------------------------
## Egrep script (Perl implementation) similar to the egrep binary.
## Look at bin/mce_grep for a wrapper script around the grep binary.
##
## This script supports egrep's options [ceHhiLlmnqRrsv]. The main focus is
## demonstrating Many-core Engine for Perl. Use this script against large
## file(s).
##
## This script was created to show how output order can be preserved even
## though there are only 4 shared socket pairs in MCE no matter the number
## of workers.
##
## Which to choose (examples/egrep.pl or bin/mce_grep).
##
## Examples/egrep.pl is a pure Perl implementation with less options.
## Bin/mce_grep is a wrapper script for the relevant binary.
##
## The wrapper script is good for expensive pattern matching -- especially
## for agrep and tre-agrep. It also supports more options due to being
## passed to the binary. The wrapper supports 2 levels of chunking via the
## --chunk-level={auto|file|list} option. For large files, choose file.
##
## The usage description was largely ripped off from the egrep man page.
##
###############################################################################
use strict;
use warnings;
use Cwd qw(abs_path);
use lib abs_path . "/../lib";
my ($prog_name, $prog_dir);
BEGIN {
$prog_name = $0;
$prog_name =~ s{^.*[\\/]}{}g;
$prog_dir = abs_path($0);
$prog_dir =~ s{[\\/][^\\/]*$}{};
$ENV{PATH} .= ($^O eq 'MSWin32' ? ';' : ':') . $prog_dir;
}
sub INIT {
## Provide file globbing support under Windows similar to Unix.
@ARGV = <@ARGV> if ($^O eq 'MSWin32');
}
use Scalar::Util qw( looks_like_number );
use MCE;
###############################################################################
## ----------------------------------------------------------------------------
## Display usage and exit.
##
###############################################################################
sub usage {
my $exit_status = $_[0] || 0;
print <<"::_USAGE_BLOCK_END_::";
Options for Many-core Engine:
--max-workers=NUM override max workers (default 6)
e.g. auto, auto-2, 4
--chunk-size=NUM[KM] override chunk size (default 4M)
minimum: 200K; maximum: 20M
Usage: $prog_name [OPTION]... PATTERN [FILE] ...
Search for PATTERN in each FILE or standard input.
Example: $prog_name -i 'hello world' menu.h main.c
Regexp selection and interpretation:
-e, --regexp=PATTERN use PATTERN as a regular expression
-i, --ignore-case ignore case distinctions
Miscellaneous:
-s, --no-messages suppress error messages
-v, --invert-match select non-matching lines
--help display this help and exit
Output control:
-m, --max-count=NUM stop after NUM matches
-n, --line-number print line number with output lines
-H, --with-filename print the filename for each match
-h, --no-filename suppress the prefixing filename on output
-q, --quiet, --silent suppress all normal output
-R, -r, --recursive equivalent to --directories=recurse
--include=PATTERN files that match PATTERN will be examined
--exclude=PATTERN files that match PATTERN will be skipped
--exclude-from=FILE files that match PATTERN in FILE will be skipped
--exclude-dir=PATTERN directories that match PATTERN will be skipped
requires a recent egrep binary for --exclude-dir
-L, --files-without-match only print FILE names containing no match
-l, --files-with-matches only print FILE names containing matches
-c, --count only print a count of matching lines per FILE
With no FILE, or when FILE is -, read standard input. If less than
two FILEs given, assume -h. Exit status is 0 if match, 1 if no match,
and 2 if trouble.
::_USAGE_BLOCK_END_::
exit $exit_status;
}
###############################################################################
## ----------------------------------------------------------------------------
## Define defaults and process command-line arguments.
##
###############################################################################
my $flag = sub { 1 };
my $isOk = sub { (@ARGV == 0 or $ARGV[0] =~ /^-/) ? usage(1) : shift @ARGV; };
my ($c_flag, $H_flag, $h_flag, $i_flag, $n_flag, $q_flag, $r_flag, $v_flag);
my (@r_patn, $arg, @files, @patterns, $re, $skip_args, $w_filename);
my ($L_flag, $l_flag, $f_list);
my $max_workers = 6; my $chunk_size = 4194304; ## 4M
my $max_count = 0; my $no_msg = 0;
## Option parsing step 1.
while ( @ARGV ) {
$arg = shift @ARGV; $arg =~ s/ /\\ /g;
if ($skip_args) {
push @files, $arg;
next;
}
if (substr($arg, 0, 2) eq '--') { ## --OPTION
$skip_args = $flag->() and next if ($arg eq '--');
$no_msg = $flag->() and next if ($arg eq '--no-messages');
$c_flag = $flag->() and next if ($arg eq '--count');
$i_flag = $flag->() and next if ($arg eq '--ignore-case');
$L_flag = $flag->() and next if ($arg eq '--files-without-match');
$l_flag = $flag->() and next if ($arg eq '--files-with-match');
$n_flag = $flag->() and next if ($arg eq '--line-number');
$q_flag = $flag->() and next if ($arg eq '--quiet');
$q_flag = $flag->() and next if ($arg eq '--silent');
$r_flag = $flag->() and next if ($arg eq '--recursive');
$v_flag = $flag->() and next if ($arg eq '--invert-match');
if ($arg eq '--help') {
usage(0);
}
if ($arg eq '^--regexp=(.+)') {
push @patterns, $1;
next;
}
if ($arg =~ m/^--include=.+/) {
push @r_patn, $arg;
next;
}
if ($arg =~ m/^--exclude=.+/) {
push @r_patn, $arg;
next;
}
if ($arg =~ m/^--exclude-from=.+/) {
push @r_patn, $arg;
next;
}
if ($arg =~ m/^--exclude-dir=.+/) {
push @r_patn, $arg;
next;
}
if ($arg eq '--with-filename') {
$H_flag = 1; $h_flag = 0;
next;
}
if ($arg eq '--no-filename') {
$H_flag = 0; $h_flag = 1;
next;
}
$max_count = $isOk->() and next if ($arg =~ /^--max-count$/);
$max_workers = $isOk->() and next if ($arg =~ /^--max[-_]workers$/);
$chunk_size = $isOk->() and next if ($arg =~ /^--chunk[-_]size$/);
if ($arg =~ /^--max-count=(.+)/) {
$max_count = $1;
next;
}
if ($arg =~ /^--max[-_]workers=(.+)/) {
$max_workers = $1;
next;
}
if ($arg =~ /^--chunk[-_]size=(.+)/) {
$chunk_size = $1;
next;
}
usage(2);
}
elsif (substr($arg, 0, 1) eq '-') { ## -OPTION
if ($arg eq '-') {
push @files, $arg;
next;
}
if ($arg =~ m/^-([cHhiLlmnqRrsv]+)$/) {
my $t_arg = reverse $1;
while ($t_arg) {
my $a = chop($t_arg);
$no_msg = $flag->() and next if ($a eq 's');
$c_flag = $flag->() and next if ($a eq 'c');
$i_flag = $flag->() and next if ($a eq 'i');
$n_flag = $flag->() and next if ($a eq 'n');
$q_flag = $flag->() and next if ($a eq 'q');
$r_flag = $flag->() and next if ($a eq 'R');
$r_flag = $flag->() and next if ($a eq 'r');
$v_flag = $flag->() and next if ($a eq 'v');
if ($a eq 'H') {
$H_flag = 1; $h_flag = 0;
}
elsif ($a eq 'h') {
$H_flag = 0; $h_flag = 1;
}
elsif ($a eq 'L') {
$L_flag = 1; $l_flag = 0;
}
elsif ($a eq 'l') {
$L_flag = 0; $l_flag = 1;
}
elsif ($a eq 'm') {
if (substr($arg, -1) eq 'm') {
$max_count = shift @ARGV;
}
elsif ($arg =~ /m(\d+)$/) {
$max_count = $1;
}
}
}
next;
}
if ($arg eq '-e') {
my $pattern = shift;
push @patterns, $pattern if (defined $pattern);
next;
}
usage(2);
}
push @files, $arg; ## FILE
}
## Option parsing step 2.
{
if (defined $max_count) {
unless (looks_like_number($max_count) && $max_count >= 0) {
print STDERR "$prog_name: invalid max count\n";
exit 2;
}
}
if ($max_workers !~ /^auto/) {
unless (looks_like_number($max_workers) && $max_workers > 0) {
print STDERR "$prog_name: invalid max workers\n";
exit 2;
}
}
if ($chunk_size =~ /^(\d+)K/i) {
$chunk_size = $1 * 1024;
}
elsif ($chunk_size =~ /^(\d+)M/i) {
$chunk_size = $1 * 1024 * 1024;
}
if (looks_like_number($chunk_size) && $chunk_size > 0) {
$chunk_size = 20_971_520 if $chunk_size > 20_971_520; ## 20M
$chunk_size = 204_800 if $chunk_size < 204_800; ## 200K
}
else {
print STDERR "$prog_name: invalid chunk size\n";
exit 2;
}
}
## Option parsing step 3.
$f_list = ($L_flag || $l_flag);
push @patterns, shift @files if (@patterns == 0 && @files > 0);
$w_filename = 1
if ((!$h_flag && @files > 1) || (!$h_flag && $r_flag) || $H_flag);
usage(2) if (@patterns == 0);
if (@patterns > 1) {
$re = '(?:' . join('|', @patterns) . ')';
}
else {
$re = $patterns[0];
}
###############################################################################
## ----------------------------------------------------------------------------
## MCE callback functions.
##
###############################################################################
my ($file, %result, $abort_all, $abort_job, $found_match);
my $exit_status = 0;
my $total_found = 0;
my $total_lines = 0;
my $order_id = 1;
keys(%result) = 4000;
sub aggregate_count {
my ($wk_count) = @_;
$total_found += $wk_count;
$found_match = 1 if ($total_found);
return;
}
sub display_result {
my ($result, $chunk_id) = @_;
return if ($abort_job);
$result{$chunk_id} = $result;
while (1) {
last unless exists $result{$order_id};
my $r = $result{$order_id};
if (!$abort_job && $r->{found_match}) {
$found_match = 1;
if ($q_flag) {
MCE->abort(); $abort_all = $abort_job = 1;
last;
}
for my $i (0 .. @{ $r->{matches} } - 1) {
$total_found++;
unless ($c_flag) {
printf "%s:", $file if ($w_filename);
printf "%d:", $r->{lines}[$i] + $total_lines if ($n_flag);
print $r->{matches}[$i];
}
if ($max_count && $max_count == $total_found) {
MCE->abort(); $abort_job = 1;
last;
}
}
}
$total_lines += $r->{line_count} if ($n_flag);
delete $result{$order_id};
$order_id++;
}
}
sub report_match {
if (!$abort_job) {
MCE->abort();
$abort_all = 1 if $q_flag;
$abort_job = $total_found = 1;
}
return;
}
###############################################################################
## ----------------------------------------------------------------------------
## MCE user functions.
##
###############################################################################
sub user_begin {
my ($mce) = @_;
if ($c_flag) {
use vars qw($match_re $eol_re $count);
our $match_re = $re . '.*' . $/;
our $eol_re = $/;
our $count = 0;
}
return;
}
sub user_end {
my ($mce) = @_;
if ($c_flag) {
MCE->do('aggregate_count', $count) if ($count);
}
return;
}
sub user_func {
my ($mce, $chunk_ref, $chunk_id) = @_;
my ($found_match, @matches, $line_count, @lines);
## Count and return immediately if -c was specified.
if ($c_flag && !$f_list) {
my $match_count = 0;
if ($i_flag) {
$match_count++ while ( $$chunk_ref =~ /$match_re/img );
} else {
$match_count++ while ( $$chunk_ref =~ /$match_re/mg );
}
if ($v_flag) {
unless ($eol_re eq "\n") {
$line_count = 0; $line_count++ while ( $$chunk_ref =~ /$eol_re/g );
} else {
$line_count = ( $$chunk_ref =~ tr/\n// );
}
$count += $line_count - $match_count;
}
else {
$count += $match_count;
}
return;
}
## Quickly determine if a match is found.
if (!$v_flag || $f_list) {
for (0 .. @patterns - 1) {
if ($i_flag) {
if ($$chunk_ref =~ /$patterns[$_]/im) {
$found_match = 1;
last;
}
}
else {
if ($$chunk_ref =~ /$patterns[$_]/m) {
$found_match = 1;
last;
}
}
}
}
if ($f_list) {
MCE->do('report_match')
if (($l_flag && $found_match) || ($L_flag && !$found_match));
return;
}
## Obtain file handle to slurped data.
## Collect matched data if slurped chunk data contains a match.
open my $_MEM_FH, '<', $chunk_ref;
binmode $_MEM_FH, ':raw';
if (!$v_flag && !$found_match) {
if ($n_flag) {
1 while (<$_MEM_FH>);
}
}
else {
if ($v_flag) {
if ($i_flag) {
while (<$_MEM_FH>) {
if ($_ !~ /$re/i) {
push @matches, $_; push @lines, $. if ($n_flag);
}
}
}
else {
while (<$_MEM_FH>) {
if ($_ !~ /$re/) {
push @matches, $_; push @lines, $. if ($n_flag);
}
}
}
}
else {
if ($i_flag) {
while (<$_MEM_FH>) {
if ($_ =~ /$re/i) {
push @matches, $_; push @lines, $. if ($n_flag);
}
}
}
else {
while (<$_MEM_FH>) {
if ($_ =~ /$re/) {
push @matches, $_; push @lines, $. if ($n_flag);
}
}
}
}
}
$line_count = $.;
close $_MEM_FH;
## Send results to the manager process.
my %result = (
'found_match' => scalar @matches,
'line_count' => $line_count,
'matches' => \@matches,
'lines' => \@lines
);
MCE->do('display_result', \%result, $chunk_id);
return;
}
###############################################################################
## ----------------------------------------------------------------------------
## Process routines.
##
###############################################################################
sub display_matched {
if (!$q_flag && $f_list) {
print "$file\n" if $total_found;
}
elsif (!$q_flag && $c_flag) {
printf "%s:", $file if $w_filename;
print "$total_found\n";
}
$total_found = $total_lines = 0;
$abort_job = undef;
$order_id = 1;
return;
}
sub process_file {
$file = $_[0];
if ($file eq '-') {
open(STDIN, ($^O eq 'MSWin32') ? 'CON' : '/dev/tty') or die $!;
process_stdin();
}
elsif (! -e $file) {
$exit_status = 2;
print STDERR "$prog_name: $file: No such file or directory\n"
unless $no_msg;
}
elsif (-d $file) {
$exit_status = 1;
}
else {
MCE->process($file);
display_matched();
}
return;
}
sub process_stdin {
$file = "(standard input)";
MCE->process(\*STDIN);
display_matched();
return;
}
###############################################################################
## ----------------------------------------------------------------------------
## Run.
##
###############################################################################
MCE->new(
max_workers => $max_workers, chunk_size => $chunk_size, use_slurpio => 1,
user_begin => \&user_begin, user_func => \&user_func,
user_end => \&user_end
);
if ($r_flag && @files > 0) {
my ($list_fh, $list);
MCE->spawn;
unless ($^O eq 'MSWin32') {
open $list_fh, '-|', 'egrep', '-lsr', @r_patn, '^', @files;
}
else {
$list = `egrep -lsr @r_patn ^ @files`;
open $list_fh, '<', \$list;
}
while (<$list_fh>) {
chomp;
process_file($_);
last if $abort_all;
}
close $list_fh;
}
elsif (@files > 0) {
foreach (@files) {
process_file($_);
last if $abort_all;
}
}
else {
process_stdin();
}
###############################################################################
## ----------------------------------------------------------------------------
## Finish.
##
###############################################################################
MCE->shutdown();
if (!$q_flag && $exit_status) {
exit($exit_status);
}
else {
exit($found_match ? 0 : ($exit_status ? $exit_status : 1));
}