The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/env perl
###############################################################################
## ----------------------------------------------------------------------------
## Word count script similar to the wc binary.
##
## The logic below does not support multi-byte characters. The main focus is
## demonstrating Many-core Engine for Perl. Use this script for large file(s).
##
## The usage description was largely ripped off from the wc man page.
##
###############################################################################

use strict;
use warnings;

use Cwd qw(abs_path);
use lib abs_path . "/../lib";

my $prog_name = $0; $prog_name =~ s{^.*[\\/]}{}g;

sub INIT {
   ## Provide file globbing support under Windows similar to Unix.
   @ARGV = <@ARGV> if ($^O eq 'MSWin32');
}

use IPC::Open2;
use MCE;

###############################################################################
## ----------------------------------------------------------------------------
## Display usage and exit.
##
###############################################################################

sub usage {

   print <<"::_USAGE_BLOCK_END_::";

NAME
   $prog_name -- word, line, and character count

SYNOPSIS
   $prog_name [-clw] [file ...]

DESCRIPTION
   The $prog_name utility displays the number of lines, words, and bytes
   contained in each input file, or standard input (if not file is
   specified) to the standard output.  A line is defined as a string
   of characters delimited by a <newline> character.

   The following options are available:

   --max-workers MAX_WORKERS
          Specify number of workers for MCE   -- default: 8

   --chunk-size CHUNK_SIZE
          Specify chunk size for MCE          -- default: 2M

   -c     Display the number of bytes
   -l     Display the number of lines
   -w     Display the number of words

   When an option is specified, $prog_name, only reports the information
   requested by that option. The order of output always takes the form
   of line, word, byte, and file name. The default action is equivalent
   to specifying the -c -l and -w options.

   If no files are specified, the standard input is used and no file name
   is displayed.  The prompt will accept input until receiving EOF, or
   [^D] in most environments.

EXIT STATUS
   The $prog_name utility exits 0 on success, and >0 if an error occurs.

EXAMPLES
   Count the number of bytes, words and lines in each of the files
   report1 and report2 as well as the totals for both:

         $prog_name -c -w -l report1 report2
         $prog_name -cwl report1 report2
         $prog_name report1 report2

   Count the number of lines: (pass -- to treat following args as files)
         $prog_name -l -- -filename_with_dash

::_USAGE_BLOCK_END_::

   exit 1
}

###############################################################################
## ----------------------------------------------------------------------------
## Define defaults and process command-line arguments.
##
###############################################################################

my $flag = sub { 1; };
my $isOk = sub { (@ARGV == 0 or $ARGV[0] =~ /^-/) ? usage() : shift @ARGV; };

my $chunk_size  = 2097152;  ## 2M
my $max_workers = 8;
my $skip_args   = 0;

my $c_flag = 0;
my $l_flag = 0;
my $w_flag = 0;

my @files = ();

while ( my $arg = shift @ARGV ) {
   unless ($skip_args) {
      if ($arg eq '-') {
         push @files, $arg;
         next;
      }
      if ($arg =~ m/^-[clw]+$/) {
         while ($arg) {
            my $a = chop($arg);
            $c_flag = $flag->() and next if ($a eq 'c');
            $l_flag = $flag->() and next if ($a eq 'l');
            $w_flag = $flag->() and next if ($a eq 'w');
         }
         next;
      }

      $skip_args   = $flag->() and next if ($arg eq '--');
      $max_workers = $isOk->() and next if ($arg =~ /^--max[-_]workers$/);
      $chunk_size  = $isOk->() and next if ($arg =~ /^--chunk[-_]size$/);

      if ($arg =~ /^--max[-_]workers=(.+)/) {
         $max_workers = $1;
         next;
      }
      if ($arg =~ /^--chunk[-_]size=(.+)/) {
         $chunk_size = $1;
         next;
      }

      usage() if ($arg =~ /^-/);
   }

   push @files, $arg;
}

if ($c_flag + $l_flag + $w_flag == 0) {
   $c_flag = $l_flag = $w_flag = 1;
}

my ($wc_cmd, $wc_args);

if ($^O ne 'cygwin' && $^O ne 'MSWin32') {
   $wc_cmd = (-x '/usr/bin/wc')
           ? '/usr/bin/wc' : ((-x '/bin/wc') ? '/bin/wc' : undef);
}

if (defined $wc_cmd && ($l_flag || $w_flag)) {
   $wc_args  = '-';
   $wc_args .= 'l' if ($l_flag);
   $wc_args .= 'w' if ($w_flag);
}

###############################################################################
## ----------------------------------------------------------------------------
## Configure Many-core Engine.
##
###############################################################################

## Called once per file (prior to chunking) -- think of awk BEGIN { ... }

sub user_begin {

   my ($mce) = @_;

   $mce->{wk_lines} = 0;
   $mce->{wk_words} = 0;
   $mce->{wk_bytes} = 0;

   use vars qw($wc_pid $wc_out $wc_in);
   our ($wc_pid, $wc_out, $wc_in);

   if (defined $wc_cmd && ($l_flag || $w_flag)) {
      $wc_pid = open2($wc_out, $wc_in, "$wc_cmd $wc_args");
   }

   return;
}

## Called once per chunk of data -- think of forchuck { ... }

sub user_func {

   my ($mce, $chunk_ref, $chunk_id) = @_;
   my $line_count;

   if ($l_flag || $w_flag) {
      if (defined $wc_cmd) {
         syswrite($wc_in, $$chunk_ref);
      }
      else {
         open my $_MEM_FH, '<', $chunk_ref;
         binmode $_MEM_FH;
         1 while <$_MEM_FH>;
         $line_count = $.;
         close $_MEM_FH;

         $mce->{wk_lines} += $line_count;

         if ($w_flag) {
            if (index($$chunk_ref, ' ') >= 0 || index($$chunk_ref, "\t") >= 0) {
               my $words = 0; $words++ while ($$chunk_ref =~ m!\S+!mg);
               $mce->{wk_words} += $words;
            }
            else {
               $mce->{wk_words} += $line_count;
            }
         }
      }
   }

   $mce->{wk_bytes} += length($$chunk_ref) if ($c_flag);

   return;
}

## Called once per file (after chunking) -- think of awk END { ... }

sub user_end {

   my ($mce) = @_;

   if (defined $wc_cmd && ($l_flag || $w_flag)) {
      close $wc_in;

      my $result = <$wc_out>; chomp $result;

      if ($result) {
         if ($l_flag && $w_flag) {
            if ($result =~ m/(\d+)\s+(\d+)/) {
               $mce->{wk_lines} = $1;
               $mce->{wk_words} = $2;
            }
         }
         elsif ($l_flag) {
            $mce->{wk_lines} = $result;
         }
         else {
            $mce->{wk_words} = $result;
         }
      }

      waitpid($wc_pid, 0);
   }

   my %subtotal = (
      'lines' => $mce->{wk_lines},
      'words' => $mce->{wk_words},
      'bytes' => $mce->{wk_bytes}
   );

   $mce->do('main::aggregate_result', \%subtotal);

   return;
}

## Instantiate Many-core Engine and spawn workers.

my $mce = MCE->new(
   user_begin  => \&user_begin,          ## Called prior to chunking
   user_func   => \&user_func,           ## Think of forchunk { ... }
   user_end    => \&user_end,            ## Called after chunking
   chunk_size  => $chunk_size,
   max_workers => $max_workers,
   use_slurpio => 1
);

###############################################################################
## ----------------------------------------------------------------------------
## Word, line, and character count.
##
###############################################################################

my ($f_lines, $f_words, $f_bytes) = (0, 0, 0);
my ($t_lines, $t_words, $t_bytes) = (0, 0, 0);
my $exit_status = 0;

sub aggregate_result {

   my $subtotal_ref = shift;

   $f_lines += $subtotal_ref->{'lines'};
   $f_words += $subtotal_ref->{'words'};
   $f_bytes += $subtotal_ref->{'bytes'};

   $t_lines += $subtotal_ref->{'lines'};
   $t_words += $subtotal_ref->{'words'};
   $t_bytes += $subtotal_ref->{'bytes'};

   return;
}

sub display_result {

   my ($lines, $words, $bytes, $file) = @_;
   my $result = '';

   $result .= sprintf " %7d", $lines if ($l_flag);
   $result .= sprintf " %7d", $words if ($w_flag);
   $result .= sprintf " %7d", $bytes if ($c_flag);
   $result .= sprintf " %s", $file if (defined $file);

   print $result, "\n";
   return;
}

## Process files, otherwise read from standard input.

if (@files > 0) {
   for my $file (@files) {
      if (! -e $file) {
         print STDERR "$prog_name: $file: No such file or directory\n";
         $exit_status = 2;
      }
      elsif (-d $file) {
         print STDERR "$prog_name: $file: Is a directory\n";
         $exit_status = 1;
      }
      else {
         if ($c_flag && ($l_flag + $w_flag == 0)) {
            $f_bytes  = -s $file;
            $t_bytes += $f_bytes;
         }
         else {
            $mce->process($file);
         }
         display_result($f_lines, $f_words, $f_bytes, $file);
         $f_lines = $f_words = $f_bytes = 0;
      }
   }
   if (@files > 1) {
      display_result($t_lines, $t_words, $t_bytes, 'total');
   }
}
else {
   $mce->process(\*STDIN);
   display_result($f_lines, $f_words, $f_bytes);
}

## Shutdown Many-core Engine and exit.

$mce->shutdown();
exit $exit_status;