The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package File::Shuffle;

use strict;
use warnings;
use File::Temp qw(tempdir);
use Sort::External;
use Data::Dump qw(dump);

BEGIN
{
	use Exporter ();
	use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
	$VERSION     = '0.10';
	@ISA         = qw(Exporter);
	@EXPORT      = qw();
	@EXPORT_OK   = qw(fileShuffle);
	%EXPORT_TAGS = ();
}

#01234567890123456789012345678901234567891234
#Randomly shuffle the lines in a file.

=head1 NAME

C<File::Shuffle> - Randomly shuffle the lines in a file.

=head1 SYNOPSIS

  use File::Temp qw(tempfile);
  use File::Shuffle qw(fileShuffle);
  use Data::Dump qw(dump);
  my ($handle, $inputFile) = tempfile();
  print $handle join("\n", 0..9, '');
  close $handle;
  fileShuffle (inputFile => $inputFile);
  open ($handle, '<', $inputFile);
  my @lines = <$handle>;
  close $handle;
  print @lines;

=head1 DESCRIPTION

C<File::Shuffle> provides the routine C<fileShuffle> to randomly shuffle the lines
in a file.

=head1 SUBROUTINES

=head2 C<fileShuffle>

The subroutine C<fileShuffle> randomly shuffles the lines in a file with
the following parameters:

=over

=item C<inputFile>

  inputFile => '...'

C<inputFile> holds the path to the file whose lines are to be shuffled; if it does
not exist or if it is not a file an exception is thrown.

=item C<outputFile>

  outputFile => '...'

C<outputFile> is the file the shuffled lines are to be written to, it may equal
C<inputFile>; the default is C<inputFile>.

=item C<tempDirectory>

  tempDirectory => File::Temp::tempdir()

C<tempDirectory> is a temporary directory that intermediate files are written to if the C<inputFile>
is too large to shuffle using only internal memory; the default
is set using L<File::Temp::tempdir()|File::Temp/FUNCTIONS>.

=item C<encoding>

  encoding => ''

C<encoding> is the encoding to used when openning the input and output files; the default is the
the system default of the Perl C<open> function.

=item C<fileSizeBound>

  fileSizeBound => 1000000

If the input file contains less than C<fileSizeBound> bytes, the file will be shuffled entirely using
internal memory, otherwise L<Sort::External> is used to shuffle the lines in the file.

=back

=cut

sub fileShuffle
{
	my (%Parameters) = @_;

	# make sure the input file was defined.
	unless (exists($Parameters{inputFile}) && defined($Parameters{inputFile}))
	{
		die("error: inputFile parameter is undefined.\n");
	}
	my $InputFile = $Parameters{inputFile};

	# make sure the input files was defined.
	unless (defined $InputFile)
	{
		die("error: input file undefined.\n");
	}

	# make sure the input file exists.
	unless (-e $InputFile)
	{
		die("error: input file '$InputFile' does not exist.\n");
	}

	# make sure the input file is a file.
	unless (-f $InputFile)
	{
		die("error: input file '$InputFile' is not a file.\n");
	}

	# set the default encoding to utf8.
	my $Encoding = '';
	$Encoding = $Parameters{encoding} if (exists($Parameters{encoding}) && defined($Parameters{encoding}));

	# ensure the encoding is prefixed with a colon.
	$Encoding = ':' . $Encoding if (length($Encoding) && (substr($Encoding, 0, 1) ne ':'));

	# set the default file size bound.
	my $FileSizeBound = 1000000;
	$FileSizeBound = int abs $Parameters{fileSizeBound} if (exists($Parameters{fileSizeBound}) && defined($Parameters{fileSizeBound}));

	# set the temp directory if defined.
	my $TempDirectory;
	$TempDirectory = $Parameters{tempDirectory} if (exists($Parameters{tempDirectory}) && defined($Parameters{tempDirectory}));

	# set the temp directory if defined.
	my $OutputFile = $InputFile;
	$OutputFile = $Parameters{outputFile} if (exists($Parameters{outputFile}) && defined($Parameters{outputFile}));

	# open the input file for reading only.
	my $inputFileHandle;
	unless (open($inputFileHandle, "<$Encoding", $InputFile))
	{
		die("could not open file '$InputFile' for reading: $!\n");
	}

	# get the size of the file.
	my $inputFileBytes = -s $InputFile;

	# small files are shuffled like an internal array.
	if ($inputFileBytes <= $FileSizeBound)
	{

		# the file is small enough to read into memory and shuffle.
		shuffleInternal($inputFileHandle, $OutputFile, $Encoding);
	}
	else
	{
		# the file is too large to read in, so shuffle via a random prefix and sort.

		# create and set the temporary directory if needed.
		$TempDirectory = tempdir(CLEANUP => 1) unless defined $TempDirectory;

		# shuffle the file via sorting.
		shuffleExternal($inputFileHandle, $OutputFile, $Encoding, $TempDirectory, $inputFileBytes);
	}

	return undef;
}

sub shuffleInternal
{
	my ($InputHandle, $OutputFile, $Encoding) = @_;

	# read in all the lines of the file.
	my @linesInFile = <$InputHandle>;

	# close the input file.
	close $InputHandle;

	# shuffle the lines.
	my $totalLines = @linesInFile;
	for (my $i = 0 ; $i < $totalLines ; $i++)
	{

		# select a random line to swap $i with.
		my $j = int rand $totalLines;

		# save the line at $j.
		my $lineAtj = $linesInFile[$j];

		# replace line at $j with $i.
		$linesInFile[$j] = $linesInFile[$i];

		# replace line at $i with $j.
		$linesInFile[$i] = $lineAtj;
	}

	# open the output file for writing only.
	my $outputFileHandle;
	unless (open($outputFileHandle, ">$Encoding", $OutputFile))
	{
		die("could not open file '$OutputFile' for writing: $!\n");
	}

	# output the shuffled lines.
	for (my $i = 0 ; $i < $totalLines ; $i++)
	{
		print $outputFileHandle $linesInFile[$i];
		$linesInFile[$i] = undef;
	}

	# close the output file.
	close $outputFileHandle;

	return undef;
}

sub shuffleExternal
{
	use bytes;

	my ($InputHandle, $OutputFile, $Encoding, $TempDirectory, $InputFileBytes) = @_;

	# set the prefix size.
	my $prefixByteSize = getPrefixByteSize($InputFileBytes);

	# create the sorter.
	my $sorter = Sort::External->new(mem_threshold => 64 * 1024 * 1024, working_dir => $TempDirectory);

	# add each line to the sorter prefixed with a random string of $prefixByteSize bytes.
	while (defined(my $line = <$InputHandle>))
	{

		# feed the prefix and line to the sorter.
		$sorter->feed(getRandomString($prefixByteSize) . $line);
	}

	# close the input file.
	close $InputHandle;

	# finish the sorting.
	$sorter->finish();

	# open the output file for writing only.
	my $outputFileHandle;
	unless (open($outputFileHandle, ">$Encoding", $OutputFile))
	{
		die("could not open file '$OutputFile' for writing: $!\n");
	}

	while (defined(my $prefixedLine = $sorter->fetch))
	{

		# write the original line to the output file.
		print $outputFileHandle substr($prefixedLine, $prefixByteSize);
	}
	$sorter = undef;

	# close the output file.
	close $outputFileHandle;

	return undef;
}

sub getPrefixByteSize
{
	my ($BytesInFile) = @_;

	# make sure the total bytes is a non-negative integer.
	$BytesInFile = int abs $BytesInFile;

	# if less than two, return zero.
	return 0 if $BytesInFile < 2;

	# assume each line in the file is at least two bytes;
	# compute the number of bits needed to represent the
	# maximum possible number of lines.
	my $maxPrefixBits = log(abs($BytesInFile) + 1) / log(2) - 1;
	$maxPrefixBits = 1 if $maxPrefixBits < 1;
	$maxPrefixBits = int $maxPrefixBits;

	# compute the number of bytes needed for the prefixes.
	my $bytesInPrefix = int(($maxPrefixBits + 7) / 8);

	return $bytesInPrefix;
}

sub getRandomString
{

	# get the number of bytes in the string.
	my $totalBytes = $_[0];

	# get the number of shorts in the string.
	my $totalShorts = $totalBytes >> 1;

	# generate the shorts.
	my @listOfShorts;
	while ($totalShorts > 0)
	{
		push @listOfShorts, pack('S', int rand(1 << 16));
		--$totalShorts;
	}

	# if totalBytes is odd, add one more random byte.
	push @listOfShorts, pack('C', int rand(1 << 8)) if $totalBytes & 1;

	# return the string.
	return join('', @listOfShorts);
}

=head1 INSTALLATION

Use L<CPAN> to install the module and all its prerequisites:

  perl -MCPAN -e shell
  cpan[1]> install File::Shuffle

=head1 BUGS

Please email bugs reports or feature requests to C<bug-file-shuffle@rt.cpan.org>, or through
the web interface at L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=File-Shuffle>.  The author
will be notified and you can be automatically notified of progress on the bug fix or feature request.

=head1 AUTHOR

 Jeff Kubina<jeff.kubina@gmail.com>

=head1 COPYRIGHT

Copyright (c) 2009 Jeff Kubina. All rights reserved.
This program is free software; you can redistribute
it and/or modify it under the same terms as Perl itself.

The full text of the license can be found in the
LICENSE file included with this module.

=head1 KEYWORDS

file, permute, randomize, shuffle

=head1 SEE ALSO

L<Sort::External>

=cut

1;

# The preceding line will help the module return a true value