The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/env perl

use strict;
use warnings;

use File::Temp;
use Path::Class;
use List::Util qw(sum);
use Getopt::Long::Descriptive;

#-----------------------------------------------------------------------------

# Copyright 2013 Jeffrey Ryan Thalhammer <jeff@stratopan.com>

#-----------------------------------------------------------------------------

# To diagnose leaks:

# use Pinto::Util;
# *Pinto::Util::throw = sub { warn ">>> Called my throw"; die "@_" };

# use Devel::Leak::Object qw{ GLOBAL_bless };

#-----------------------------------------------------------------------------

my ($opt, $usage) = describe_options(
	"$0 %o TARGETS",
    [ 'root|r=s',          "Root of repository",                            ],
    [ 'cpan|c=s',          "Path to a local cpan",                          ],
    [ 'batch-size|s=i',    "Distributions per batch",    { default => 100 } ],
    [ 'batch-offset|o=i',  "Batch number to start at",   { default => 0   } ],
    [ 'batch-count|n=i',   "Number of batches to run",   { default => 0   } ],
);

my $batch_size   = $opt->batch_size;
my $batch_offset = $opt->batch_offset;
my $batch_count  = $opt->batch_count;
my $root  = $opt->root || File::Temp->newdir;
my $cpan  = $opt->cpan or die "Must specify --cpan";

#-----------------------------------------------------------------------------

unless (-e $root) {

	require Pinto::Initializer;

	my %init_args = (sources => "file://$cpan", root => $root, no_history => 1);
	Pinto::Initializer->init(%init_args);
}

#-----------------------------------------------------------------------------

chomp (my @dists = ($ARGV[0] || '') eq '-' ? <STDIN> : search_for_dists($cpan));
my @batches = map { [splice @dists, 0, $batch_size, ()] } 1..int (@dists/$batch_size+1);

# Lop off the leading batches, if offset given
# TODO: make sure batch offset is in range
splice @batches, 0, $batch_offset, () if $batch_offset;

# Lop off trailing batches, if count is given
# TODO: make sure count is in range
@batches = @batches[0..($batch_count-1)] if $batch_count;

# Count the total number of dists
my $count = (@batches - 1) * $batch_size + @{ $batches[-1] };

#-----------------------------------------------------------------------------
printf "Loading %i distributions (in batches of %i) into repository at %s\n", $count, $batch_size, $root;

require Pinto;
my $pinto = Pinto->new(root => "$root");

my $n = $batch_offset;
while (@batches) {

	local $ENV{PINTO_NO_CHECKSUMS} = 1;
	local $ENV{PINTO_ALLOW_DUPLICATES} = 1;

	my $batch = shift @batches;
	s|^.*/authors/id/./../|| for @{ $batch };

	my $start = time;
	my $result = $pinto->run(Pull => (targets => $batch, no_recurse => 1, no_fail => 1, message => ''));
	printf "Batch %i loaded in %i seconds at %s\n", $n, time - $start, scalar localtime;

	if ($n % 100 == 0) {
		print "Vacuuming database\n";
		$pinto->repo->db->schema->storage->dbh->do('VACUUM;');
		$pinto->repo->db->schema->storage->dbh->do('ANALYZE;');
	}
	$n++;
}


#-----------------------------------------------------------------------------

sub search_for_dists {
	my ($cpan) = @_;

	printf "Searching for distributions beneath %s\n", $cpan;

	require File::Find;

	my @dists;
	my $cb = sub {push @dists, $_ if /[.](gz|tgz|zip|bz2)$/i and not /perl/ and not /BadExample/};
	File::Find::find( {no_chdir => 1, wanted => $cb}, dir($cpan)->subdir( qw(authors id) ));
	@dists = map {$_->[0]} sort {$a->[1] <=> $b->[1]} map { [$_ => (stat $_)[9]] } @dists;

	printf "Found %d distributions in total\n", scalar @dists;

	return @dists;
}