The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package MyCPAN::App::BackPAN::Indexer;

use strict;
no warnings qw(uninitialized redefine);

use vars qw($VERSION $Starting_dir $logger);

use Carp;
use Cwd qw(cwd);
use File::Basename;
use File::Path qw(mkpath);
use File::Spec::Functions qw(catfile file_name_is_absolute rel2abs);
use File::Temp qw(tempdir);
use Getopt::Std;
use List::Util qw(max);
use Log::Log4perl;

$VERSION = '1.28_12';

=head1 NAME

MyCPAN::App::BackPAN::Indexer - The BackPAN indexer application

=head1 SYNOPSIS

	use MyCPAN::Indexer;

=head1 DESCRIPTION

=cut

$|++;

__PACKAGE__->activate( @ARGV ) unless caller;

BEGIN {
my $cwd = cwd();

my $report_dir = catfile( $cwd, 'indexer_reports' );

my %Defaults = (
	alarm                 => 15,
#	backpan_dir           => cwd(),
	copy_bad_dists        => 0,
	collator_class        => 'MyCPAN::Indexer::Collator::Null',
	dispatcher_class      => 'MyCPAN::Indexer::Dispatcher::Parallel',
	error_report_subdir   => catfile( $report_dir, 'errors'  ),
	indexer_class         => 'MyCPAN::Indexer',
	indexer_id            => 'Joe Example <joe@example.com>',
	interface_class       => 'MyCPAN::Indexer::Interface::Text',
	log_file_watch_time   => 30,
#	merge_dirs            => undef,
	organize_dists        => 0,
	parallel_jobs         => 1,
	pause_id              => 'MYCPAN',
	pause_full_name       => "MyCPAN user <CENSORED>",
	prefer_bin            => 0,
	queue_class           => 'MyCPAN::Indexer::Queue',
	report_dir            => $report_dir,
	reporter_class        => 'MyCPAN::Indexer::Reporter::AsYAML',
	retry_errors          => 1,
	success_report_subdir => catfile( $report_dir, 'success' ),
	system_id             => 'an unnamed system',
	worker_class          => 'MyCPAN::Indexer::Worker',
	perl                  => remember_perl( $^X ),
	);

=over 4

=item remember_perl

We need to remember the C<perl> that started our program. We want to use the
same binary to fire off other processes. WE have to do this very early because
we are going to discard most of the environment. After we do that, we can't
search the PATH to find the C<perl> binary.

=cut

sub remember_perl {
	require File::Which;

	my $perl = do {
		   if( file_name_is_absolute( $^X )      )  { $^X }
		elsif( my $f = File::Which::which( $^X ) )  { $f  }
		elsif( my $g = rel2abs( $^X )            )  { $g  }
		else                                        { undef }
		};

=pod

# All of this takes place before we have an object. :(

	my $sub = sub {
		my $perl = $self->get_config->perl;

		   if( not defined $perl ) {
			$logger->warn( "I couldn't find a perl! This may cause problems later." );
			}
		elsif( -x $perl ) {
			$logger->debug( "$perl is executable" );
			}
		else {
			$logger->warn( "$perl is not executable. This may cause problems later." );
			}
		};

	$self->push_onto_note( 'pre_logging_items', $sub );

=cut

	return $perl;
	}

=item default_keys

Return a list of the default keys.

=cut

sub default_keys { keys %Defaults }

=item default( KEY )

Return the default value for KEY.

=cut

sub default { $Defaults{$_[1]} }


=item config_class

Return the name of the configuration class to use. The default is
C<ConfigReader::Simple>. Any configuration class should respond to
the same interface.

=cut

sub config_class { 'ConfigReader::Simple' }

=item init_config

Load the configuration class, create the new object, and set the defaults.

=cut

sub init_config {
	my( $self, $file ) = @_;

	eval "require " . $self->config_class . "; 1";

	my $config = $self->config_class->new( defined $file ? $file : () );

	foreach my $key ( $self->default_keys ) {
		next if $config->exists( $key );
		$config->set( $key, $self->default( $key ) );
		}

	$config;
	}
}

=item adjust_config

After we setup everything, adjust the config for things that we discovered.
Set some defaults.

=cut

sub adjust_config {
	my( $application ) = @_;

	my $coordinator = $application->get_coordinator;
	my $config      = $coordinator->get_config;

	my( $backpan_dir, @merge_dirs ) = @{ $application->{args} };

	$config->set( 'backpan_dir', $backpan_dir ) if defined $backpan_dir;
	$config->set( 'merge_dirs', join "\x00", @merge_dirs ) if @merge_dirs;

	# set the directories to index, either set in:
		# first argument on the command line
		# config file
		# current working directory
	unless( $config->get( 'backpan_dir' ) ) {
		$config->set( 'backpan_dir', cwd() );
		}

	# in the config file, it's all a single line
	if( $config->get( 'merge_dirs' ) ) {
		my @dirs =
			grep { length }
			split /(?<!\\) /,
				$config->get( 'merge_dirs' ) || '';

		$config->set( 'merge_dirs', join "\x00", @dirs );
		}

	if( $config->exists( 'report_dir' ) ) {
		foreach my $subdir ( qw(success error) ) {
			$config->set(
				"${subdir}_report_subdir",
				catfile( $config->get( 'report_dir' ), $subdir ),
				);
			}
		}

	# Adjust for some environment variables
	my $log4perl_file =
		$ENV{'MYCPAN_LOG4PERL_FILE'}
			||
		$coordinator->get_note( 'log4perl_file' )
			;

	# Adjust for some environment variables
	$ENV{'PREFER_BIN'} = 1 if $config->get( 'prefer_bin' );

	$config->set( 'log4perl_file', $log4perl_file ) if $log4perl_file;

	return 1;
	}

=item new

=cut

sub new {
	my( $class, @args ) = @_;

	bless { args => [ @args ] }, $class;
	}

=item get_coordinator

=item set_coordinator

Convenience methods to deal with the coordinator

=cut

sub get_coordinator { $_[0]->{coordinator}         }
sub set_coordinator { $_[0]->{coordinator} = $_[1] }

=item process_options

Handle the configuration directives from the command line and set default
values:

	-f  config_file     Default is $script.conf
	-l  log4perl_file   Default is $script.log4perl
	-c                  Print the config and exit

=cut

sub process_options {
	my( $application ) = @_;

	my $run_dir = dirname( $0 );
	( my $script  = basename( $0 ) ) =~ s/\.\w+$//;

	local @ARGV = @{ $application->{args} };
	getopts( 'cl:f:', \ my %Options );

	# other things might want to use things from @ARGV, and
	# we just removed the bits that we wanted.
	$application->{args} = [ @ARGV ]; # XXX: yuck

	$Options{f} ||= catfile( $run_dir, "$script.conf" );

	#$Options{l} ||= catfile( $run_dir, "$script.log4perl" );

	$application->{options} = \%Options;
	}

sub get_option { $_[0]->{options}{$_[1]} }

=item setup_coordinator

Set up the coordinator object and set its initial values.

=cut

sub setup_coordinator {
	my( $application ) = @_;

	require MyCPAN::Indexer::Coordinator;
	my $coordinator = MyCPAN::Indexer::Coordinator->new;

	$coordinator->set_application( $application );
	$application->set_coordinator( $coordinator );

	$coordinator->set_note( 'UUID',          $application->get_uuid() );
	$coordinator->set_note( 'tempdirs',      [] );
	$coordinator->set_note( 'log4perl_file', $application->get_option( 'l' ) );

	$coordinator;
	}

=item handle_config

Load and set the configuration file.

You can set the configuration filename with the C<-f> option on the command
line.

You can print the configuration and exit with the C<-c> option.

=cut

sub handle_config {
	my( $application ) = @_;

	# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
	# Adjust config based on run parameters
	my $config = $application->init_config( $application->get_option('f') );
	$application->get_coordinator->set_config( $config );

	$application->adjust_config;

	# If this is a dry run, just print the directives and exit
	if( $application->get_option( 'c' ) ) {
		my @directives = $config->directives;
		my $longest = max( map { length } @directives );
		foreach my $directive ( sort @directives ) {
			printf "%${longest}s   %-10s\n",
				$directive,
				$config->get( $directive );
			}

		exit;
		}
	}

=item activate_steps

Returns a list of the steps to run in C<activate>.

=cut

sub activate_steps {
	qw(
	process_options
	setup_coordinator
	setup_environment
	handle_config
	setup_logging
	post_setup_logging_tasks
	adjust_config
	disable_the_missiles
	setup_dirs
	run_components
	activate_end
	);
	}

=item activate

Start the process.

=cut

sub activate {
	my( $class, @argv ) = @_;
	use vars qw( %Options $Starting_dir);
	$Starting_dir = cwd(); # remember this so we can change out of temp dirs in abnormal cleanup
	local %ENV = %ENV;

	my $application = $class->new( @argv );

	foreach my $step ( $application->activate_steps ) {
		$application->$step();
		}

	$application;
	}

=item run_components

Do the work.

=cut

sub run_components {
	my( $application ) = @_;

	# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
	# Load classes and check that they do the right thing
	my @components = $application->components;

	my $coordinator = $application->get_coordinator;

	my $config     = $coordinator->get_config;

	foreach my $tuple ( @components ) {
		my( $component_type, $default_class, $method ) = @$tuple;

		my $class = $config->get( "${component_type}_class" ) || $default_class;

		eval "require $class; 1" or die "$@\n";
		die "$component_type [$class] does not implement $method()"
			unless $class->can( $method );

		$logger->debug( "Calling $class->$method()" );

		my $component = $class->new;
		$component->set_coordinator( $coordinator );
		$component->$method();

		my $set_method = "set_${component_type}";
		$coordinator->$set_method( $component );
		}
	}

=item activate_end

Do stuff before we quit.

=cut

sub activate_end {
	my( $application ) = @_;

	$application->cleanup;

	$application->_exit;
	}

=item setup_environment

Delete what we don't want and set what we need.

We don't want most of the environment, just the minimal to make things not
break. We especially want to cleanse PATH. We keep these:

	DISPLAY
	USER
	HOME
	PWD
	TERM

Some of the things we need are:

	AUTOMATED_TESTING
	PERL_MM_USE_DEFAULT
	PERL_EXTUTILS_AUTOINSTALL

=cut

sub setup_environment {
	my %pass_through = map { $_, 1 } qw(
		DISPLAY USER HOME PWD TERM
		), grep { /\A(?:D|MYC)PAN_/ } keys %ENV;

	foreach my $key ( keys %ENV ) {
		delete $ENV{$key} unless exists $pass_through{$key}
		}

	# Testers conventions
	$ENV{AUTOMATED_TESTING}++;

	# Makemaker
	$ENV{PERL_MM_USE_DEFAULT}++;

	# Module::Install
	$ENV{PERL_EXTUTILS_AUTOINSTALL} = '--skipdeps';
	}

=item setup_logging

Initialize C<Log4perl>.

In the configuration, you can set

	log4perl_file
	log_file_watch_time

You can also use the environment to set the values:

	MYCPAN_LOG4PERL_FILE
	MYCPAN_LOGLEVEL (defaults to ERROR)

The environment takes precedence.

=cut

sub setup_logging {
	my( $self ) = @_;

	my $config   = $self->get_coordinator->get_config;

	my $log_config = do {
		no warnings 'uninitialized';
		if( -e $ENV{MYCPAN_LOG4PERL_FILE} ) {
			$ENV{MYCPAN_LOG4PERL_FILE};
			}
		elsif( -e $config->get( 'log4perl_file' ) ) {
			$config->get( 'log4perl_file' );
			}
		};

	if( defined $log_config ) {
		Log::Log4perl->init_and_watch(
			$log_config,
			$self->get_coordinator->get_config->get( 'log_file_watch_time' )
			);
		}
	else {
		my %hash = (
			DEBUG => $Log::Log4perl::DEBUG,
			ERROR => $Log::Log4perl::ERROR,
			WARN  => $Log::Log4perl::WARN,
			FATAL => $Log::Log4perl::FATAL,
			);

		my $level = defined $ENV{MYCPAN_LOGLEVEL} ?
			$ENV{MYCPAN_LOGLEVEL} : 'ERROR';

		Log::Log4perl->easy_init( $hash{$level} );
		}

	$logger = Log::Log4perl->get_logger( 'backpan_indexer' );
	}

=item post_setup_logging_tasks

Logging has to happen after we read the config, but there are some
things I'd like to check and log, so I must wait to log. Anyone who
wants to log something before logging has been set up should push a
sub reference onto the C<pre_logging_items> note.

=cut

sub post_setup_logging_tasks {
	my $application = shift;

	# this stuff happened too early to set a pre_logging_items
	$application->_log_perl;

	my $coordinator = $application->get_coordinator;

	my @items = $coordinator->get_note( 'pre_logging_items' );

	foreach my $item ( @items ) {
		next unless ref $item eq ref sub {};
		$item->();
		}

	1;
	}

sub _log_perl {
	my( $application ) = @_;

	my $coordinator = $application->get_coordinator;
	my $config      = $coordinator->get_config;

	my $perl = $config->perl;

	   if( not defined $perl ) {
		$logger->warn( "I couldn't find a perl! This may cause problems later." );
		}
	elsif( -x $perl ) {
		$logger->debug( "$perl is executable" );
		}
	else {
		$logger->warn( "$perl is not executable. This may cause problems later." );
		}
	}

=item disable_the_missiles

Catch INT signals and set up error handlers to direct things toward Log4perl.
Some of this stuff is a bit dangerous, maybe.

=cut

sub disable_the_missiles {
	my( $self ) = @_;

	$self->install_int_handler;
	$self->install_warn_handler;
	}

=item install_int_handler

Catch INT signals so we can log it, clean up, and exit nicely.

=cut

sub install_int_handler {
	#$SIG{__DIE__} = \&Carp::confess;

	# If we catch an INT we're probably in one of the temporary directories
	# and have some files open. To clean up the temp dirs, we have to move
	# above them, so change back to the original directory.
	$SIG{INT} = sub {
		$logger->error("Caught SIGINT in $$" );
		chdir $Starting_dir;
		exit()
		};
	}

=item install_warn_handler

Make C<warn> go to C<Log4perl>.

=cut

sub install_warn_handler {
	$SIG{__WARN__} = sub {
		$logger->warn( @_ );
		};
	}

=item components

An array of arrays that list the components to load and the method each
component needs to implement. You can override the implementing class through
the configuration.

=cut

sub components {
	(
	[ qw( reporter   MyCPAN::Indexer::Reporter::AsYAML     get_reporter   ) ],
	[ qw( queue      MyCPAN::Indexer::Queue                get_queue      ) ],
	[ qw( dispatcher MyCPAN::Indexer::Dispatcher::Parallel get_dispatcher ) ],
	[ qw( indexer    MyCPAN::Indexer                       get_indexer    ) ],
	[ qw( worker     MyCPAN::Indexer::Worker               get_task       ) ],
	[ qw( collator   MyCPAN::Indexer::Collator::Null       get_collator   ) ],
	[ qw( interface  MyCPAN::Indexer::Interface::Curses    do_interface   ) ],
	)
	}

=item cleanup

Clean up on the way out. We're already done with the run.

=cut

sub cleanup {
	my( $self ) = @_;

	require File::Path;

	my @dirs =
		@{ $self->get_coordinator->get_note('tempdirs') },
		$self->get_coordinator->get_config->temp_dir;
	$logger->debug( "Dirs to remove are @dirs" );

	eval {
		no warnings;
		File::Path::rmtree [@dirs];
		};

	$logger->error( "Couldn't cleanup: $@" ) if $@;
	}

# I don't remember why I made an explicit exit. Was it to get
# out of a Tk app or something?
sub _exit {
	my( $self ) = @_;

	$logger->info( "Exiting from ", __PACKAGE__ );

	exit 0;
	}

=item setup_dirs

Setup the temporary directories, report directories, and so on, etc.

=cut

sub setup_dirs { # XXX big ugly mess to clean up
	my( $self ) = @_;

	my $config = $self->get_coordinator->get_config;

# Okay, I've gone back and forth on this a couple of times. There is
# no default for temp_dir. I create it here so it's only set when I
# need it. It either comes from the user or on-demand creation. I then
# set it's value in the configuration.

	my $temp_dir = $config->temp_dir || tempdir( DIR => cwd(), CLEANUP => 1 );
	$logger->debug( "temp_dir is [$temp_dir] [" . $config->temp_dir . "]" );
	$config->set( 'temp_dir', $temp_dir );


	my $tempdirs = $self->get_coordinator->get_note( 'tempdirs' );
	push @$tempdirs, $temp_dir;
	$self->get_coordinator->set_note( 'tempdirs', $tempdirs );

	mkpath( $temp_dir ) unless -d $temp_dir;
	$logger->logdie( "temp_dir [$temp_dir] does not exist!" ) unless -d $temp_dir;

	foreach my $key ( qw(report_dir success_report_subdir error_report_subdir) ) {
		my $dir = $config->get( $key );

		mkpath( $dir ) unless -d $dir;
		$logger->logdie( "$key [$dir] does not exist!" ) unless -d $dir;
		}

	if( $config->retry_errors ) {
		$logger->warn( 'retry_errors no longer deletes error reports, but the worker should skip them if the setting is false' );
		}
	}

=item get_uuid

Generate a unique identifier for this indexer run.

=cut

sub get_uuid {
	require Data::UUID;
	my $ug = Data::UUID->new;
	my $uuid = $ug->create;
	$ug->to_string( $uuid );
	}

=back

=head1 TO DO

=over 4

=item Count the lines in the files

=item Code stats? Lines of code, lines of pod, lines of comments

=back

=head1 SOURCE AVAILABILITY

This code is in Github:

	git://github.com/briandfoy/mycpan-indexer.git

=head1 AUTHOR

brian d foy, C<< <bdfoy@cpan.org> >>

=head1 COPYRIGHT AND LICENSE

Copyright (c) 2008-2013, brian d foy, All Rights Reserved.

You may redistribute this under the same terms as Perl itself.

=cut

1;