lib/CPAN/Mini/Indexed.pm


package CPAN::Mini::Indexed ;

use strict ;
use warnings ;
use Carp ;

BEGIN 
{
use Sub::Exporter -setup => 
	{
	exports => [ qw(search check_index show_database_information) ],
	groups  => 
		{
		all  => [ qw() ],
		}
	};
	
use vars qw ($VERSION);
$VERSION     = '0.03_01';
}

#-------------------------------------------------------------------------------

use Time::HiRes 'time' ;
use File::Temp ;
use Text::Pluralize;
use File::Find::Rule ;
use IO::Zlib ;
use Archive::Tar ;
use File::Copy ;

use Search::Indexer::Incremental::MD5 qw() ;
use Search::Indexer::Incremental::MD5::Indexer qw() ;
use Search::Indexer::Incremental::MD5::Searcher qw() ;
use Search::Indexer::Incremental::MD5::Language::Perl qw() ;

use English qw( -no_match_vars ) ;

use Readonly ;
Readonly my $EMPTY_STRING => q{} ;

#-------------------------------------------------------------------------------

=head1 NAME

CPAN::Mini::Indexed - Index the content of your CPAN mini repository

=head1 SYNOPSIS


=head1 DESCRIPTION

This module implements ...

=head1 DOCUMENTATION

=head1 SUBROUTINES/METHODS

=cut

#----------------------------------------------------------------------------------------------------------

sub show_database_information
{

=head2 ( )

  some code

I<Arguments>

=over 2 

=item * $ - 

=back

I<Returns> - Nothing

I<Exceptions> - None

=cut

my ($options) = @_ ;
my $information = Search::Indexer::Incremental::MD5::show_database_information($options->{index_directory}) ;

# make sizes more readable
1 while $information->{entries} =~ s/^([-+]?\d+)(\d{3})/$1_$2/ ;
1 while $information->{size} =~ s/^([-+]?\d+)(\d{3})/$1_$2/ ;

print {*STDOUT} <<"EOI" ;
Location: $options->{index_directory}
Last updated on: $information->{update_date}
Number of indexed documents: $information->{entries}
Database size: $information->{size} bytes
EOI
}

#----------------------------------------------------------------------------------------------------------

sub search
{

=head2 ( )

  some code

I<Arguments>

=over 2 

=item * $ - 

=back

I<Returns> - Nothing

I<Exceptions> - None

=cut

my ($options) = @_ ;

my $searcher 
	= eval 
		{
		Search::Indexer::Incremental::MD5::Searcher->new
			(
			INDEX_DIRECTORY => $options->{index_directory}, 
			USE_POSITIONS => 0, 
			WORD_REGEX => qr/\w+/,
			);
		} or croak "No full text index found! $EVAL_ERROR\n" ;

my $results  = $searcher->search(SEARCH_STRING => $options->{search}) ;

my @indexes = map { $_->[0] }
				reverse
					sort { $a->[1] <=> $b->[1] }
						map { [$_, $results->[$_]{SCORE}] }
							0 .. $#$results ;

my ($displayed_matches, %displayed_module) = (0) ;

for my $index (@indexes)
	{
	last if $displayed_matches++ == $options->{lines} ;
	
	my $matching_file = $results->[$index]{PATH} ;
	
	unless($matching_file)
		{
		carp "matched id:'$results->[$index]{ID}' which was removed!\n" ;
		next ;
		}
	
	(my $matching_file_short = $matching_file) =~ s{^/tmp/[^/]+/}{} ;
	
	if($options->{modules_only})
		{
		(my $matching_module = $matching_file_short) =~ s{^([^/]+).*}{$1} ;
		$matching_module =~ s/(.*)-.*/$1/g ;
		$matching_module =~ s/-/::/g ;
		
		print {*STDOUT} "$matching_module\n" unless exists $displayed_module{$matching_module} ;
		
		$displayed_module{$matching_module} += $results->[$index]{SCORE} ;
		}
	else
		{
		if($options->{verbose})
			{
			print {*STDOUT} "'$matching_file_short' [id:'$results->[$index]{ID}', score: '$results->[$index]{SCORE}]'\n" ;
			}
		else
			{
			print {*STDOUT} "$matching_file_short\n" ;
			}
		}
	}
}

#----------------------------------------------------------------------------------------------------------

sub check_index
{

=head2 check_index($indexer, $options)

 brings the cpan mini index database up to date

I<Arguments> - 

$indexer, $options

I<Returns> -  Nothing

I<Exceptions> - 

=cut

my ($options) = @_ ;

my $cpan_mini = $options->{cpan_mini} || $ENV{CPAN_MINI} || '/devel/cpan' ;
$cpan_mini =~ s{^\./}[] ;
$cpan_mini =~ s{/$}[] ;

croak "Invalid cpan mini repository!\n" if $cpan_mini eq $EMPTY_STRING;

printf "[CPAN mini repository in '$cpan_mini']\n" if ($options->{verbose}) ;

my $modules_details_file = "$cpan_mini/modules/02packages.details.txt.gz" ;
my $cache_details_file = "$options->{index_directory}/02packages.details.txt.gz" ;

if(index_needs_update($modules_details_file, $cache_details_file))
	{
	my @stopwords = (STOPWORDS => $options->{stopwords_file}) if $options->{stopwords_file} ;

	my $indexer = Search::Indexer::Incremental::MD5::Indexer->new
					(
					INDEX_DIRECTORY => $options->{index_directory}, 
					USE_POSITIONS => 0, 
					Search::Indexer::Incremental::MD5::Language::Perl::get_perl_word_regex_and_stopwords(),
					@stopwords,
					) ;

	my ($modules_in_repository, $modules_up_to_date, $modules_out_of_date) = scan_index($cpan_mini, $indexer) ;

	remove_out_of_date_modules($indexer, $modules_out_of_date, $options) ;

	my %new_modules = grep { ! exists $modules_up_to_date->{$_} } keys %{$modules_in_repository};

	add_new_modules($indexer, $cpan_mini, \%new_modules, $options) ;

	if(-e $modules_details_file)
		{
		copy($modules_details_file, $cache_details_file) or carp "Warning: '$cache_details_file' creation failed: $!" ;
		}
	}

return ;
}

#----------------------------------------------------------------------------------------------------------

sub index_needs_update
{

=head2 ( )

  some code

I<Arguments>

=over 2 

=item * $ - 

=back

I<Returns> - Nothing

I<Exceptions> - None

=cut

my ($modules_details_file, $cache_details_file) = @_ ;

my $index_need_update = 1 ;

if(-e $modules_details_file && -e $cache_details_file)
	{
	if
		(
		Search::Indexer::Incremental::MD5::get_file_MD5($modules_details_file) 
			eq Search::Indexer::Incremental::MD5::get_file_MD5($cache_details_file)
		)
		{
		$index_need_update = 0 ; 
		}
	}
	
return $index_need_update ;
}

#----------------------------------------------------------------------------------------------------------

sub scan_index
{

=head2 ( )

  some code

I<Arguments>

=over 2 

=item * $ - 

=back

I<Returns> - Nothing

I<Exceptions> - None

=cut

my ($cpan_mini, $indexer) = @_ ;

my %modules_in_repository
	= map {chomp($_) ; $_ => 1} 
		File::Find::Rule
			->file()
			->name('*.tar.gz')
			->in($cpan_mini);


my (%modules_up_to_date, %indexed_modules_to_remove) ;

$indexer->check_indexed_files
		(
		DONE_ONE_FILE_CALLBACK =>
			sub 
			{
			my ($file, $description, $file_info) = @_ ;			
			
			if(exists $modules_in_repository{"$cpan_mini/$description"})
				{
				# we can't delete $modules_in_repository{$cpan_mini . $description} as
				# it may contain multiple indexed files
				$modules_up_to_date{"$cpan_mini/$description"}++ ;
				}
			else
				{
				$indexed_modules_to_remove{$description}{$file} = $file_info->{ID} ;
				}
			},
		) ;
		
return (\%modules_in_repository, \%modules_up_to_date, \%indexed_modules_to_remove) ;
}

#----------------------------------------------------------------------------------------------------------

sub remove_out_of_date_modules
{

=head2 ( )

  some code

I<Arguments>

=over 2 

=item * $ - 

=back

I<Returns> - Nothing

I<Exceptions> - None

=cut

my ($indexer, $modules_out_of_date, $options) = @_ ;

my $t0_remove = time ;

my $number_of_modules = scalar(keys %{$modules_out_of_date}) ;
my $module_index = 0 ;
my $total_number_of_files = 0 ;

for my $module_to_remove(sort keys %{$modules_out_of_date})
	{
	my $t0_remove_module = time ;
	
	$module_index++ ;
	print "-$module_to_remove\n" ;
	
	my $number_of_files_in_module = 0 ;
	
	for my $module_element (sort keys %{$modules_out_of_date->{$module_to_remove}})
		{
		(my $module_element_short = $module_element) =~ s{^/tmp/[^/]+/}[] ;
		
		$total_number_of_files++ ;
		$number_of_files_in_module++ ;
		
		print "\t-$module_element_short\n" if $options->{verbose} ;
		
		$indexer->remove_document_with_id($modules_out_of_date->{$module_to_remove}{$module_element})   ;
		}
		
	if ($options->{verbose})
		{
		printf
			"\t[$module_index/$number_of_modules ($number_of_files_in_module) in %.3f s.]\n",
			(time - $t0_remove_module)  ;
		}
	}

if ($options->{verbose})
	{
	printf "[Removed $total_number_of_files files in $number_of_modules modules in %.3f s.]\n", (time - $t0_remove) ; 
	}
}

#----------------------------------------------------------------------------------------------------------

sub add_new_modules
{

=head2 ( )

  some code

I<Arguments>

=over 2 

=item * $ - 

=back

I<Returns> - Nothing

I<Exceptions> - None

=cut

my ($indexer, $cpan_mini, $new_modules, $options) = @_ ;

my $module_index = 0 ;
my $total_number_of_files = 0 ;
my $number_of_modules = scalar(keys %{$new_modules}) ;

my $one_warning = 0 ;
local $SIG{__WARN__} = get_sig_warn_sub(\$one_warning) ;

my $t0_add_modules = time;

for my $module (sort keys %{$new_modules})
	{
	my $t0_module = time ;
	
	$one_warning = 0 ;
	$module_index++ ;
	
	(my $module_to_add_short = $module) =~ s{^$cpan_mini/}[] ;
	print "+$module_to_add_short\n" ;

	my $directory = File::Temp->newdir() ;
	my $extraction_directory = $directory->dirname;

	my $next_archive_item = Archive::Tar->iter($module, 1);

	while(my $item = $next_archive_item->()) 
		{
		my $item_name = $item->name() ;
		$item->extract("$extraction_directory/$item_name") 
			or carp "Error: failed Extracting '$item_name' from '$module'!\n";
		}	
	
	my @files = File::Find::Rule->file()->name( '*.pod', '*.pl', '*.pm')->in($extraction_directory);

	my $number_of_files_in_module = scalar(@files) ;
	$total_number_of_files += $number_of_files_in_module ;
	
	my $t0_index = time ;
	
	for my $file (@files)
		{
		(my $file_short = $file) =~ s{^/tmp/[^/]+/}{} ;
		print "\t+$file_short\n" if ($options->{verbose}) ;
			
		$indexer->add_files
			(
			FILES => [map { {NAME => $_, DESCRIPTION => $module_to_add_short} } $file],
			MAXIMUM_DOCUMENT_SIZE => $options->{maximum_document_size},
			) ;
		}
		
	if ($options->{verbose})
		{
		printf
			"\t[$module_index/$number_of_modules ($number_of_files_in_module) in "
			. "%.3f s. (indexing: %.3f s.)]\n",
			(time - $t0_module), (time - $t0_index)  ;
		}
	}
	
if ($options->{verbose})
	{
	print {*STDOUT} 
		pluralize("[Re-indexed $total_number_of_files file(s) in ", $total_number_of_files),
		pluralize("$number_of_modules module(s) in ", $number_of_modules),
		sprintf("%.3f s.]\n", (time - $t0_add_modules)) ;
	}
}

#----------------------------------------------------------------------------------------------------------

sub get_sig_warn_sub
{

=head2 ( )

  some code

I<Arguments>

=over 2 

=item * $ - 

=back

I<Returns> - Nothing

I<Exceptions> - None

=cut

my ($one_warning) = @_ ;

return 
	sub
		{
		my ($warning) = @_ ;
		
		if
			(
			$warning =~ /^Invalid header block at offset unknown/
			|| $warning =~ /^Couldn't read chunk/
			|| $warning =~ /checksum error/
			)
			{
			if(! $$one_warning)
				{
				print "\tInvalid Archive!\n" ;
				$$one_warning++ ;
				}
			else
				{
				# ignore
				}
			}
		else
			{
			if($warning =~ m~'/tmp/.+?/(.+)' is bigger than .+ bytes, skipping!~)
				{
				print "\tSkipping '$1', too big!\n" ;
				}
			else
				{
				warn $warning ;
				}
			}
		} ;
}

#-------------------------------------------------------------------------------

1 ;

=head1 BUGS AND LIMITATIONS

None so far.

=head1 AUTHOR

	Nadim ibn hamouda el Khemir
	CPAN ID: NKH
	mailto: nadim@cpan.org

=head1 LICENSE AND COPYRIGHT

This program is free software; you can redistribute
it and/or modify it under the same terms as Perl itself.

=head1 SUPPORT

You can find documentation for this module with the perldoc command.

    perldoc CPAN::Mini::Indexed

You can also look for information at:

=over 4

=item * AnnoCPAN: Annotated CPAN documentation

L<http://annocpan.org/dist/CPAN-Mini-Indexed>

=item * RT: CPAN's request tracker

Please report any bugs or feature requests to  L <bug-cpan-mini-indexed@rt.cpan.org>.

We will be notified, and then you'll automatically be notified of progress on
your bug as we make changes.

=item * Search CPAN

L<http://search.cpan.org/dist/CPAN-Mini-Indexed>

=back

=head1 SEE ALSO


=cut
	Global
`s`	Focus search bar
`?`	Bring up this help dialog
	GitHub
`g` `p`	Go to pull requests
`g` `i`	go to github issues (only if github is preferred repository)
	POD
`g` `a`	Go to author
`g` `c`	Go to changes
`g` `i`	Go to issues
`g` `d`	Go to dist
`g` `r`	Go to repository/SCM
`g` `s`	Go to source
`g` `b`	Go to file browse
	Search terms
module: (e.g. module:Plugin)
distribution: (e.g. distribution:Dancer auth)
author: (e.g. author:SONGMU Redis)
version: (e.g. version:1.00)