The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package CPAN::Digger::Index;
use 5.008008;
use Moose;

our $VERSION = '0.08';

extends 'CPAN::Digger';

use autodie;
use Cwd qw(abs_path cwd);
use Capture::Tiny qw(capture);
use Data::Dumper qw(Dumper);
use File::Basename qw(basename dirname);
use File::Copy qw(copy move);
use File::Copy::Recursive qw(fcopy rcopy);
use File::Path qw(mkpath);
use File::Spec ();
use File::Temp qw(tempdir);
use File::Find::Rule ();
use File::ShareDir   ();
use JSON qw(to_json from_json);
use List::Util qw(max);
use Parse::CPAN::Whois ();

#use Parse::CPAN::Authors  ();
use POSIX                 ();
use Parse::CPAN::Packages ();
use YAML                  ();
use PPIx::EditorTools::Outline;
use Perl::Critic;
use Archive::Any;

#require Archive::Any::Plugin::Tar;
#require Archive::Any::Plugin::Zip;

use CPAN::Digger::PPI;
use CPAN::Digger::Pod;
use CPAN::Digger::DB;
use CPAN::Digger::Tools;

#has 'counter'    => (is => 'rw', isa => 'HASH');
has 'counter_distro' => ( is => 'rw', isa => 'Int', default => 0 );
has 'dir'            => ( is => 'ro', isa => 'Str' );
has 'prefix'         => ( is => 'ro', isa => 'Str' );
has 'authors'        => ( is => 'rw', isa => 'Parse::CPAN::Authors' );

has 'cpan'   => ( is => 'ro', isa => 'Str' );
has 'output' => ( is => 'ro', isa => 'Str' );
has 'filter' => ( is => 'ro', isa => 'Str' );

has 'prepare' => ( is => 'ro', isa => 'Str' );
has 'pod'     => ( is => 'ro', isa => 'Str' );
has 'syn'     => ( is => 'ro', isa => 'Str' );
has 'outline' => ( is => 'ro', isa => 'Str' );
has 'critic'  => ( is => 'ro', isa => 'Str' );

my $dbx;

sub db {
	if ( not $dbx ) {
		$dbx = CPAN::Digger::DB->new;
		$dbx->setup;
	}
	return $dbx;
}

sub process_all_distros {
	my ($self) = @_;

	my $distros = db->get_all_distros;
	LOG('start processing distros');

	#LOG(Dumper $distros);
	my $filter = $self->filter;
	foreach my $name ( sort keys %$distros ) {

		next if $filter and $name !~ qr{$filter};

		LOG("Work on $name");
		my $d       = $distros->{$name};
		my $details = db->get_distro_details_by_id( $d->{id} );
		next if $details;
		$self->process_distro( $d->{path} );
	}
	LOG('done processing all distros');

	return;
}


# process a single distribution given the (relative) path to it
sub process_distro {
	my ( $self, $path ) = @_;

	#$self->counter_distro($self->counter_distro +1);
	LOG("Working on $path");

	my $d = db->get_distro_by_path($path);
	die "Could not find distro by path '$path'" if not $d;

	my $src_dir  = File::Spec->catdir( $self->output, 'src',  uc $d->{author} );
	my $dist_dir = File::Spec->catdir( $self->output, 'dist', $d->{name} );
	my $syn_dir  = File::Spec->catdir( $self->output, 'syn',  $d->{name} );

	mkpath $_ for ( $dist_dir, $src_dir, $syn_dir );

	if ( $self->prepare ) {
		return if $d->{unzip_error};
		$self->prepare_src( $d, $src_dir, $path ) or return;
	}

	chdir $d->{distvname}; # source_dir

	my @files = File::Find::Rule->file->relative->in('.');

	my $pods = $self->generate_html_from_pod( $dist_dir, $d );

	my %data;

	$data{modules} = $pods->{modules};
	if ( @{ $pods->{pods} } ) {
		$data{pods} = $pods->{pods};
	}

	my ( $outlines, $min_versions, $pc_violations, $version_markers ) =
		$self->generate_outline( $dist_dir, $data{modules} );

	$self->generate_syn( $syn_dir, $data{modules} );

	$self->collect_meta_data( \%data );
	$data{distvname} = $d->{distvname};

	LOG( "update_distro_details for $path by " . Dumper \%data );

	my $dist = db->get_distro_by_path($path);

	#LOG("Update DB for id $dist->{id}");

	my $min_perl_version = 1;
	db->dbh->begin_work;


	# add files to database
	foreach my $f (@files) {
		db->add_file( $f, $dist->{id} );
	}

	foreach my $t ( @{ $data{modules} } ) {
		db->update_module( $t, $min_versions->{ $t->{name} }, 1, $dist->{id} );
		$min_perl_version = max( $min_versions->{ $t->{name} }, $min_perl_version );
	}
	foreach my $t ( @{ $data{pods} } ) {
		db->update_module( $t, $min_versions->{ $t->{name} }, 0, $dist->{id} );
		$min_perl_version = max( $min_versions->{ $t->{name} }, $min_perl_version );
	}

	foreach my $o (@$outlines) {

		#CPAN::Digger::Index::LOG("add subs $o->{name} " . Dumper $o);
		db->add_subs( $o->{name}, $o->{methods} );
	}
	$data{min_perl} = $min_perl_version;
	db->update_distro_details( \%data, $dist->{id} );
	{
		my $policies = db->get_all_policies;
		if (%$pc_violations) {
			my $id_of_file = db->get_file_ids_of_dist( $dist->{id} );

			#die Dumper $id_of_file;
			foreach my $file ( keys %$pc_violations ) {

				#die $file;

				if ( not $id_of_file->{$file} ) {
					warn("id of file '$file' is missing");
					next;
				}
				foreach my $v ( @{ $pc_violations->{$file} } ) {
					my $policy = substr( $v->policy, 22 );

					#print "$policy\n";
					if ( not $policies->{$policy} ) {
						$policies->{$policy} = db->add_policy($policy);
					}

					#print STDERR $id_of_file->{$file}, "\n";
					db->add_violation( $v, $id_of_file->{$file}{id}, $policies->{$policy} );
				}
			}
		}
	}
	{
		open my $out, '>', "$dist_dir/version.txt";
		print $out "<pre>\n";
		print $out "Overall min perl version: $min_perl_version\n\n";
		print $out "Markers:\n\n";
		print $out $version_markers;
		print $out "\n</pre>\n";
	}

	db->dbh->commit;

	return;
}

# assume we are in the project directory
sub collect_meta_data {
	my ( $self, $data ) = @_;

	$data->{has_meta_yml} = -e 'META.yml';

	# TODO we need to make sure the data we read from META.yml is correct and
	# someone does not try to fill it with garbage or too much data.
	if ( $data->{has_meta_yml} ) {
		eval {
			my $meta = YAML::LoadFile('META.yml');

			#print Dumper $meta;
			my @fields = qw(license abstract author name requires version);
			foreach my $field (@fields) {
				$data->{meta}{$field} = $meta->{$field};
			}
			if ( $meta->{resources} ) {
				foreach my $field (qw(repository homepage bugtracker license)) {
					$data->{meta}{resources}{$field} = $meta->{resources}{$field};
				}
			}
		};
		if ($@) {
			WARN("Exception while reading YAML file: $@");

			#$counter{exception_in_yaml}++;
			$data->{exception_in_yaml} = $@;
		}
	}
	$data->{has_meta_json} = -e 'META.json';

	if ( -d 'xt' ) {
		$data->{has_xt} = 1;
	}
	if ( -d 't' ) {
		$data->{has_t} = 1;
	}
	if ( -f 'test.pl' ) {
		$data->{test_file} = 1;
	}
	my @example_dirs = qw(eg examples);
	foreach my $dir (@example_dirs) {
		if ( -d $dir ) {
			$data->{examples} = $dir;
		}
	}
	my @changes_files = qw(Changes CHANGES ChangeLog);


	my @readme_files = qw('README');

	my @special_files =
		sort grep { -e $_ } ( qw(META.yml MANIFEST INSTALL Makefile.PL Build.PL), @changes_files, @readme_files );

	if ( $data->{meta}{resources}{repository} ) {
		my $repo = delete $data->{meta}{resources}{repository};
		$data->{meta}{resources}{repository}{display} = $repo;
		$repo =~ s{git://(github.com/.*)\.git}{http://$1};
		$data->{meta}{resources}{repository}{link} = $repo;
	}

	$data->{special_files} = \@special_files;

	foreach my $t ( @{ $data->{modules} }, @{ $data->{pods} } ) {
		$t->{path} =~ s{\\}{/}g;
	}

	return;
}

# unzip if needed or copy files if we were supplied with a directory structure (e.g. an svn checkout)
sub prepare_src {
	my ( $self, $d, $src_dir, $path ) = @_;

	my $full_path = File::Spec->catfile( $self->cpan, 'authors', 'id', $path );

	chdir $src_dir;
	my $distv_dir = File::Spec->catdir( $src_dir, $d->{distvname} );
	if ( not -e $distv_dir ) {
		my $unzip = $self->unzip( $path, $full_path, $d->{distvname} );
		return if not $unzip;
	}

	if ( not -e $d->{distvname} ) {
		WARN("No directory for '$d->{distvname}'");

		#$counter{no_directory}++;
		db->unzip_error( $path, 'no_directory', $d->{distvname} );
		return;
	}
	return 1;
}

# starting from current directory
sub generate_html_from_pod {
	my ( $self, $dir, $d ) = @_;

	my %ret;
	$ret{modules} = $self->_generate_html( $dir, '.pm',  'lib', $d );
	$ret{pods}    = $self->_generate_html( $dir, '.pod', 'lib', $d );

	return \%ret;
}

sub generate_syn {
	my ( $self, $dir, $files ) = @_;

	return if not $self->syn;

	foreach my $file (@$files) {
		my $outfile = File::Spec->catfile( $dir, $file->{path} );
		mkpath dirname $outfile;
		my $html;
		eval {
			my $ppi = CPAN::Digger::PPI->new( infile => $file->{path} );
			$html = $ppi->get_syntax;
		};
		if ($@) {
			ERROR("Exception while generating syn in PPI for $file->{path}  $@");
			next;
		}

		LOG("Save syn in $outfile");

		#my %data = (
		#filename => $opt{infile},
		#	code => $html,
		#);
		#my $tt = $self->get_tt;
		#$tt->process('syntax.tt', \%data, $outfile) or die $tt->error;
		open my $out, '>', $outfile;
		print $out qq{<div class="code">$html</div>};

	}

	return;
}

sub generate_outline {
	my ( $self, $dir, $files ) = @_;

	return if not $self->outline;

	my $pc = $self->critic ? Perl::Critic->new( -profile => $self->critic ) : undef;

	my @all_outlines;
	my %all_versions;
	my $all_version_markers = '';
	my %all_violations;
	foreach my $file (@$files) {

		my $min_perl;
		my $version_markers;
		my $outline;
		my @violations;
		eval {
			my $ppi = CPAN::Digger::PPI->new( infile => $file->{path} );

			$outline = PPIx::EditorTools::Outline->new->find( ppi => $ppi->get_ppi );

			( $min_perl, $version_markers ) = $ppi->min_perl;

			if ( $self->critic ) {
				@violations = $pc->critique( $ppi->get_ppi );
			}
		};
		if ($@) {
			ERROR("Exception in PPI while generating outline for $file->{path} $@");
			next;
		}

		my $outfile = File::Spec->catfile( $dir, "$file->{path}.json" );

		#my $vm_file = File::Spec->catfile($dir, "$file->{path}.vm.txt");
		#my $pc_file = File::Spec->catfile($dir, "$file->{path}.pc.txt");
		mkpath dirname $outfile;

		LOG( "Save outline in $outfile " . Dumper $outline);
		{
			open my $out, '>', $outfile;
			print $out to_json( $outline, { pretty => 1 } );
		}
		push @all_outlines, @$outline;

		my $module = $file->{path};
		$module =~ s{^lib/}{};
		$module =~ s{\.pm$}{};
		$module =~ s{/}{::}g;
		$all_versions{$module} = "$min_perl"; # forced stringification
		$all_version_markers .= "$module\n" . Dumper($version_markers) . "\n";

		if (@violations) {
			$all_violations{ $file->{path} } = \@violations;
		}
	}

	return ( \@all_outlines, \%all_versions, \%all_violations, $all_version_markers );
}

sub _generate_html {
	my ( $self, $dir, $ext, $path, $d ) = @_;

	my @files = eval {
		sort map { _untaint_path($_) }
			File::Find::Rule->file->name("*$ext")->extras( { untaint => 1 } )->relative->in($path);
	};

	# id/K/KA/KAWASAKI/WSST-0.1.1.tar.gz
	# directory (lib/WSST/Templates/perl/lib/WebService/) {company_name} is still tainted at /usr/share/perl/5.10/File/Find.pm line 869.
	if ($@) {
		WARN("Exception in File::Find::Rule: $@");
		return [];
	}
	my @data;
	my $tt     = $self->get_tt;
	my $author = uc $d->{author};

	my $dist = $d->{name};
	foreach my $file (@files) {
		my $module = substr( $file, 0, -1 * length($ext) );
		$module =~ s{/}{::}g;
		my $infile = File::Spec->catdir( $path, $file );
		my $outfile = File::Spec->catfile( $dir, $infile );
		mkpath dirname $outfile;

		my %info = (
			path => $infile,
			name => $module,
		);
		if ( $self->pod ) {
			LOG("POD: $infile -> $outfile");
			my $pod = CPAN::Digger::Pod->new();

			#$pod->batch_mode(1);
			# description?
			# keywords?
			my ( $header_top, $header_bottom, $footer );

			# We now only generate the "inside" of the pod and put it together
			# with the header and footer on-the fly.

			#$tt->process('incl/header_top.tt', {}, \$header_top) or die $tt->error;
			#$tt->process('incl/header_bottom.tt', {}, \$header_bottom) or die $tt->error;
			#$tt->process('incl/footer.tt', {}, \$footer) or die $tt->error;
			#$pod->html_header_before_title( $header_top );
			#$header_bottom .= qq((<a href="/src/$author/$d->{distvname}/$path/$file">source</a>));
			#$header_bottom .= qq((<a href="/syn/$dist/$path/$file">syn</a>));
			#$pod->html_header_after_title( $header_bottom );
			#$pod->html_footer( $footer );

			$pod->html_header_before_title('');
			$pod->html_header_after_title('');
			$pod->html_footer('');

			eval { $info{html} = $pod->process( $infile, $outfile ); };
			if ($@) {
				ERROR("Exception when processing pod '$infile' of $path to '$outfile'  $@");
				next;
			}
			$info{abstract} = delete $pod->{__abstract};
		}
		push @data, \%info;
	}
	return \@data;
}


sub generate_central_files {
	my $self = shift;

	# copy static files from public/ to --outdir
	my $outdir = _untaint_path( $self->output );

	#	mkpath $outdir;

	rcopy( File::ShareDir::dist_dir('CPAN-Digger'), $outdir );

	return;
}


sub unzip {
	my ( $self, $path, $full_path, $distvname ) = @_;

	if ( $full_path !~ m/\.(tar\.bz2|tar\.gz|tgz|zip)$/ ) {
		WARN("Does not know how to unzip $full_path");
		db->unzip_error( $path, 'invalid_extension', '' );
		return;
	}

	LOG("Unzipping '$full_path'");
	my $archive;
	eval {
		local $SIG{__WARN__} = sub { die shift };
		$archive = Archive::Any->new($full_path);
		die 'Could not unzip' if not $archive;
	};
	if ($@) {
		WARN("Exception in unzip: $@");
		db->unzip_error( $path, 'exception', $@ );
		return;
	}

	my $is_naughty;
	eval { $is_naughty = $archive->is_naughty; };

	# TODO!

	if ($is_naughty) {
		WARN("Archive is naughty");
		db->unzip_error( $path, 'naughty_archive', '' );
		return;
	}
	my $dir = $distvname;
	eval {
		if ( $archive->is_impolite )
		{
			mkdir $dir;
			$archive->extract($dir);
		} else {
			$archive->extract();
		}
	};
	if ($@) {
		WARN("Exception in unzip extract: $@");
		db->unzip_error( $path, 'exception', $@ );
		return;
	}

	# my $cwd = eval { _untaint_path(cwd()) };
	# if ($@) {
	# WARN("Could not untaint cwd: '" . cwd() . "'  $@");
	# return;
	# }
	# my $temp = tempdir( CLEANUP => 1 );
	# chdir $temp;
	# my ($out, $err) = eval { capture { system($cmd) } };
	# if ($@) {
	# die "$cmd $@";
	# }
	# if ($err) {
	# WARN("Command ($cmd) failed: $err");
	# chdir $cwd;
	# return;
	# }

	# TODO check if this was really successful?
	# TODO check what were the permission bits
	_chmod('.');

	opendir my ($dh), '.';
	my @content = eval {
		map { _untaint_path($_) } grep { $_ ne '.' and $_ ne '..' } readdir $dh;
	};
	if ($@) {
		WARN("Could not untaint content of directory: $@");

		#chdir $cwd;
		db->unzip_error( $path, 'tainted_directory', $@ );
		return;
	}

	#print "CON: @content\n";
	# if (@content == 1 and $content[0] eq $d->distvname) {
	# # using external mv as File::Copy::move cannot move directory...
	# my $cmd_move = "mv " . $d->distvname . " $cwd";
	# #LOG("Moving " . $d->distvname . " to $cwd");
	# LOG($cmd_move);
	# #move $d->distvname, File::Spec->catdir( $cwd, $d->distvname );
	# system($cmd_move);
	# # TODO: some files open with only read permissions on the main directory.
	# # this needs to be reported and I need to correct it on the local unzip setting
	# # xw on the directories and w on the files
	# #chdir $cwd;
	# return 1;
	# } else {
	# my $target_dir = eval { _untaint_path(File::Spec->catdir( $cwd, $d->distvname )) };
	# if ($@) {
	# WARN("Could not untaint target_directory: $@");
	# chdir $cwd;
	# return;
	# }
	# LOG("Need to create $target_dir");
	# mkdir $target_dir;
	# foreach my $thing (@content) {
	# system "mv $thing $target_dir";
	# }
	# chdir $cwd;
	# return 2;
	# }

	return 1;
}

sub _chmod {
	my $dir = shift;
	opendir my ($dh), $dir;
	my @content = eval {
		map { _untaint_path($_) } grep { $_ ne '.' and $_ ne '..' } readdir $dh;
	};
	if ($@) {
		WARN("Could not untaint: $@");
	}
	foreach my $thing (@content) {
		my $path = File::Spec->catfile( $dir, $thing );
		if ( -l $path ) {
			WARN("Symlink found '$path'");
			unlink $path;
		} elsif ( -d $path ) {
			chmod 0755, $path;
			_chmod($path);
		} elsif ( -f $path ) {
			chmod 0644, $path;
		} else {
			WARN("Unknown thing '$path'");
		}
	}
	return;
}

sub _untaint_path {
	my $p = shift;
	if ( $p =~ m{^([\w/:\\.-]+)$}x ) {
		$p = $1;
	} else {
		Carp::confess("Untaint failed for '$p'\n");
	}
	if ( index( $p, '..' ) > -1 ) {
		Carp::confess("Found .. in '$p'\n");
	}
	return $p;
}

sub collect_distributions {
	my ($self) = @_;

	LOG('collecting list of distributions');

	return if not $self->cpan;

	db->dbh->begin_work;

	my $files = File::Find::Rule->file()->relative

		#   ->name( '*.tar.gz' )
		->start( $self->cpan . '/authors/id' );

	while ( my $file = $files->match ) {
		next if $file =~ m{.meta$};
		next if $file =~ m{.readme$};
		next if $file =~ m{.pl$};
		next if $file =~ m{.pm$};
		next if $file =~ m{.txt$};
		next if $file =~ m{.png$};
		next if $file =~ m{.html$};
		next if $file =~ m{CHECKSUMS$};
		next if $file =~ m{/\w+$};

		# limit processing when profiling
		#$main::count++;
		#last if $main::count > 200;

		# Sample files:
		# F/FA/FAKE1/My-Package-1.02.tar.gz
		# Z/ZI/ZIGOROU/Module-Install-TestVars-0.01_02.tar.gz
		# G/GR/GREENBEAN/Asterisk-AMI-v0.2.0.tar.gz
		# Z/ZA/ZAG/Objects-Collection-029targz/Objects-Collection-0.29.tar.gz
		my $PREFIX           = qr{\w/\w\w/(\w+)/};
		my $SUBDIRS          = qr{(?:[\w/-]+/)};
		my $PACKAGE          = qr{([\w-]*?)};
		my $VERSION_NO       = qr{[\d._]+};
		my $CRAZY_VERSION_NO = qr {[\w.]+};
		my $EXTENSION        = qr{(?:\.tar\.gz|\.tgz|\.zip|\.tar\.bz2)};
		my $full_path        = $self->cpan . '/authors/id/' . $file;
		if ($file =~ m{^$PREFIX           # P/PA/PAUSEID
			   $SUBDIRS?          # optional garbage
			   $PACKAGE
			   -v?($VERSION_NO)      # version
			   $EXTENSION
			   $}x
			)
		{

			#print "$1  - $2 - $3\n";
			my @args = ( $1, $2, $3, $file, ( stat $full_path )[9], time );
			LOG("insert_distro @args");
			db->insert_distro(@args);

			# K/KR/KRAKEN/Net-Telnet-Cisco-IOS-0.4beta.tar.gz
		} elsif (
			$file =~ m{^$PREFIX           # P/PA/PAUSEID
			   $SUBDIRS?          # optional garbage
			   $PACKAGE
			   -v?($CRAZY_VERSION_NO)      # version
			   $EXTENSION
			   $}x
			)
		{
			my @args = ( $1, $2, $3, $file, ( stat $full_path )[9], time );
			LOG("insert_distro @args");
			db->insert_distro(@args);
		} else {
			WARN("could not parse filename '$file'");
		}
	}

	db->dbh->commit;

	LOG('done collecting distributions');

	return;
}

sub update_from_whois {
	my ($self) = @_;

	LOG('start whois');
	my $file = $self->cpan . '/authors/00whois.xml';
	if ( not -e $file ) {
		die "Could not find whois file $file";

		# TODO minim cpan does not mirror this file
		# either ask RJBS to include it, mirror ourself or use the
		# other file (01mailrc.txt.gz) I think
		# create ~/.minicpanrc with the following line in it:
		#  also_mirror: authors/00whois.xml
	}

	db->dbh->begin_work;

	my $whois = Parse::CPAN::Whois->new($file);
	foreach my $who ( $whois->authors ) {
		my $pauseid = $who->pauseid;
		my $have    = db->get_author($pauseid);

		#print Dumper $have;
		my %new_data;
		my $changed;
		foreach my $field (qw(email name asciiname homepage)) {
			$new_data{$field} = $who->$field;
			if ($have) {
				no warnings;
				$changed = 1 if $new_data{$field} ne $have->{$field};
			}
		}
		my $homedir =
			sprintf( '%s/authors/id/%s/%s/%s', $self->cpan, substr( $pauseid, 0, 1 ), substr( $pauseid, 0, 2 ),
			$pauseid );
		$new_data{homedir} = -d $homedir ? 1 : 0;

		# has author.json ?
		my %author_profile;
		my ($author_file) = reverse sort glob "$homedir/author-*.json";
		my $author_json;
		if ($author_file) {
			eval { $author_json = from_json slurp($author_file) };
			if ($@) {
				ERROR("Failed to load '$author_file': $@");
			} else {
				$new_data{author_json} = basename $author_file;
			}
		}

		if ( not $have ) {
			LOG( "add_author $pauseid " . Dumper \%new_data );
			db->add_author( \%new_data, $pauseid );
		} elsif ($changed) {
			LOG( "update_author $pauseid " . Dumper \%new_data );
			db->update_author( \%new_data, $pauseid );
		}
		if ($author_json) {
			LOG( "updating author_json for $pauseid from $author_file by " . Dumper $author_json);
			db->update_author_json( $author_json, $pauseid );
		}
	}

	db->dbh->commit;

	LOG('whois finished');

	return;
}


1;