package Dist::Surveyor;
{
$Dist::Surveyor::VERSION = '0.001';
}
=head1 NAME
Dist::Surveyor - Survey installed modules and determine the specific distribution versions they came from
=head1 VERSION
version 0.001
=head1 SYNOPSIS
See L<dist_surveyor> for documentation.
=cut
use strict;
use warnings;
use version;
use autodie;
use Carp;
use Compress::Zlib;
use Config;
use CPAN::DistnameInfo;
use Data::Dumper::Concise;
use DBI qw(looks_like_number);
use Digest::SHA qw(sha1_base64);
use Fcntl qw(:DEFAULT :flock);
use File::Fetch;
use File::Basename;
use File::Find;
use File::Path;
use File::Slurp;
use File::Spec;
use File::Spec::Unix;
use Getopt::Long;
use List::Util qw(max sum);
use LWP::Simple;
use Memoize;
use MetaCPAN::API 0.32;
use DB_File;
use MLDBM qw(DB_File Storable);
use Module::CoreList;
use Module::Metadata;
use Storable qw(nfreeze);
use Try::Tiny;
use URI;
use constant PROGNAME => 'dist_surveyor';
use constant ON_WIN32 => $^O eq 'MSWin32';
use constant ON_VMS => $^O eq 'VMS';
GetOptions(
'match=s' => \my $opt_match,
'v|verbose!' => \my $opt_verbose,
'd|debug!' => \my $opt_debug,
# target perl version, re core modules
'perlver=s' => \my $opt_perlver,
# include old dists that have remnant/orphaned modules installed
'remnants!' => \my $opt_remnants,
# don't use a persistent cache
'uncached!' => \my $opt_uncached,
'makecpan=s' => \my $opt_makecpan,
# e.g., 'download_url author'
'output=s' => \(my $opt_output ||= 'url'),
# e.g., 'some-command --foo --file %s --authorid %s'
'format=s' => \my $opt_format,
) or exit 1;
$opt_verbose++ if $opt_debug;
$opt_perlver = version->parse($opt_perlver || $])->numify;
my $major_error_count = 0; # exit status
my $metacpan_size = 999; # don't make too large, hurts the server
my $metacpan_calls = 0;
my $metacpan_api ||= MetaCPAN::API->new(
ua_args => [ agent => $0 ],
);
# caching via persistent memoize
my $db_generation = 1; # XXX increment on incompatible change
my $memoize_file = PROGNAME."-$db_generation.db";
my %memoize_cache;
if (not $opt_uncached) {
# XXX no need for MLDBM now? Could just use DB_File
my $db = tie %memoize_cache => 'MLDBM', $memoize_file, O_CREAT|O_RDWR, 0640
or die "Unable to use persistent cache: $!";
# XXX this locking is flawed but good enough for my needs
# http://search.cpan.org/~pmqs/DB_File-1.824/DB_File.pm#HINTS_AND_TIPS
my $fd = $db->fd;
open(DB_FH, "+<&=$fd") || die "dup $!";
flock (DB_FH, LOCK_EX) || die "flock: $!";
}
my %memoize_subs = (
get_candidate_cpan_dist_releases => { generation => 1 },
get_module_versions_in_release => { generation => 1 },
);
for my $subname (keys %memoize_subs) {
my %memoize_args = %{$memoize_subs{$subname}};
my $generation = delete $memoize_args{generation} || 1;
$memoize_args{SCALAR_CACHE} = [ HASH => \%memoize_cache ];
$memoize_args{LIST_CACHE} = 'FAULT';
# TODO use faster normalizer for subs that don't get refs
# not needed because we don't pass refs
#$memoize_args{NORMALIZER} = sub { $Storable::canonical = 1; sha1_base64(nfreeze([ $subname, $generation, wantarray, @_ ])) }
memoize($subname, %memoize_args);
}
# for distros with names that don't match the principle module name
# yet the principle module version always matches the distro
# Used for perllocal.pod lookups
# # XXX should be automated lookup rather than hardcoded
my %distro_key_mod_names = (
'PathTools' => 'File::Spec',
'Template-Toolkit' => 'Template',
'TermReadKey' => 'Term::ReadKey',
'libwww-perl' => 'LWP',
'ack' => 'App::Ack',
);
sub main {
# give only top-level lib dir, the archlib will be added automatically
my @libdir = shift @ARGV or die "No perl lib directory specified\n";
die "$libdir[0] isn't a directory\n" unless -d $libdir[0];
my $archdir = "$libdir[0]/$Config{archname}";
if (-d $archdir) {
unshift @libdir, $archdir;
}
else {
warn "No $Config{archname} directory in $libdir[0].\n";
warn "This probably means you've given the wrong directory\n";
warn "(or that you're using the wrong perl build).\n";
}
my @installed_releases = determine_installed_releases(@libdir);
write_fields(\@installed_releases, $opt_format, [split ' ', $opt_output], \*STDOUT);
warn sprintf "Completed survey in %.1f minutes using %d metacpan calls.\n",
(time-$^T)/60, $metacpan_calls;
do_makecpan(@installed_releases)
if $opt_makecpan;
exit $major_error_count;
}
sub do_makecpan {
my (@installed_releases) = @_;
warn "Updating $opt_makecpan for ".@installed_releases." releases...\n";
mkpath("$opt_makecpan/modules");
my %pkg_ver_rel; # for 02packages
for my $ri (@installed_releases) {
# --- get the file
my $main_url = URI->new($ri->{download_url});
my $di = distname_info_from_url($main_url);
my $pathfile = "authors/id/".$di->pathname;
my $destfile = "$opt_makecpan/$pathfile";
mkpath(dirname($destfile));
my @urls = ($main_url);
for my $mirror ('http://backpan.perl.org') {
push @urls, "$mirror/$pathfile";
}
my $mirror_status;
for my $url (@urls) {
$mirror_status = eval { mirror($url, $destfile) };
last if not is_error($mirror_status||500);
}
if ($@ || is_error($mirror_status)) {
my $err = ($@ and chomp $@) ? $@ : $mirror_status;
my $msg = "Error $err mirroring $main_url";
if (-f $destfile) {
warn "$msg - using existing file\n";
}
else {
# better to keep going and add the packages to the index
# than abort at this stage due to network/mirror problems
# the user can drop the files in later
warn "$msg - continuing, ADD FILE MANUALLY!\n";
++$major_error_count;
}
}
else {
warn "$mirror_status $main_url\n" if $opt_verbose;
}
my $mods_in_rel = get_module_versions_in_release($ri->{author}, $ri->{name});
if (!keys %$mods_in_rel) { # XXX hack for common::sense
(my $dist_as_pkg = $ri->{distribution}) =~ s/-/::/g;
warn "$ri->{author}/$ri->{name} has no modules! Adding fake module $dist_as_pkg ".$di->version."\n";
$mods_in_rel->{$dist_as_pkg} = {
name => $dist_as_pkg,
version => $di->version,
version_obj => version->parse($di->version),
};
}
# --- accumulate package info for 02packages file
for my $pkg (sort keys %$mods_in_rel ) {
# pi => { name=>, version=>, version_obj=> }
my $pi = $mods_in_rel->{$pkg};
# for selecting which dist a package belongs to
# XXX should factor in authorization status
my $p_r_match_score = p_r_match_score($pkg, $ri);
if (my $pvr = $pkg_ver_rel{$pkg}) {
# already seen same package name in different distribution
if ($p_r_match_score < $pvr->{p_r_match_score}) {
warn "$pkg seen in $pvr->{ri}{name} so ignoring one in $ri->{name}\n";
next;
}
warn "$pkg seen in $pvr->{ri}{name} - now overridden by $ri->{name}\n";
}
my $line = _fmtmodule($pkg, $di->pathname, $pi->{version});
$pkg_ver_rel{$pkg} = { line => $line, pi => $pi, ri => $ri, p_r_match_score => $p_r_match_score };
}
}
# --- write 02packages file
my $pkg_lines = _readpkgs($opt_makecpan);
my %packages;
for my $line (@$pkg_lines, map { $_->{line} } values %pkg_ver_rel) {
my ($pkg) = split(/\s+/, $line, 2);
if ($packages{$pkg} and $packages{$pkg} ne $line) {
warn "Old $packages{$pkg}\nNew $line\n" if $opt_verbose;
}
$packages{$pkg} = $line;
};
_writepkgs($opt_makecpan, [ sort values %packages ] );
# --- write extra data files that may be useful XXX may change
# XXX these don't all (yet?) merge with existing data
my $survey_datadump_dir = "$opt_makecpan/".PROGNAME;
mkpath($survey_datadump_dir);
# Write list of token packages - each should match only one release.
# This makes it _much_ faster to do installs via cpanm because it
# can skip the modules it knows are installed (whereas using a list of
# distros it has to reinstall _all_ of them every time).
# XXX maybe add as a separate option: "--mainpkgs mainpkgs.lst"
my %dist_packages;
while ( my ($pkg, $line) = each %packages) {
my $distpath = (split /\s+/, $line)[2];
$dist_packages{$distpath}{$pkg}++;
}
my %token_package;
my %token_package_pri = ( # alter install order for some modules
'Module::Build' => 100, # should be near first
Moose => 50,
# install distros that use Module::Install late so their dependencies
# have already been resolved (else they try to fetch them directly,
# bypassing our cpanm --mirror-only goal)
'Olson::Abbreviations' => -90,
# distros with special needs
'Term::ReadKey' => -100, # tests hang if run in background
);
for my $distpath (sort keys %dist_packages) {
my $dp = $dist_packages{$distpath};
my $di = CPAN::DistnameInfo->new($distpath);
#warn Dumper([ $distpath, $di->dist, $di]);
(my $token_pkg = $di->dist) =~ s/-/::/g;
if (!$dp->{$token_pkg}) {
if (my $keypkg = $distro_key_mod_names{$di->dist}) {
$token_pkg = $keypkg;
}
else {
# XXX not good - may pick a dummy test package
$token_pkg = (grep { $_ } keys %$dp)[0] || $token_pkg;
warn "Picked $token_pkg as token package for ".$di->distvname."\n";
}
}
$token_package{$token_pkg} = $token_package_pri{$token_pkg} || 0;
}
my @main_pkgs = sort { $token_package{$b} <=> $token_package{$a} or $a cmp $b } keys %token_package;
open my $key_pkg_fh, ">", "$survey_datadump_dir/token_packages.txt";
print $key_pkg_fh "$_\n" for @main_pkgs;
close $key_pkg_fh;
# Write list of releases, like default stdout
open my $rel_fh, ">", "$survey_datadump_dir/releases.txt";
write_fields(\@installed_releases, undef, [qw(url)], $rel_fh);
close $rel_fh;
# dump the primary result data for additional info and debugging
my $gzwrite = gzopen("$survey_datadump_dir/_data_dump.perl.gz", 'wb')
or croak "Cannot open $survey_datadump_dir/_data_dump.perl.gz for writing: $gzerrno";
$gzwrite->gzwrite("[\n");
for my $ri (@installed_releases) {
$gzwrite->gzwrite(Dumper($ri));
$gzwrite->gzwrite(",");
}
$gzwrite->gzwrite("]\n");
$gzwrite->gzclose;
warn "$opt_makecpan updated.\n"
}
sub p_r_match_score {
my ($pkg_name, $ri) = @_;
my @p = split /\W/, $pkg_name;
my @r = split /\W/, $ri->{name};
for my $i (0..max(scalar @p, scalar @r)) {
return $i if not defined $p[$i]
or not defined $r[$i]
or $p[$i] ne $r[$i]
}
die; # unreached
}
sub write_fields {
my ($releases, $format, $fields, $fh) = @_;
$format ||= join("\t", ('%s') x @$fields);
$format .= "\n";
for my $release_data (@$releases) {
printf $fh $format, map {
exists $release_data->{$_} ? $release_data->{$_} : "?$_"
} @$fields;
}
}
sub determine_installed_releases {
my (@search_dirs) = @_;
warn "Searching @search_dirs\n" if $opt_verbose;
my %installed_mod_info;
warn "Finding modules in @search_dirs\n";
my ($installed_mod_files, $installed_meta) = find_installed_modules(@search_dirs);
# get the installed version of each installed module and related info
warn "Finding candidate releases for the ".keys(%$installed_mod_files)." installed modules\n";
foreach my $module ( sort keys %$installed_mod_files ) {
my $mod_file = $installed_mod_files->{$module};
if ($opt_match) {
if ($module !~ m/$opt_match/o) {
delete $installed_mod_files->{$module};
next;
}
}
module_progress_indicator($module) unless $opt_verbose;
my $mod_version = do {
# silence warnings about duplicate VERSION declarations
# eg Catalyst::Controller::DBIC::API* 2.002001
local $SIG{__WARN__} = sub { warn @_ if $_[0] !~ /already declared with version/ };
my $mm = Module::Metadata->new_from_file($mod_file);
$mm->version; # only one version for one package in file
};
$mod_version ||= 0; # XXX
my $mod_file_size = -s $mod_file;
# Eliminate modules that will be supplied by the target perl version
if ( my $cv = $Module::CoreList::version{ $opt_perlver }->{$module} ) {
$cv =~ s/ //g;
if (version->parse($cv) >= version->parse($mod_version)) {
warn "$module $mod_version is core in perl $opt_perlver (as v$cv) - skipped\n";
next;
}
}
my $mi = $installed_mod_info{$module} = {
file => $mod_file,
module => $module,
version => $mod_version,
version_obj => version->parse($mod_version),
size => $mod_file_size,
};
# ignore modules we know aren't indexed
next if $module =~ /^Moose::Meta::Method::Accessor::Native::/;
# XXX could also consider file mtime: releases newer than the mtime
# of the module file can't be the origin of that module file.
# (assuming clocks and file times haven't been messed with)
try {
my $ccdr = get_candidate_cpan_dist_releases($module, $mod_version, $mod_file_size);
if (not %$ccdr) {
$ccdr = get_candidate_cpan_dist_releases($module, $mod_version, 0);
if (%$ccdr) {
# probably either a local change/patch or installed direct from repo
# but with a version number that matches a release
warn "$module $mod_version on CPAN but with different file size (not $mod_file_size)\n"
if $mod_version or $opt_verbose;
$mi->{file_size_mismatch}++;
}
elsif ($ccdr = get_candidate_cpan_dist_releases_fallback($module, $mod_version) and %$ccdr) {
warn "$module $mod_version not on CPAN but assumed to be from @{[ sort keys %$ccdr ]}\n"
if $mod_version or $opt_verbose;
$mi->{cpan_dist_fallback}++;
}
else {
$mi->{version_not_on_cpan}++;
# Possibly:
# - a local change/patch or installed direct from repo
# with a version number that was never released.
# - a private module never released on cpan.
# - a build-time create module eg common/sense.pm.PL
warn "$module $mod_version not found on CPAN\n"
if $mi->{version} # no version implies uninteresting
or $opt_verbose;
# XXX could try finding the module with *any* version on cpan
# to help with later advice. ie could select as candidates
# the version above and the version below the number we have,
# and set a flag to inform later logic.
}
}
$mi->{candidate_cpan_dist_releases} = $ccdr if %$ccdr;
}
catch {
warn "Failed get_candidate_cpan_dist_releases($module, $mod_version, $mod_file_size): $_";
}
}
# Map modules to dists using the accumulated %installed_mod_info info
warn "*** Mapping modules to releases\n";
my %best_dist;
foreach my $mod ( sort keys %installed_mod_info ) {
my $mi = $installed_mod_info{$mod};
module_progress_indicator($mod) unless $opt_verbose;
# find best match among the cpan releases that included this module
my $ccdr = $installed_mod_info{$mod}{candidate_cpan_dist_releases}
or next; # no candidates, warned about above (for mods with a version)
my $best_dist_cache_key = join " ", sort keys %$ccdr;
our %best_dist_cache;
my $best = $best_dist_cache{$best_dist_cache_key}
||= pick_best_cpan_dist_release($ccdr, \%installed_mod_info);
my $note = "";
if (@$best > 1) { # try using perllocal.pod to narrow the options
# XXX TODO move this logic into the per-candidate-distro loop below
# it doesn't make much sense to be here at the per-module level
my @in_perllocal = grep {
my $distname = $_->{distribution};
my ($v, $dist_mod_name) = perllocal_distro_mod_version($distname, $installed_meta->{perllocalpod});
warn "$dist_mod_name in perllocal.pod: ".($v ? "YES" : "NO")."\n"
if $opt_debug;
$v;
} @$best;
if (@in_perllocal && @in_perllocal < @$best) {
$note = sprintf "narrowed from %d via perllocal", scalar @$best;
$best = \@in_perllocal;
}
}
if (@$best > 1 or $note) { # note the poor match for this module
# but not if there's no version (as that's common)
my $best_desc = join " or ", map { $_->{release} } @$best;
my $pct = sprintf "%.2f%%", $best->[0]{fraction_installed} * 100;
warn "$mod $mi->{version} odd best match: $best_desc $note ($best->[0]{fraction_installed})\n"
if $note or $opt_verbose or ($mi->{version} and $best->[0]{fraction_installed} < 0.3);
# if the module has no version and multiple best matches
# then it's unlikely make a useful contribution, so ignore it
# XXX there's a risk that we'd ignore all the modules of a release
# where none of the modules has a version, but that seems unlikely.
next if not $mi->{version};
}
for my $dist (@$best) {
# two level hash to make it easier to handle versions
my $di = $best_dist{ $dist->{distribution} }{ $dist->{release} } ||= { dist => $dist };
push @{ $di->{modules} }, $mi;
$di->{or}{$_->{release}}++ for grep { $_ != $dist } @$best;
}
}
warn "*** Refining releases\n";
# $best_dist{ Foo }{ Foo-1.23 }{ dist=>$dist_struct, modules=>, or=>{ Foo-1.22 => $dist_struct } }
my @installed_releases; # Dist-Name => { ... }
for my $distname ( sort keys %best_dist ) {
my $releases = $best_dist{$distname};
my @dist_by_version = sort {
$a->{dist}{version_obj} <=> $b->{dist}{version_obj} or
$a->{dist}{fraction_installed} <=> $b->{dist}{fraction_installed}
} values %$releases;
my @dist_by_fraction = sort {
$a->{dist}{fraction_installed} <=> $b->{dist}{fraction_installed} or
$a->{dist}{version_obj} <=> $b->{dist}{version_obj}
} values %$releases;
my @remnant_dists = @dist_by_version;
my $installed_dist = pop @remnant_dists;
# is the most recent candidate dist version also the one with the
# highest fraction_installed?
if ($dist_by_version[-1] == $dist_by_fraction[-1]) {
# this is the common case: we'll assume that's installed and the
# rest are remnants of earlier versions
}
elsif ($dist_by_fraction[-1]{dist}{fraction_installed} == 100) {
warn "Unsure which $distname is installed from among @{[ keys %$releases ]}\n";
@remnant_dists = @dist_by_fraction;
$installed_dist = pop @remnant_dists;
warn "Selecting the one that apprears to be 100% installed\n";
}
else {
# else grumble so the user knows to ponder the possibilities
warn "Can't determine which $distname is installed from among @{[ keys %$releases ]}\n";
warn Dumper([\@dist_by_version, \@dist_by_fraction]);
warn "\tSelecting based on latest version\n";
}
if (@remnant_dists or $opt_debug) {
warn "Distributions with remnants (chosen release is first):\n"
unless our $dist_with_remnants_warning++;
warn "@{[ map { $_->{dist}{release} } reverse @dist_by_fraction ]}\n";
for ($installed_dist, @remnant_dists) {
my $fi = $_->{dist}{fraction_installed};
my $modules = $_->{modules};
my $mv_desc = join(", ", map { "$_->{module} $_->{version}" } @$modules);
warn sprintf "\t%s\t%s%% installed: %s\n",
$_->{dist}{release},
$_->{dist}{percent_installed},
(@$modules > 4 ? "(".@$modules." modules)" : $mv_desc),
}
}
# note ordering: remnants first
for (($opt_remnants ? @remnant_dists : ()), $installed_dist) {
my ($author, $distribution, $release)
= @{$_->{dist}}{qw(author distribution release)};
$metacpan_calls++;
my $release_data = $metacpan_api->release( author => $author, release => $release );
if (!$release_data) {
warn "Can't find release details for $author/$release - SKIPPED!\n";
next; # XXX could fake some of $release_data instead
}
# shortcuts
(my $url = $release_data->{download_url}) =~ s{ .*? \b authors/ }{authors/}x;
push @installed_releases, {
#
%$release_data,
# extra items mushed inhandy shortcuts
url => $url,
# raw data structures
dist_data => $_->{dist},
};
}
#die Dumper(\@installed_releases);
}
# sorting into dependency order could be added later, maybe
return @installed_releases;
}
# pick_best_cpan_dist_release - memoized
# for each %$ccdr adds a fraction_installed based on %$installed_mod_info
# returns ref to array of %$ccdr values that have the max fraction_installed
sub pick_best_cpan_dist_release {
my ($ccdr, $installed_mod_info) = @_;
for my $release (sort keys %$ccdr) {
my $release_info = $ccdr->{$release};
$release_info->{fraction_installed}
= dist_fraction_installed($release_info->{author}, $release, $installed_mod_info);
$release_info->{percent_installed} # for informal use
= sprintf "%.2f", $release_info->{fraction_installed} * 100;
}
my $max_fraction_installed = max( map { $_->{fraction_installed} } values %$ccdr );
my @best = grep { $_->{fraction_installed} == $max_fraction_installed } values %$ccdr;
return \@best;
}
# returns a number from 0 to 1 representing the fraction of the modules
# in a particular release match the coresponding modules in %$installed_mod_info
sub dist_fraction_installed {
my ($author, $release, $installed_mod_info) = @_;
my $tag = "$author/$release";
my $mods_in_rel = get_module_versions_in_release($author, $release);
my $mods_in_rel_count = keys %$mods_in_rel;
my $mods_inst_count = sum( map {
my $mi = $installed_mod_info->{ $_->{name} };
# XXX we stash the version_obj into the mods_in_rel hash
# (though with little/no caching effect with current setup)
$_->{version_obj} ||= eval { version->parse($_->{version}) };
my $hit = ($mi && $mi->{version_obj} == $_->{version_obj}) ? 1 : 0;
# demote to a low-scoring partial match if the file size differs
# XXX this isn't good as the effect varies with the number of modules
$hit = 0.1 if $mi && $mi->{size} != $_->{size};
warn sprintf "%s %s %s %s: %s\n", $tag, $_->{name}, $_->{version_obj}, $_->{size},
($hit == 1) ? "matches"
: ($mi) ? "differs ($mi->{version_obj}, $mi->{size})"
: "not installed",
if $opt_debug;
$hit;
} values %$mods_in_rel) || 0;
my $fraction_installed = ($mods_in_rel_count) ? $mods_inst_count/$mods_in_rel_count : 0;
warn "$author/$release:\tfraction_installed $fraction_installed ($mods_inst_count/$mods_in_rel_count)\n"
if $opt_verbose or !$mods_in_rel_count;
return $fraction_installed;
}
sub get_candidate_cpan_dist_releases {
my ($module, $version, $file_size) = @_;
$version = 0 if not defined $version; # XXX
# timbunce: So, the current situation is that: version_numified is a float
# holding version->parse($raw_version)->numify, and version is a string
# also holding version->parse($raw_version)->numify at the moment, and
# that'll change to ->stringify at some point. Is that right now?
# mo: yes, I already patched the indexer, so new releases are already
# indexed ok, but for older ones I need to reindex cpan
my $v = (ref $version && $version->isa('version')) ? $version : version->parse($version);
my %v = map { $_ => 1 } "$version", $v->stringify, $v->numify;
my @version_qual;
push @version_qual, { term => { "file.module.version" => $_ } }
for keys %v;
push @version_qual, { term => { "file.module.version_numified" => $_ }}
for grep { looks_like_number($_) } keys %v;
my @and_quals = (
{"term" => {"file.module.name" => $module }},
(@version_qual > 1 ? { "or" => \@version_qual } : $version_qual[0]),
);
push @and_quals, {"term" => {"file.stat.size" => $file_size }}
if $file_size;
# XXX doesn't cope with odd cases like
# http://explorer.metacpan.org/?url=/module/MLEHMANN/common-sense-3.4/sense.pm.PL
$metacpan_calls++;
my $results = $metacpan_api->post("file", {
"size" => $metacpan_size,
"query" => { "filtered" => {
"filter" => {"and" => \@and_quals },
"query" => {"match_all" => {}},
}},
"fields" => [qw(release _parent author version version_numified file.module.version file.module.version_numified date stat.mtime distribution)]
});
my $hits = $results->{hits}{hits};
die "get_candidate_cpan_dist_releases($module, $version, $file_size): too many results (>$metacpan_size)"
if @$hits >= $metacpan_size;
warn "get_candidate_cpan_dist_releases($module, $version, $file_size): ".Dumper($results)
if grep { not $_->{fields}{release} } @$hits; # XXX temp, seen once but not since
# filter out perl-like releases
@$hits = grep {
$_->{fields}{release} !~ /^(perl|ponie|parrot|kurila|SiePerl-5.6.1-)/;
} @$hits;
for my $hit (@$hits) {
$hit->{release_id} = delete $hit->{_parent};
# add version_obj for convenience (will fail and be undef for releases like "0.08124-TRIAL")
$hit->{fields}{version_obj} = eval { version->parse($hit->{fields}{version}) };
}
# we'll return { "Dist-Name-Version" => { details }, ... }
my %dists = map { $_->{fields}{release} => $_->{fields} } @$hits;
warn "get_candidate_cpan_dist_releases($module, $version, $file_size): @{[ sort keys %dists ]}\n"
if $opt_verbose;
return \%dists;
}
sub get_candidate_cpan_dist_releases_fallback {
my ($module, $version) = @_;
# fallback to look for distro of the same name as the module
# for odd cases like
# http://explorer.metacpan.org/?url=/module/MLEHMANN/common-sense-3.4/sense.pm.PL
(my $distname = $module) =~ s/::/-/g;
# timbunce: So, the current situation is that: version_numified is a float
# holding version->parse($raw_version)->numify, and version is a string
# also holding version->parse($raw_version)->numify at the moment, and
# that'll change to ->stringify at some point. Is that right now?
# mo: yes, I already patched the indexer, so new releases are already
# indexed ok, but for older ones I need to reindex cpan
my $v = (ref $version && $version->isa('version')) ? $version : version->parse($version);
my %v = map { $_ => 1 } "$version", $v->stringify, $v->numify;
my @version_qual;
push @version_qual, { term => { "version" => $_ } }
for keys %v;
push @version_qual, { term => { "version_numified" => $_ }}
for grep { looks_like_number($_) } keys %v;
my @and_quals = (
{"term" => {"distribution" => $distname }},
(@version_qual > 1 ? { "or" => \@version_qual } : $version_qual[0]),
);
# XXX doesn't cope with odd cases like
$metacpan_calls++;
my $results = $metacpan_api->post("file", {
"size" => $metacpan_size,
"query" => { "filtered" => {
"filter" => {"and" => \@and_quals },
"query" => {"match_all" => {}},
}},
"fields" => [qw(release _parent author version version_numified file.module.version file.module.version_numified date stat.mtime distribution)]
});
my $hits = $results->{hits}{hits};
die "get_candidate_cpan_dist_releases_fallback($module, $version): too many results (>$metacpan_size)"
if @$hits >= $metacpan_size;
warn "get_candidate_cpan_dist_releases_fallback($module, $version): ".Dumper($results)
if grep { not $_->{fields}{release} } @$hits; # XXX temp, seen once but not since
# filter out perl-like releases
@$hits = grep {
$_->{fields}{release} !~ /^(perl|ponie|parrot|kurila|SiePerl-5.6.1-)/;
} @$hits;
for my $hit (@$hits) {
$hit->{release_id} = delete $hit->{_parent};
# add version_obj for convenience (will fail and be undef for releases like "0.08124-TRIAL")
$hit->{fields}{version_obj} = eval { version->parse($hit->{fields}{version}) };
}
# we'll return { "Dist-Name-Version" => { details }, ... }
my %dists = map { $_->{fields}{release} => $_->{fields} } @$hits;
warn "get_candidate_cpan_dist_releases_fallback($module, $version): @{[ sort keys %dists ]}\n"
if $opt_verbose;
return \%dists;
}
# this can be called for all sorts of releases that are only vague possibilities
# and aren't actually installed, so generally it's quiet
sub get_module_versions_in_release {
my ($author, $release) = @_;
$metacpan_calls++;
my $results = eval { $metacpan_api->post("file", {
"size" => $metacpan_size,
"query" => { "filtered" => {
"filter" => {"and" => [
{"term" => {"release" => $release }},
{"term" => {"author" => $author }},
{"term" => {"mime" => "text/x-script.perl-module"}},
]},
"query" => {"match_all" => {}},
}},
"fields" => ["path","name","_source.module", "_source.stat.size"],
}) };
if (not $results) {
warn "Failed get_module_versions_in_release for $author/$release: $@";
return {};
}
my $hits = $results->{hits}{hits};
die "get_module_versions_in_release($author, $release): too many results"
if @$hits >= $metacpan_size;
my %modules_in_release;
for my $hit (@$hits) {
my $path = $hit->{fields}{path};
# XXX try to ignore files that won't get installed
# XXX should use META noindex!
if ($path =~ m!^(?:t|xt|tests?|inc|samples?|ex|examples?|bak)\b!) {
warn "$author/$release: ignored non-installed module $path\n"
if $opt_debug;
next;
}
my $size = $hit->{fields}{"_source.stat.size"};
# files can contain more than one package ('module')
my $rel_mods = $hit->{fields}{"_source.module"} || [];
for my $mod (@$rel_mods) { # actually packages in the file
# Some files may contain multiple packages. We want to ignore
# all except the one that matches the name of the file.
# We use a fairly loose (but still very effective) test because we
# can't rely on $path including the full package name.
(my $filebasename = $hit->{fields}{name}) =~ s/\.pm$//;
if ($mod->{name} !~ m/\b$filebasename$/) {
warn "$author/$release: ignored $mod->{name} in $path\n"
if $opt_debug;
next;
}
# warn if package previously seen in this release
# with a different version or file size
if (my $prev = $modules_in_release{$mod->{name}}) {
my $version_obj = eval { version->parse($mod->{version}) };
die "$author/$release: $mod $mod->{version}: $@" if $@;
if ($opt_verbose) {
# XXX could add a show-only-once cache here
my $msg = "$mod->{name} $mod->{version} ($size) seen in $path after $prev->{path} $prev->{version} ($prev->{size})";
warn "$release: $msg\n"
if ($version_obj != version->parse($prev->{version}) or $size != $prev->{size});
}
}
# keep result small as Storable thawing this is major runtime cost
# (specifically we avoid storing a version_obj here)
$modules_in_release{$mod->{name}} = {
name => $mod->{name},
path => $path,
version => $mod->{version},
size => $size,
};
}
}
warn "\n$author/$release contains: @{[ map { qq($_->{name} $_->{version}) } values %modules_in_release ]}\n"
if $opt_debug;
return \%modules_in_release;
}
sub get_file_mtime {
my ($file) = @_;
# try to find the time the file was 'installed'
# by looking for the commit date in svn or git
# else fallback to the file modification time
return (stat($file))[9];
}
sub find_installed_modules {
my (@dirs) = @_;
### File::Find uses follow_skip => 1 by default, which doesn't die
### on duplicates, unless they are directories or symlinks.
### Ticket #29796 shows this code dying on Alien::WxWidgets,
### which uses symlinks.
### File::Find doc says to use follow_skip => 2 to ignore duplicates
### so this will stop it from dying.
my %find_args = ( follow_skip => 2 );
### File::Find uses lstat, which quietly becomes stat on win32
### it then uses -l _ which is not allowed by the statbuffer because
### you did a stat, not an lstat (duh!). so don't tell win32 to
### follow symlinks, as that will break badly
# XXX disabled because we want the postprocess hook to work
#$find_args{'follow_fast'} = 1 unless ON_WIN32;
### never use the @INC hooks to find installed versions of
### modules -- they're just there in case they're not on the
### perl install, but the user shouldn't trust them for *other*
### modules!
### XXX CPANPLUS::inc is now obsolete, remove the calls
#local @INC = CPANPLUS::inc->original_inc;
# sort @dirs to put longest first to make it easy to handle
# elements that are within other elements (e.g., an archdir)
my @dirs_ordered = sort { length $b <=> length $a } @dirs;
my %seen_mod;
my %dir_done;
my %meta; # return metadata about the search
for my $dir (@dirs_ordered) {
next if $dir eq '.';
### not a directory after all
### may be coderef or some such
next unless -d $dir;
### make sure to clean up the directories just in case,
### as we're making assumptions about the length
### This solves rt.cpan issue #19738
### John M. notes: On VMS cannonpath can not currently handle
### the $dir values that are in UNIX format.
$dir = File::Spec->canonpath($dir) unless ON_VMS;
### have to use F::S::Unix on VMS, or things will break
my $file_spec = ON_VMS ? 'File::Spec::Unix' : 'File::Spec';
### XXX in some cases File::Find can actually die!
### so be safe and wrap it in an eval.
eval {
File::Find::find(
{ %find_args,
postprocess => sub {
$dir_done{$File::Find::dir}++;
},
wanted => sub {
unless (/\.pm$/i) {
# skip all dot-dirs (eg .git .svn)
$File::Find::prune = 1
if -d $File::Find::name and /^\.\w/;
# don't reenter a dir we've already done
$File::Find::prune = 1
if $dir_done{$File::Find::name};
# remember perllocal.pod if we see it
push @{$meta{perllocalpod}}, $File::Find::name
if $_ eq 'perllocal.pod';
return;
}
my $mod = $File::Find::name;
### make sure it's in Unix format, as it
### may be in VMS format on VMS;
$mod = VMS::Filespec::unixify($mod) if ON_VMS;
$mod = substr( $mod, length($dir) + 1, -3 );
$mod = join '::', $file_spec->splitdir($mod);
return if $seen_mod{$mod};
$seen_mod{$mod} = $File::Find::name;
### ignore files that don't contain a matching package declaration
### warn about those that do contain some kind of package declaration
#my $content = read_file($File::Find::name);
#unless ( $content =~ m/^ \s* package \s+ (\#.*\n\s*)? $mod \b/xm ) {
#warn "No 'package $mod' seen in $File::Find::name\n"
#if $opt_verbose && $content =~ /\b package \b/x;
#return;
#}
},
},
$dir
);
1;
}
or die "File::Find died: $@";
}
return (\%seen_mod, \%meta);
}
sub perllocal_distro_mod_version {
my ($distname, $perllocalpod) = @_;
( my $dist_mod_name = $distname ) =~ s/-/::/g;
my $key_mod_name = $distro_key_mod_names{$distname} || $dist_mod_name;
our $perllocal_distro_mod_version;
if (not $perllocal_distro_mod_version) { # initial setup
warn "Only first perllocal.pod file will be processed: @$perllocalpod\n"
if @$perllocalpod > 1;
$perllocal_distro_mod_version = {};
# extract data from perllocal.pod
if (my $plp = shift @$perllocalpod) {
# The VERSION isn't always the same as that in the distro file
if (eval { require ExtUtils::Perllocal::Parser }) {
my $p = ExtUtils::Perllocal::Parser->new;
$perllocal_distro_mod_version = { map {
$_->name => $_->{data}{VERSION}
} $p->parse_from_file($plp) };
warn "Details of ".keys(%$perllocal_distro_mod_version)." distributions found in $plp\n";
}
else {
warn "Wanted to use perllocal.pod but can't because ExtUtils::Perllocal::Parser isn't available\n";
}
}
else {
warn "No perllocal.pod found to aid disambiguation\n";
}
}
return $perllocal_distro_mod_version->{$key_mod_name};
}
sub module_progress_indicator {
my ($module) = @_;
my $crnt = (split /::/, $module)[0];
our $last ||= '';
if ($last ne $crnt) {
warn "\t$crnt...\n";
$last = $crnt;
}
}
# copied from CPAN::Mini::Inject and hacked
sub _readpkgs {
my ($cpandir) = @_;
my $packages_file = $cpandir.'/modules/02packages.details.txt.gz';
return [] if not -f $packages_file;
my $gzread = gzopen($packages_file, 'rb')
or croak "Cannot open $packages_file: $gzerrno\n";
my $inheader = 1;
my @packages;
my $package;
while ( $gzread->gzreadline( $package ) ) {
if ( $inheader ) {
$inheader = 0 unless $package =~ /\S/;
next;
}
chomp $package;
push @packages, $package;
}
$gzread->gzclose;
return \@packages;
}
sub _writepkgs {
my ($cpandir, $pkgs) = @_;
my $packages_file = $cpandir.'/modules/02packages.details.txt.gz';
my $gzwrite = gzopen($packages_file, 'wb')
or croak "Cannot open $packages_file for writing: $gzerrno";
$gzwrite->gzwrite( "File: 02packages.details.txt\n" );
$gzwrite->gzwrite(
"URL: http://www.perl.com/CPAN/modules/02packages.details.txt\n"
);
$gzwrite->gzwrite(
'Description: Package names found in directory $CPAN/authors/id/'
. "\n" );
$gzwrite->gzwrite( "Columns: package name, version, path\n" );
$gzwrite->gzwrite(
"Intended-For: Automated fetch routines, namespace documentation.\n"
);
$gzwrite->gzwrite( "Written-By: $0 0.001\n" ); # XXX TODO
$gzwrite->gzwrite( "Line-Count: " . scalar( @$pkgs ) . "\n" );
# Last-Updated: Sat, 19 Mar 2005 19:49:10 GMT
my @date = split( /\s+/, scalar( gmtime ) );
$gzwrite->gzwrite( "Last-Updated: $date[0], $date[2] $date[1] $date[4] $date[3] GMT\n\n" );
$gzwrite->gzwrite( "$_\n" ) for ( @$pkgs );
$gzwrite->gzclose;
}
sub _fmtmodule {
my ( $module, $file, $version ) = @_;
$version = "undef" if not defined $version;
my $fw = 38 - length $version;
$fw = length $module if $fw < length $module;
return sprintf "%-${fw}s %s %s", $module, $version, $file;
}
sub first_word {
my $string = shift;
return ($string =~ m/^(\w+)/) ? $1 : $string;
}
sub distname_info_from_url {
my ($url) = @_;
$url =~ s{.* \b authors/id/ }{}x
or warn "No authors/ in '$url'\n";
my $di = CPAN::DistnameInfo->new($url);
return $di;
}