The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package Dist::Surveyor;
$Dist::Surveyor::VERSION = '0.016';
=head1 NAME

Dist::Surveyor - Survey installed modules and determine the specific distribution versions they came from

=head1 VERSION

version 0.016

=head1 SYNOPSIS

    my $options = {
        opt_match => $opt_match,
        opt_perlver => $opt_perlver,
        opt_remnants => $opt_remnants,
        distro_key_mod_names => $distro_key_mod_names,
    };
    my @installed_releases = determine_installed_releases($options, \@libdirs);

=head1 DESCRIPTION

Surveys your huge ball of Perl modules, jammed together inside a directory,
and tells you exactly which module is installed there.

For quick start, and a fine example of this module usage, see L<dist_surveyor>.

This module have one exported function - determine_installed_releases

=cut

use strict;
use warnings;

use version;
use Carp; # core
use Data::Dumper; # core
use File::Find;  # core
use File::Spec; # core
use List::Util qw(max sum); # core
use Dist::Surveyor::Inquiry; # internal
use Module::CoreList;
use Module::Metadata;

use constant ON_WIN32 => $^O eq 'MSWin32';
use constant ON_VMS   => $^O eq 'VMS';

if (ON_VMS) {
    require File::Spec::Unix;
}

our ($DEBUG, $VERBOSE);
*DEBUG = \$::DEBUG;
*VERBOSE = \$::VERBOSE;

require Exporter;
our @ISA = qw{Exporter};
our @EXPORT = qw{determine_installed_releases};

=head1 determine_installed_releases($options, $search_dirs)

$options includes:

=over

=item opt_match

A regex qr//. If exists, will ignore modules that doesn't match this regex

=item opt_perlver

Skip modules that are included as core in this Perl version

=item opt_remnants

If true, output will include old distribution versions that have left old modules behind

=item distro_key_mod_names

A hash-ref, with a list of irregular named releases. i.e. 'libwww-perl' => 'LWP'.

=back

$search_dirs is an array-ref containing the list of directories to survey.

Returns a list, where each element is a hashref representing one installed distibution.
This hashref is what MetaCPAN returns for http://api.metacpan.org/v0/release/$author/$release,
with two additional keys: 

=over

=item *

'url' - that same as 'download_url', but without the hostname. can be used to
download the file for your favorite mirror

=item *

'dist_data' - Hashref containing info about the release, i.e. percent_installed.
(fully installed releases will have '100.00')

=back

=cut

sub determine_installed_releases {
    my ($options, $search_dirs) = @_;
    $options->{opt_perlver} ||= version->parse( $] )->numify;

    my %installed_mod_info;

    warn "Finding modules in @$search_dirs\n";
    my ($installed_mod_files, $installed_meta) = find_installed_modules(@$search_dirs);

    # get the installed version of each installed module and related info
    warn "Finding candidate releases for the ".keys(%$installed_mod_files)." installed modules\n";
    foreach my $module ( sort keys %$installed_mod_files ) {
        my $mod_file = $installed_mod_files->{$module};

        if (my $opt_match = $options->{opt_match}) {
            if ($module !~ m/$opt_match/o) {
                delete $installed_mod_files->{$module};
                next;
            }
        }

        module_progress_indicator($module) unless $VERBOSE;
        my $mi = get_installed_mod_info($options, $module, $mod_file);
        $installed_mod_info{$module} = $mi if $mi;
    }


    # Map modules to dists using the accumulated %installed_mod_info info

    warn "*** Mapping modules to releases\n";

    my %best_dist;
    foreach my $mod ( sort keys %installed_mod_info ) {
        my $mi = $installed_mod_info{$mod};

        module_progress_indicator($mod) unless $VERBOSE;

        # find best match among the cpan releases that included this module
        my $ccdr = $installed_mod_info{$mod}{candidate_cpan_dist_releases}
            or next; # no candidates, warned about above (for mods with a version)

        my $best_dist_cache_key = join " ", sort keys %$ccdr;
        our %best_dist_cache;
        my $best = $best_dist_cache{$best_dist_cache_key}
            ||= pick_best_cpan_dist_release($ccdr, \%installed_mod_info);

        my $note = "";
        if ((@$best > 1) and $installed_meta->{perllocalpod}) { 
            # try using perllocal.pod to narrow the options, if there is one
            # XXX TODO move this logic into the per-candidate-distro loop below
            # it doesn't make much sense to be here at the per-module level
            my @in_perllocal = grep {
                my $distname = $_->{distribution};
                my ($v, $dist_mod_name) = perllocal_distro_mod_version(
                    $options->{distro_key_mod_names}, $distname, $installed_meta->{perllocalpod});
                warn "$dist_mod_name in perllocal.pod: ".($v ? "YES" : "NO")."\n"
                    if $DEBUG;
                $v;
            } @$best;
            if (@in_perllocal && @in_perllocal < @$best) {
                $note = sprintf "narrowed from %d via perllocal", scalar @$best;
                $best = \@in_perllocal;
            }
        }

        if (@$best > 1 or $note) { # note the poor match for this module
            # but not if there's no version (as that's common)
            my $best_desc = join " or ", map { $_->{release} } @$best;
            my $pct = sprintf "%.2f%%", $best->[0]{fraction_installed} * 100;
            warn "$mod $mi->{version} odd best match: $best_desc $note ($best->[0]{fraction_installed})\n"
                if $note or $VERBOSE or ($mi->{version} and $best->[0]{fraction_installed} < 0.3);
            # if the module has no version and multiple best matches
            # then it's unlikely make a useful contribution, so ignore it
            # XXX there's a risk that we'd ignore all the modules of a release
            # where none of the modules has a version, but that seems unlikely.
            next if not $mi->{version};
        }

        for my $dist (@$best) {
            # two level hash to make it easier to handle versions
            my $di = $best_dist{ $dist->{distribution} }{ $dist->{release} } ||= { dist => $dist };
            push @{ $di->{modules} }, $mi;
            $di->{or}{$_->{release}}++ for grep { $_ != $dist } @$best;
        }

    }

    warn "*** Refining releases\n";

    # $best_dist{ Foo }{ Foo-1.23 }{ dist=>$dist_struct, modules=>, or=>{ Foo-1.22 => $dist_struct } }

    my @installed_releases;    # Dist-Name => { ... }

    for my $distname ( sort keys %best_dist ) {
        my $releases = $best_dist{$distname};
        push @installed_releases, refine_releases($options, $distname, $releases);
    }

    # sorting into dependency order could be added later, maybe

    return @installed_releases;
}

sub refine_releases {
    my ($options, $distname, $releases) = @_;

    my @dist_by_version  = sort {
        $a->{dist}{version_obj}        <=> $b->{dist}{version_obj} or
        $a->{dist}{fraction_installed} <=> $b->{dist}{fraction_installed}
    } values %$releases;
    my @dist_by_fraction = sort {
        $a->{dist}{fraction_installed} <=> $b->{dist}{fraction_installed} or
        $a->{dist}{version_obj}        <=> $b->{dist}{version_obj}
    } values %$releases;
    
    my @remnant_dists  = @dist_by_version;
    my $installed_dist = pop @remnant_dists;

    # is the most recent candidate dist version also the one with the
    # highest fraction_installed?
    if ($dist_by_version[-1] == $dist_by_fraction[-1]) {
        # this is the common case: we'll assume that's installed and the
        # rest are remnants of earlier versions
    }
    elsif ($dist_by_fraction[-1]{dist}{fraction_installed} == 100) {
        warn "Unsure which $distname is installed from among @{[ keys %$releases ]}\n";
        @remnant_dists  = @dist_by_fraction;
        $installed_dist = pop @remnant_dists;
        warn "Selecting the one that apprears to be 100% installed\n";
    }
    else {
        # else grumble so the user knows to ponder the possibilities
        warn "Can't determine which $distname is installed from among @{[ keys %$releases ]}\n";
        warn Dumper([\@dist_by_version, \@dist_by_fraction]);
        warn "\tSelecting based on latest version\n";
    }

    if (@remnant_dists or $DEBUG) {
        warn "Distributions with remnants (chosen release is first):\n"
            unless our $dist_with_remnants_warning++;
        warn "@{[ map { $_->{dist}{release} } reverse @dist_by_fraction ]}\n"; 
        for ($installed_dist, @remnant_dists) {
            my $fi = $_->{dist}{fraction_installed};
            my $modules = $_->{modules};
            my $mv_desc = join(", ", map { "$_->{module} $_->{version}" } @$modules);
            warn sprintf "\t%s\t%s%% installed: %s\n",
                $_->{dist}{release},
                $_->{dist}{percent_installed},
                (@$modules > 4 ? "(".@$modules." modules)" : $mv_desc),
        }
    }

    my @installed_releases;
    # note ordering: remnants first
    for (($options->{opt_remnants} ? @remnant_dists : ()), $installed_dist) {
        my ($author, $release)
            = @{$_->{dist}}{qw(author release)};

        my $release_data = get_release_info($author, $release);
        next unless $release_data;
        
        # shortcuts
        (my $url = $release_data->{download_url}) =~ s{ .*? \b authors/ }{authors/}x;

        push @installed_releases, {
            # 
            %$release_data,
            # extra items mushed inhandy shortcuts
            url => $url,
            # raw data structures
            dist_data => $_->{dist},
        };
    }
    #die Dumper(\@installed_releases);
    return @installed_releases;
}

# for each installed module, get the list of releases that it exists in
# Parameters:
#   $options - uses only opt_perlver
#   $module - module name (i.e. 'Dist::Surveyor')
#   $mod_file - the location of this module on the filesystem
# Return:
#   undef if this module should be skipped
#   otherwise, a hashref containing:
#       file => $mod_file,
#       module => $module,
#       version => $mod_version,
#       version_obj => same as version, but as an object,
#       size => $mod_file_size,
#       # optional flags:
#       file_size_mismatch => 1,
#       cpan_dist_fallback => 1, # could not find this module/version on cpan,
#           # but found a release with that version, containing such module
#       version_not_on_cpan> 1, # can not find this file on CPAN.
#       # releases info
#       candidate_cpan_dist_releases => hashref,
#
#   candidate_cpan_dist_releases hashref contain a map of all the releases
#   that this module exists in. see get_candidate_cpan_dist_releases for more
#   info.
sub get_installed_mod_info {
    my ($options, $module, $mod_file) = @_;

    my $mod_version = do {
        # silence warnings about duplicate VERSION declarations
        # eg Catalyst::Controller::DBIC::API* 2.002001
        local $SIG{__WARN__} = sub { warn @_ if $_[0] !~ /already declared with version/ };
        my $mm = Module::Metadata->new_from_file($mod_file);
        $mm->version; # only one version for one package in file
    };
    $mod_version ||= 0; # XXX
    my $mod_file_size = -s $mod_file;

    # Eliminate modules that will be supplied by the target perl version
    if ( my $cv = $Module::CoreList::version{ $options->{opt_perlver} }->{$module} ) {
        $cv =~ s/ //g;
        if (version->parse($cv) >= version->parse($mod_version)) {
            warn "$module is core in perl $options->{opt_perlver} (lib: $mod_version, core: $cv) - skipped\n";
            return;
        }
    }

    my $mi = {
        file => $mod_file,
        module => $module,
        version => $mod_version,
        version_obj => version->parse($mod_version),
        size => $mod_file_size,
    };

    # ignore modules we know aren't indexed
    return $mi if $module =~ /^Moose::Meta::Method::Accessor::Native::/;

    # XXX could also consider file mtime: releases newer than the mtime
    # of the module file can't be the origin of that module file.
    # (assuming clocks and file times haven't been messed with)

    eval {
        my $ccdr = get_candidate_cpan_dist_releases($module, $mod_version, $mod_file_size);
        if (not %$ccdr) {
            $ccdr = get_candidate_cpan_dist_releases($module, $mod_version, 0);
            if (%$ccdr) {
                # probably either a local change/patch or installed direct from repo
                # but with a version number that matches a release
                warn "$module $mod_version on CPAN but with different file size (not $mod_file_size)\n"
                    if $mod_version or $VERBOSE;
                $mi->{file_size_mismatch}++;
            }
            elsif ($ccdr = get_candidate_cpan_dist_releases_fallback($module, $mod_version) and %$ccdr) {
                warn "$module $mod_version not on CPAN but assumed to be from @{[ sort keys %$ccdr ]}\n"
                    if $mod_version or $VERBOSE;
                $mi->{cpan_dist_fallback}++;
            }
            else {
                $mi->{version_not_on_cpan}++;
                # Possibly:
                # - a local change/patch or installed direct from repo
                #   with a version number that was never released.
                # - a private module never released on cpan.
                # - a build-time create module eg common/sense.pm.PL
                warn "$module $mod_version not found on CPAN\n"
                    if $mi->{version} # no version implies uninteresting
                    or $VERBOSE;
                # XXX could try finding the module with *any* version on cpan
                # to help with later advice. ie could select as candidates
                # the version above and the version below the number we have,
                # and set a flag to inform later logic.
            }
        }
        $mi->{candidate_cpan_dist_releases} = $ccdr if %$ccdr;
    };
    if ($@) {
        warn "Failed get_candidate_cpan_dist_releases($module, $mod_version, $mod_file_size): $@";
    }
    return $mi;
}

# pick_best_cpan_dist_release - memoized
# for each %$ccdr adds a fraction_installed based on %$installed_mod_info
# returns ref to array of %$ccdr values that have the max fraction_installed

sub pick_best_cpan_dist_release {
    my ($ccdr, $installed_mod_info) = @_;

    for my $release (sort keys %$ccdr) {
        my $release_info = $ccdr->{$release};
        $release_info->{fraction_installed}
            = dist_fraction_installed($release_info->{author}, $release, $installed_mod_info);
        $release_info->{percent_installed} # for informal use
            = sprintf "%.2f", $release_info->{fraction_installed} * 100;
    }

    my $max_fraction_installed = max( map { $_->{fraction_installed} } values %$ccdr );
    my @best = grep { $_->{fraction_installed} == $max_fraction_installed } values %$ccdr;

    return \@best;
}


# returns a number from 0 to 1 representing the fraction of the modules
# in a particular release match the coresponding modules in %$installed_mod_info
sub dist_fraction_installed {
    my ($author, $release, $installed_mod_info) = @_;

    my $tag = "$author/$release";
    my $mods_in_rel = get_module_versions_in_release($author, $release);
    my $mods_in_rel_count = keys %$mods_in_rel;
    my $mods_inst_count = sum( map {
        my $mi = $installed_mod_info->{ $_->{name} };
        # XXX we stash the version_obj into the mods_in_rel hash
        # (though with little/no caching effect with current setup)
        $_->{version_obj} ||= eval { version->parse($_->{version}) };
        my $hit = ($mi && $mi->{version_obj} == $_->{version_obj}) ? 1 : 0;
        # demote to a low-scoring partial match if the file size differs
        # XXX this isn't good as the effect varies with the number of modules
        $hit = 0.1 if $mi && $mi->{size} != $_->{size};
        warn sprintf "%s %s %s %s: %s\n", $tag, $_->{name}, $_->{version_obj}, $_->{size},
                ($hit == 1) ? "matches"
                    : ($mi) ? "differs ($mi->{version_obj}, $mi->{size})"
                    : "not installed",
            if $DEBUG;
        $hit;
    } values %$mods_in_rel) || 0;

    my $fraction_installed = ($mods_in_rel_count) ? $mods_inst_count/$mods_in_rel_count : 0;
    warn "$author/$release:\tfraction_installed $fraction_installed ($mods_inst_count/$mods_in_rel_count)\n"
        if $VERBOSE or !$mods_in_rel_count;

    return $fraction_installed;
}

sub get_file_mtime {
    my ($file) = @_;
    # try to find the time the file was 'installed'
    # by looking for the commit date in svn or git
    # else fallback to the file modification time
    return (stat($file))[9];
}


sub find_installed_modules {
    my (@dirs) = @_;

    ### File::Find uses follow_skip => 1 by default, which doesn't die
    ### on duplicates, unless they are directories or symlinks.
    ### Ticket #29796 shows this code dying on Alien::WxWidgets,
    ### which uses symlinks.
    ### File::Find doc says to use follow_skip => 2 to ignore duplicates
    ### so this will stop it from dying.
    my %find_args = ( follow_skip => 2 );

    ### File::Find uses lstat, which quietly becomes stat on win32
    ### it then uses -l _ which is not allowed by the statbuffer because
    ### you did a stat, not an lstat (duh!). so don't tell win32 to
    ### follow symlinks, as that will break badly
    # XXX disabled because we want the postprocess hook to work
    #$find_args{'follow_fast'} = 1 unless ON_WIN32;

    ### never use the @INC hooks to find installed versions of
    ### modules -- they're just there in case they're not on the
    ### perl install, but the user shouldn't trust them for *other*
    ### modules!
    ### XXX CPANPLUS::inc is now obsolete, remove the calls
    #local @INC = CPANPLUS::inc->original_inc;

    # sort @dirs to put longest first to make it easy to handle
    # elements that are within other elements (e.g., an archdir)
    my @dirs_ordered = sort { length $b <=> length $a } @dirs;

    my %seen_mod;
    my %dir_done;
    my %meta; # return metadata about the search
    for my $dir (@dirs_ordered) {
        next if $dir eq '.';

        ### not a directory after all
        ### may be coderef or some such
        next unless -d $dir;

        ### make sure to clean up the directories just in case,
        ### as we're making assumptions about the length
        ### This solves rt.cpan issue #19738

        ### John M. notes: On VMS cannonpath can not currently handle
        ### the $dir values that are in UNIX format.
        $dir = File::Spec->canonpath($dir) unless ON_VMS;

        ### have to use F::S::Unix on VMS, or things will break
        my $file_spec = ON_VMS ? 'File::Spec::Unix' : 'File::Spec';

        ### XXX in some cases File::Find can actually die!
        ### so be safe and wrap it in an eval.
        eval {
            File::Find::find(
                {   %find_args,
                    postprocess => sub {
                        $dir_done{$File::Find::dir}++;
                    },
                    wanted => sub {

                        unless (/\.pm$/i) {
                            # skip all dot-dirs (eg .git .svn)
                            $File::Find::prune = 1
                                if -d $File::Find::name and /^\.\w/;
                            # don't reenter a dir we've already done
                            $File::Find::prune = 1
                                if $dir_done{$File::Find::name};
                            # remember perllocal.pod if we see it
                            push @{$meta{perllocalpod}}, $File::Find::name
                                if $_ eq 'perllocal.pod';
                            return;
                        }
                        my $mod = $File::Find::name;

                        ### make sure it's in Unix format, as it
                        ### may be in VMS format on VMS;
                        $mod = VMS::Filespec::unixify($mod) if ON_VMS;

                        $mod = substr( $mod, length($dir) + 1, -3 );
                        $mod = join '::', $file_spec->splitdir($mod);

                        return if $seen_mod{$mod};
                        $seen_mod{$mod} = $File::Find::name;

                        ### ignore files that don't contain a matching package declaration
                        ### warn about those that do contain some kind of package declaration
                        #use File::Slurp;
                        #my $content = read_file($File::Find::name);
                        #unless ( $content =~ m/^ \s* package \s+ (\#.*\n\s*)? $mod \b/xm ) {
                        #warn "No 'package $mod' seen in $File::Find::name\n"
                        #if $VERBOSE && $content =~ /\b package \b/x;
                        #return;
                        #}

                    },
                },
                $dir
            );
            1;
        }
            or die "File::Find died: $@";

    }

    return (\%seen_mod, \%meta);
}


sub perllocal_distro_mod_version {
    my ($distro_key_mod_names, $distname, $perllocalpod) = @_;

    ( my $dist_mod_name = $distname ) =~ s/-/::/g;
    my $key_mod_name = $distro_key_mod_names->{$distname} || $dist_mod_name;

    our $perllocal_distro_mod_version;
    if (not $perllocal_distro_mod_version) { # initial setup
        warn "Only first perllocal.pod file will be processed: @$perllocalpod\n"
            if ref $perllocalpod eq 'ARRAY' and @$perllocalpod > 1;

        $perllocal_distro_mod_version = {};
        # extract data from perllocal.pod
        if (my $plp = shift @$perllocalpod) {
            # The VERSION isn't always the same as that in the distro file
            if (eval { require ExtUtils::Perllocal::Parser }) {
                my $p = ExtUtils::Perllocal::Parser->new;
                $perllocal_distro_mod_version = { map {
                    $_->name => $_->{data}{VERSION}
                } $p->parse_from_file($plp) };
                warn "Details of ".keys(%$perllocal_distro_mod_version)." distributions found in $plp\n";
            }
            else {
                warn "Wanted to use perllocal.pod but can't because ExtUtils::Perllocal::Parser isn't available\n";
            }
        }
        else {
            warn "No perllocal.pod found to aid disambiguation\n";
        }
    }

    return $perllocal_distro_mod_version->{$key_mod_name};
}


sub module_progress_indicator {
    my ($module) = @_;
    my $crnt = (split /::/, $module)[0];
    our $last ||= '';
    if ($last ne $crnt) {
        warn "\t$crnt...\n";
        $last = $crnt;
    }
}

=head1 OTHERS

This module checks $::DEBUG and $::VERBOSE for obvious proposes.

This module uses L<Dist::Surveyor::Inquiry> to communicate with MetaCPAN. 
Check that module's documentation for options and caching. 

You can use L<Dist::Surveyor::MakeCpan> to take the list of releases
and create a mini-cpan containing them.

=head1 AUTHOR

Written by Tim Bunce E<lt>Tim.Bunce@pobox.comE<gt> 

Maintained by Fomberg Shmuel E<lt>shmuelfomberg@gmail.comE<gt>, Dan Book E<lt>dbook@cpan.orgE<gt>
 
=head1 COPYRIGHT AND LICENSE
 
Copyright 2011-2013 by Tim Bunce.
 
This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.
 
=cut

1;