The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package CPAN2RT;

=head1 NAME

CPAN2RT - CPAN to RT converter for rt.cpan.org service

=head1 DESCRIPTION

An utility and module with functions to import and update metadata
about CPAN distributions into RT DB using files available from each
CPAN mirror.

Comes with `cpan2rt` script.

=cut

use v5.8.3;
use strict;
use warnings;

our $VERSION = '0.03';

use Email::Address;
use List::Compare;
use CPAN::DistnameInfo;
use List::MoreUtils qw(uniq);

our $DEBUG = 0;
sub debug(&);

=head1 METHODS

=head2 new

Simple constructor that creates a hash based object and stores all
passed arguments inside it. Then L</init> is called.

=head3 options

=over 8

=item home - RT home dir, RTHOME is checked if empty and defaults to
"/opt/rt3".

=item debug - turn on ddebug output to STDERR.

=item mirror - CPAN mirror to fetch files from.

=back

=cut

sub new {
    my $proto = shift;
    my $self = bless { @_ }, ref($proto) || $proto;
    $self->init();
    return $self;
}

=head2 init

Called right after constructor, changes @INC, loads RT and initilize it.

See options in description of L</new>.

=cut

sub init {
    my $self = shift;

    my $home = ($self->{'home'} ||= $ENV{'RTHOME'} || '/opt/rt3');
    unshift @INC, File::Spec->catdir( $home, 'lib' );
    unshift @INC, File::Spec->catdir( $home, 'local', 'lib' );

    require RT;
    RT::LoadConfig();
    RT::Init();

    $DEBUG = $self->{'debug'};
}

sub sync_files {
    my $self = shift;
    my $mirror = shift || $self->{'mirror'} || 'ftp://ftp.funet.fi/pub/CPAN';

    debug { "Syncing files from '$mirror'\n" };

    my @files = qw(
        indices/find-ls.gz
        authors/00whois.xml
        modules/06perms.txt.gz
        modules/02packages.details.txt.gz
    );

    require LWP::UserAgent;
    my $ua = new LWP::UserAgent;
    $ua->timeout( 10 );

    foreach my $file ( @files ) {
        debug { "Fetching '$file'\n" };
        my $store = $self->file_path( $file );
        $self->backup_file( $store ) if -e $store;
        my $response = $ua->get( "$mirror/$file", ':content_file' => $store );
        unless ( $response->is_success ) {
            print STDERR $response->status_line, "\n";
            next;
        }
        my $mtime = $response->header('Last-Modified');

        debug { "Fetched '$file' -> '$store'\n" };

        if ( $store =~ /(.*)\.gz$/ ) {
            $self->backup_file( $1 );
            `gunzip -f $store`;
            $store =~ s/\.gz$//;
            debug { "Unzipped '$store'\n" };
        }

        if ( $mtime ) {
            require HTTP::Date;
            $mtime = HTTP::Date::str2time( $mtime );
            utime $mtime, $mtime, $store if $mtime;
            debug { "Last modified: $mtime\n" };
        }
    }
}

{ my $cache;
sub authors {
    my $self = shift;
    $cache = $self->_authors unless $cache;
    return $cache;
} }

sub _authors {
    my $self = shift;
    my $file = '00whois.xml';
    debug { "Parsing $file...\n" };
    my $path = $self->file_path( $file );

    use XML::SAX::ParserFactory;
    my $handler = CPAN2RT::UsersSAXParser->new();
    my $p = XML::SAX::ParserFactory->parser(Handler => $handler);

    open my $fh, "<:raw", $path or die "Couldn't open '$path': $!";
    my $res = $p->parse_file( $fh );
    close $fh;

    return $res;
}

{ my $cache;
sub permissions {
    my $self = shift;
    $cache = $self->_permissions unless $cache;
    return $cache;
} }

sub _permissions {
    my $self = shift;
    my $file = '06perms.txt';
    debug { "Parsing $file...\n" };
    my $path = $self->file_path( $file );
    open my $fh, "<:utf8", $path or die "Couldn't open '$path': $!";

    $self->skip_header( $fh );

    my %res;
    while ( my $str = <$fh> ) {
        chomp $str;

        my ($module, $cpanid, $permission) = (split /\s*,\s*/, $str);
        unless ( $module && $cpanid ) {
            debug { "couldn't parse '$str' from '$file'\n" };
            next;
        }
        $res{ $module } ||= [];
        push @{ $res{ $module } }, $cpanid;
    }
    close $fh;

    return \%res;
}

{ my $cache;
sub module2file {
    my $self = shift;
    $cache = $self->_module2file() unless $cache;
    return $cache;
} }

sub _module2file {
    my $self = shift;
    my $file = '02packages.details.txt';
    debug { "Parsing $file...\n" };
    my $path = $self->file_path( $file );
    open my $fh, "<:utf8", $path or die "Couldn't open '$path': $!";

    $self->skip_header( $fh );

    my %res;
    while ( my $str = <$fh> ) {
        chomp $str;

        my ($module, $mver, $file) = split /\s+/, $str;
        unless ( $module && $file ) {
            debug { "couldn't parse '$str'\n" };
            next;
        }
        $res{ $module } = $file;
    }
    close $fh;

    return \%res;
}


{ my $cache;
sub all_distributions {
    my $self = shift;
    $cache = $self->_all_distributions() unless $cache;
    return $cache;
} }

sub _all_distributions {
    my $self = shift;
    my $file = 'find-ls';
    debug { "Parsing $file...\n" };
    my $path = $self->file_path( $file );
    open my $fh, "<:utf8", $path or die "Couldn't open '$path': $!";

    my %res;
    while ( my $str = <$fh> ) {
        next if $str =~ /^\d+\s+0\s+l\s+1/; # skip symbolic links
        chomp $str;

        my ($mode, $file) = (split /\s+/, $str)[2, -1];
        next if index($mode, 'x') >= 0; # skip executables (dirs)
        # we're only interested in files in authors/id/ dir
        next unless index($file, "authors/id/") == 0;
        next unless $file =~ /\.(bz2|zip|tgz|tar\.gz)$/i;

        my $info = CPAN::DistnameInfo->new( $file );
        my $dist = $info->dist;
        unless ( $dist ) {
            debug { "Couldn't parse distribution name from '$file'\n" };
            next;
        }
        push @{ $res{ $dist }{'versions'} ||= [] }, $info->version;
        push @{ $res{ $dist }{'uploaders'} ||= [] }, $info->cpanid;
    }
    close $fh;

    return \%res;
}

sub sync_authors {
    my $self = shift;
    my $force = shift;
    if ( !$force && !$self->is_new_file( '01mailrc.txt' ) ) {
        debug { "Skip syncing, file's not changed\n" };
        return (1);
    }

    my @errors;
    my $authors = $self->authors;
    while ( my ($cpanid, $meta) = each %$authors ) {
        my ($user, @msg) = $self->load_or_create_user( $cpanid, @{ $meta }{qw(fullname email)} );
        push @errors, @msg unless $user;
    }
    return (undef, @errors) if @errors;
    return (1);
}

sub sync_distributions {
    my $self = shift;
    my $force = shift;
    if ( !$force && !$self->is_new_file( '02packages.details.txt' ) ) {
        debug { "Skip syncing, file's not changed\n" };
        return (1);
    }

    my @files = uniq values %{ $self->module2file };
    my $all_dists = $self->all_distributions;

    my %tmp;
    foreach my $file ( @files ) {
        my $info = CPAN::DistnameInfo->new( "authors/id/$file" );
        my $dist = $info->dist;
        unless ( $dist ) {
            debug { "Couldn't parse distribution name from '$file'\n" };
            next;
        }
        if ( $dist =~ /^(parrot|perl)$/i ) {
            debug { "Skipping $dist as it's hard coded to be skipped." };
            next;
        }

        $tmp{ $dist } ||= [];
        if ( my $v = $info->version ) {
            push @{ $tmp{ $dist } }, $v;
        }
        push @{ $tmp{ $dist } }, @{ $all_dists->{ $dist }{'versions'} || [] };
    }

    my @errors;
    while ( my ($dist, $versions) = each %tmp ) {
        my ($queue, @msg) = $self->load_or_create_queue( $dist );
        unless ( $queue ) {
            push @errors, @msg;
            next;
        }
        if ( $versions && @$versions ) {
            my ($status, @msg) = $self->add_versions( $queue, @$versions );
            push @errors, @msg unless $status;
        }
    }

    %tmp = ();

    return (undef, @errors) if @errors;
    return (1);
}

sub sync_maintainers {
    my $self = shift;
    my $force = shift;
    if ( !$force && !$self->is_new_file( '06perms.txt' ) ) {
        debug { "Skip syncing, file's not changed\n" };
        return (1);
    }

    my $m2f = $self->module2file;
    my $perm = $self->permissions;

    my %res;
    while ( my ($module, $maint) = each %$perm ) {
        my $file = $m2f->{ $module };
        next unless $file;

        my $dist = CPAN::DistnameInfo->new( "authors/id/$file" )->dist;
        unless ( $dist ) {
            debug { "Couldn't parse distribution name from '$file'\n" };
            next;
        }
        push @{ $res{ $dist } ||= [] }, @$maint;
    }

    my @errors = ();
    while ( my ($dist, $maint) = each %res ) {
        my ($queue, @msg) = $self->load_or_create_queue( $dist );
        unless ( $queue ) {
            push @errors, @msg;
            next;
        }

        my $status;
        ($status, @msg) = $self->set_maintainers( $queue, @$maint );
        push @errors, @msg unless $status;
    }
    %res = ();
    return (undef, @errors) if @errors;
    return (1);
}

sub current_maintainers {
    my $self = shift;
    my $queue = shift;

    my $users = $queue->AdminCc->UserMembersObj;
    $users->OrderByCols;
    return map uc $_->Name, @{ $users->ItemsArrayRef };
}

sub filter_maintainers {
    my $self = shift;
    my $authors = $self->authors;
    return grep { ($authors->{$_}{'type'}||'') eq 'author' } @_;
}

sub set_maintainers {
    my $self = shift;
    my $queue   = shift;

    my @maints  = $self->filter_maintainers( @_ );
    my @current = $self->current_maintainers( $queue );

    my @errors;

    my $set = List::Compare->new( '--unsorted', \@current, \@maints );
    foreach ( $set->get_unique ) {
        debug { "Going to delete $_ from maintainers of ". $queue->Name };
        my ($status, @msg) = $self->del_maintainer( $queue, $_, 'force' );
        push @errors, @msg unless $status;
    }
    foreach ( $set->get_complement ) {
        debug { "Going to add $_ as maintainer of ". $queue->Name };
        my ($status, @msg) = $self->add_maintainer( $queue, $_, 'force' );
        push @errors, @msg unless $status;
    }

    return (undef, @errors) if @errors;
    return (1);
}

sub add_maintainer {
    my $self = shift;
    my $queue = shift;
    my $user  = shift;
    my $force = shift || 0;

    unless ( ref $user ) {
        my $tmp = RT::User->new( $RT::SystemUser );
        $tmp->LoadByCols( Name => $user );
        return (undef, "Couldn't load user '$user'")
            unless $tmp->id;

        $user = $tmp;
    }
    unless ( $user->id ) {
        return (undef, "Empty user object");
    }

    if ( !$force && $queue->IsAdminCc( $user->PrincipalId ) ) {
        debug {  $user->Name ." is already maintainer of '". $queue->Name ."'\n"  };
        return (1);
    }

    my ($status, $msg) = $queue->AddWatcher(
        Type        => 'AdminCc',
        PrincipalId => $user->PrincipalId,
    );
    unless ( $status ) {
        $msg = "Couldn't add ". $user->Name ." as AdminCc for ". $queue->Name .": $msg\n";
        return (undef, $msg);
    } else {
        debug { "Added ". $user->Name ." as maintainer of '". $queue->Name ."'\n" };
    }
    return (1);
}

sub del_maintainer {
    my $self = shift;
    my $queue = shift;
    my $user  = shift;
    my $force = shift;

    unless ( ref $user ) {
        my $tmp = RT::User->new( $RT::SystemUser );
        $tmp->LoadByCols( Name => $user );
        return (undef, "Couldn't load user '$user'")
            unless $tmp->id;

        $user = $tmp;
    }
    unless ( $user->id ) {
        return (undef, "Empty user object");
    }

    if ( !$force && !$queue->IsAdminCc( $user->PrincipalId ) ) {
        debug {  $user->Name ." is not maintainer of '". $queue->Name ."'. Skipping...\n"  };
        return (1);
    }

    my ($status, $msg) = $queue->DeleteWatcher(
        Type        => 'AdminCc',
        PrincipalId => $user->PrincipalId,
    );
    unless ( $status ) {
        $msg = "Couldn't delete ". $user->Name
            ." from AdminCc list of '". $queue->Name ."': $msg\n";
        return (undef, $msg);
    } else {
        debug { "Deleted ". $user->Name ." from maintainers of '". $queue->Name ."'\n" };
    }
    return (1);
}

sub add_versions {
    my $self = shift;
    my ($queue, @versions) = @_;
    @versions = uniq @versions;

    my @errors;
    foreach my $name ( "Broken in", "Fixed in" ) {
        my ($cf, $msg) = $self->load_or_create_version_cf( $queue, $name );
        unless ( $cf ) {
            push @errors, $msg;
            next;
        }

        # Unless it's a new value, don't add it
        my %old = map { $_->Name => 1 } @{ $cf->Values->ItemsArrayRef };
        foreach my $version ( grep defined && length, @versions ) {
            if ( exists $old{$version} ) {
                debug { "Version '$version' exists (not adding)\n" };
                next;
            }

            my ($val, $msg) = $cf->AddValue(
                Name          => $version,
                Description   => "Version $version",
                SortOrder     => 0,
            );
            unless ( $val ) {
                $msg = "Failed to add value '$version' to CF $name"
                    ." for queue ". $queue->Name .": $msg";
                debug {  $msg  };
                push @errors, $msg;
            }
            else {
                debug { "Added version '$version' into '$name' list for queue '". $queue->Name ."'\n" };
            }
        }
    }
    return (undef, @errors) if @errors;
    return (1);
}

sub load_or_create_user {
    my $self = shift;
    my ($cpanid, $realname, $email) = @_;

    my $bycpanid = RT::User->new($RT::SystemUser);
    $bycpanid->LoadByCol( Name => $cpanid );

    # WARNING: when MergeUser extension is used then the same user records
    # will be loaded even when there are multiple records in the DB
    $email = $self->parse_email_address( $email ) || "$cpanid\@cpan.org";
    my $byemail = RT::User->new( $RT::SystemUser );
    $byemail->LoadByEmail( $email );

    if ( $bycpanid->id && (($byemail->id && $bycpanid->id == $byemail->id) || !$byemail->id) ) {
        # the same users, both cpanid and email...
        # or email change, so no user with new email...
        #
        # XXX: as we have no way to detect email changes on PAUSE
        # then we set email to the public version from PAUSE only when
        # user in RT has no email. The same applies to name.
        $bycpanid->SetEmailAddress( $email )
            unless $bycpanid->EmailAddress;
        $bycpanid->SetRealName( $realname )
            unless $bycpanid->RealName;
        return $bycpanid;
    }
    elsif ( $bycpanid->id && $byemail->id ) {
        # both exist, but different
        # XXX: merge them
        debug { "WARNING: Two different users\n" };
        return $bycpanid;
    }
    elsif ( $byemail->id ) {
        # there is already user with that address, but different CPANID
        my ($new, $msg) = $self->create_user( $cpanid, $realname );
        return ($new, $msg) unless $new;

        if ( $new->can('MergeInto') ) {
            debug { "Merging user @{[$new->Name]} into @{[$byemail->Name]}...\n" };
            $new->MergeInto( $byemail );
        } else {
            debug {
                "WARNING: Couldn't merge user @{[$new->Name]} into @{[$byemail->Name]}."
                ." Extension is not installed.\n" };
        }
        return ($new);
    }

    return $self->create_user($cpanid, $realname, $email);
}

sub create_user {
    my $self = shift;
    my ($username, $realname, $email) = @_;

    my $user = RT::User->new( $RT::SystemUser );
    my ($val, $msg) = $user->Create(
        Name          => $username,
        RealName      => $realname,
        EmailAddress  => $email,
        Privileged    => 1
    );

    unless ( $val ) {
        $msg = "Failed to create user $username: $msg";
        debug { "FAILED! $msg\n" };
        return (undef, $msg);
    }
    else {
        debug { "Created user $username... " };
    }

    return ($user)
}

sub load_or_create_queue {
    my $self = shift;
    my $dist = shift;

    my $queue = RT::Queue->new( $RT::SystemUser );
    # Try to load up the current queue by name.  Avoids duplication.
    $queue->Load( $dist );
    unless ( $queue->id ) {
        my ($status, $msg) = $queue->Create(
            Name               => $dist,
            Description        => "Bugs in $dist",
            CorrespondAddress  => "bug-$dist\@rt.cpan.org",
            CommentAddress     => "comment-$dist\@rt.cpan.org",
        );
        unless ( $status ) {
            return (undef, "Couldn't create queue '$dist': $msg\n");
        }
		debug { "Created queue #". $queue->id ." for dist ". $queue->Name ."\n" };
    } else {
		debug { "Found queue #". $queue->id ." for dist ". $queue->Name ."\n" };
    }
    return $queue;
}

sub load_or_create_version_cf {
    my $self = shift;
    my ($queue, $name) = @_;

    my $cfs = RT::CustomFields->new( $RT::SystemUser );
    $cfs->Limit( FIELD => 'Name', VALUE => $name );
    $cfs->LimitToQueue( $queue->id );
    $cfs->{'find_disabled_rows'} = 0;   # This is why we don't simply do a LoadByName
    $cfs->OrderByCols; # don't sort things
    $cfs->RowsPerPage( 1 );

    my $cf = $cfs->First;
    unless ( $cf && $cf->id ) {
        return $self->create_version_cf( $queue, $name );
    }

    return ($cf);
}

sub create_version_cf {
    my $self = shift;
    my ($queue, $name) = @_;

    my $cf = RT::CustomField->new( $RT::SystemUser );
    debug { "creating custom field $name..." };
    my ($val, $msg) = $cf->Create(
        Name            => $name,
        TypeComposite   => 'Select-0',    
        # This is a much clearer way of associating a CF
        # with a queue, except that it's not as efficient
        # as the method below...
        # 
        #Queue           => $queue->Id,
        #
        # So instead we're going to set the lookup type here...
        #
        LookupType   => 'RT::Queue-RT::Ticket',
    );
    unless ( $val ) {
        debug { "FAILED!  $msg\n" };
        return (undef, "Failed to create CF $name for queue "
                        . $queue->Name
                        . ": $msg");
    }
    else {
        debug { "ok\n" };
    }

    #
    # ... and associate with the queue down here.
    #
    # This is the other way of associating a CF with a queue.  Unlike
    # the much more clear method above, it doesn't have to fetch the
    # queue object again.  And since this is an import, we do kinda
    # care about that stuff...
    #
    ($val, $msg) = $cf->AddToObject( $queue );
    unless ( $val ) {
        $msg = "Failed to link CF $name with queue " . $queue->Name . ": $msg";
        debug { $msg };
        $cf->Delete;
        return (undef, $msg);
    }
    return ($cf);
}

sub parse_email_address {
    my $self = shift;
    my $string = shift;
    return undef unless defined $string && length $string;
    return undef if uc($string) eq 'CENSORED';

    my $address = (grep defined, Email::Address->parse( $string || '' ))[0];
    return undef unless defined $address;
    return $address->address;
}

sub file_path {
    my $self = shift;
    my $file = shift;
    my $res = $file;
    $res =~ s/.*\///; # strip leading dirs
    if ( my $dir = $self->{'datadir'} ) {
        require File::Spec;
        $res = File::Spec->catfile( $dir, $res );
    }
    return $res;
}

sub is_new_file {
    my $self = shift;
    my $new = $self->file_path( shift );
    my $old = $new .'.old';
    return 1 unless -e $old; 
    return (stat $new)[9] > (stat $old)[9]? 1 : 0;
}

sub backup_file {
    my $self = shift;
    my $old = shift;
    my $new = $old .'.old';
    rename $old, $new;
}

sub skip_header {
    my $self = shift;
    my $fh = shift;
    while ( my $str = <$fh> ) {
        return if $str =~ /^\s*$/;
    }
}

sub debug(&) {
    return unless $DEBUG;
    print STDERR map { /\n$/? $_ : $_."\n" } $_[0]->();
}

1;

package CPAN2RT::UsersSAXParser;
use base qw(XML::SAX::Base);

sub start_document {
    my ($self, $doc) = @_;
    $self->{'res'} = {};
}

sub start_element {
    my ($self, $el) = @_;
    my $name = $el->{LocalName};
    return if $name ne 'cpanid' && !$self->{inside};

    if ( $name eq 'cpanid' ) {
        $self->{inside} = 1;
        $self->{tmp} = [];
        return;
    } else {
        $self->{inside_prop} = 1;
    }

    push @{ $self->{'tmp'} }, $name, '';
}

sub characters {
    my ($self, $el) = @_;
    $self->{'tmp'}[-1] .= $el->{Data} if $self->{inside_prop};
}

sub end_element {
    my ($self, $el) = @_;
    $self->{inside_prop} = 0;

    my $name = $el->{LocalName};

    if ( $name eq 'cpanid' ) {
        $self->{inside} = 0;
        my %rec = map Encode::decode_utf8($_), @{ delete $self->{'tmp'} };
        $self->{'res'}{ delete $rec{'id'} } = \%rec;
    }
}

sub end_document {
    my ($self) = @_;
    return $self->{'res'};
}

1;