The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/env perl
use 5.012;
use warnings;

use CPAN::DistnameInfo;
use Cwd qw(realpath);
use DB_File;
use File::Find;
use File::Spec::Functions qw(catfile catpath splitpath tmpdir);
use PerlIO::gzip;
use WebService::Google::Reader;

use constant VERBOSE => not $ENV{CRON};

my $dir = catpath((splitpath(realpath __FILE__))[0, 1]);
my $file = catfile($dir, '.unwanted-modules.bdb');
my $db = tie my %db, DB_File => $file;

my $reader = WebService::Google::Reader->new(
    host     => 'www.inoreader.com',
    username => $ENV{GOOGLE_USERNAME},
    password => $ENV{GOOGLE_PASSWORD},
);

# Return a sub that checks if the given Perl dist is installed.
my $is_perl_dist_installed = do {
    # Find the list of installed modules.
    my (%modules, %prune);
    for my $top (reverse sort @INC) {
        next if '.' eq $top;
        my $len = length $top;
        find {
            wanted => sub {
                if ($File::Find::dir ~~ %prune) {
                    $File::Find::prune = 1;
                    return;
                }
                return unless '.pm' eq substr $_, -3, 3, '';
                return unless -f _;
                substr $_, 0, 1 + $len, '';
                s{[\\/]}{::}g;
                $modules{$_} = undef;
            },
            no_chdir => 1
        }, $top;
        $prune{$top} = undef;
    }

    # Fetch the file that maps packages to distributions.
    my $file = '02packages.details.txt.gz';
    my $url = "http://search.cpan.org/CPAN/modules/$file";
    $file = catfile(tmpdir, $file);
    if (not -r $file or 1 < -M _) {
        my $res = $reader->ua->mirror($url, $file);
        die "Failed to mirror $file; " if $res->is_error;
    }
    open my $fh, '<:gzip', $file or die "$file: $!";

    # Skip header.
    while (<$fh>) { last when "\n" }

    # Determine the installed distributions, given the installed modules.
    my %dists;
    while (my $line = <$fh>) {
        my ($package, $version, $dist) = split /\s+/, $line;
        next unless $package ~~ %modules;
        $dists{ CPAN::DistnameInfo->new($dist)->dist } = undef;
    }
    close $fh;

    sub { $_[0] ~~ %dists; }
};

my %conf = (
    perl => {
        url  => 'http://search.cpan.org/uploads.rdf',
        name => sub {
            CPAN::DistnameInfo->new($_[0]->title . '.tgz')->dist
        },
        whitelist => [ $is_perl_dist_installed ],
    },
    python => {
        url  => 'http://pypi.python.org/pypi?:action=rss',
        name => sub {
            my ($name) = $_[0]->link->href =~ m[
                ^http://pypi\.python\.org/pypi/([^/]+)
            ]x;
            return $name;
        },
        blacklist => [ qr/ (?:\b|_) (?:django | plone | zope) (?:\b|_) /ix ],
    },
    ruby => {
        url  => 'http://feeds.feedburner.com/gemcutter-latest',
        name => sub {
            my ($name) = $_[0]->link->href =~ m[
                ^http://rubygems\.org/gems/([^?/]+)
            ]x;
            return $name;
        },
        blacklist => [
            qr/ (?:\b|_) (?:rails | active\W?record) (?:\b|_) /ix,
        ],
    },
    haskell => {
        url  => 'http://hackage.haskell.org/packages/archive/recent.rss',
        name => sub { ($_[0]->title =~ m[^\s*(\S+)])[0] },
    },
    vimscripts => {
        url => 'http://feed43.com/vim-scripts.xml',
        name => sub {
            my ($title, $name) = $_[0]->title =~ m[^\s*((.*?)\s+\S+) --];
            $_[0]->title($title);
            return $name;
        },
    },
);

# Get list of feed subscription times.
my %subs;
for my $sub ($reader->feeds) {
    (my $id = $sub->id) =~ s[^feed/][] or next;
    $subs{$id} = int $sub->firstitemmsec / 1000;
}

while (my ($lang, $conf) = each %conf) {
    my $feed = $reader->feed(
        $conf->{url},
        count      => 100,
        exclude    => { state => 'read' },
        start_time => $subs{$conf->{url}},
    );
    die $reader->error unless $feed;

    my @blacklist = @{ $conf->{blacklist} || [] };
    my @whitelist = @{ $conf->{whitelist} || [] };

    my @unwanted_entries;
    {
        for my $entry ($feed->entries) {
            my $name    = $conf->{name}->($entry);
            my $title   = $entry->title;
            my $summary = $entry->summary || '';
            my $desc    = $entry->content || '';
            $desc &&= $desc->body;
            # say "$lang | $name | $title | $summary | $desc\n" and next;

            unless ($name) {
                warn "Couldn't extract name from $title\n";
                next;
            }

            my $listed;
            for my $w (@whitelist) {
                next if not $name ~~ $w;
                $listed = 1;
                VERBOSE && say "$lang - $name - whitelisted";
                last;
            }
            next if $listed;

            for my $b (@blacklist) {
                next if not [$name, $title, $summary, $desc] ~~ $b;
                $listed = 1;
                push @unwanted_entries, $entry;
                VERBOSE && say "$lang - $name - blacklisted";
                last;
            }
            next if $listed;

            if ("$lang|$name" ~~ %db and $title ne $db{"$lang|$name"}) {
                push @unwanted_entries, $entry;
                VERBOSE && say "$lang - $name - unwanted because seen";
            }
            else {
                $db{"$lang|$name"} = $title;
            }
        }

        sleep 0.25;
        redo if $reader->more($feed);
    }

    $reader->mark_read_entry(\@unwanted_entries);
}