The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/perl

use strict;
use utf8;
use Getopt::Long qw(:config auto_version);
use LWP::Simple;
use LWP::UserAgent;
use Data::Dumper;
use List::Util qw(shuffle);

our $VERSION = $Locale::Memories::VERSION;

my @default_locales = qw(aa_dj aa_er aa_er aa_et af_za am_et an_es
			 ar_ae ar_bh ar_dz ar_eg ar_in ar_iq ar_jo
			 ar_kw ar_lb ar_ly ar_ma ar_om ar_qa ar_sa
			 ar_sd ar_sy ar_tn ar_ye as_in az_az be_by
			 be_by bg_bg bn_bd bn_in br_fr bs_ba
			 byn_er ca_ad ca_es ca_es ca_fr ca_it
			 csb_pl cs_cz cy_gb da_dk de_at de_be
			 de_ch de_de de_lu dz_bt el_cy el_gr en_au
			 en_bw en_ca en_dk en_gb en_hk en_ie en_in
			 en_nz en_ph en_sg en_us en_za en_zw eo
			 es_ar es_bo es_cl es_co es_cr es_do es_ec
			 es_es es_gt es_hn es_mx es_ni es_pa es_pe
			 es_pr es_py es_sv es_us es_uy es_ve et_ee
			 eu_es eu_fr fa_ir fi_fi fo_fo fr_be fr_ca
			 fr_ch fr_fr fr_lu fy_nl ga_ie gd_gb
			 gez_er gez_er gez_et gez_et gl_es gu_in
			 gv_gb he_il hi_in hr_hr hsb_de hu_hu
			 hy_am ia id_id is_is it_ch it_it iw_il
			 ja_jp ka_ge kk_kz kl_gl km_kh kn_in ko_kr
			 ku_tr kw_gb ky_kg lg_ug lo_la lt_lt lv_lv
			 mai_in mg_mg mi_nz mk_mk ml_in mn_mn
			 mr_in ms_my mt_mt nb_no ne_np nl_be nl_nl
			 nn_no no_no nr_za nso_za oc_fr om_et
			 om_ke or_in pa_in pa_pk pl_pl pt_br pt_pt
			 ro_ro ru_ru ru_ua rw_rw sa_in se_no
			 sid_et si_lk sk_sk sl_si so_dj so_et
			 so_ke so_so sq_al sr_cs sr_me sr_rs ss_za
			 st_za sv_fi sv_se ta_in te_in tg_tj th_th
			 ti_er ti_et tig_er tl_ph tn_za tr_cy
			 tr_tr ts_za tt_ru uk_ua ur_pk uz_uz uz_uz
			 ve_za vi_vn wa_be wo_sn xh_za yi_us zh_cn
			 zh_hk zh_sg zh_tw zu_za);

@default_locales = shuffle(@default_locales,
			   map {m{(\w+)_\w+}; $1} @default_locales);
sub get_search_result {
    my $locale = shift;
    my $max_results = shift;
    my $remote_prog = 'http://www.google.com/codesearch/feeds/search?';
    my $feed_url = ($remote_prog . 'q=file:' . $locale
		    . '.po$&max-results=' . $max_results);
    my $content = LWP::Simple::get($feed_url);
    my @urls;
    while ($content =~ m{<id>(.+?)</id>}g) {
	my $url = $1;
	next if $url =~ m{/codesearch/feeds};
	$url =~ m{cs_p=(.+?)&amp;cs_f=(.+?)&amp;cs_p};
	my ($source, $path) = ($1, $2);
	$path =~ s{(.+)/.+\.po}{$1};
	push @urls, [ $source, $path ];
    }
    return @urls;
}

sub fetch_software_urls {
    my ($output_dir, $locales_ref, $max_results) = @_;

    if (-e $output_dir && !-d $output_dir) {
	die "$output_dir already exists and it is not a directory";
    }
    elsif (!-d $output_dir) {
	mkdir $output_dir or die $!;
    }

    my %url;
    for my $locale (@{$locales_ref}) {
	print "Fetching code information for locale ($locale)\n";
	for my $u (get_search_result($locale, $max_results)) {
	    if ($u->[0] =~ m{http://.+\.gz\z}) {
		$url{$u->[0]} = $u->[1];
	    }
	}
    }
    return %url;
}

sub fetch_source_files {
    my $output_dir = shift;
    my %url = @_;
    for my $u (keys %url) {
	$u =~ m{\A.+/(.+)\z};
	my $file = $1;
	my $path = $url{$u};
	my $subroot_path = $path;
	$subroot_path =~ s{\A(.+?)/.+}{$1};
	my $output_subpath = $url{$u};
	$output_subpath =~ s{/}{_}g;
	system("wget '$u' -O '$output_dir/$file'"
	       . "&& tar zxvf '$output_dir/$file' > /dev/null "
	       . "&& mkdir -p '$output_dir/$subroot_path'");
	if (-d $url{$u}) {
	    system("cp '$url{$u}/'*.po '$output_dir/$subroot_path/' "
		   . "&& rm -rf $subroot_path '$output_dir/$file'");
	    for my $old_path (glob("$output_dir/$subroot_path/*.po")) {
		$old_path =~ m{\A(.+)/(.+\.po)\z};
		my $po_file = $2;
		$po_file =~ s{-}{_}g;
		my $new_path = $1 . '/' . lc $po_file;
		system("mv $old_path $new_path");
	    }
	}
	else {
	    system("rm -rf $subroot_path '$output_dir/$file'");
	}
    }
}

sub main {
    exec 'perldoc', $0 if !@ARGV;
    my ($output_dir, @locales, $max_results);
    GetOptions(q{output_dir|o=s} => \$output_dir,
	       q{locales|l=s@} => \@locales,
	       q{max_results|m=i} => \$max_results);
    die "Please specify output_dir" if !$output_dir;
    $max_results ||= 100;
    @locales = @default_locales if !@locales;
    my %url = fetch_software_urls($output_dir, \@locales, $max_results);
    fetch_source_files($output_dir, %url);
}

main;

1;
__END__

=pod

=head1 NAME

pofetcher - L10N Message Fetching Tool

=head1 SYNOPSIS

  % pofetcher -o output directory
              -l specific locales [can be used multiple times]
              -m maximum .po files for each locale

=head1 DESCRIPTION

This tool fetches software information from
http://www.google.com/codesearch, and then get source code and
extracts .po to your file system.

                      pofetcher
   .po files on WWW -------------> .po files on disk

=head1 DEPENDENCY

This tool depends on B<wget>.

=head1 COPYRIGHT

Copyright (c) 2007 Yung-chung Lin. All rights reserved. This program is
free software; you can redistribute it and/or modify it under the same
terms as Perl itself.

=cut