#!/usr/bin/perl
use strict;
use utf8;
use Getopt::Long qw(:config auto_version);
use LWP::Simple;
use LWP::UserAgent;
use Data::Dumper;
use List::Util qw(shuffle);
our $VERSION = $Locale::Memories::VERSION;
my @default_locales = qw(aa_dj aa_er aa_er aa_et af_za am_et an_es
ar_ae ar_bh ar_dz ar_eg ar_in ar_iq ar_jo
ar_kw ar_lb ar_ly ar_ma ar_om ar_qa ar_sa
ar_sd ar_sy ar_tn ar_ye as_in az_az be_by
be_by bg_bg bn_bd bn_in br_fr bs_ba
byn_er ca_ad ca_es ca_es ca_fr ca_it
csb_pl cs_cz cy_gb da_dk de_at de_be
de_ch de_de de_lu dz_bt el_cy el_gr en_au
en_bw en_ca en_dk en_gb en_hk en_ie en_in
en_nz en_ph en_sg en_us en_za en_zw eo
es_ar es_bo es_cl es_co es_cr es_do es_ec
es_es es_gt es_hn es_mx es_ni es_pa es_pe
es_pr es_py es_sv es_us es_uy es_ve et_ee
eu_es eu_fr fa_ir fi_fi fo_fo fr_be fr_ca
fr_ch fr_fr fr_lu fy_nl ga_ie gd_gb
gez_er gez_er gez_et gez_et gl_es gu_in
gv_gb he_il hi_in hr_hr hsb_de hu_hu
hy_am ia id_id is_is it_ch it_it iw_il
ja_jp ka_ge kk_kz kl_gl km_kh kn_in ko_kr
ku_tr kw_gb ky_kg lg_ug lo_la lt_lt lv_lv
mai_in mg_mg mi_nz mk_mk ml_in mn_mn
mr_in ms_my mt_mt nb_no ne_np nl_be nl_nl
nn_no no_no nr_za nso_za oc_fr om_et
om_ke or_in pa_in pa_pk pl_pl pt_br pt_pt
ro_ro ru_ru ru_ua rw_rw sa_in se_no
sid_et si_lk sk_sk sl_si so_dj so_et
so_ke so_so sq_al sr_cs sr_me sr_rs ss_za
st_za sv_fi sv_se ta_in te_in tg_tj th_th
ti_er ti_et tig_er tl_ph tn_za tr_cy
tr_tr ts_za tt_ru uk_ua ur_pk uz_uz uz_uz
ve_za vi_vn wa_be wo_sn xh_za yi_us zh_cn
zh_hk zh_sg zh_tw zu_za);
@default_locales = shuffle(@default_locales,
map {m{(\w+)_\w+}; $1} @default_locales);
sub get_search_result {
my $locale = shift;
my $max_results = shift;
my $remote_prog = 'http://www.google.com/codesearch/feeds/search?';
my $feed_url = ($remote_prog . 'q=file:' . $locale
. '.po$&max-results=' . $max_results);
my $content = LWP::Simple::get($feed_url);
my @urls;
while ($content =~ m{<id>(.+?)</id>}g) {
my $url = $1;
next if $url =~ m{/codesearch/feeds};
$url =~ m{cs_p=(.+?)&cs_f=(.+?)&cs_p};
my ($source, $path) = ($1, $2);
$path =~ s{(.+)/.+\.po}{$1};
push @urls, [ $source, $path ];
}
return @urls;
}
sub fetch_software_urls {
my ($output_dir, $locales_ref, $max_results) = @_;
if (-e $output_dir && !-d $output_dir) {
die "$output_dir already exists and it is not a directory";
}
elsif (!-d $output_dir) {
mkdir $output_dir or die $!;
}
my %url;
for my $locale (@{$locales_ref}) {
print "Fetching code information for locale ($locale)\n";
for my $u (get_search_result($locale, $max_results)) {
if ($u->[0] =~ m{http://.+\.gz\z}) {
$url{$u->[0]} = $u->[1];
}
}
}
return %url;
}
sub fetch_source_files {
my $output_dir = shift;
my %url = @_;
for my $u (keys %url) {
$u =~ m{\A.+/(.+)\z};
my $file = $1;
my $path = $url{$u};
my $subroot_path = $path;
$subroot_path =~ s{\A(.+?)/.+}{$1};
my $output_subpath = $url{$u};
$output_subpath =~ s{/}{_}g;
system("wget '$u' -O '$output_dir/$file'"
. "&& tar zxvf '$output_dir/$file' > /dev/null "
. "&& mkdir -p '$output_dir/$subroot_path'");
if (-d $url{$u}) {
system("cp '$url{$u}/'*.po '$output_dir/$subroot_path/' "
. "&& rm -rf $subroot_path '$output_dir/$file'");
for my $old_path (glob("$output_dir/$subroot_path/*.po")) {
$old_path =~ m{\A(.+)/(.+\.po)\z};
my $po_file = $2;
$po_file =~ s{-}{_}g;
my $new_path = $1 . '/' . lc $po_file;
system("mv $old_path $new_path");
}
}
else {
system("rm -rf $subroot_path '$output_dir/$file'");
}
}
}
sub main {
exec 'perldoc', $0 if !@ARGV;
my ($output_dir, @locales, $max_results);
GetOptions(q{output_dir|o=s} => \$output_dir,
q{locales|l=s@} => \@locales,
q{max_results|m=i} => \$max_results);
die "Please specify output_dir" if !$output_dir;
$max_results ||= 100;
@locales = @default_locales if !@locales;
my %url = fetch_software_urls($output_dir, \@locales, $max_results);
fetch_source_files($output_dir, %url);
}
main;
1;
__END__
=pod
=head1 NAME
pofetcher - L10N Message Fetching Tool
=head1 SYNOPSIS
% pofetcher -o output directory
-l specific locales [can be used multiple times]
-m maximum .po files for each locale
=head1 DESCRIPTION
This tool fetches software information from
http://www.google.com/codesearch, and then get source code and
extracts .po to your file system.
pofetcher
.po files on WWW -------------> .po files on disk
=head1 DEPENDENCY
This tool depends on B<wget>.
=head1 COPYRIGHT
Copyright (c) 2007 Yung-chung Lin. All rights reserved. This program is
free software; you can redistribute it and/or modify it under the same
terms as Perl itself.
=cut