The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
use strict;
use warnings;

package Locale::Simple::Scraper;
BEGIN {
  $Locale::Simple::Scraper::AUTHORITY = 'cpan:GETTY';
}
$Locale::Simple::Scraper::VERSION = '0.019';
# ABSTRACT: scraper to find translation tokens in a directory

use Exporter 'import';
use Getopt::Long;
use File::Find;
use Cwd;
use Locale::Simple;
use Data::Dumper;
use Locale::Simple::Scraper::Parser;

our @EXPORT = qw(scrape);

sub scrape {
    @ARGV = @_;

    $| = 1;

    # Supported filetypes:
    my $js_ext = "";    # Javascript
    my $pl_ext = "";    # Perl
    my $py_ext = "";    # Python
    my $tx_ext = "";    # Text::Xslate (Kolon or Metakolon)

    my @ignores;
    my @only;

    my $output = 'po';
    my ($md5, $no_line_numbers);

    GetOptions(
        "js=s"            => \$js_ext,
        "pl=s"            => \$pl_ext,
        "py=s"            => \$py_ext,
        "tx=s"            => \$tx_ext,
        "ignore=s"        => \@ignores,
        "only=s"          => \@only,
        "output=s"        => \$output,
        "md5"             => \$md5,
        "no_line_numbers" => \$no_line_numbers,
    );

    # could add Getopt::Long here for override

    my @js = split( ",", $js_ext );
    push @js, 'js';

    my @pl = split( ",", $pl_ext );
    push @pl, 'pl', 'pm', 't';

    my @tx = split( ",", $tx_ext );
    push @tx, 'tx';

    my @py = split( ",", $py_ext );
    push @py, 'py';

    # extension list
    my %e = (
        ( map { $_ => 'js' } @js ),
        ( map { $_ => 'pl' } @pl ),
        ( map { $_ => 'tx' } @tx ),
        ( map { $_ => 'py' } @py ),
    );

    # functions with count of locale simple with function of parameter
    #
    # 1 = msgid
    # 2 = msgid_plural
    # 3 = msgctxt
    # 4 = domain
    #
    my %f = (
        l    => [1],
        ln   => [ 1, 2 ],
        ld   => [ 4, 1 ],
        lp   => [ 3, 1 ],
        lnp  => [ 3, 1, 2 ],
        ldn  => [ 4, 1, 2 ],
        ldp  => [ 4, 3, 1 ],
        ldnp => [ 4, 3, 1, 2 ],
    );

    my %files;

    my $dir    = getcwd;
    my $re_dir = $dir;
    $re_dir =~ s/\./\\./g;

    finddepth(
        sub {
            my $filename        = $File::Find::name;
            my $stored_filename = $filename;
            if ( $md5 ) {
                eval {
                    require Digest::MD5;
                    Digest::MD5->import( 'md5_hex' );
                };
                die "This feature requires Digest::MD5" if $@;
                $stored_filename = md5_hex( $filename );
            }
            $filename =~ s/^$dir\///g;
            for ( @ignores ) {
                return if $filename =~ /$_/;
            }
            if ( @only ) {
                my $found = 0;
                for ( @only ) {
                    $found = 1 if $filename =~ /$_/;
                }
                return unless $found;
            }
            my @fileparts = split( '\.', $File::Find::name );
            my $ext = pop @fileparts;
            $files{$File::Find::name} = [ $ext, $filename, $stored_filename ] if grep { $ext eq $_ } keys %e;
        },
        $dir
    );

    my @found;
    for my $file ( sort keys %files ) {
        my ( $ext, $filename, $stored_filename ) = @{ $files{$file} };
        my $type = $e{$ext};
        print STDERR $type . " => " . $file . "\n";
        return if -l $file and not -e readlink( $file );
        my $parses = Locale::Simple::Scraper::Parser->new( type => $type )->from_file( $file );
        my @file_things = map {
            {
                %{ result_from_params( $_->{args}, $f{ $_->{func} } ) },
                  line => $_->{line},
                  file => $stored_filename,
                  type => $type,
            }
        } @{$parses};
        push @found, @file_things;
    }

    if ( $output eq 'po' ) {
        my %files;
        my %token;
        for ( @found ) {
            my $key .= defined $_->{domain}       ? '"' . $_->{domain} . '"'       : 'undef';
            $key    .= defined $_->{msgctxt}      ? '"' . $_->{msgctxt} . '"'      : 'undef';
            $key    .= defined $_->{msgid}        ? '"' . $_->{msgid} . '"'        : 'undef';
            $key    .= defined $_->{msgid_plural} ? '"' . $_->{msgid_plural} . '"' : 'undef';
            $token{$key} = $_ unless defined $token{$key};
            $files{$key} = [] unless defined $files{$key};
            push @{ $files{$key} }, $_->{file} . ':' . $_->{line};
        }
        for my $k ( sort { $a cmp $b } keys %files ) {
            print "\n";
            print "#: " . join( ' ', @{ $files{$k} } ) . "\n" if !$no_line_numbers;
            print "#, locale-simple-format";
            print " " . $token{$k}{domain} if defined $token{$k}{domain};
            print "\n";
            for ( qw( msgctxt msgid msgid_plural ) ) {
                print $_. ' "' . Locale::Simple::gettext_escape( $token{$k}{$_} ) . '"' . "\n"
                  if defined $token{$k}{$_};
            }
            my $plural_marker = $token{$k}{msgid_plural} ? "[0]" : "";
            print qq[msgstr$plural_marker ""\n];

        }
    }
    elsif ( $output eq 'perl' ) {
        print Dumper \@found;
    }
    elsif ( $output eq 'json' ) {
        eval {
            require JSON;
            JSON->import;
            print encode_json( \@found );
        } or do {
            die "You require the module JSON for this output";
        };
    }
    elsif ( $output eq 'yaml' ) {
        eval {
            require YAML;
            YAML->import;
            print Dump( \@found );
        } or do {
            die "You require the module YAML for this output";
        };
    }
}

sub parse_line {
    my ( $line, $type, $f, @results ) = @_;
    return if $line =~ /^\s*\#.*/;
    for ( keys %{$f} ) {
        my @args = @{ $f->{$_} };
        my $params = get_func_params( $_, $line );
        next if !$params;
        my $argc = scalar @args;
        my ( $remainder, @params ) = parse_params( $params, $type, $argc );
        if ( scalar @params == $argc ) {
            push @results, result_from_params( \@params, \@args ), parse_line( $remainder, $type, $f );
        }
    }
    return @results;
}

sub result_from_params {
    my ( $params, $args ) = @_;
    my %result;
    my $pos = 0;
    for ( @{$args} ) {
        $result{msgid}        = $params->[$pos] if $_ eq 1;
        $result{msgid_plural} = $params->[$pos] if $_ eq 2;
        $result{msgctxt}      = $params->[$pos] if $_ eq 3;
        $result{domain}       = $params->[$pos] if $_ eq 4;
        $pos++;
    }
    return \%result;
}

sub get_func_params {
    my ( $func, $line ) = @_;
    $line =~ /([^\w]|^)$func\((.*)/;
    return $2;
}

sub parse_params {
    my ( $params, $type, $argc ) = @_;
    my @chars = split( '', $params );
    my @args;
    my $arg         = "";
    my $q_state     = 0;    # 0 = code, 1 = qoute, 2 = double qoute
    my $comma_state = 1;
    while ( defined( my $c = shift @chars ) ) {
        next if $c =~ /\s/ and !$q_state;
        if ( $q_state ) {
            if ( $c eq '\\' ) {
                my $esc = shift @chars;
                if ( $esc eq "'" or $esc eq '"' or $esc eq '\\' ) {
                    $arg .= $esc;
                }
                else {
                    warn "Unknown escape char '" . $esc . "'";
                }
            }
            elsif ( ( $c eq "'" and $q_state == 1 ) or ( $c eq '"' and $q_state == 2 ) ) {
                $q_state     = 0;
                $comma_state = 0;
                push @args, $arg;
                $arg = "";
                last if scalar @args == $argc;
            }
            else {
                $arg .= $c;
            }
        }
        else {
            if ( $c eq "'" or $c eq '"' ) {
                die "quote found where comma expected: " . $params unless $comma_state;
                $q_state = $c eq "'" ? 1 : 2;
            }
            elsif ( $c eq ',' ) {
                die "comma found after comma in code: " . $params if $comma_state;
                $comma_state = 1;
            }
            elsif ( $type eq 'js' ) {
                last;
            }
            else {
                last;
            }
        }
    }
    return join( '', @chars ), @args;
}

1;

__END__

=pod

=head1 NAME

Locale::Simple::Scraper - scraper to find translation tokens in a directory

=head1 VERSION

version 0.019

=head1 AUTHOR

Torsten Raudssus <torsten@raudss.us>

=head1 COPYRIGHT AND LICENSE

This software is copyright (c) 2012 by DuckDuckGo, Inc. L<http://duckduckgo.com/>, Torsten Raudssus <torsten@raudss.us>.

This is free software; you can redistribute it and/or modify it under
the same terms as the Perl 5 programming language system itself.

=cut