The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package I18NFool::Extractor; 
use MKDoc::XML::TreeBuilder;
use Locale::PO;
use warnings;
use strict;

our $Namespace = "http://xml.zope.org/namespaces/i18n";
our $Prefix    = 'i18n';
our $Domain    = 'default';
our $Results   = {};


sub process
{
    my $class = shift;
    my $data  = shift;
    
    local $Namespace = $Namespace;
    local $Prefix    = $Prefix;
    local $Domain    = $Domain;
    local $Results   = {};
    
    my @nodes = MKDoc::XML::TreeBuilder->process_data ($data);
    for (@nodes) { $class->_process ($_) }
    return $Results;
}


sub _process
{
    my $class = shift;
    my $tree  = shift;
    return unless (ref $tree);

    local $Prefix = $Prefix;
    local $Domain = $Domain;

    # process the I18N namespace
    foreach my $key (keys %{$tree})
    {
        my $value = $tree->{$key};
        if ($value eq $Namespace)
        {
            next unless ($key =~ /^xmlns\:/);
            delete $tree->{$key};
            $Prefix = $key;
            $Prefix =~ s/^xmlns\://;
        }
    }

    # set the current i18n:domain
    $Domain = delete $tree->{"$Prefix:domain"} || $Domain;

    my $tag  = $tree->{_tag};
    my $attr = { map { /^_/ ? () : ( $_ => $tree->{$_} ) } keys %{$tree} };
    return if ($tag eq '~comment' or $tag eq '~pi' or $tag eq '~declaration');
    
    # lookup for attributes...
    $tree->{"$Prefix:attributes"} && do {
        my $attributes = $tree->{"$Prefix:attributes"};
        $attributes =~ s/\s*;\s*$//;
        $attributes =~ s/^\s*//;
        my @attributes = split /\s*\;\s*/, $attributes;
        foreach my $attribute (@attributes)
        {
            # if we have i18n:attributes="alt alt_text", then the
            # attribute name is 'alt' and the
            # translate_id is 'alt_text'
            my ($attribute_name, $translate_id);
            if ($attribute =~ /\s/)
            {
                ($attribute_name, $translate_id) = split /\s+/, $attribute, 2;
            }

            # otherwise, if we have i18n:attributes="alt", then the
            # attribute name is 'alt' and the
            # translate_id is $tree->{'alt'}
            else
            {
                $attribute_name = $attribute;
                $translate_id = _canonicalize ( $tree->{$attribute_name} );
            }

            $translate_id || next;
            $Results->{$Domain} ||= {};

            my $existing_po = $Results->{$Domain}->{$translate_id};
            my $new_po = Locale::PO->new (
                -msgid  => $translate_id,
                -msgstr => _canonicalize ( $tree->{$attribute_name} ) || '',
            );

            if ($existing_po && ($existing_po->{msgstr} ne $new_po->{msgstr}))
            {
                print STDERR "String for '$translate_id' doesn't match:\n".
                             "   old: $existing_po->{msgstr}\n".
                             "   new: $new_po->{msgstr}\n"
            }

            $Results->{$Domain}->{$translate_id} = $new_po;
        }
    };

    # lookup for content...
    exists $tree->{"$Prefix:translate"} && do {
        my ($translate_id);

        # if we have $Domain:translate="something",
        # then the translate_id is 'something'
        if (defined $tree->{"$Prefix:translate"} and $tree->{"$Prefix:translate"} ne '')
        {
            $translate_id = $tree->{"$Prefix:translate"};
        }

        # otherwise, the translate_id has to be computed
        # from the contents of this node, so that
        # <div i18n:translate="">Hello, <span i18n:name="user">David</span>, how are you?</div>
        # becomes 'Hello, ${user}, how are you?'
        else
        {
            $translate_id = _canonicalize ( _extract_content_string ($tree) );
        }

        $translate_id || next;
        $Results->{$Domain} ||= {};

        my $existing_po = $Results->{$Domain}->{$translate_id};
        my $new_po = Locale::PO->new (
            -msgid  => $translate_id,
            -msgstr => _canonicalize ( _extract_content_string ($tree) ) || '',
        );

        if ($existing_po && ($existing_po->{msgstr} ne $new_po->{msgstr}))
        {
            print STDERR "String for '$translate_id' doesn't match:\n".
                         "   old: $existing_po->{msgstr}\n".
                         "   new: $new_po->{msgstr}\n"
        }

        $Results->{$Domain}->{$translate_id} = $new_po;
    };

    # I know, I know, the I18N namespace processing is a bit broken...
    # It should suffice for now
    delete $tree->{"$Prefix:attributes"};
    delete $tree->{"$Prefix:translate"};
    delete $tree->{"$Prefix:name"};

    # Do the same i18n thing with child nodes, recursively.
    # for some reason it always makes me think of roller coasters.
    # Yeeeeeeee!
    defined $tree->{_content} and do {
        for (@{$tree->{_content}}) { $class->_process ($_) }
    };
}


sub _canonicalize
{
    my $string = shift || '';
    $string =~ s/\r/ /g;
    $string =~ s/\n/ /g;
    $string =~ s/\s+/ /gsm;
    $string =~ s/^ //;
    $string =~ s/ $//;
    return $string;
}


sub _extract_content_string
{
    my $tree  = shift;
    my @res   = ();

    my $count = 0;
    foreach my $node (@{$tree->{_content}})
    {
        ref $node or do {
            push @res, $node;
            next;
        };
        
        $count++;
        my $name = $node->{"$Prefix:name"} || $count;
        push @res, '${' . $name . '}';
    }
    
    return join '', @res;
}


1;