The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/perl -s

# PODNAME: tmxclean
# ABSTRACT: Simple tool to clean TMX files

use v5.10;
use strict;
use warnings;

use XML::TMX::Reader;

our (
     $junk, # remove if one of the languages just have junk
     $output,  # output filename
     $eq ,  # remove if seg(l1) = seg(l2)
     $len , # remove if len(li) > 50 ∧ len(lj) > 2len(li)
     $v, $verbose
    );

my $cleaned = 0;
my $processed = 0;
my $tmx = shift or help();
my $reader = XML::TMX::Reader->new($tmx);

$junk//=1;
$output ||= "_cleaned_$tmx";

print STDERR "loading..." if $v;

$reader->for_tu( {output => $output},
                  \&cleaner);

printf STDERR "\rRemoved %d/%d (%.3f%%).\n", 
              $cleaned, $processed, 100*$cleaned/$processed if $v;

sub cleaner {
    my $langs = shift;
    $processed++;
    my $remove = 0;
    my %seg=();
    my @len=();
    for my $k (keys %$langs) {
        next if $k =~ /^-/;
        $remove = 1 if $eq    && $seg{$langs->{$k}{-seg}}++;

        $remove = 1 if $junk  && $langs->{$k}{-seg} =~ /^[-.,0-9\s]+$/;
        $remove = 1 if $junk  && $langs->{$k}{-seg} =~ /^\W*$/;
        push(@len, length($langs->{$k}{-seg}));
    }
    @len = sort{$a <=> $b} @len;
    $remove = 1 if $len && $len[0] > 50 && $len[0]*2< $len[-1];

    $cleaned++ if $remove;
    printf STDERR "\rRemoved %d/%d (%.3f%%)...", $cleaned, $processed,
      100*$cleaned/$processed if $v && $processed%1000==0;
    return $remove ? undef : $langs;
}

sub help {
    print "   tmxclean [-junk=1] <file.tmx>\n";
    exit 1;
}

__END__

=pod

=encoding UTF-8

=head1 NAME

tmxclean - Simple tool to clean TMX files

=head1 VERSION

version 0.36

=head1 SYNOPSIS

  $ tmx-clean file.tmx

=head1 DESCRIPTION

Removes the Translation units that

 1. have no letters...  (unless -junk=0)
 2. seg(L1) = seg(L2)   (if -eq)

=head1 SEE ALSO

XML::TMX

=head1 AUTHORS

=over 4

=item *

Alberto Simões <ambs@cpan.org>

=item *

José João Almeida <jj@di.uminho.pt>

=back

=head1 COPYRIGHT AND LICENSE

This software is copyright (c) 2010-2017 by Projeto Natura <natura@di.uminho.pt>.

This is free software; you can redistribute it and/or modify it under
the same terms as the Perl 5 programming language system itself.

=cut