The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/perl -w
# Time-stamp: <2010-05-14T16:39:18 mxp>
# Copyright © 2000 Michael Piotrowski.  All Rights Reserved.

=head1 NAME

trainlid - build transition matrix for Lingua::Ident module

=head1 SYNOPSIS

B<trainlid> [-s] I<language_name> < I<training_text> > I<matrix_file>

=head1 DESCRIPTION

B<trainlid> builds a trigram transition matrix for use with the
B<Lingua::Ident> module.  It reads a training text from standard input
and outputs a transition matrix with the specified I<language_name> as
identifier to standard output.

It is recommended that I<language_name> be a POSIX locale name
constructed from an ISO 639 2-letter language code, possibly extended
by an ISO 3166 2-letter country code and a character set
identifier. Example: B<de_DE.iso88591>.

The following options are available:

=over 4

=item B<-s>	Do not print statistics after building the language model.

=back

=head1 AUTHOR

B<trainlid> was developed by Michael Piotrowski <mxp@dynalabs.de>.

=head1 SEE ALSO

Lingua::Ident(3)

=cut

###############################################################################

require 5.004;
use Getopt::Std;

our $opt_s;   # silent

getopts("s");

while(defined($c = getc))
{
   $c =~ s/[\x00-\x1F\x21-\x40\x7B-\x7F]/ /og;
   $c =  lc($c);

   push @chars, $c;
   $alphabet{$c} = "";

   $chars = join "", @chars;
   $chars =~ s/ +/ /og;
   @chars = split //, $chars;

   $nc++;

   if(@chars == 3)
   {
      $trigram = join("", @chars);
      $matrix{$trigram}++;
      $bigrams{substr($trigram, 0, 2)}++;

      shift @chars;
   }
}


$size_of_alphabet = keys(%alphabet);
print STDERR "alphabet: ", sort(keys(%alphabet)), "\n" unless $opt_s;
undef %alphabet;

while(($trigram, $count) = each(%matrix))
{
    $prob = ($count + 1)/
	($bigrams{substr($trigram, 0, 2)} + $size_of_alphabet);

    $matrix{$trigram} = $prob;
}

print "_LANG:", $ARGV[0], "\n";
print "_NULL:", 1 / $size_of_alphabet, "\n";
print "#ALPH:$size_of_alphabet\n";

while(($key, $val) = each(%matrix))
{
    print "$key:$val\n";
}

while (($key, $val) = each %bigrams)
{
   print "$key;", $val + $size_of_alphabet, "\n";
}

print STDERR "size of alphabet: $size_of_alphabet\n" unless $opt_s;
print STDERR "chars total: $nc\n" unless $opt_s;