The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/local/bin/perl
##################################################################
# Postprocessing script for GMA alignment
# <align file> contains entries as i,j,...k <=> l,m,...n
#script places lines i, j,... k from  file1 in file1.a
#lines l,m,...n from  file2 in file2.a
#results in file1.a and file2.a being aligned files
# optional parameters:
# threshold: pring the names of the files when more than threshold %
#   of the lines are missing
# -s parameter for just statistics, do not generate aligned files
# if the following parameters are specified, 
# print error message if a string is missing and it is not marked 'omitted' and the line is not empty
# and some word in the line is found in the lexicon
# file1 morph - morpholised version of text1 (that was used for axis generation
# file2 morph - morpholised version of text2 (that was used for axis generation
# file1 lexicon 
# file2 lexicon
# 
##################################################################


if($#ARGV<2)
{
    print "usage $0 <align file> <text1> <text2> [<threshold> <-e> <file1 morph> <file2 morph> <file1 lexicon> <file2 lexicon>]\n";
    exit 0;
}

$statonly  = 0;
if($ARGV[4] eq '-s')
{
    $statonly = 1;
}


#empty lines from file 1
$cnt = 0;
open(F1, $ARGV[1]);
while(<F1>)
{
    $cnt++;
    $emptyF1{$cnt} = 1 if(length($_)<=2);
    
}

close(F1);
#empty lines from file 2
$cnt = 0;
open(F2, $ARGV[2]);
while(<F2>)
{
    $cnt++;
    $emptyF2{$cnt} = 1 if(length($_)<=2);
}

close(F2);


open(ALIGN, $ARGV[0]);

$lang1 =  $ARGV[1];
$lang2  =  $ARGV[2];
$thresh = $ARGV[3];
$lang1morph =  $ARGV[5];
$lang2morph  =  $ARGV[6];
$lang1lex = $ARGV[7];
$lang2lex = $ARGV[8];
#$dir = $ARGV[3];

#read in lang1 lexicon
undef $/;
open(F1, $lang1lex) || die "Couldn't open $lang1lex: $!\n";
$lang1lexdata = <F1>;
close(F1);
open(F2, $lang2lex) || die "Couldn't open $lang2lex: $!\n";
$lang2lexdata = <F2>;
close(F2);
$/ = "\n";

if(!($statonly))
{
    system("rm $lang1.a")  if(-f "$lang1.a" );
    system("rm $lang2.a")  if(-f "$lang2.a" );
#exit 0 if(-f "$lang1.a" );
#exit 0 if(-f "$lang2.a" );
}

while(<ALIGN>)
{
#    print stderr $_;
    chop;
    @align = split '<=>';
    @l1  = split ',',$align[0];
    @l2   = split ',',$align[1];

    if($align[0]=~/omitted/ || $align[1] =~ /omitted/)
    {
	#print "omitted $align[0] $align[1]\n";
	foreach $l(@l1){
	    $l1_omit[$l]++;	
	}
	foreach $l(@l2){
	    $l2_omit[$l]++;	
	}


    }
    else
    {    
	foreach $l(@l1){
	    system("sellinesnonewline $l $l $lang1 >>  $lang1.a") if(!($statonly));
	    $l1_check[$l]++;	
	}

	foreach $l(@l2){
	    system("sellinesnonewline $l $l $lang2 >>  $lang2.a") if(!($statonly));
	    $l2_check[$l]++;
	}

	system("echo ''>>  $lang1.a");
	system("echo ''>>  $lang2.a");
	
    }
}


$numlines1 = `wc -l $lang1`;
$numlines2 = `wc -l $lang2`;

##################################################################################################
# print error message if a string is missing and it is not marked 'omitted' and the line is not empty
# and some word in the line is found in the lexicon
##################################################################################################

for($i = 1; $i<=$numlines1; $i++)
{
    if($l1_check[$i]==0 && $l1_omit[$i] == 0 && !(exists $emptyF1{$i}) )
    {
    #print "$lang1 missing $i\n";
    $str  = `head -$i $lang1morph | tail -1`;
    @words = split " ", $str;
    $found = 0;
    foreach $w(@words)
    {
	$found = 1 if($lang1lexdata =~ /\b\Q$w\E\b/)	   	    
    }

    if($found){
	print  "$lang1 missing $i:$str\n"; 
	$missing1++;
    }

    #exit 0;
    }
}

for($i = 1; $i<=$numlines2; $i++)
{
    if($l2_check[$i]==0 && $l2_omit[$i] == 0 && !(exists $emptyF2{$i}))
    {
    #print "$lang2 missing $i\n";
     $str  = `head -$i $lang2morph | tail -1`;
    @words = split " ", $str;
    $found = 0;
    foreach $w(@words)
    {
	$found = 1 if($lang2lexdata =~ /\b\Q$w\E\b/)	   	    
    }

    if($found){	
	print "$lang2 missing $i:$str\n";
	$missing2++;
    }
    #exit 0;
    }
}

print("$lang1\n") if($missing1/$numlines1 > $thresh);
print("$lang2\n") if($missing2/$numlines2 > $thresh);