The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package Lingua::Align::Features::Orthography;

use 5.005;
use strict;
use utf8;

use vars qw(@ISA);
use vars qw(%__LCSR_CACHE $__LCSR_CACHE_SIZE $__LCSR_CACHE_COUNT);

@ISA = qw(Lingua::Align::Features::Tree);

$__LCSR_CACHE_COUNT=0;
$__LCSR_CACHE_SIZE=1000000;


# get features from tree pairs for specific node pairs

sub get_features{
    my $self=shift;
    my ($src,$trg,$srcN,$trgN,$FeatTypes,$values)=@_;

    my @srcwords = $self->{TREES}->get_leafs($src,$srcN);
    my $srcstr = join(' ',@srcwords);
    my @trgwords = $self->{TREES}->get_leafs($trg,$trgN);
    my $trgstr = join(' ',@trgwords);

    my $srclen=length($srcstr);
    my $trglen=length($trgstr);

    if (exists $$FeatTypes{lendiff}){
	$$values{lendiff}=abs($srclen-$trglen);
    }
    if (exists $$FeatTypes{lenratio}){
	if ($srclen>$trglen){
	    $$values{lenratio}=$trglen/$srclen;
	}
	else{
	    $$values{lenratio}=$srclen/$trglen;
	}
    }

    # string similarity features
    # (is this really a good idea to do for all nodes? ---> expensive & slow!)

    $self->string_sim_features($srcstr,$trgstr,$srclen,$trglen,
			       $FeatTypes,$values);

    # string class features
    $self->string_class_features($srcstr,$trgstr,$FeatTypes,$values);

    # sub-string features
    if ($self->{TREES}->is_terminal($src,$srcN)){
	if ($self->{TREES}->is_terminal($trg,$trgN)){
	    $self->substring_features($srcstr,$trgstr,$FeatTypes,$values);
	}
    }

}




sub substring_features{
    my $self=shift;
    my ($srcstr,$trgstr,$FeatTypes,$values)=@_;

    if (exists $FeatTypes->{suffix}){
	if ($FeatTypes->{suffix}>0){
	    my $SuffixLength=$FeatTypes->{suffix};
	    my $suffixSrc = substr($srcstr, 0-$SuffixLength);
	    my $suffixTrg = substr($trgstr, 0-$SuffixLength);
	    my $pair = $suffixSrc.'_'.$suffixTrg;
	    $$values{'suffix_'.$pair} = 1;
	}
    }

    if (exists $FeatTypes->{prefix}){
	if ($FeatTypes->{prefix}>0){
	    my $PrefixLength=$FeatTypes->{prefix};
	    my $prefixSrc = substr($srcstr, $PrefixLength);
	    my $prefixTrg = substr($trgstr, $PrefixLength);
	    my $pair = $prefixSrc.'_'.$prefixTrg;
	    $$values{'prefix_'.$pair} = 1;
	}
    }

    if (exists $FeatTypes->{word}){
	my $pair = $srcstr.'_'.$trgstr;
	$$values{'word_'.$pair} = 1;
    }
}


sub string_class_features{
    my $self=shift;
    my ($srcstr,$trgstr,$FeatTypes,$values)=@_;

    if (exists $$FeatTypes{isnumber}){
	if ($srcstr=~/^[\d\.\,]+\%?$/){
	    if ($trgstr=~/^[\d\.\,]+\%?$/){
		$$values{isnumber}=1;
	    }
	}
    }

    if (exists $$FeatTypes{hasdigit}){
#	if ($self->{TREES}->is_terminal($src,$srcN)){
#	    if ($self->{TREES}->is_terminal($trg,$trgN)){
		if ($srcstr=~/\d/){
		    if ($trgstr=~/\d/){
			$$values{digit}=1;
#			print STDERR "$srcstr .... $trgstr\n";
		    }
		}
#	    }
#	}
    }

    if (exists $$FeatTypes{ispunct}){
	if ($srcstr=~/^\p{P}$/){
	    if ($trgstr=~/^\p{P}$/){
		$$values{ispunct}=1;
	    }
	}
    }

    if (exists $$FeatTypes{punct}){
	if ($srcstr=~/^\p{P}$/){
	    if ($trgstr=~/^\p{P}$/){
		$$values{"punct\_$srcstr\_$trgstr"}=1;
	    }
	}
    }


    if (exists $$FeatTypes{haspunct}){
#	if ($self->{TREES}->is_terminal($src,$srcN)){
#	    if ($self->{TREES}->is_terminal($trg,$trgN)){
		if ($srcstr=~/\p{P}/){
		    if ($trgstr=~/\p{P}/){
			$$values{haspunct}=1;
#			print STDERR "$srcstr .... $trgstr\n";
		    }
		}
#	    }
#	}
    }
}



sub string_sim_features{
    my $self=shift;
    my ($srcstr,$trgstr,$srclen,$trglen,$FeatTypes,$values)=@_;

    if (exists $$FeatTypes{identical}){
	my $minLength=1;
	if (defined $$FeatTypes{identical}){
	    $minLength=$$FeatTypes{identical};
	}
	if ($srclen>=$minLength && $trglen>=$minLength){
	    if ($srcstr eq $trgstr){
		$$values{identical}=1;
	    }
	}
    }

    if (exists $$FeatTypes{lcsr}){
	my $minLength=1;
	if (defined $$FeatTypes{lcsr}){
	    $minLength=$$FeatTypes{lcsr};
	}
	if ($srclen>=$minLength && $trglen>=$minLength){
	    if ($srcstr eq $trgstr){
		$$values{lcsr}=1;
	    }
	    else{
		$$values{lcsr}=lcsr($srcstr,$trgstr);
	    }
	}
    }


    if (exists $$FeatTypes{lcsrlc}){
	my $minLength=1;
	if (defined $$FeatTypes{lcsrlc}){
	    $minLength=$$FeatTypes{lcsrlc};
	}
	if ($srclen>=$minLength && $trglen>=$minLength){
	    my $SrcStr=lc($srcstr);
	    my $TrgStr=lc($trgstr);
	    if ($SrcStr eq $TrgStr){
		$$values{lcsrlc}=1;
	    }
	    else{
		$$values{lcsrlc}=lcsr($SrcStr,$TrgStr);
	    }
	}
    }


    ## ignore non-ascii for lcsr scores!
    if (exists $$FeatTypes{lcsrascii}){
	my $SrcStr=$srcstr;
	my $TrgStr=$trgstr;
	$SrcStr=~s/[^a-z0-9]//gi;
	$TrgStr=~s/[^a-z0-9]//gi;
	my $minLength=0;
	if (defined $$FeatTypes{lcsrascii}){
	    $minLength=$$FeatTypes{lcsrascii};
	}
	if (length($SrcStr)>=$minLength && length($TrgStr)>=$minLength){
	    if ($SrcStr eq $TrgStr){
		$$values{lcsrascii} = 1;
	    }
	    else{
		$$values{lscrascii} = lcsr($SrcStr,$TrgStr);
	    }
	}
    }

    ## ignore non-ascii for lcsr scores!
    if (exists $$FeatTypes{lcsrcons}){
	my $SrcStr=$srcstr;
	my $TrgStr=$trgstr;
	$SrcStr=~s/[aeiuoAEIOUÀÁÂÃÄÅÆÈÉÊËÌÍÎÏÒÓÔÕÖÙÚÛÜÝàáâãäåæèéêëìíîïñòóôõöùúûüýÿ]//g;
	$TrgStr=~s/[aeiuoAEIOUÀÁÂÃÄÅÆÈÉÊËÌÍÎÏÒÓÔÕÖÙÚÛÜÝàáâãäåæèéêëìíîïñòóôõöùúûüýÿ]//g;

	my $minLength=0;
	if (defined $$FeatTypes{lcsrcons}){
	    $minLength=$$FeatTypes{lcsrcons};
	}
	if (length($SrcStr)>=$minLength && length($TrgStr)>=$minLength){
	    if ($SrcStr eq $TrgStr){
		$$values{lcsrcons} = 1;
	    }
	    else{
		$$values{lscrcons} = lcsr($SrcStr,$TrgStr);
	    }
	}
    }
}








sub lcsr{
    my ($str1,$str2)=@_;
    if (exists $__LCSR_CACHE{$str1}{$str2}){
	return $__LCSR_CACHE{$str1}{$str2};
    }
    my $score=&lcs($str1,$str2);
    if (length($str1)>length($str2)){
	$score/=length($str1);
    }
    if (length($str2)>0){
	$score/=length($str2);
    }
    if ($__LCSR_CACHE_COUNT<$__LCSR_CACHE_SIZE){
	$__LCSR_CACHE{$str1}{$str2}=$score;
	$__LCSR_CACHE_COUNT++;
    }
    return $score;
}



sub lcs {
  my ($src,$trg)=@_;
  my (@l,$i,$j);
  my @src_let=split(//,$src);
  my @trg_let=split(//,$trg);
  unshift (@src_let,'');
  unshift (@trg_let,'');
  for ($i=0;$i<=$#src_let;$i++){
      $l[$i][0]=0;
  }
  for ($i=0;$i<=$#trg_let;$i++){
      $l[0][$i]=0;
  }
  for $i (1..$#src_let){
      for $j (1..$#trg_let){
	  if ($src_let[$i] eq $trg_let[$j]){
	      $l[$i][$j]=$l[$i-1][$j-1]+1;
	  }
	  else{
	      if ($l[$i][$j-1]>$l[$i-1][$j]){
		  $l[$i][$j]=$l[$i][$j-1];
	      }
	      else{
		  $l[$i][$j]=$l[$i-1][$j];
	      }
	  }
      }
  }
  return $l[$#src_let][$#trg_let];
}




1;