lib/Uplug/StrSim.pm - metacpan.org

#-*-perl-*-
#####################################################################
# Copyright (C) 2004 Jörg Tiedemann  <joerg@stp.ling.uu.se>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#
#####################################################################
# string similarity measures
#####################################################################


package Uplug::StrSim;

use vars qw(@ISA @EXPORT);
@ISA = qw( Exporter);
@EXPORT = qw( &similar );

#####################################################################
# GetNrSims
#####################################################################


sub GetNrSims
{
  return 4;
}

#####################################################################
# similar($src,$trg,$meth[,\%weights[,$splitpattern]])
#####################################################################
# returns string similarity score for ($src,$trg)
#   $meth specifies similarity metric
#   $norm specifies normalization of scores
#      0 -> no normalization
#      1 -> normalization by length of longer string
#      2 -> normalization by length of shorter string

sub similar
{
  my ($src,$trg,                         # source and target string
      $meth,                             # similarity measure
      $norm,                             # normalization
      $W,                                # weight function
      $pattern)=@_;                      # pattern for splitting into char's

  my ($score,$first,$last)=(0,0,0);
#  $src=&my_lc($src);                     # make lower case versions
#  $trg=&my_lc($trg);                     #   of source and target strings
  if (length($src)>length($trg)){        # source string should be the shorter
      ($src,$trg)=($trg,$src);           #   one -> swap if necessary
  }

#---------------------------------------------------------------------------
# now, call the sub-function for string similarity score calculation
#---------------------------------------------------------------------------

  if (($meth==2) or ($meth eq 'ord')) {
      ($score,$first,$last)=
	  &LCIFS($src,$trg,$W,$pattern);
  }
  elsif (($meth==4) or ($meth eq 'pos')) {
      ($score,$first,$last)=
	  &LNCCRP($src,$trg,$W,$pattern);
  }
  elsif (($meth==6) or ($meth eq 'best')) {
      ($score,$first,$last)=
	  &BestSimilar($src,$trg,$W,$pattern);
  }
  else {
      ($score,$first,$last)=              # LCSR is the default metric
	  &LCS($src,$trg,$W,$pattern);
  }

#---------------------------------------------------------------------------
# score normalization
#---------------------------------------------------------------------------

  if ($norm==1) {
	if (length($trg)>0) {$score/=(length($trg));}
  }
  if ($norm==2) {
	if (length($src)>0) {$score/=(length($src));}
  }
  if ((defined $first) and (defined $last)){
      return ($score,$first,$last);
  }
  return $score;
}


###########################################################################
# LCSR($src,$trg,[\%weights[,$SplitPattern[,\%trace[,$PrintMatrix]]]])
#--------------------------------------------------------------------------
# longest common sub-sequence ratio
#--------------------------------------------------------------------------
# $src          - source string
# $trg          - target string
# %weights      - weights for character pairs
# $SplitPattern - pattern for splitting strings into characters
# %trace        - trace of character matches
# $PrintMatrix  - ==1 -> print the LSCR matrix
###########################################################################

sub LCSR {

  my ($src,$trg,$W,$pattern,$trace,$printMatrix)=@_;
  my $score=&LCS($src,$trg,$W,$pattern,$trace,$printMatrix);
  if (length($src)>length($trg)){
      return $score/length($src);
  }
  if (length($trg)>0){
      return $score/length($trg);
  }
  return 0;
}

###########################################################################
# LCS($src,$trg,[\%weights[,$SplitPattern[,\%trace[,$PrintMatrix]]]])
#--------------------------------------------------------------------------
# longest common sub-sequence
###########################################################################

sub LCS {

  my ($src,$trg,$W,$pattern,$trace,$printMatrix)=@_;
  my (@l,$i,$j);
  my @src_let=split(/$pattern/,$src);		# split string into char
  my @trg_let=split(/$pattern/,$trg);
  unshift (@src_let,'');
  unshift (@trg_let,'');
  for ($i=0;$i<=$#src_let;$i++){                # initialize the matrix
      $l[$i][0]=0;
  }
  for ($i=0;$i<=$#trg_let;$i++){
      $l[0][$i]=0;
  }                                                       # weight function is
  if (defined $W){                                        # defined:
      for $i (1..$#src_let){                              #   if the pair
	  for $j (1..$#trg_let){                          #   ['.','.'] is 
	      if (($$W{'.'}{'.'}) and                     #   defined in %W:
		  (not $$W{$src_let[$i]}{$trg_let[$j]})){ #   -> count all
		  if ($src_let[$i] eq $trg_let[$j]){      #      identical 
		      $$W{$src_let[$i]}{$trg_let[$j]}=1;  #      matches with 
		  }                                       #      score=1 if the
	      }                                           #      pair is not
	      my $best;                                   #      included in %W
	      if ($$W{$src_let[$i]}{$trg_let[$j]}){
		  $best=$l[$i-1][$j-1]+$$W{$src_let[$i]}{$trg_let[$j]};
	      }
	      if ($l[$i][$j-1]>$best){
		  $best=$l[$i][$j-1];
	      }
	      if ($l[$i-1][$j]>$best){
		  $best=$l[$i-1][$j];
	      }
	      $l[$i][$j]=$best;
	  }
      }
  }
  else{
      for $i (1..$#src_let){
	  for $j (1..$#trg_let){
	      if ($src_let[$i] eq $trg_let[$j]){
		  $l[$i][$j]=$l[$i-1][$j-1]+1;
	      }
	      else{
		  if ($l[$i][$j-1]>$l[$i-1][$j]){
		      $l[$i][$j]=$l[$i][$j-1];
		  }
		  else{
		      $l[$i][$j]=$l[$i-1][$j];
		  }
	      }
	  }
      }
  }
  if (defined $trace){                           # save the trace of character
      $i=$#l;                                    # matches if %trace is defined
      $j=$#{$l[0]};
      while (($i>0) and ($j>0)){
	  if ($l[$i][$j]==$l[$i-1][$j]){
	      $$trace{$i}{$j}=$l[$i][$j]-$l[$i-1][$j];
	      $i-=1;
	  }
	  elsif($l[$i][$j]==$l[$i][$j-1]){
	      $$trace{$i}{$j}=$l[$i][$j]-$l[$i][$j-1];
	      $j-=1;
	  }
	  else{
	      $$trace{$i}{$j}=$l[$i][$j]-$l[$i-1][$j-1];
	      $i-=1;
	      $j-=1;
	  }
      }
  }
 
  if ($printMatrix){
      print '   ';
      foreach (0..$#src_let){
	  printf "%4s ", $src_let[$_];
      }
      print "\n";
      
      foreach (0..$#trg_let){
	  my $i;
	  printf "%3s ", $trg_let[$_];
	  foreach $i (0..$#src_let){
	      printf "%1.2f",$l[$i][$_];
	      print " ";
	  }
	  print "\n";
      }
  }

  return $l[$#src_let][$#trg_let];
}



#####################################################################
# GetNonMatches($src,$trg,\%NonMatchPairs)'
#--------------------------------------------------------------------
# get all non-matching pairs from two strings
#####################################################################

sub GetNonMatches{

    my ($src,$trg,$res)=@_;

    my %trace;
    my $score=&LCS($src,$trg,undef,'',\%trace);

    my @SRC=split(//,$src);
    my @TRG=split(//,$trg);

    my $i=1;
    my $j=1;

    my $x=1;
    my $y=1;
    my ($srcnot,$trgnot)=('','');

    my $matches='';
    my $nonmatches='';
    my $SrcNonMatch='';
    my $TrgNonMatch='';

    foreach $i (sort {$a <=> $b} keys %trace){
	foreach $j (sort {$a <=> $b} keys %{$trace{$i}}){
	    if ($trace{$i}{$j}){
		$matches.=$SRC[$i-1];
		while ($x<$i){
		    $srcnot.=$SRC[$x-1];
		    $x++;
		}
		while ($y<$j){
		    $trgnot.=$TRG[$y-1];
		    $y++;
		}
		$x++;
		$y++;
		if ($srcnot or $trgnot){
		    $$res{$srcnot}{$trgnot}++;
		    $SrcNonMatch.=$srcnot.'*';
		    $TrgNonMatch.=$trgnot.'*';
		    $nonmatches.='('.$srcnot.'|'.$trgnot.').*';
		}
		else{
		    if (not $nonmatches){
			$nonmatches='.*';
			$SrcNonMatch.='*';
			$TrgNonMatch.='*';
		    }
		}
		($srcnot,$trgnot)=('','');
	    }
	    else{
		if ($matches!~/\*$/){
		    $matches.='*';
		}
	    }
	}
    }
    while ($x<=@SRC){
	$srcnot.=$SRC[$x-1];
	$x++;
    }
    while ($y<=@TRG){
	$trgnot.=$TRG[$y-1];
	$y++;
    }
    if ($srcnot or $trgnot){
	$$res{$srcnot}{$trgnot}++;
#	$nonmatches.="\{'".$srcnot."' \=\> '".$trgnot."'\}\*";
	$nonmatches.='('.$srcnot.'|'.$trgnot.')';
	$SrcNonMatch.=$srcnot;
	$TrgNonMatch.=$trgnot;
    }
    return ($score,$nonmatches,$matches,
	    $SrcNonMatch,$TrgNonMatch);
}


#####################################################################
# GetNonMatches($src,$trg,\%NonMatchPairs)'
#--------------------------------------------------------------------
# get all non-matching pairs from two strings
#####################################################################

sub GetNonMatchesOld{

    my ($src,$trg,$res)=@_;

    my %trace;
    my $score=&LCS($src,$trg,undef,'',\%trace);

    my @SRC=split(//,$src);
    my @TRG=split(//,$trg);

    my $i=1;
    my $j=1;

    my $x=1;
    my $y=1;
    my ($srcnot,$trgnot)=('','');

    my $matches='';
    my $nonmatches='';

    foreach $i (sort {$a <=> $b} keys %trace){
	foreach $j (sort {$a <=> $b} keys %{$trace{$i}}){
	    if ($trace{$i}{$j}){
		$matches.=$SRC[$i];
		while ($x<$i){
		    $srcnot.=$SRC[$x-1];
		    $x++;
		}
		while ($y<$j){
		    $trgnot.=$TRG[$y-1];
		    $y++;
		}
		$x++;
		$y++;
		if ($srcnot or $trgnot){
		    $$res{$srcnot}{$trgnot}++;
		    $nonmatches.='('.$srcnot.'|'.$trgnot.').*';
		}
		else{
		    if (not $nonmatches){
			$nonmatches='.*';
		    }
		}
		($srcnot,$trgnot)=('','');
	    }
	    else{
		if ($matches!~/\*$/){
		    $matches.='*';
		}
	    }
	}
    }
    while ($x<=@SRC){
	$srcnot.=$SRC[$x-1];
	$x++;
    }
    while ($y<=@TRG){
	$trgnot.=$TRG[$y-1];
	$y++;
    }
    if ($srcnot or $trgnot){
	$$res{$srcnot}{$trgnot}++;
#	$nonmatches.="\{'".$srcnot."' \=\> '".$trgnot."'\}\*";
	$nonmatches.='('.$srcnot.'|'.$trgnot.')';
    }
    return ($score,$nonmatches,$matches);
}


#####################################################################
# LCIS($src,$trg)
#####################################################################
# longest common initial subsequence

sub LCIS
{
  my ($src,$trg,$W,$pattern)=@_;
  my ($i,$j,$score,$first,$last)=(0,0,0,0,0);
  if (length($src)>length($trg)) {($src,$trg)=($trg,$src);}
  @src_let=split(/$pattern/,$src);      # split words into single
  @trg_let=split(/$pattern/,$trg);	# letters
  while ($i<@src_let)			# until last letter is reached
  {
    if ($src_let[$i] eq $trg_let[$j])   # if same letters at
    {					# current positions
      $score++;                         # -> increment score and
      $i++;				# the position in the smaller word
      if (not $first) {$first=$j+1;}	# remember postition of the first
      $last=$j+1;			# and the last match
    }
    if ($j<@trg_let) {$j++;}		# increment the position in the
    else {last;}			# longer word, if not endposition
  }
  return ($score,$first,$last);		# return score, first and last
}

#####################################################################
# LCIFS($src,$trg)
#####################################################################
# max(longest common initial subsequence,
#     longest common final subsequence)

sub LCIFS
{
  my ($src,$trg,$W,$pattern)=@_;
  my ($i,$j,$score1,$first1,$last1)=(0,0,0,0,0);
  my ($score2,$first2,$last2)=(0,0,0);
  if (length($src)>length($trg)) {($src,$trg)=($trg,$src);}

  ($score1,$first1,$last1)=&LCIS($src,$trg);

  @src_let=split(/$pattern/,$src);      # split words into single
  @trg_let=split(/$pattern/,$trg);	# letters
  ($i,$j)=(@src_let-1,@trg_let-1);	# and now from the last character
  while ($i>=0)
  {
    if ($src_let[$i] eq $trg_let[$j])	# same as above
    {
      $score2++;
      $i--;				# but decrement positions
      if (not $last2) {$last2=$j+1;}	# remember postition of the last
      $first2=$j+1;			# and the first match
    }
    if ($j>0) {$j--;}
    else {last;};
  }

  if ($score1>=$score2){return ($score1,$first1,$last1);}
  return ($score2,$first2,$last2);
}

#####################################################################
# LNCCP($src,$trg)
#####################################################################
# largest number of common characters at same positions

sub LNCCP
{
  my ($src,$trg,$W,$pattern)=@_;
  my ($i,$score,$first,$last)=(0,0,0);
  if (length($src)>length($trg)) {($src,$trg)=($trg,$src);}
  @src_let=split(/$pattern/,$src);	# split words into single
  @trg_let=split(/$pattern/,$trg);      # letters
  for ($i=0;$i<@src_let;$i++)		# for every letter of the
  {					# string
    if ($src_let[$i] eq $trg_let[$i])	# if letters equal
    {                                   # at same positions
      	$score++;			# -> increment score
	if (not $first) {$first=$i+1;}
	$last=$i+1;
    }
  }
  return ($score,$first,$last);		# and return
}

#####################################################################
# LNCCRP($src,$trg)
#####################################################################
# largest number of common characters at same relativ positions

sub LNCCRP
{
  my ($src,$trg,$W,$pattern)=@_;

  my ($j,$i,$best,$score,$first,$last,$first_tmp,$last_tmp)=(0,0,0,0,0,0,0,0);
  if (length($src)>length($trg)) {($src,$trg)=($trg,$src);}

  @src_let=split(/$pattern/,$src);
  @trg_let=split(/$pattern/,$trg);
  for ($j=0-@src_let;$j<@trg_let;$j++){
      $score=0;
      $first_tmp=0;
      for ($i=0;$i<@src_let;$i++)
      {
	  if (($i+$j>=0) and ($i+$j<@trg_let)){
	      if ($src_let[$i] eq $trg_let[$i+$j])
	      {
		  $score++;
		  $last_tmp=$j+$i+1;
		  if (not $first_tmp){$first_tmp=$j+$i+1;}
	      }
	  }
      }
      if ($score>$best)
      {
	  $best=$score;
	  $first=$first_tmp;
	  $last=$last_tmp;
      }
  }
  return ($best,$first,$last);
}


#####################################################################
# BestSimilar($src,$trg)                                            #
#####################################################################
# calculate similarity scores with all available measures and
# return the highest for the current string pair

sub BestSimilar
{
  my ($src,$trg)=@_;
  my $sims=&GetNrSims;
  my ($i,$bestscore,$bestlast,$bestfirst)=(0,0,0,0);
  my ($score,$first,$last);
  for $i (0..$sims-2){                     # don't do BestSimilar again ...
      ($score,$first,$last)=similar($src,$trg,$i);
      if ($score>$bestscore){
	  $bestscore=$score;
	  $bestfirst=$first;
	  $bestlast=$last;
      }
  }
  return ($bestscore,$bestfirst,$bestlast);
}

#####################################################################
# CombScore(\@ScoreMatrix,\@ScoreComb,$meth)
#####################################################################
# scores[source][target]: score matrix
# $meth specifies algorithm
# @comb contains resulting score matrix

sub CombScore
{
  local (*scores,*comb,$meth)=@_;
  if (($meth==0)or($meth eq 'sub_s')) {&CombScore1(\@scores,\@comb);}
  if (($meth==1)or($meth eq 'sub_t')) {&CombScore2(\@scores,\@comb);}
  if (($meth==2)or($meth eq 'sub_l')) {&CombScore3(\@scores,\@comb);}
  if (($meth==3)or($meth eq 'sub_b')) {&CombScore4(\@scores,\@comb);}
  if (($meth==4)or($meth eq 'sub_s_sub_b')) {&CombScore5(\@scores,\@comb);}
  if (($meth==5)or($meth eq 'sub_t_sub_b')) {&CombScore6(\@scores,\@comb);}
  if (($meth==6)or($meth eq 'sub_l_sub_b')) {&CombScore7(\@scores,\@comb);}
}

#####################################################################
# score combination 1 (score-s)                                     #
#####################################################################

sub CombScore1{

    local (*scores,*comb)=@_;
    for $i (0..$#scores){
	for $j (0..$#{$scores[$i]}){
	    for $k (0..$#scores){
		if ($k==$i) {$comb[$i][$j]+=$scores[$k][$j];}
		else {$comb[$i][$j]-=$scores[$k][$j];}

	    }
	}
    }
}

#####################################################################
# score combination 2 (score-t)                                     #
#####################################################################

sub CombScore2{

    local (*scores,*comb)=@_;
    for $i (0..$#scores){
	for $j (0..$#{$scores[$i]}){
	    for $k (0..$#{$scores[$i]}){
		if ($k==$j) {$comb[$i][$j]+=$scores[$i][$k];}
		else {$comb[$i][$j]-=$scores[$i][$k];}

	    }
	}
    }
}

#####################################################################
# score combination 3 (score-l)                                     #
#####################################################################

sub CombScore3{
    local (*scores,*comb)=@_;
    if ($#scores>$#{$scores[0]}){CombScore2(\@scores,\@comb);}
    else{CombScore1(\@scores,\@comb);}
}


#####################################################################
# score combination 4 (lead_avr)                                    #
#####################################################################

sub CombScore4{

    local (*scores,*comb)=@_;
    for $i (0..$#scores){
	for $j (0..$#{$scores[$i]}){
	    my $best=-99999999;
	    my $bestpos=0;
	    for $k (0..$#scores){
		if ($k!=$i) {
		    if ($scores[$k][$j]>$best){
			$best=$scores[$k][$j];
			$bestpos=$k;
		    }
		}
	    }
	    $comb[$i][$j]=$scores[$i][$j]-$scores[$bestpos][$j];
	}
    }
}


#####################################################################
# score combination lead_avr-s                                      #
#####################################################################

sub CombScore5{
    local (*scores,*comb)=@_;
    my @tmparr;
    &CombScore1(\@scores,\@comb);
    &CombScore4(\@comb,\@tmparr);
    @comb=@tmparr;
}

#####################################################################
# score combination lead_avr-t                                      #
#####################################################################

sub CombScore6{
    local (*scores,*comb)=@_;
    my @tmparr;
    &CombScore2(\@scores,\@comb);
    &CombScore4(\@comb,\@tmparr);
    @comb=@tmparr;
}

#####################################################################
# score combination lead_avr-l                                      #
#####################################################################

sub CombScore7{
    local (*scores,*comb)=@_;
    my @tmparr;
    &CombScore3(\@scores,\@comb);
    &CombScore4(\@comb,\@tmparr);
    @comb=@tmparr;
}











1;
	Global
`s`	Focus search bar
`?`	Bring up this help dialog
	GitHub
`g` `p`	Go to pull requests
`g` `i`	go to github issues (only if github is preferred repository)
	POD
`g` `a`	Go to author
`g` `c`	Go to changes
`g` `i`	Go to issues
`g` `d`	Go to dist
`g` `r`	Go to repository/SCM
`g` `s`	Go to source
`g` `b`	Go to file browse
	Search terms
module: (e.g. module:Plugin)
distribution: (e.g. distribution:Dancer auth)
author: (e.g. author:SONGMU Redis)
version: (e.g. version:1.00)