##################################################################### # Copyright (C) 2004 Jörg Tiedemann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # ##################################################################### # string similarity measures ##################################################################### # $Author: joerg72 $ # $Id: StrSim.pm,v 1.3 2005/03/24 10:32:24 joerg72 Exp $ package Uplug::StrSim; use vars qw(@ISA @EXPORT); @ISA = qw( Exporter); @EXPORT = qw( &similar ); ##################################################################### # GetNrSims ##################################################################### sub GetNrSims { return 4; } ##################################################################### # similar($src,$trg,$meth[,\%weights[,$splitpattern]]) ##################################################################### # returns string similarity score for ($src,$trg) # $meth specifies similarity metric # $norm specifies normalization of scores # 0 -> no normalization # 1 -> normalization by length of longer string # 2 -> normalization by length of shorter string sub similar { my ($src,$trg, # source and target string $meth, # similarity measure $norm, # normalization $W, # weight function $pattern)=@_; # pattern for splitting into char's my ($score,$first,$last)=(0,0,0); # $src=&my_lc($src); # make lower case versions # $trg=&my_lc($trg); # of source and target strings if (length($src)>length($trg)){ # source string should be the shorter ($src,$trg)=($trg,$src); # one -> swap if necessary } #--------------------------------------------------------------------------- # now, call the sub-function for string similarity score calculation #--------------------------------------------------------------------------- if (($meth==2) or ($meth eq 'ord')) { ($score,$first,$last)= &LCIFS($src,$trg,$W,$pattern); } elsif (($meth==4) or ($meth eq 'pos')) { ($score,$first,$last)= &LNCCRP($src,$trg,$W,$pattern); } elsif (($meth==6) or ($meth eq 'best')) { ($score,$first,$last)= &BestSimilar($src,$trg,$W,$pattern); } else { ($score,$first,$last)= # LCSR is the default metric &LCS($src,$trg,$W,$pattern); } #--------------------------------------------------------------------------- # score normalization #--------------------------------------------------------------------------- if ($norm==1) { if (length($trg)>0) {$score/=(length($trg));} } if ($norm==2) { if (length($src)>0) {$score/=(length($src));} } if ((defined $first) and (defined $last)){ return ($score,$first,$last); } return $score; } ########################################################################### # LCSR($src,$trg,[\%weights[,$SplitPattern[,\%trace[,$PrintMatrix]]]]) #-------------------------------------------------------------------------- # longest common sub-sequence ratio #-------------------------------------------------------------------------- # $src - source string # $trg - target string # %weights - weights for character pairs # $SplitPattern - pattern for splitting strings into characters # %trace - trace of character matches # $PrintMatrix - ==1 -> print the LSCR matrix ########################################################################### sub LCSR { my ($src,$trg,$W,$pattern,$trace,$printMatrix)=@_; my $score=&LCS($src,$trg,$W,$pattern,$trace,$printMatrix); if (length($src)>length($trg)){ return $score/length($src); } if (length($trg)>0){ return $score/length($trg); } return 0; } ########################################################################### # LCS($src,$trg,[\%weights[,$SplitPattern[,\%trace[,$PrintMatrix]]]]) #-------------------------------------------------------------------------- # longest common sub-sequence ########################################################################### sub LCS { my ($src,$trg,$W,$pattern,$trace,$printMatrix)=@_; my (@l,$i,$j); my @src_let=split(/$pattern/,$src); # split string into char my @trg_let=split(/$pattern/,$trg); unshift (@src_let,''); unshift (@trg_let,''); for ($i=0;$i<=$#src_let;$i++){ # initialize the matrix $l[$i][0]=0; } for ($i=0;$i<=$#trg_let;$i++){ $l[0][$i]=0; } # weight function is if (defined $W){ # defined: for $i (1..$#src_let){ # if the pair for $j (1..$#trg_let){ # ['.','.'] is if (($$W{'.'}{'.'}) and # defined in %W: (not $$W{$src_let[$i]}{$trg_let[$j]})){ # -> count all if ($src_let[$i] eq $trg_let[$j]){ # identical $$W{$src_let[$i]}{$trg_let[$j]}=1; # matches with } # score=1 if the } # pair is not my $best; # included in %W if ($$W{$src_let[$i]}{$trg_let[$j]}){ $best=$l[$i-1][$j-1]+$$W{$src_let[$i]}{$trg_let[$j]}; } if ($l[$i][$j-1]>$best){ $best=$l[$i][$j-1]; } if ($l[$i-1][$j]>$best){ $best=$l[$i-1][$j]; } $l[$i][$j]=$best; } } } else{ for $i (1..$#src_let){ for $j (1..$#trg_let){ if ($src_let[$i] eq $trg_let[$j]){ $l[$i][$j]=$l[$i-1][$j-1]+1; } else{ if ($l[$i][$j-1]>$l[$i-1][$j]){ $l[$i][$j]=$l[$i][$j-1]; } else{ $l[$i][$j]=$l[$i-1][$j]; } } } } } if (defined $trace){ # save the trace of character $i=$#l; # matches if %trace is defined $j=$#{$l[0]}; while (($i>0) and ($j>0)){ if ($l[$i][$j]==$l[$i-1][$j]){ $$trace{$i}{$j}=$l[$i][$j]-$l[$i-1][$j]; $i-=1; } elsif($l[$i][$j]==$l[$i][$j-1]){ $$trace{$i}{$j}=$l[$i][$j]-$l[$i][$j-1]; $j-=1; } else{ $$trace{$i}{$j}=$l[$i][$j]-$l[$i-1][$j-1]; $i-=1; $j-=1; } } } if ($printMatrix){ print ' '; foreach (0..$#src_let){ printf "%4s ", $src_let[$_]; } print "\n"; foreach (0..$#trg_let){ my $i; printf "%3s ", $trg_let[$_]; foreach $i (0..$#src_let){ printf "%1.2f",$l[$i][$_]; print " "; } print "\n"; } } return $l[$#src_let][$#trg_let]; } ##################################################################### # GetNonMatches($src,$trg,\%NonMatchPairs)' #-------------------------------------------------------------------- # get all non-matching pairs from two strings ##################################################################### sub GetNonMatches{ my ($src,$trg,$res)=@_; my %trace; my $score=&LCS($src,$trg,undef,'',\%trace); my @SRC=split(//,$src); my @TRG=split(//,$trg); my $i=1; my $j=1; my $x=1; my $y=1; my ($srcnot,$trgnot)=('',''); my $matches=''; my $nonmatches=''; my $SrcNonMatch=''; my $TrgNonMatch=''; foreach $i (sort {$a <=> $b} keys %trace){ foreach $j (sort {$a <=> $b} keys %{$trace{$i}}){ if ($trace{$i}{$j}){ $matches.=$SRC[$i-1]; while ($x<$i){ $srcnot.=$SRC[$x-1]; $x++; } while ($y<$j){ $trgnot.=$TRG[$y-1]; $y++; } $x++; $y++; if ($srcnot or $trgnot){ $$res{$srcnot}{$trgnot}++; $SrcNonMatch.=$srcnot.'*'; $TrgNonMatch.=$trgnot.'*'; $nonmatches.='('.$srcnot.'|'.$trgnot.').*'; } else{ if (not $nonmatches){ $nonmatches='.*'; $SrcNonMatch.='*'; $TrgNonMatch.='*'; } } ($srcnot,$trgnot)=('',''); } else{ if ($matches!~/\*$/){ $matches.='*'; } } } } while ($x<=@SRC){ $srcnot.=$SRC[$x-1]; $x++; } while ($y<=@TRG){ $trgnot.=$TRG[$y-1]; $y++; } if ($srcnot or $trgnot){ $$res{$srcnot}{$trgnot}++; # $nonmatches.="\{'".$srcnot."' \=\> '".$trgnot."'\}\*"; $nonmatches.='('.$srcnot.'|'.$trgnot.')'; $SrcNonMatch.=$srcnot; $TrgNonMatch.=$trgnot; } return ($score,$nonmatches,$matches, $SrcNonMatch,$TrgNonMatch); } ##################################################################### # GetNonMatches($src,$trg,\%NonMatchPairs)' #-------------------------------------------------------------------- # get all non-matching pairs from two strings ##################################################################### sub GetNonMatchesOld{ my ($src,$trg,$res)=@_; my %trace; my $score=&LCS($src,$trg,undef,'',\%trace); my @SRC=split(//,$src); my @TRG=split(//,$trg); my $i=1; my $j=1; my $x=1; my $y=1; my ($srcnot,$trgnot)=('',''); my $matches=''; my $nonmatches=''; foreach $i (sort {$a <=> $b} keys %trace){ foreach $j (sort {$a <=> $b} keys %{$trace{$i}}){ if ($trace{$i}{$j}){ $matches.=$SRC[$i]; while ($x<$i){ $srcnot.=$SRC[$x-1]; $x++; } while ($y<$j){ $trgnot.=$TRG[$y-1]; $y++; } $x++; $y++; if ($srcnot or $trgnot){ $$res{$srcnot}{$trgnot}++; $nonmatches.='('.$srcnot.'|'.$trgnot.').*'; } else{ if (not $nonmatches){ $nonmatches='.*'; } } ($srcnot,$trgnot)=('',''); } else{ if ($matches!~/\*$/){ $matches.='*'; } } } } while ($x<=@SRC){ $srcnot.=$SRC[$x-1]; $x++; } while ($y<=@TRG){ $trgnot.=$TRG[$y-1]; $y++; } if ($srcnot or $trgnot){ $$res{$srcnot}{$trgnot}++; # $nonmatches.="\{'".$srcnot."' \=\> '".$trgnot."'\}\*"; $nonmatches.='('.$srcnot.'|'.$trgnot.')'; } return ($score,$nonmatches,$matches); } ##################################################################### # LCIS($src,$trg) ##################################################################### # longest common initial subsequence sub LCIS { my ($src,$trg,$W,$pattern)=@_; my ($i,$j,$score,$first,$last)=(0,0,0,0,0); if (length($src)>length($trg)) {($src,$trg)=($trg,$src);} @src_let=split(/$pattern/,$src); # split words into single @trg_let=split(/$pattern/,$trg); # letters while ($i<@src_let) # until last letter is reached { if ($src_let[$i] eq $trg_let[$j]) # if same letters at { # current positions $score++; # -> increment score and $i++; # the position in the smaller word if (not $first) {$first=$j+1;} # remember postition of the first $last=$j+1; # and the last match } if ($j<@trg_let) {$j++;} # increment the position in the else {last;} # longer word, if not endposition } return ($score,$first,$last); # return score, first and last } ##################################################################### # LCIFS($src,$trg) ##################################################################### # max(longest common initial subsequence, # longest common final subsequence) sub LCIFS { my ($src,$trg,$W,$pattern)=@_; my ($i,$j,$score1,$first1,$last1)=(0,0,0,0,0); my ($score2,$first2,$last2)=(0,0,0); if (length($src)>length($trg)) {($src,$trg)=($trg,$src);} ($score1,$first1,$last1)=&LCIS($src,$trg); @src_let=split(/$pattern/,$src); # split words into single @trg_let=split(/$pattern/,$trg); # letters ($i,$j)=(@src_let-1,@trg_let-1); # and now from the last character while ($i>=0) { if ($src_let[$i] eq $trg_let[$j]) # same as above { $score2++; $i--; # but decrement positions if (not $last2) {$last2=$j+1;} # remember postition of the last $first2=$j+1; # and the first match } if ($j>0) {$j--;} else {last;}; } if ($score1>=$score2){return ($score1,$first1,$last1);} return ($score2,$first2,$last2); } ##################################################################### # LNCCP($src,$trg) ##################################################################### # largest number of common characters at same positions sub LNCCP { my ($src,$trg,$W,$pattern)=@_; my ($i,$score,$first,$last)=(0,0,0); if (length($src)>length($trg)) {($src,$trg)=($trg,$src);} @src_let=split(/$pattern/,$src); # split words into single @trg_let=split(/$pattern/,$trg); # letters for ($i=0;$i<@src_let;$i++) # for every letter of the { # string if ($src_let[$i] eq $trg_let[$i]) # if letters equal { # at same positions $score++; # -> increment score if (not $first) {$first=$i+1;} $last=$i+1; } } return ($score,$first,$last); # and return } ##################################################################### # LNCCRP($src,$trg) ##################################################################### # largest number of common characters at same relativ positions sub LNCCRP { my ($src,$trg,$W,$pattern)=@_; my ($j,$i,$best,$score,$first,$last,$first_tmp,$last_tmp)=(0,0,0,0,0,0,0,0); if (length($src)>length($trg)) {($src,$trg)=($trg,$src);} @src_let=split(/$pattern/,$src); @trg_let=split(/$pattern/,$trg); for ($j=0-@src_let;$j<@trg_let;$j++){ $score=0; $first_tmp=0; for ($i=0;$i<@src_let;$i++) { if (($i+$j>=0) and ($i+$j<@trg_let)){ if ($src_let[$i] eq $trg_let[$i+$j]) { $score++; $last_tmp=$j+$i+1; if (not $first_tmp){$first_tmp=$j+$i+1;} } } } if ($score>$best) { $best=$score; $first=$first_tmp; $last=$last_tmp; } } return ($best,$first,$last); } ##################################################################### # BestSimilar($src,$trg) # ##################################################################### # calculate similarity scores with all available measures and # return the highest for the current string pair sub BestSimilar { my ($src,$trg)=@_; my $sims=&GetNrSims; my ($i,$bestscore,$bestlast,$bestfirst)=(0,0,0,0); my ($score,$first,$last); for $i (0..$sims-2){ # don't do BestSimilar again ... ($score,$first,$last)=similar($src,$trg,$i); if ($score>$bestscore){ $bestscore=$score; $bestfirst=$first; $bestlast=$last; } } return ($bestscore,$bestfirst,$bestlast); } ##################################################################### # CombScore(\@ScoreMatrix,\@ScoreComb,$meth) ##################################################################### # scores[source][target]: score matrix # $meth specifies algorithm # @comb contains resulting score matrix sub CombScore { local (*scores,*comb,$meth)=@_; if (($meth==0)or($meth eq 'sub_s')) {&CombScore1(\@scores,\@comb);} if (($meth==1)or($meth eq 'sub_t')) {&CombScore2(\@scores,\@comb);} if (($meth==2)or($meth eq 'sub_l')) {&CombScore3(\@scores,\@comb);} if (($meth==3)or($meth eq 'sub_b')) {&CombScore4(\@scores,\@comb);} if (($meth==4)or($meth eq 'sub_s_sub_b')) {&CombScore5(\@scores,\@comb);} if (($meth==5)or($meth eq 'sub_t_sub_b')) {&CombScore6(\@scores,\@comb);} if (($meth==6)or($meth eq 'sub_l_sub_b')) {&CombScore7(\@scores,\@comb);} } ##################################################################### # score combination 1 (score-s) # ##################################################################### sub CombScore1{ local (*scores,*comb)=@_; for $i (0..$#scores){ for $j (0..$#{$scores[$i]}){ for $k (0..$#scores){ if ($k==$i) {$comb[$i][$j]+=$scores[$k][$j];} else {$comb[$i][$j]-=$scores[$k][$j];} } } } } ##################################################################### # score combination 2 (score-t) # ##################################################################### sub CombScore2{ local (*scores,*comb)=@_; for $i (0..$#scores){ for $j (0..$#{$scores[$i]}){ for $k (0..$#{$scores[$i]}){ if ($k==$j) {$comb[$i][$j]+=$scores[$i][$k];} else {$comb[$i][$j]-=$scores[$i][$k];} } } } } ##################################################################### # score combination 3 (score-l) # ##################################################################### sub CombScore3{ local (*scores,*comb)=@_; if ($#scores>$#{$scores[0]}){CombScore2(\@scores,\@comb);} else{CombScore1(\@scores,\@comb);} } ##################################################################### # score combination 4 (lead_avr) # ##################################################################### sub CombScore4{ local (*scores,*comb)=@_; for $i (0..$#scores){ for $j (0..$#{$scores[$i]}){ my $best=-99999999; my $bestpos=0; for $k (0..$#scores){ if ($k!=$i) { if ($scores[$k][$j]>$best){ $best=$scores[$k][$j]; $bestpos=$k; } } } $comb[$i][$j]=$scores[$i][$j]-$scores[$bestpos][$j]; } } } ##################################################################### # score combination lead_avr-s # ##################################################################### sub CombScore5{ local (*scores,*comb)=@_; my @tmparr; &CombScore1(\@scores,\@comb); &CombScore4(\@comb,\@tmparr); @comb=@tmparr; } ##################################################################### # score combination lead_avr-t # ##################################################################### sub CombScore6{ local (*scores,*comb)=@_; my @tmparr; &CombScore2(\@scores,\@comb); &CombScore4(\@comb,\@tmparr); @comb=@tmparr; } ##################################################################### # score combination lead_avr-l # ##################################################################### sub CombScore7{ local (*scores,*comb)=@_; my @tmparr; &CombScore3(\@scores,\@comb); &CombScore4(\@comb,\@tmparr); @comb=@tmparr; } 1;