#!/local/bin/perl -w # # this is a sentence aligner based on Phillip Koehn's implementation used # for aligning the EuroParl corpus # http://people.csail.mit.edu/koehn/publications/europarl/ # (and his aligner is based on Gale&Chruch's length-based alignment approach) # # USAGE: sentalign [OPTIONS] srcfile trgfile > alignfile # # OPTIONS # # -h 'hard-tag-re' .......... regular expression to match hard boundary tags # # use strict; use vars qw($opt_h); use Getopt::Std; use XML::Parser; getopts('h:'); my $SENT_TAG = 's'; my $HARD_TAG = $opt_h; # hard tag is a regular expression! my $src_file = shift(@ARGV); my $trg_file = shift(@ARGV); my $src_parser = new XML::Parser(Handlers => {Start => \&XmlStart, End => \&XmlEnd, Default => \&XmlChar, },); my $trg_parser = new XML::Parser(Handlers => {Start => \&XmlStart, End => \&XmlEnd, Default => \&XmlChar, },); open SRC,"<$src_file" || die "cannot open $src_file!\n"; open TRG,"<$trg_file" || die "cannot open $trg_file!\n"; my $src_ph = $src_parser->parse_start; my $trg_ph = $trg_parser->parse_start; ParseBitext(*SRC,*TRG,$src_ph,$trg_ph); # # if same number of levels # foreach level # align at that level # adjust segmentation and go to next level # # # &print_xmlheader($src_file,$trg_file); # my ($src_len,$trg_len,$src_id,$trg_id); # my $LINK_ID=0; # my @links = &sentence_align($src_len,$trg_len,$src_id,$trg_id); # print_alignments(\@links); close SRC; close TRG; &print_xmlfooter(); sub print_xmlheader{ my ($src_file,$trg_file) = @_; print ''."\n"; print ''."\n"; print ''."\n"; print ' '."\n"; } sub print_xmlfooter{ print "\n"; print "\n"; } sub ParseBitext{ my ($SFH,$TFH,$src_parser,$trg_parser) = @_; delete $src_parser->{LEVEL}; # current level in XML tree delete $trg_parser->{LEVEL}; delete $src_parser->{START}; # start positions of each tag at each level delete $trg_parser->{START}; delete $src_parser->{END}; # end positions of each tag at each level delete $trg_parser->{END}; delete $src_parser->{IDS}; # end positions of each tag at each level delete $trg_parser->{IDS}; delete $src_parser->{CHILDREN}; delete $trg_parser->{CHILDREN}; delete $src_parser->{SENT_OPEN}; # flag to indicate that we delete $trg_parser->{SENT_OPEN}; # are in a sentence while (my $line = <$SFH>){ $src_parser->parse_more($line); } while (my $line = <$TFH>){ $trg_parser->parse_more($line); } } sub XmlStart{ my ($p,$e,%attr) = @_; $p->{LEVEL}=0 unless defined $p->{LEVEL}; $p->{POSITION}=0 unless defined $p->{POSITION}; ## above sentence level if (not $p->{SENT_OPEN}){ $p->{TAGS}->[$p->{LEVEL}]->{$e}++; $p->{TAGS}->[$p->{LEVEL}]->{__ALL__}++; push (@{$p->{START}->[$p->{LEVEL}]},$p->{POSITION}); push (@{$p->{IDS}->[$p->{LEVEL}]},$attr{id}); if ($p->{LEVEL} > 0){ my $parent = $#{$p->{START}->[$p->{LEVEL}-1]}; push(@{$p->{CHILDREN}->[$p->{LEVEL}-1]->[$parent]}, $#{$p->{START}->[$p->{LEVEL}]}); } $p->{LEVEL}++; if ($e eq $SENT_TAG){ $p->{SENT_OPEN} = 1; } } } sub XmlEnd{ my $p=shift; my $e=shift; if ($e eq $SENT_TAG){ delete $p->{SENT_OPEN}; } if (not $p->{SENT_OPEN}){ $p->{LEVEL}--; push (@{$p->{END}->[$p->{LEVEL}]},$p->{POSITION}); } } sub XmlChar{ my $p=shift; my $e=shift; $p->{POSITION} += length($e); # if ($p->{SENT_OPEN}){ # $p->{SENT_LENGTHS}->[-1] += length($e); # } } sub print_alignments{ my $LINKS = shift; foreach my $i (0..$#{$LINKS}){ print '\n"; $LINK_ID++; } } # this is a vanilla implementation of church and gale sub sentence_align { my ($LEN1,$LEN2,$IDS1,$IDS2) = @_; # parameters my %PRIOR; $PRIOR{1}{1} = 0.89; $PRIOR{1}{0} = 0.01/2; $PRIOR{0}{1} = 0.01/2; $PRIOR{2}{1} = 0.089/2; $PRIOR{1}{2} = 0.089/2; # $PRIOR{2}{2} = 0.011; # dynamic programming my (@COST,@BACK); $COST[0][0] = 0; for(my $i1=0;$i1<=$#{$LEN1};$i1++) { for(my $i2=0;$i2<=$#{$LEN2};$i2++) { next if $i1 + $i2 == 0; $COST[$i1][$i2] = 1e10; foreach my $d1 (keys %PRIOR) { next if $d1>$i1; foreach my $d2 (keys %{$PRIOR{$d1}}) { next if $d2>$i2; my $cost = $COST[$i1-$d1][$i2-$d2] - log($PRIOR{$d1}{$d2}) + &match($$LEN1[$i1]-$$LEN1[$i1-$d1], $$LEN2[$i2]-$$LEN2[$i2-$d2]); # print "($i1->".($i1-$d1).",$i2->".($i2-$d2).") [".($LEN1[$i1]-$LEN1[$i1-$d1]).",".($LEN2[$i2]-$LEN2[$i2-$d2])."] = $COST[$i1-$d1][$i2-$d2] - ".log($PRIOR{$d1}{$d2})." + ".&match($LEN1[$i1]-$LEN1[$i1-$d1], $LEN2[$i2]-$LEN2[$i2-$d2])." = $cost\n"; if ($cost < $COST[$i1][$i2]) { $COST[$i1][$i2] = $cost; @{$BACK[$i1][$i2]} = ($i1-$d1,$i2-$d2); } } } # print $COST[$i1][$i2]."($i1-$BACK[$i1][$i2][0],$i2-$BACK[$i1][$i2][1]) "; } # print "\n"; } # back tracking my %NEXT=(); my @LINKS=(); my $i1 = $#{$LEN1}; my $i2 = $#{$LEN2}; while($i1>0 || $i2>0) { # print "back $i1 $i2\n"; @{$NEXT{$BACK[$i1][$i2][0]}{$BACK[$i1][$i2][1]}} = ($i1,$i2); ($i1,$i2) = ($BACK[$i1][$i2][0],$BACK[$i1][$i2][1]); } while($i1<$#{$LEN1} || $i2<$#{$LEN2}) { push(@LINKS,{}); # print "fwd $i1 $i2\n"; # print '\n"; ($i1,$i2) = @{$NEXT{$i1}{$i2}}; } return @LINKS; } sub match { my ($len1,$len2) = @_; my $c = 1; my $s2 = 6.8; if ($len1==0 && $len2==0) { return 0; } my $mean = ($len1 + $len2/$c) / 2; my $z = ($c * $len1 - $len2)/sqrt($s2 * $mean); if ($z < 0) { $z = -$z; } my $pd = 2 * (1 - &pnorm($z)); if ($pd>0) { return -log($pd); } return 25; } sub pnorm { my ($z) = @_; my $t = 1/(1 + 0.2316419 * $z); return 1 - 0.3989423 * exp(-$z * $z / 2) * ((((1.330274429 * $t - 1.821255978) * $t + 1.781477937) * $t - 0.356563782) * $t + 0.319381530) * $t; }