package Uplug::Align::Word::Giza; use strict; use Cwd; use Exporter; use File::Copy; use Uplug::Data; use Uplug::Data::Align; use Uplug::IO::Any; use Uplug::Config; use vars qw(@ISA @EXPORT $VERSION $DEBUG $GIZAHOME); # @ISA = qw( Uplug::Align::Word Exporter ); @ISA = qw( Exporter); $VERSION = '$Id: Giza.pm,v 1.6 2005/12/13 16:27:25 joerg72 Exp $ '; $DEBUG = 0; @EXPORT = qw( &Bitext2Text &RunGiza &Combined2Uplug &Giza2Clue &Giza2Uplug ); #BEGIN{ our $GIZAHOME="$ENV{UPLUGHOME}/ext/GIZA++"; if (not -d $GIZAHOME){$GIZAHOME="$ENV{HOME}/cvs/GIZA++-v2";} if (not -d $GIZAHOME){$GIZAHOME="$ENV{HOME}/cvs/GIZA++";} if (not -d $GIZAHOME){$GIZAHOME="/local/ling/GIZA++-v2";} if (not -d $GIZAHOME){$GIZAHOME="/local/ling/GIZA++";} #} # if (not -d $GIZAHOME){warn "cannot find GIZA++!";exit;} #---------------------------------------------------------------------------- # Giza2Clue (new version): no external calls # - looks for $dir/GIZA++.actual.ti.final (lexical prob's from GIZA) # - creates data/runtime/giza.dbm # - creates data/runtime/giza2.dbm (inverse alignments) sub Giza2Clue{ my $file=shift; my $param=shift; my $inverse=shift; my $ClueDB=shift; my $threshold=shift; if (-d $file){ # if file is a directory $file="$file/GIZA++.actual.ti.final"; # look for the standard file in } # this directory! my %dic; if (ref($ClueDB) eq 'HASH'){ %dic=%{$ClueDB}; } else{ %dic=('format' => 'dbm', 'write_mode' => 'overwrite', 'key' => ['source','target']); my $cluedir='data/runtime'; if ($inverse){$dic{file}="$cluedir/giza2.dbm";} else{$dic{file}="$cluedir/giza.dbm";} } my %inStream=('file' => $file, 'format' => 'tab', 'field delimiter' => ' '); if ($inverse){ $inStream{'columns'}=['source','target','value',], } else{ $inStream{'columns'}=['target','source','value',], } my %lex=(); my $data=Uplug::Data->new; my $in=Uplug::IO::Any->new(\%inStream); $in->open('read',\%inStream); my $count=0; while ($in->read($data)){ $count++; if (not ($count % 1000)){print STDERR '.';} if (not ($count % 10000)){print STDERR "$count\n";} my $src=$data->attribute('source'); my $trg=$data->attribute('target'); if ((not $src) or (not $trg)){next;} my $value=$data->attribute('value'); if (not $value){$value=1;} if ((defined $threshold) and ($value<$threshold)){next;} $lex{$src}{$trg}=$value; if (($src=~s/\_/ /gs) or ($trg=~s/\_/ /gs)){ # (for giza-clue:) $lex{$src}{$trg}=$value; # '_' means ' ' } } my $header=$in->header; my $out=Uplug::IO::Any->new(\%dic); $out->open('write',\%dic); $out->addheader($header); $out->addheader($param); $out->writeheader(); foreach my $s (keys %lex){ my $total; foreach my $t (keys %{$lex{$s}}){ my $score=$lex{$s}{$t}; my $data=Uplug::Data->new; $data->setAttribute('source',$s); $data->setAttribute('target',$t); $data->setAttribute('score',$score); $out->write($data); } } $out->close; $in->close; } #---------------------------------------------------------------------------- # Giza2Uplug: convert GIZA's Viterbi alignment to Uplug format (XML) # (slow and risky: GIZA's output must be complete and use a certain format) sub Giza2Uplug{ my $viterbi=shift; my $bitext=shift; my $param=shift; my $links=shift; my $inverse=shift; if (ref($links) ne 'HASH'){return 0;} my $input=Uplug::IO::Any->new($bitext); if (not ref($input)){return 0;} if (not $input->open('read',$bitext)){return 0;} my $BitextHeader=$input->header(); my $output=Uplug::IO::Any->new($links); if (not ref($output)){return 0;} $output->addheader($BitextHeader); if (not $output->open('write',$links)){return 0;} $output->setOption('SkipDataHeader',0); # don't skip headers and $output->setOption('SkipDataTail',0); # footers (e.g. ) #------------------------------------------------------------------------ if ($viterbi=~/\.gz$/){ open F,"gzip -cd <$viterbi |" || die "cannot open Viterbi alignment file $viterbi!"; } else{ open F,"<$viterbi" || die "cannot open Viterbi alignment file $viterbi!"; } #------------------------------------------------------------------------ my $TokenLabel='w'; my $data=Uplug::Data::Align->new(); print STDERR "convert GIZA's Viterbi alignment to XML!\n"; my $count=0; while ($input->read($data)){ $count++; if (not ($count % 100)){ $|=1;print STDERR '.';$|=0; } if (not ($count % 1000)){ $|=1;print STDERR "$count\n";$|=0; } #---------------------------------- # do the same as for Bitext2Text!! # (to check for empty strings ...) # my @SrcNodes=(); my @TrgNodes=(); my ($srctxt,$trgtxt)= &BitextStrings($data,$param,\@SrcNodes,\@TrgNodes); if (($srctxt!~/\S/) or ($trgtxt!~/\S/)){next;} #---------------------------------- # my $SrcData=$data->sourceData(); # my $TrgData=$data->targetData(); # # my @SrcNodes=$SrcData->findNodes($TokenLabel); my @SrcIds=$data->attribute(\@SrcNodes,'id'); my @SrcSpans=$data->attribute(\@SrcNodes,'span'); # my @SrcTokens=$data->content(\@SrcNodes); my @SrcTokens=$data->getSrcTokenFeatures($param,\@SrcNodes); # my @TrgNodes=$TrgData->findNodes($TokenLabel); my @TrgIds=$data->attribute(\@TrgNodes,'id'); my @TrgSpans=$data->attribute(\@TrgNodes,'span'); # my @TrgTokens=$data->content(\@TrgNodes); my @TrgTokens=$data->getTrgTokenFeatures($param,\@TrgNodes); if ((not @SrcNodes) or (not @TrgNodes)){next;} $_=; $_=; chomp; my @src=split(/ /); $_=; chomp; my %align=(); my $count=1; while (/\s(\S.*?)\s\(\{\s(.*?)\}\)/g){ # strunta i NULL!! if ($2){push (@{$align{$2}},$count);} $count++; } foreach (sort keys %align){ my @s;my @t; if ($inverse){ @t=@{$align{$_}}; @s=split(/\s/); } else{ @s=@{$align{$_}}; @t=split(/\s/); } my @src=();my @trg=(); foreach (@s){push (@src,$SrcTokens[$_-1]);} foreach (@t){push (@trg,$TrgTokens[$_-1]);} my @srcId=();my @trgId=(); foreach (@s){push (@srcId,$SrcIds[$_-1]);} foreach (@t){push (@trgId,$TrgIds[$_-1]);} my @srcSpan=();my @trgSpan=(); foreach (@s){push (@srcSpan,$SrcSpans[$_-1]);} foreach (@t){push (@trgSpan,$TrgSpans[$_-1]);} my %link=(); $link{link}=join ' ',@src; $link{link}.=';'; $link{link}.=join ' ',@trg; $link{source}=join '+',@srcId; $link{target}=join '+',@trgId; $link{src}=join '&',@srcSpan; $link{trg}=join '&',@trgSpan; $data->addWordLink(\%link); } $output->write($data); } $input->close; $output->close; } #---------------------------------------------------------------------------- # Combined2Uplug: combine GIZA's Viterbi alignment and convert them to Uplug format (XML) # (slow and risky: GIZA's output must be complete and must use a certain format) # # possible combinatins: union, intersection, refined # sub Combined2Uplug{ my $giza0=shift; my $giza1=shift; my $combine=shift; my $bitext=shift; my $param=shift; my $links=shift; if (ref($links) ne 'HASH'){return 0;} my $input=Uplug::IO::Any->new($bitext); if (not ref($input)){return 0;} if (not $input->open('read',$bitext)){return 0;} my $output; # open output later (after reading the first record of # the input stream --> this gives the complete input header) #------------------------------------------------------------------------ if ($giza0=~/\.gz$/){open F0,"gzip -cd <$giza0 |";} else{open F0,"<$giza0";} if ($giza1=~/\.gz$/){open F1,"gzip -cd <$giza1 |";} else{open F1,"<$giza1";} #------------------------------------------------------------------------ my $TokenLabel='w'; my $data=Uplug::Data::Align->new(); print STDERR "combine GIZA's Viterbi alignments and convert to XML!\n"; my $count=0; while ($input->read($data)){ if (not $output){ # output is not opened yet: my $BitextHeader=$input->header(); # - get the input header $output=Uplug::IO::Any->new($links); # - create an output stream if (not ref($output)){return 0;} # (or die) $output->addheader($BitextHeader); # - add input header if (not $output->open('write',$links)){ # - open the output stream return 0; # (or die) } $output->setOption('SkipDataHeader',0); # don't skip headers and $output->setOption('SkipDataTail',0); # footers (e.g. ) } $count++; if (not ($count % 100)){ $|=1;print STDERR '.';$|=0; } if (not ($count % 1000)){ $|=1;print STDERR "$count\n";$|=0; } #---------------------------------- # do the same as for Bitext2Text!! # (to check for empty strings ...) # my @SrcNodes=(); my @TrgNodes=(); my ($srctxt,$trgtxt)= &BitextStrings($data,$param,\@SrcNodes,\@TrgNodes); if (($srctxt!~/\S/) or ($trgtxt!~/\S/)){next;} #---------------------------------- # my @SrcNodes=$SrcData->findNodes($TokenLabel); my @SrcIds=$data->attribute(\@SrcNodes,'id'); my @SrcSpans=$data->attribute(\@SrcNodes,'span'); # my @SrcTokens=$data->content(\@SrcNodes); my @SrcTokens=$data->getSrcTokenFeatures($param,\@SrcNodes); # my @TrgNodes=$TrgData->findNodes($TokenLabel); my @TrgIds=$data->attribute(\@TrgNodes,'id'); my @TrgSpans=$data->attribute(\@TrgNodes,'span'); # my @TrgTokens=$data->content(\@TrgNodes); my @TrgTokens=$data->getTrgTokenFeatures($param,\@TrgNodes); if ((not @SrcNodes) or (not @TrgNodes)){next;} $_=;$_=;chomp; # read source->target viterbi alignment my @src=split(/ /); $_=;chomp; my %srclinks=(); my $count=1; while (/\s(\S.*?)\s\(\{\s(.*?)\}\)/g){ # strunta i NULL!! my @s=split(/\s/,$2); foreach (@s){$srclinks{$_}{$count}=1;} $count++; } $_=;$_=;chomp; # read source->target viterbi alignment my @trg=split(/ /); $_=;chomp; my %trglinks=(); my $count=1; while (/\s(\S.*?)\s\(\{\s(.*?)\}\)/g){ # strunta i NULL!! my @t=split(/\s/,$2); foreach (@t){$trglinks{$_}{$count}=1;} $count++; } my (%CombinedSrc,%CombinedTrg); &CombineLinks(\%srclinks,\%trglinks,$combine,\%CombinedSrc,\%CombinedTrg); my @cluster=&LinkClusters(\%CombinedSrc,\%CombinedTrg); foreach my $c (@cluster){ # my @s=sort {$a <=> $b} keys %{$cluster[$_]{src}}; # my @t=sort {$a <=> $b} keys %{$cluster[$_]{trg}}; my @s=@{$$c{src}}; my @t=@{$$c{trg}}; my @src=();my @trg=(); foreach (@s){push (@src,$SrcTokens[$_-1]);} foreach (@t){push (@trg,$TrgTokens[$_-1]);} my @srcId=();my @trgId=(); foreach (@s){push (@srcId,$SrcIds[$_-1]);} foreach (@t){push (@trgId,$TrgIds[$_-1]);} my @srcSpan=();my @trgSpan=(); foreach (@s){push (@srcSpan,$SrcSpans[$_-1]);} foreach (@t){push (@trgSpan,$TrgSpans[$_-1]);} my %link=(); $link{link}=join ' ',@src; $link{link}.=';'; $link{link}.=join ' ',@trg; $link{source}=join '+',@srcId; $link{target}=join '+',@trgId; $link{src}=join '&',@srcSpan; $link{trg}=join '&',@trgSpan; $data->addWordLink(\%link); } $output->write($data); } $input->close; $output->close; } sub LinkClusters{ my ($src,$trg)=@_; my @cluster=(); while (keys %{$src}){ my ($s,$links)=each %{$src}; # get the next source token if ((ref($$src{$s}) ne 'HASH') or (not keys %{$$src{$s}})){ # if no links exist: delete $$src{$s}; # delete and next! next; } push (@cluster,{src=>[],trg=>[]}); # create a new link cluster push (@{$cluster[-1]{src}},$s); # and save it in the cluster &AddLinks($cluster[-1],$src,$trg,$s, # add all tokens aligned to the 'src','trg'); # source token to the cluster } # (and recursively the ones foreach my $c (@cluster){ @{$$c{src}}=sort {$a <=> $b} @{$$c{src}}; @{$$c{trg}}=sort {$a <=> $b} @{$$c{trg}}; } return @cluster; } # linked to them, see AddLinks) sub AddLinks{ my ($cluster,$src,$trg,$s,$key1,$key2)=@_; foreach my $t (keys %{$$src{$s}}){ # add all linked tokens to the delete $$src{$s}{$t}; # cluster and delete the links delete $$trg{$t}{$s}; # in the link-hashs push (@{$$cluster{$key2}},$t); &AddLinks($cluster,$trg,$src,$t,$key2,$key1); # add tokens aligned to the } # linked token to the cluster delete $$src{$s}; # delete the source token link hash } sub CombineLinks{ my ($src,$trg,$method,$srclinks,$trglinks)=@_; # my %srclinks; # my %trglinks; if ($method eq 'union'){ foreach my $s (keys %{$src}){ foreach my $t (keys %{$$src{$s}}){ $$srclinks{$s}{$t}=1; $$trglinks{$t}{$s}=1; } } foreach my $t (keys %{$trg}){ foreach my $s (keys %{$$trg{$t}}){ $$srclinks{$s}{$t}=1; $$trglinks{$t}{$s}=1; } } } elsif (($method eq 'intersection') or ($method eq 'refined')){ foreach my $s (keys %{$src}){ foreach my $t (keys %{$$src{$s}}){ if ($$trg{$t}{$s}){ $$srclinks{$s}{$t}=1; $$trglinks{$t}{$s}=1; } } } } if ($method eq 'refined'){ # refined combination: foreach my $s (keys %{$src}){ # * start with the intersection foreach my $t (keys %{$$src{$s}}){ # * go iteratively through other links if ((not defined $$srclinks{$s}) and (not defined $$trglinks{$t})){ # - if both are not aligned yet: $$srclinks{$s}{$t}=1; # add the link $$trglinks{$t}{$s}=1; } elsif ((defined $$srclinks{$s-1}) or (defined $$srclinks{$s+1})){ if (($$srclinks{$s-1}{$t}) or # if the link is adjacent to ($$srclinks{$s+1}{$t})){ # another one horizontally: if ($$srclinks{$s}{$t+1}){next;} # do not accept if it is also if ($$srclinks{$s}{$t-1}){next;} # adjacent to other links vertically if ($$srclinks{$s-1}{$t}){ # do not accept if the adjacent if ($$srclinks{$s-1}{$t-1}){next;} # link is also adjacent to other if ($$srclinks{$s-1}{$t+1}){next;} # links vertically } if ($$srclinks{$s+1}{$t}){ # the same for the other if ($$srclinks{$s+1}{$t-1}){next;} # adjacency direction if ($$srclinks{$s+1}{$t+1}){next;} } $$srclinks{$s}{$t}=1; # everything ok: add the link $$trglinks{$t}{$s}=1; } } elsif ((defined $$trglinks{$t-1}) or (defined $$trglinks{$t+1})){ if (($$srclinks{$s}{$t-1}) or # if the link is adjacent to ($$srclinks{$s}{$t+1})){ # another one vertically: if ($$srclinks{$s+1}{$t}){next;} # do not accept if it is also if ($$srclinks{$s-1}{$t}){next;} # adjacent to other links horizontally if ($$srclinks{$s}{$t-1}){ # do not accept if the adjacent if ($$srclinks{$s-1}{$t-1}){next;} # link is also adjacent to other if ($$srclinks{$s+1}{$t-1}){next;} # links horizontally } if ($$srclinks{$s}{$t+1}){ # the same for the other if ($$srclinks{$s-1}{$t+1}){next;} # adjacency direction if ($$srclinks{$s+1}{$t+1}){next;} } $$srclinks{$s}{$t}=1; # everything ok: add the link $$trglinks{$t}{$s}=1; } } } } } # $src=\%srclinks; # $trg=\%trglinks; } #---------------------------------------------------------------------------- # RunGiza: run GIZA++ using external scripts # (GIZA must be installed in the given directory) sub RunGiza{ my $src=shift; my $trg=shift; my $viterbi=shift; if (not -d $GIZAHOME){ $GIZAHOME="$ENV{UPLUGHOME}/ext/GIZA++"; if (not -d $GIZAHOME){$GIZAHOME="$ENV{HOME}/cvs/GIZA++-v2";} if (not -d $GIZAHOME){$GIZAHOME="$ENV{HOME}/cvs/GIZA++";} if (not -d $GIZAHOME){$GIZAHOME="/local/ling/GIZA++-v2";} if (not -d $GIZAHOME){$GIZAHOME="/local/ling/GIZA++";} if (not -d $GIZAHOME){ warn "cannot find GIZA++ in $GIZAHOME! Set ENV{UPLUGHOME} or Uplug::Align::Word::Giza::GIZAHOME!"; return 0; } } if (my $sig=system "$GIZAHOME/plain2snt.out $src $trg"){ die "got signal $? from plain2snt!\n"; } my $command="PATH=\$\{PATH\}:$GIZAHOME;"; my $snt="$src$trg\.snt"; if (not -e $snt){$snt="$src\_$trg\.snt";} if (not -e $snt){die "cannot find alignment-file: $snt!\n";} $command.="$GIZAHOME/trainGIZA++.sh $src\.vcb $trg\.vcb $snt"; if (my $sig=system $command){ die "got signal $? from trainGIZA++.sh!\n"; } if ($viterbi){copy ('GIZA++.A3.final',$viterbi);} else{copy ('GIZA++.A3.final','viterbi');} } #---------------------------------------------------------------------------- # Bitext2Text: convert bitexts from Uplug format (XML) to GIZA's format # (this is much too slow ....) sub Bitext2Text{ my $bitext=shift; my $srcfile=shift; my $trgfile=shift; my $param=shift; my %SrcStream=('format'=>'text','file'=>$srcfile); my %TrgStream=('format'=>'text','file'=>$trgfile); my $input=Uplug::IO::Any->new($bitext); my $source=Uplug::IO::Any->new(\%SrcStream); my $target=Uplug::IO::Any->new(\%TrgStream); $input->open('read',$bitext) || warn "cannot open the bitext" && return 0; $source->open('write',\%SrcStream) || warn "cannot write to $srcfile" && return 0; $target->open('write',\%TrgStream) || warn "cannot write to $trgfile" && return 0; #------------------------------------------------------------------------- my $data=Uplug::Data::Align->new(); print STDERR "convert bitext to plain text!\n"; my $count=0; while ($input->read($data)){ $count++; if (not ($count % 100)){ $|=1;print STDERR '.';$|=0; } if (not ($count % 1000)){ $|=1;print STDERR "$count\n";$|=0; } my ($srctxt,$trgtxt)=&BitextStrings($data,$param); if (($srctxt=~/\S/) and ($trgtxt=~/\S/)){ $source->write($srctxt); $target->write($trgtxt); } } # $BitextHeader=$input->header; $input->close; $source->close; $target->close; return $input->header; } #---------------------------------------------------------------------------- # get the actual strings from the bitext (using feature-parameters) # (feature specifications as in coocfreq.pl) sub BitextStrings{ my $data=shift; my $param=shift; my ($srcnodes,$trgnodes)=@_; my @srctok=$data->getSrcTokenFeatures($param,$srcnodes); my @trgtok=$data->getTrgTokenFeatures($param,$trgnodes); map($_=~s/^\s+//sg,@srctok); # delete initial white-space map($_=~s/^\s+//sg,@trgtok); map($_=~s/(\S)\s+$/$1/sg,@srctok); # delete final white-space map($_=~s/(\S)\s+$/$1/sg,@trgtok); map($_=~s/\n/ /sg,@srctok); # otherwise: convert to space map($_=~s/\n/ /sg,@trgtok); map($_=~s/\s/\_/sg,@srctok); # and replace space with underline map($_=~s/\s/\_/sg,@trgtok); # (to avoid extra tokens) my $srctxt=join(' ',@srctok); my $trgtxt=join(' ',@trgtok); $srctxt=~tr/\n/ /; $trgtxt=~tr/\n/ /; return ($srctxt,$trgtxt); } #---------------------------------------------------------------------------- =pod =head1 Synopsis use lib '/path/to/uplug'; use Uplug::Align::Word::Giza; $ENV{UPLUGHOME}='/path/to/uplug'; my %bitext = ('file' => 'svenprf.xces', 'format' => 'xces align'); &Bitext2Text(\%bitext,'src','trg',{}); # convert to plain text &RunGiza('src','trg','viterbi.src-trg'); # run GIZA++ (src-->trg) &RunGiza('trg','src','viterbi.trg-src'); # run GIZA++ (trg-->src) my %dbm = (file=>'clues.dbm', format=>'dbm', key => ['source','target']); &Giza2Clue('.', # directory where GIZA was running (=current) {}, # parameter (= clue dbm header) 1, # =1 --> inverse (trg-src) \%dbm); # clue DBM (giza.dbm if not specified) my $combine = 'intersection' # combine heuristics (union|refined) my %out = ('file' => $combined, # save the result in this file 'format' => 'xces align'); &Combined2Uplug('viterbi.src-trg', # name of first viterbi alignment 'viterbi.trg-src', # name of second viterbi alignment $combine, # type of combination heuristics \%bitext, # bitext {}, # token parameters \%out); # output stream =head1 GIZA++ directories You have to tell the program where it can find the GIZA++ executables. You can either set the UPLUGHOME environment variable or the GIZAHOME variable. $ENV{UPLUGHOME}='/path/to/uplug'; $Uplug::Align::Word::Giza::GIZAHOME="/path/to/uplug/ext/GIZA++"; =cut 1;