# Copyright (C) 2004 Jörg Tiedemann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ##################################################################### # # XCESalign # ##################################################################### # $Author: joerg72 $ # $Id: XCESalign.pm,v 1.6 2005/07/28 07:57:00 joerg72 Exp $ package Uplug::IO::XCESalign; use strict; use vars qw(@ISA $VERSION); use vars qw(%StreamOptions); use Uplug::IO::XML; use Uplug::IO::Any; use Uplug::Data; # use Uplug::Data::DOM; $VERSION=0.1; @ISA = qw( Uplug::IO::XML ); # stream options and their default values!! %StreamOptions = ('DocRootTag' => 'cesAlign', 'root' => 'link', 'DataStructure' => 'complex', 'SkipDataHeader' => 1, 'SkipDataTail' => 1, 'DTDname' => 'cesAlign', # 'DTDsystemID' => 'dtd/xcesAlign.dtd', 'DTDpublicID' => '-//CES//DTD XML cesAlign//EN' ); $StreamOptions{DocRoot}{version}='1.0'; sub new{ my $class=shift; my $self=$class->SUPER::new($class); foreach (keys %StreamOptions){ $self->setOption($_,$StreamOptions{$_}); } # &Uplug::IO::AddHash2Hash($self->{StreamOptions},\%StreamOptions); return $self; } sub open{ my $self = shift; if ($self->SUPER::open(@_)){ if ($self->{AccessMode} eq 'read'){ # if access-mode=read: if (not $self->option('DocBodyTag')){ # set doc-body-tag $self->setOption('DocBodyTag','(linkGrp|linkAlign)'); } } if (defined $self->{StreamOptions}->{fromDoc}){ $self->OpenAlignDocs($self->{StreamOptions}); } elsif (defined $self->{StreamHeader}->{fromDoc}){ $self->OpenAlignDocs($self->{StreamHeader}); $self->{StreamOptions}->{fromDoc}=$self->{StreamHeader}->{fromDoc}; $self->{StreamOptions}->{toDoc}=$self->{StreamHeader}->{toDoc}; } elsif (defined $self->{StreamOptions}->{DocRoot}->{fromDoc}){ $self->OpenAlignDocs($self->{StreamOptions}->{DocRoot}); $self->{StreamOptions}->{fromDoc}= $self->{StreamOptions}->{DocRoot}->{fromDoc}; $self->{StreamOptions}->{toDoc}= $self->{StreamOptions}->{DocRoot}->{toDoc}; } elsif (defined $self->{StreamHeader}->{DocRoot}->{fromDoc}){ $self->OpenAlignDocs($self->{StreamHeader}->{DocRoot}); $self->{StreamOptions}->{fromDoc}= $self->{StreamHeader}->{DocRoot}->{fromDoc}; $self->{StreamOptions}->{toDoc}= $self->{StreamHeader}->{DocRoot}->{toDoc}; } if ($self->{AccessMode} ne 'read'){ if (($self->{fromDoc} ne 'exists') and (not ref($self->{source}))){ $self->{StreamOptions}->{fromDoc}= $self->{StreamOptions}->{file}.'.src'; } if (($self->{toDoc} ne 'exists') and (not ref($self->{target}))){ $self->{StreamOptions}->{toDoc}= $self->{StreamOptions}->{file}.'.trg'; } $self->OpenAlignDocs($self->{StreamOptions}); # write linkGrp-tag only of we skip data headers! # otherwise: linkGrp is expected to be in the data-headers # of data written to cesAlign files!! if ($self->{StreamOptions}->{SkipDataHeader}){ $self->OpenTag('linkGrp', {'targType' => 's', 'fromDoc' => $self->{StreamOptions}->{fromDoc}, 'toDoc' => $self->{StreamOptions}->{toDoc}}); } } if (ref($self->{XmlParser})){ $self->{XmlHandle}->{REMOVESPACES}=$self->option('REMOVESPACES'); $self->{XmlHandle}->{SubTreeRoot}=$self->option('root'); $self->{XmlHandle}->{DocRootTag}=$self->option('DocRootTag'); $self->{XmlHandle}->{DocBodyTag}=$self->option('DocBodyTag'); $self->CompileTagREs; } return 1; } return 0; } sub close{ my $self=shift; if (ref($self->{source})){ $self->{source}->close(); } if (ref($self->{target})){ $self->{target}->close(); } return $self->SUPER::close(); } sub read{ my $self=shift; my $data=shift; $data->init; if (not ref($data->{link})){$data->{link}=Uplug::Data->new;} else{$data->{link}->init;} my $ret=$self->SUPER::read($data->{link},@_); if ($self->NewDocBody){ my $BodyAttr=$self->DocBodyAttr; $self->addheader($BodyAttr); if (not $self->OpenAlignDocs($BodyAttr)){ return 0; } delete $data->{sourceSent}; # delete previous data-objects delete $data->{targetSent}; # (to get new XML::Parser instances) } my $attr=$data->{link}->attribute(); # $data->addChild('align',undef,$attr); $data->addNode('align',$attr); my ($src,$trg); my $xtargets=$data->{link}->attribute('xtargets'); if (defined $xtargets){ ($src,$trg)=split(/\s*\;\s*/,$xtargets); } else{return 0;} my @srcID=split(/\s/,$src); my @trgID=split(/\s/,$trg); #----------- sentences ------------------------------ &ReadSegments($self->{source},$data,\@srcID,'source'); &ReadSegments($self->{target},$data,\@trgID,'target'); $self->Uplug::IO::read($data); # $data->{source}=$data->subData('source'); # $data->{target}=$data->subData('target'); # $data->subTree($data->{source},'source'); # $data->subTree($data->{target},'target'); return $ret; } sub write{ my $self=shift; my ($data)=@_; $self->Uplug::IO::write($data); my $ret; my $source=$data->subData('source'); my $target=$data->subData('target'); # my $source=Uplug::Data::DOM->new; # my $source=Uplug::Data->new; # $data->subTree($source,'source'); # my $target=Uplug::Data->new; # my $target=Uplug::Data::DOM->new; # $data->subTree($target,'target'); if (ref($data->{link})){ $ret=$self->SUPER::write($data->{link}); } elsif (defined $data->attribute('xtargets')){ $ret=$self->SUPER::write($data); } else{ my $tag=$self->option('root'); my $link=Uplug::Data->new($tag); my @src=$source->findNodes('s'); my @srcID=$source->attribute(\@src,'id'); foreach (0..$#src){ if (not defined $srcID[$_]){ $self->{srcID}++; $srcID[$_]=$self->{srcID}; $source->setAttribute($src[$_],'id',$srcID[$_]); } $self->{srcID}=$srcID[$_]; } my @trg=$target->findNodes('s'); my @trgID=$target->attribute(\@trg,'id'); foreach (0..$#trg){ if (not defined $trgID[$_]){ $self->{trgID}++; $trgID[$_]=$self->{trgID}; $target->setAttribute($trg[$_],'id',$trgID[$_]); } $self->{trgID}=$trgID[$_]; } my $s=join(' ',@srcID); my $t=join(' ',@trgID); $link->setAttribute('xtargets',"$s\;$t",$tag); my $id=$data->attribute('id'); if (not defined $id){ $self->{alignID}++; $id=$self->{alignID}; } $link->setAttribute('id',$id); $ret=$self->SUPER::write($link); } if (($self->{fromDoc} ne 'exists') and (ref($self->{source}))){ # $source->moveAttribute('s','seg'); $self->{source}->write($source); } if (($self->{toDoc} ne 'exists') and (ref($self->{target}))){ # $target->moveAttribute('s','seg'); $self->{target}->write($target); } return $ret; } #----------------------------------------------------------------- # select: select data # # * read sequentially through the link-file and # search bitext segments according to the matching pattern # # we cannot use select for searching in the link file because # we need fromDoc and toDoc from the alignment tags .....!!!! sub select{ my $self=shift; my ($data,$pattern,$attr,$operator)=@_; $data->init; if (not ref($data->{link})){$data->{link}=Uplug::Data->new;} else{$data->{link}->init;} my %linkPattern=(); my %srcPattern=(); my %trgPattern=(); if (ref($pattern) eq 'HASH'){ %linkPattern=%{$pattern}; if (ref($pattern->{source}) eq 'HASH'){ %srcPattern=%{$pattern->{source}}; delete $linkPattern{source}; } elsif (defined $pattern->{source}){ $srcPattern{'#text'}=$pattern->{source}; delete $linkPattern{source}; } if (ref($pattern->{target}) eq 'HASH'){ %trgPattern=%{$pattern->{target}}; delete $linkPattern{target}; } elsif (defined $pattern->{target}){ $trgPattern{'#text'}=$pattern->{target}; delete $linkPattern{target}; } } if ($self->{ENDOFSTREAM}){ delete $self->{ENDOFSTREAM}; $self->reopen(); } while ($self->SUPER::read($data->{link})){ print ''; if ($self->NewDocBody){ my $BodyAttr=$self->DocBodyAttr; $self->addheader($BodyAttr); if (not $self->OpenAlignDocs($BodyAttr)){ return 0; } delete $data->{sourceSent}; # delete previous data-objects delete $data->{targetSent}; # (to get new XML::Parser instances) } if (not $data->{link}->matchData(\%linkPattern,$operator)){next;} my $attr=$data->{link}->attribute(); $data->addNode('align',$attr); my ($src,$trg); my $xtargets=$data->{link}->attribute('xtargets'); if (defined $xtargets){ ($src,$trg)=split(/\s*\;\s*/,$xtargets); } else{return 0;} my @srcID=split(/\s/,$src); my @trgID=split(/\s/,$trg); #----------- sentences ------------------------------ # print STDERR "read src sentences ",join(":",@srcID),"\n"; # print STDERR "read trg sentences ",join(":",@trgID),"\n"; &SearchSegments($self->{source},$data,\@srcID,'source'); &SearchSegments($self->{target},$data,\@trgID,'target'); # &ReadSegments($self->{source},$data,\@srcID,'source'); # &ReadSegments($self->{target},$data,\@trgID,'target'); $self->Uplug::IO::read($data); return 1; } $self->{ENDOFSTREAM}=1; return 0; } # end of select #----------------------------------------------------------------- sub FindAlignDocFile{ my $self=shift; my $doc=shift; if ((-s $doc) or (-s "$doc.gz")){return $doc;} # found it --> ok! my $file=$self->files(); # no? my $dir='./'; # check relative to the if ($file=~/^(.*[\\\/])[^\\\/]+$/){$dir=$1;} # align-document if ((-s $dir.$doc) or (-s "$dir$doc.gz")){ # (align-dir/filename) # print STDERR "$doc --> $dir$doc\n"; if ($doc=~/^([\\\/\.]*[^\\\/\.]+)[\\\/]/){ # make symbolic links! eval { symlink ($dir.$1,$1);1 }; # ... relative dir } # ... or else{eval { symlink ($dir.$doc,$doc);1 };} # ... file link return $dir.$doc; # return path+file } my $docfile=$doc; # no? if ($doc=~/^.*[\\\/]([^\\\/]+)$/){$docfile=$1;} # remove path in the if ((-s $dir.$docfile) or (-s "$dir$docfile.gz")){ # filename and check # print STDERR "$doc --> $dir$docfile\n"; if ($doc=~/^([\\\/\.]*[^\\\/\.]+)[\\\/]/){ eval { symlink ($dir.$1,$1);1 }; } else{eval { symlink ($dir.$docfile,$docfile);1 };} return $dir.$docfile; # found it --> return! } # no? --> remove if ($doc=~/^[^\\\/]+[\\\/]([^\\\/].*)$/){$docfile=$1;} # initial directory if ((-s $dir.$docfile) or (-s "$dir$docfile.gz")){ # and check again! # print STDERR "$doc --> $dir$docfile\n"; if ($doc=~/^([\\\/\.]*[^\\\/\.]+)[\\\/]/){ eval { symlink ($dir,$1);1 }; # print STDERR "symlink ($dir,$1)\n"; } else{eval { symlink ($dir.$docfile,$docfile);1 };} # print STDERR "symlink ($dir.$docfile,$docfile)\n"; return $dir.$docfile; } return $doc; } sub OpenAlignDocs{ my $self=shift; my $options=shift; if (ref($options) ne 'HASH'){return 0;} if (($self->{AccessMode} ne 'read') and ((-s $options->{fromDoc}) or (-s "$options->{fromDoc}.gz") or ($self->{StreamOptions}->{SkipSrcFile}))){ $self->{fromDoc}='exists'; } else{ $options->{fromDoc}=$self->FindAlignDocFile($options->{fromDoc}); my %stream=('file' => $options->{fromDoc}, 'format' => 'XML', 'root' => 's'); ## make subtree index files (DBM hash) for the source file ## # if ($self->option('MAKESUBTREEINDEX')){ # commented out -> always! $stream{MAKESUBTREEINDEX} = 1; # } if ($self->{AccessMode} ne 'read'){$stream{DocRootTag}='document';} $self->{source}=Uplug::IO::Any->new(\%stream); if (not $self->{source}->open($self->{AccessMode},\%stream)){ return 0; } } if (($self->{AccessMode} ne 'read') and ((-s $options->{toDoc}) or (-s "$options->{toDoc}.gz") or ($self->{StreamOptions}->{SkipTrgFile}))){ $self->{toDoc}='exists'; } else{ $options->{toDoc}=$self->FindAlignDocFile($options->{toDoc}); my %stream=('file' => $options->{toDoc}, 'format' => 'XML', 'root' => 's'); ## make subtree index files (DBM hash) for the source file ## # if ($self->option('MAKESUBTREEINDEX')){ # commented out -> always! $stream{MAKESUBTREEINDEX} = 1; # } if ($self->{AccessMode} ne 'read'){$stream{DocRootTag}='document';} $self->{target}=Uplug::IO::Any->new(\%stream); if (not $self->{target}->open($self->{AccessMode},\%stream)){ return 0; } } return 1; } #------------------------------------------------------------------------- # ReadSegments: read segments from the aligned XML-files (sentences) # # * read sequentially through the data and add sentences with the # requested IDs # * if the ID's don't match --> use the select function for IO::XML # (this is probably also just a sequential read -> might be a problem!) sub ReadSegments{ my ($stream,$data,$IDs,$lang)=@_; if (not ref($stream)){return;} if (not @{$IDs}){return;} #-------------------------------------------------------------- if (not ref($data->{$lang})){ $data->{$lang}=Uplug::Data::Lang->new(); # a new language object } if (ref($data->{$lang.'Sent'}) ne 'ARRAY'){ # the array of data objects $data->{$lang.'Sent'}=[]; # (one for each sentence) } #-------------------------------------------------------------- my $parent=$data->addNode($lang); # set root node = $lang $data->{$lang}->setRoot($parent); # set root node of sub-lang-data my $count = 0; # make a new data object if (not ref($data->{$lang.'Sent'}->[$count])){ # for reading the sentence $data->{$lang.'Sent'}->[$count]= Uplug::Data->new(); } my $sent = $data->{$lang.'Sent'}->[$count]; # $sents points to it! while ($stream->read($sent)){ # read sequentially through the data my $id=$sent->attribute('id'); # get the sentence ID if (not grep ($_ eq $id,@{$IDs})){ # if s-ID is not in the requested: foreach (0..$#{$IDs}){ # use the select-function if (not ref($data->{$lang.'Sent'}->[$_])){ $data->{$lang.'Sent'}->[$_]=Uplug::Data->new(); } my $sent = $data->{$lang.'Sent'}->[$_]; if ($stream->select($sent,{id => $IDs->[$_]})){ my $node=$sent->root(); $data->addNode($parent,$node); } } return; } my $node=$sent->root(); # otherwise: add the sentence $data->addNode($parent,$node); # and continue to read if necessary if ($id eq $IDs->[-1]){last;} $count ++; # still more sentences! if (not ref($data->{$lang.'Sent'}->[$count])){ # make a new data- $data->{$lang.'Sent'}->[$count]= # object if necessary Uplug::Data->new(); } $sent = $data->{$lang.'Sent'}->[$count]; # let $sent point to it } } #------------------------------------------------------------------------- # SearchSegments: search sentences in the aligned XML files # # * similar to ReadSegments but uses the select function in IO::XML # as standard (does not call data->read at all!) # * 'select' is just reading sequentially through the data at the moment # and could actually be used even for ReadSegments (??!) sub SearchSegments{ my ($stream,$data,$IDs,$lang)=@_; if (not ref($stream)){return;} if (not @{$IDs}){return;} #-------------------------------------------------------------- if (not ref($data->{$lang})){ $data->{$lang}=Uplug::Data::Lang->new(); # a new language object } if (ref($data->{$lang.'Sent'}) ne 'ARRAY'){ # the array of data objects $data->{$lang.'Sent'}=[]; # (one for each sentence) } #-------------------------------------------------------------- my $parent=$data->addNode($lang); # set root node = $lang $data->{$lang}->setRoot($parent); # set root node of sub-lang-data foreach (0..$#{$IDs}){ if (not ref($data->{$lang.'Sent'}->[$_])){ $data->{$lang.'Sent'}->[$_]=Uplug::Data->new(); } my $sent = $data->{$lang.'Sent'}->[$_]; if ($stream->select($sent,{id => $IDs->[$_]})){ my $node=$sent->root(); $data->addNode($parent,$node); } } } #------------------------------------------------------------------------- # ReadSegments: old version of ReadSegments # # * reads sentences only if the IDs match OR the current ID is # LOWER than the FIRST sentence ID in the list of requested onces # * advantage: does not use the select function from IO::XML which is # right just a sequential search through the file! this may cause the # program to read through the whole file without finding anything, and # this is slow # (this problem appears if there is an requested ID which is LOWER than # the one at the current file position, or if the requested ID does not # exist in the file) # * use this one instead of the one above by removing the 'Old' in the # sub-name sub ReadSegmentsOld{ my ($stream,$data,$IDs,$lang)=@_; if (not ref($stream)){return;} if (not @{$IDs}){return;} my $i=0; #-------------------------------------------------------------- if (not ref($data->{$lang})){ $data->{$lang}=Uplug::Data::Lang->new(); # a new language object } if (not ref($data->{$lang.'Sent'})){ $data->{$lang.'Sent'}=Uplug::Data->new(); # a new object for reading } #-------------------------------------------------------------- my $parent=$data->addNode($lang); # set root node = $lang $data->{$lang}->setRoot($parent); # set root node of sub-lang-data my $sent=$data->{$lang.'Sent'}; # sentences will be read into $sent my @start=split(/\./,$IDs->[0]); # split start ID (can be like 's3.2.5.2') map(s/[^0-9]//,@start); # delete non-digits my $fail=0; while ($stream->read($sent)){ # read sequentially through the data my $id=$sent->attribute('id'); # get the sentence ID if (not grep ($_ eq $id,@{$IDs})){ # if s-ID is not in the requested my @nr=split(/\./,$id); # split into ID-levels map(s/[^0-9]//,@nr); # delete non-digits foreach my $l (0..$#start){ # compare each ID level if ($nr[$l]<$start[$l]){ # if the current ID is lower: last; # OK! continue reading! (we don't } # have to check deeper levels!) if ($nr[$l]>$start[$l]){ # if larger than the start-ID $fail++;last; # --> fail and stop } } if ($fail>1){last;} # allow to fail once (why ?!?!) # $stream->close; # we could also re-open: takes too # $stream->open('read'); # much time! ... ignore it for now! next; } my $node=$sent->root(); $data->addNode($parent,$node); if ($id eq $IDs->[-1]){last;} $i++; } }