#!/local/bin/perl -w
#
# this is a sentence aligner based on Phillip Koehn's implementation used
# for aligning the EuroParl corpus
# http://people.csail.mit.edu/koehn/publications/europarl/
# (and his aligner is based on Gale&Chruch's length-based alignment approach)
#
# USAGE: sentalign [OPTIONS] srcfile trgfile > alignfile
#
# OPTIONS
#
#  -h 'hard-tag-re' .......... regular expression to match hard boundary tags
#
#

use strict;
use vars qw($opt_h);
use Getopt::Std;
use XML::Parser;


getopts('h:');

my $SENT_TAG = 's';
my $HARD_TAG = $opt_h;        # hard tag is a regular expression!



my $src_file = shift(@ARGV);
my $trg_file = shift(@ARGV);



my $src_parser = new XML::Parser(Handlers => {Start => \&XmlStart,
					      End => \&XmlEnd,
					      Default => \&XmlChar,
					  },);


my $trg_parser = new XML::Parser(Handlers => {Start => \&XmlStart,
					      End => \&XmlEnd,
					      Default => \&XmlChar,
					  },);


open SRC,"<$src_file" || die "cannot open $src_file!\n";
open TRG,"<$trg_file" || die "cannot open $trg_file!\n";

my $src_ph = $src_parser->parse_start;
my $trg_ph = $trg_parser->parse_start;


ParseBitext(*SRC,*TRG,$src_ph,$trg_ph);

#
# if same number of levels
#    foreach level
#       align at that level
#          adjust segmentation and go to next level
#
#



# &print_xmlheader($src_file,$trg_file);

# my ($src_len,$trg_len,$src_id,$trg_id);
# my $LINK_ID=0;





#    my @links = &sentence_align($src_len,$trg_len,$src_id,$trg_id);
#    print_alignments(\@links);


close SRC;
close TRG;



&print_xmlfooter();


sub print_xmlheader{
    my ($src_file,$trg_file) = @_;

    print '<?xml version="1.0" encoding="utf-8"?>'."\n";
    print '<!DOCTYPE cesAlign PUBLIC "-//CES//DTD XML cesAlign//EN" "">'."\n";
    print '<cesAlign toDoc="'.$trg_file;
    print '" version="1.0" fromDoc="'.$src_file;
    print '">'."\n";
    print ' <linkGrp targType="s" toDoc="'.$trg_file;
    print '" fromDoc="'.$src_file.'">'."\n";
}

sub print_xmlfooter{
    print "</linkGrp>\n";
    print "</cesAlign>\n";
}














sub ParseBitext{
    my ($SFH,$TFH,$src_parser,$trg_parser) = @_;

    delete $src_parser->{LEVEL};    # current level in XML tree
    delete $trg_parser->{LEVEL};

    delete $src_parser->{START};    # start positions of each tag at each level
    delete $trg_parser->{START};

    delete $src_parser->{END};      # end positions of each tag at each level
    delete $trg_parser->{END};

    delete $src_parser->{IDS};      # end positions of each tag at each level
    delete $trg_parser->{IDS};

    delete $src_parser->{CHILDREN};
    delete $trg_parser->{CHILDREN};

    delete $src_parser->{SENT_OPEN}; # flag to indicate that we 
    delete $trg_parser->{SENT_OPEN}; # are in a sentence

    while (my $line = <$SFH>){
	$src_parser->parse_more($line);
    }

    while (my $line = <$TFH>){
	$trg_parser->parse_more($line);
    }
}





sub XmlStart{
    my ($p,$e,%attr) = @_;

    $p->{LEVEL}=0 unless defined $p->{LEVEL};
    $p->{POSITION}=0 unless defined $p->{POSITION};

    ## above sentence level

    if (not $p->{SENT_OPEN}){
	$p->{TAGS}->[$p->{LEVEL}]->{$e}++;
	$p->{TAGS}->[$p->{LEVEL}]->{__ALL__}++;
	push (@{$p->{START}->[$p->{LEVEL}]},$p->{POSITION});
	push (@{$p->{IDS}->[$p->{LEVEL}]},$attr{id});

	if ($p->{LEVEL} > 0){
	    my $parent = $#{$p->{START}->[$p->{LEVEL}-1]};
	    push(@{$p->{CHILDREN}->[$p->{LEVEL}-1]->[$parent]},
		 $#{$p->{START}->[$p->{LEVEL}]});
	}

	$p->{LEVEL}++;
	if ($e eq $SENT_TAG){
	    $p->{SENT_OPEN} = 1;
	}
    }
}


sub XmlEnd{
    my $p=shift;
    my $e=shift;
    if ($e eq $SENT_TAG){
	delete $p->{SENT_OPEN};
    }

    if (not $p->{SENT_OPEN}){
	$p->{LEVEL}--;
	push (@{$p->{END}->[$p->{LEVEL}]},$p->{POSITION});
    }
}

sub XmlChar{
    my $p=shift;
    my $e=shift;

    $p->{POSITION} += length($e);

#    if ($p->{SENT_OPEN}){
#	$p->{SENT_LENGTHS}->[-1] += length($e);
#    }

}







sub print_alignments{
    my $LINKS = shift;
    foreach my $i (0..$#{$LINKS}){
	print '<link id="SL'.$LINK_ID.'" xtargets="';
	print join(' ',@{$$LINKS[$i]{src}});
	print ';';
	print join(' ',@{$$LINKS[$i]{trg}});
	print "\" />\n";
	$LINK_ID++;
    }
}







# this is a vanilla implementation of church and gale
sub sentence_align {
  my ($LEN1,$LEN2,$IDS1,$IDS2) = @_;

  # parameters
  my %PRIOR;
  $PRIOR{1}{1} = 0.89;
  $PRIOR{1}{0} = 0.01/2;
  $PRIOR{0}{1} = 0.01/2;
  $PRIOR{2}{1} = 0.089/2;
  $PRIOR{1}{2} = 0.089/2;
#  $PRIOR{2}{2} = 0.011;
  

  # dynamic programming
  my (@COST,@BACK);
  $COST[0][0] = 0;
  for(my $i1=0;$i1<=$#{$LEN1};$i1++) {
    for(my $i2=0;$i2<=$#{$LEN2};$i2++) {
      next if $i1 + $i2 == 0;
      $COST[$i1][$i2] = 1e10;
      foreach my $d1 (keys %PRIOR) {
	next if $d1>$i1;
	foreach my $d2 (keys %{$PRIOR{$d1}}) {
	  next if $d2>$i2;
	  my $cost = $COST[$i1-$d1][$i2-$d2] - log($PRIOR{$d1}{$d2}) +  
	    &match($$LEN1[$i1]-$$LEN1[$i1-$d1], $$LEN2[$i2]-$$LEN2[$i2-$d2]);
#	  print "($i1->".($i1-$d1).",$i2->".($i2-$d2).") [".($LEN1[$i1]-$LEN1[$i1-$d1]).",".($LEN2[$i2]-$LEN2[$i2-$d2])."] = $COST[$i1-$d1][$i2-$d2] - ".log($PRIOR{$d1}{$d2})." + ".&match($LEN1[$i1]-$LEN1[$i1-$d1], $LEN2[$i2]-$LEN2[$i2-$d2])." = $cost\n";
	  if ($cost < $COST[$i1][$i2]) {
	    $COST[$i1][$i2] = $cost;
	    @{$BACK[$i1][$i2]} = ($i1-$d1,$i2-$d2);
	  }
	}
      }
#      print $COST[$i1][$i2]."($i1-$BACK[$i1][$i2][0],$i2-$BACK[$i1][$i2][1]) ";
    }
#    print "\n";
  }


  # back tracking
  my %NEXT=();
  my @LINKS=();

  my $i1 = $#{$LEN1};
  my $i2 = $#{$LEN2};

  while($i1>0 || $i2>0) {
#    print "back $i1 $i2\n";
      @{$NEXT{$BACK[$i1][$i2][0]}{$BACK[$i1][$i2][1]}} = ($i1,$i2);
      ($i1,$i2) = ($BACK[$i1][$i2][0],$BACK[$i1][$i2][1]);
  }
  while($i1<$#{$LEN1} || $i2<$#{$LEN2}) {
      push(@LINKS,{});
#     print "fwd $i1 $i2\n";
#      print '<link id="SL'.$LINK_ID.'" xtargets="';
      for(my $i=$i1;$i<$NEXT{$i1}{$i2}[0];$i++) {
	  my $sid = $$IDS1{$$LEN1[$i]};
	  push(@{$LINKS[-1]{src}},$sid);
#	  print " " unless $i == $i1;
#	  print $$IDS1[$i];
      }
#      print ";";
      for(my $i=$i2;$i<$NEXT{$i1}{$i2}[1];$i++) {
	  my $sid = $$IDS2{$$LEN2[$i]};
	  push(@{$LINKS[-1]{trg}},$sid);
#	  push(@{$LINKS[-1]{trg}},$$LEN2[$i]);
#	  print " " unless $i == $i2;
#	  print $$IDS2[$i];
      }
#      print "\" />\n";
      ($i1,$i2) = @{$NEXT{$i1}{$i2}};
  }  

  return @LINKS;
}



sub match {
  my ($len1,$len2) = @_;
  my $c = 1;
  my $s2 = 6.8;

  if ($len1==0 && $len2==0) { return 0; }
  my $mean = ($len1 + $len2/$c) / 2;
  my $z = ($c * $len1 - $len2)/sqrt($s2 * $mean);
  if ($z < 0) { $z = -$z; }
  my $pd = 2 * (1 - &pnorm($z));
  if ($pd>0) { return -log($pd); }
  return 25;
}

sub pnorm {
  my ($z) = @_;
  my $t = 1/(1 + 0.2316419 * $z);
  return 1 - 0.3989423 * exp(-$z * $z / 2) *
    ((((1.330274429 * $t 
	- 1.821255978) * $t 
       + 1.781477937) * $t 
      - 0.356563782) * $t
     + 0.319381530) * $t;
}
