/* William A. Gale and Kenneth W. Church, "A Program for Aligning Sentence in Bilingual Corpora" in Susan Armstong ed. "Using Large Corpora", MIT Press, 1994, p91-102. with Michael D. Riley The following code is the core of align. It is a C language program that inputs two files, with one token (word) per line. The text files contain a number of delimiter tokens: "hard" and "soft". The hard regions (e.g. paragraphs) may not be changed, and there must be equal numbers of them in the two input files. The soft regions (e.g. sentences) may be deleted (1-0), inserted (0-1), contracted (2-1), expanded (1-2) or merged (2-2) as necessary so that the output ends up with the same number of soft regions. The program generates two output files. The two output files contain an equal number of soft regions, each on a line. If the -v command line option is included, each soft region is preceded by its probability score. */ #include #include #include #include #include #include #include #include #include /* usage: align_regions -D '.PARA' -d '.End of Sentence' file1 file2 outputs two files: file1.al & file2.al hard regions are delimited by the -D arg soft regions are delimited by the -d arg */ #define dist(x,y) distances[(x)*((ny)+1)+(y)] #define pathx(x,y) path_x[(x)*((ny)+1)+(y)] #define pathy(x,y) path_y[(x)*((ny)+1)+(y)] #define MAX_FILENAME 256 #define BIG_DISTANCE 2500 /* 1999-06-16/PS: I don't know understand why ET added this line and I got a conflict with the value in /usr/include/values.h. I guess it was need on AIX for some reason. */ #ifdef AIX /* added by ET 960405 */ #define MAXINT (~(1L << (BITS(long) - 1))) #endif /* Dynamic Programming Optimization */ struct alignment { int x1; int y1; int x2; int y2; int d; }; char *hard_delimiter=NULL; /* -D arg */ char *soft_delimiter=NULL; /* -d arg */ int verbose=0; /* -v arg */ /* utility functions */ char *readchars(),**readlines(),**substrings(); void err(); /* added by ET 960405 */ void bcopy(); /* seq_align() by Mike Riley x and y are sequences of objects, represented as non-zero ints, to be aligned. dist_func(x1,y1,x2,y2) is a distance function of 4 args: dist_func(x1,y1, 0, 0) gives cost of substitution of x1 by y1. dist_func(x1, 0, 0, 0) gives cost of deletion of x1. dist_func( 0,y1, 0, 0) gives cost of insertion of y1. dist_func(x1,y1,x2, 0) gives cost of contraction of (x1,x2) to y1. dist_func(x1,y1, 0,y2) gives cost of expansion of x1 to (y1,y2). dist_func(x1,y1,x2,y2) gives cost to match (x1,x2) to (y1,y2). align is the alignment, with (align[i].x1,align[i].x2) aligned with (align[i].y1,align[i].y2). Zero in align[].x1 and align[].y1 correspond to insertion and deletion respectively. Non-zero in align[].x2 and align[].y2 correspond to contraction and expansion, respectively. align[].d gives the distance for that pairing. The function returns the length of the alignment. Function call in main: seq_align(len1,len2,number_of_soft_regions1,number_of_soft_regions2, two_side_distance,&align); */ int seq_align(x,y,nx,ny,dist_funct,align) int *x,*y,nx,ny; int (*dist_funct)(); struct alignment **align; { int *distances,*path_x,*path_y,n; int i,j,oi,oj,di,dj,d1,d2,d3,d4,d5,d6,dmin; struct alignment *ralign; distances=(int *) malloc((nx+1)*(ny+1)*sizeof(int)); path_x=(int *) malloc((nx+1)*(ny+1)*sizeof(int)); path_y=(int *) malloc((nx+1)*(ny+1)*sizeof(int)); ralign=(struct alignment *) malloc((nx+ny)*sizeof(struct alignment)); printf("seq_align: x:%d y:%d nx:%d ny:%d \n",x[0],y[0],nx,ny);fflush(stdout); for (j=0;j<=ny;j++) { for (i=0;i<=nx;i++) { printf("***seq_align: %d %d %d %d\n",i,j,x[i],y[j]);fflush(stdout); /* substitution */ d1=i>0&&j>0 ? dist(i-1,j-1)+(*dist_funct)(x[i-1],y[j-1],0,0):MAXINT; /* deletion */ d2=i>0 ? dist(i-1,j)+(*dist_funct)(x[i-1],0,0,0):MAXINT; /* insertion */ d3=j>0 ? dist(i,j-1)+(*dist_funct)(0,y[j-1],0,0):MAXINT; /* contraction */ d4=i>1&&j>0 ? dist(i-2,j-1)+(*dist_funct)(x[i-2],y[j-1],x[i-1],0):MAXINT; /* expansion */ d5=i>0&&j>1 ? dist(i-1,j-2)+(*dist_funct)(x[i-1],y[j-2],0,y[j-1]):MAXINT; /* melding */ d6=i>1&&j>1 ? dist(i-2,j-2)+(*dist_funct)(x[i-2],y[j-2],x[i-1],y[j-1]) : MAXINT; printf("seq_align: "); if (d10||j>0;i=oi,j=oj) { oi=pathx(i,j); oj=pathy(i,j); di=i-oi; dj=j-oj; if (di==1&&dj==1) { /* substitution */ ralign[n].x1=x[i-1]; ralign[n].y1=y[j-1]; ralign[n].x2=0; ralign[n].y2=0; ralign[n++].d=dist(i,j)-dist(i-1,j-1); } else if (di==1&&dj==0) { /* deletion */ ralign[n].x1=x[i-1]; ralign[n].y1=0; ralign[n].x2=0; ralign[n].y2=0; ralign[n++].d=dist(i,j)-dist(i-1,j-1); } else if (di==0&&dj==1) { /* insertion */ ralign[n].x1=0; ralign[n].y1=y[j-1]; ralign[n].x2=0; ralign[n].y2=0; ralign[n++].d=dist(i,j)-dist(i-1,j-1); } else if (dj==1) { /* contraction */ ralign[n].x1=x[i-2]; ralign[n].y1=y[j-1]; ralign[n].x2=x[i-1]; ralign[n].y2=0; ralign[n++].d=dist(i,j)-dist(i-1,j-1); } else if (di==1) { /* expansion */ ralign[n].x1=x[i-1]; ralign[n].y1=y[j-2]; ralign[n].x2=0; ralign[n].y2=y[j-1]; ralign[n++].d=dist(i,j)-dist(i-1,j-1); } else /* (di==2&&dj==2) */ { /* melding */ ralign[n].x1=x[i-2]; ralign[n].y1=y[j-2]; ralign[n].x2=x[i-1]; ralign[n].y2=y[j-1]; ralign[n++].d=dist(i,j)-dist(i-1,j-1); } } *align=(struct alignment *) malloc(n*sizeof(struct alignment)); for (i=0;i0) return((int)(-100*log(pd))); else return(BIG_DISTANCE); } int two_side_distance(x1,y1,x2,y2) int x1,y1,x2,y2; { int penalty21=230; /* -100*log[prob of 2-1 match] / [prob of 1-1 match]) */ int penalty22=440; /* -100*log[prob of 2-2 match] / [prob of 1-1 match]) */ int penalty01=450; /* -100*log[prob of 0-1 match] / [prob of 1-1 match]) */ /*printf("two_side_distance: %d %d %d %d\n",x1,y1,x2,y2);fflush(stdout);*/ if (x2==0&&y2==0) if (x1==0) return(match(x1,y1)+penalty01); /* insertion */ else if (y1==0) return(match(x1,y1)+penalty01); /* deletion */ else return(match(x1,y1)); /* substitution */ else if (x2==0) return(match(x1,y1+y2)+penalty21); /* expansion */ else if (y2==0) return(match(x1+x2,y1)+penalty21); /* contraction */ else return(match(x1+x2,y1+y2)+penalty22); /* melding */ } /* Functions for Manipulating Regions */ struct region { char **lines; int length; }; void print_region(fd,region,score) int score; FILE *fd; struct region *region; { char **lines,**end; lines=region->lines; end=lines+region->length; for (;lineslines; end=lines+region->length; result=end-lines; for (;lineslines; printf("find_sub_regions:2\n");fflush(stdout); end=lines+region->length; for (l=lines;lx1,a->x2, a->y1, a->y2, a->d); */ if (a->x2>0) ix++; else if (a->x1==0) ix--; if (a->y2>0) iy++; else if (a->y1==0) iy--; if (a->x2 > 0 && a->x1 == 0 && a->y1 == 0 && a->y2 == 0) iy++; else if (a->y2 > 0 && a->y1 == 0 && a->x1 == 0 && a->x2 == 0) ix++; else if (a->x1==0&&a->y1==0&&a->x2==0&&a->y2==0) { ix++; iy++; } ix++; iy++; fprintf(stderr, "ix=%d prevx=%d; iy=%d prevy=%d\n",ix, prevx, iy, prevy); if (verbose) { fprintf(out1,".Score %d\n",a->d); fprintf(out2,".Score %d\n",a->d); } for (;prevxd); fprintf(out1,"%s %d.%d\n",soft_delimiter,hardCounter,softCounter); for (;prevyd); fprintf(out2,"%s %d.%d\n",soft_delimiter,hardCounter,softCounter); softCounter++; } fprintf(out1,"%s\n",hard_delimiter); fprintf(out2,"%s\n",hard_delimiter); free(align); free(soft_regions1); free(soft_regions2); free(len1); free(len2); hardCounter++; } /* added by ET 960405 */ exit(0); } /* Utility functions */ void err(msg) char *msg; { fprintf(stderr,"**ERROR**: %s\n",msg); exit(2); } /* return the contents of the file as a string and stuff the length of this string into len_ptr */ char *readchars(filename,len_ptr) char *filename; int *len_ptr; { FILE *fd; char *result; struct stat stat_buf; fd=fopen(filename,"r"); if (fd==NULL) err("open failed"); if (fstat(fileno(fd),&stat_buf)==-1) err("stat failed"); *len_ptr=stat_buf.st_size; result=malloc(*len_ptr); if (result==NULL) err("malloc3 failed\n"); if (fread(result,sizeof(char),*len_ptr,fd)!=*len_ptr) err("fread failed"); if (fclose(fd)==-1) err("fclose failed"); return(result); } /* split string into a number of substrings delimited by a delimiter chararcter return an array of substrings stuff the length of this arry into len_ptr */ char **substrings(string,end,delimiter,len_ptr) char *string,*end,delimiter; int *len_ptr; { char *s,**result; int i=0; while (string