/* EGYPT Toolkit for Statistical Machine Translation Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ /* --------------------------------------------------------------------------* * * * Module : getSentece * * * * Method Definitions File: getSentence.cc * * * * Objective: Defines clases and methods for handling I/O for the parallel * * corpus. * *****************************************************************************/ #include "getSentence.h" #include #include #include "Parameter.h" #include "errno.h" int PrintedTooLong=0; /* -------------- Method Defnitions for Class sentenceHandler ---------------*/ GLOBAL_PARAMETER(double,ManlexMAX_MULTIPLICITY,"manlexMAX_MULTIPLICITY","",PARLEV_EM,20.0); GLOBAL_PARAMETER(double,Manlexfactor1,"manlexfactor1","",PARLEV_EM,0.0); GLOBAL_PARAMETER(double,Manlexfactor2,"manlexfactor2","",PARLEV_EM,0.0); sentenceHandler::sentenceHandler(const char* filename, vcbList* elist, vcbList* flist) : realCount(0) // This method is the constructor of the class, it also intitializes the // sentence pair sequential number (count) to zero. { readflag = false ; allInMemory = false ; inputFilename = filename ; inputFile = new ifstream(filename); pair_no = 0 ; if(!(*inputFile)){ cerr << "\nERROR:(a) Cannot open " << filename; exit(1); } currentSentence = 0; totalPairs1 = 0 ; totalPairs2 =0; pair_no = 0 ; noSentInBuffer = 0 ; Buffer.clear(); bool isNegative=0; if (elist && flist){ cout << "Calculating vocabulary frequencies from corpus " << filename << '\n'; sentPair s ; while (getNextSentence(s, elist, flist)) { totalPairs1++; totalPairs2+=s.realCount; // NOTE: this value might change during training // for words from the manual dictionary, yet this is ignored! if( s.noOcc<0 ) isNegative=1; } } if( isNegative==1 ) { cerr << "WARNING: corpus contains negative occurrency frequencies => these are interpreted as entries of a manual dictionary.\n"; realCount=new Vector(totalPairs1,1.0); } else realCount=0; } void sentenceHandler::rewind() { currentSentence = 0; readflag = false ; if (!allInMemory || !(Buffer.size() >= 1 && Buffer[currentSentence].sentenceNo == 1)){ // check if the buffer doe not already has the first chunk of pairs if (Buffer.size() > 0) cerr << ' ' << Buffer[currentSentence].sentenceNo << '\n'; // totalPairs = 0 ; pair_no = 0 ; noSentInBuffer = 0 ; Buffer.clear(); } if (!allInMemory){ delete inputFile; inputFile = new ifstream(inputFilename); if(!(*inputFile)){ cerr << "\nERROR:(b) Cannot open " << inputFilename << " " << (int)errno; } } } bool sentenceHandler::getNextSentence(sentPair& sent, vcbList* elist, vcbList* flist) { sentPair s ; if (readflag){ cerr << "Attempting to read from the end of corpus, rewinding\n"; rewind(); return(false); } if (currentSentence >= noSentInBuffer){ if (allInMemory) return(false); /* no more sentences in buffer */ noSentInBuffer = 0 ; currentSentence = 0 ; Buffer.clear(); cout << "Reading more sentence pairs into memory ... \n"; while((noSentInBuffer < TRAIN_BUFFER_SIZE) && readNextSentence(s)){ if ((s.fSent.size()-1) > (MAX_FERTILITY-1) * (s.eSent.size()-1)){ cerr << "WARNING: The following sentence pair has source/target sentence length ration more than\n"<< "the maximum allowed limit for a source word fertility\n"<< " source length = " << s.eSent.size()-1 << " target length = " << s.fSent.size()-1 << " ratio " << double(s.fSent.size()-1)/ (s.eSent.size()-1) << " ferility limit : " << MAX_FERTILITY-1 << '\n'; cerr << "Shortening sentence \n"; cerr << s; s.eSent.resize(min(s.eSent.size(),s.fSent.size())); s.fSent.resize(min(s.eSent.size(),s.fSent.size())); } Buffer.push_back(s) ; if (elist && flist){ if ((*elist).size() > 0) for (WordIndex i= 0 ; i < s.eSent.size() ; i++){ if (s.eSent[i] >= (*elist).uniqTokens()){ if( PrintedTooLong++<100) cerr << "ERROR: source word " << s.eSent[i] << " is not in the vocabulary list \n"; exit(-1); } (*elist).incFreq(s.eSent[i], s.realCount); } if ((*flist).size() > 0) for (WordIndex j= 1 ; j < s.fSent.size() ; j++){ if (s.fSent[j] >= (*flist).uniqTokens()){ cerr << "ERROR: target word " << s.fSent[j] << " is not in the vocabulary list \n"; exit(-1); } (*flist).incFreq(s.fSent[j], s.realCount); } } noSentInBuffer++; } if (inputFile->eof()){ allInMemory = (Buffer.size() >= 1 && Buffer[currentSentence].sentenceNo == 1) ; if (allInMemory) cout << "Corpus fits in memory, corpus has: " << Buffer.size() << " sentence pairs.\n"; } } if(noSentInBuffer <= 0 ){ //cerr << "# sent in buffer " << noSentInBuffer << '\n'; readflag = true ; return(false); } sent = Buffer[currentSentence++] ; if( sent.noOcc<0 && realCount ) { if( Manlexfactor1 && sent.noOcc==-1.0 ) sent.realCount=Manlexfactor1; else if( Manlexfactor2 && sent.noOcc==-2.0 ) sent.realCount=Manlexfactor2; else sent.realCount=(*realCount)[sent.getSentenceNo()-1]; } return true ; } bool sentenceHandler::readNextSentence(sentPair& sent) /* This method reads in a new pair of sentences, each pair is read from the corpus file as line triples. The first line the no of times this line pair occured in the corpus, the second line is the source sentence and the third is the target sentence. The sentences are represented by a space separated positive integer token ids. */ { string line; bool fail(false) ; sent.clear(); if (getline(*inputFile, line)){ istrstream buffer(line.c_str()); buffer >> sent.noOcc; if( sent.noOcc<0 ) { if( realCount ) { if( Manlexfactor1 && sent.noOcc==-1.0 ) sent.realCount=Manlexfactor1; else if( Manlexfactor2 && sent.noOcc==-2.0 ) sent.realCount=Manlexfactor2; else { sent.realCount=(*realCount)[pair_no]; } } else sent.realCount=1.0; } else sent.realCount=sent.noOcc; } else { fail = true ;; } if (getline(*inputFile, line)){ istrstream buffer(line.c_str()); WordIndex w; // w is a local variabe for token id sent.eSent.push_back(0); // each source word is assumed to have 0 == // a null word (id 0) at the begining of the sentence. while(buffer>>w){ // read source sentece , word by word . if (sent.eSent.size() < MAX_SENTENCE_LENGTH) sent.eSent.push_back(w); else { if( PrintedTooLong++<100) cerr << "{WARNING:(a)truncated sentence "<>w){ // read target sentece , word by word . if (sent.fSent.size() < MAX_SENTENCE_LENGTH) sent.fSent.push_back(w); else { if( PrintedTooLong++<100) cerr << "{WARNING:(b)truncated sentence "<&vd) { Vector l; for(double lambda=1.0;lambda0&&(oldPairs.back().get_eSent()!=s.get_eSent()||oldPairs.back().getSentenceNo()>=s.getSentenceNo()) ) { double lambda=optimize_lambda(oldProbs); for(unsigned int i=0;i