/* EGYPT Toolkit for Statistical Machine Translation Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include #include "getSentence.h" #include "TTables.h" #include "model1.h" #include "model2.h" #include "model3.h" #include "hmm.h" #include "file_spec.h" #include "defs.h" #include "vocab.h" #include "Perplexity.h" #include "Dictionary.h" #include "utility.h" #include "Parameter.h" #include "myassert.h" #include "D4Tables.h" #include "D5Tables.h" #include "transpair_model4.h" #include "transpair_model5.h" #define ITER_M2 0 #define ITER_MH 5 GLOBAL_PARAMETER3(int,Model1_Iterations,"Model1_Iterations","NO. ITERATIONS MODEL 1","m1","number of iterations for Model 1",PARLEV_ITER,5); GLOBAL_PARAMETER3(int,Model2_Iterations,"Model2_Iterations","NO. ITERATIONS MODEL 2","m2","number of iterations for Model 2",PARLEV_ITER,ITER_M2); GLOBAL_PARAMETER3(int,HMM_Iterations,"HMM_Iterations","mh","number of iterations for HMM alignment model","mh", PARLEV_ITER,ITER_MH); GLOBAL_PARAMETER3(int,Model3_Iterations,"Model3_Iterations","NO. ITERATIONS MODEL 3","m3","number of iterations for Model 3",PARLEV_ITER,5); GLOBAL_PARAMETER3(int,Model4_Iterations,"Model4_Iterations","NO. ITERATIONS MODEL 4","m4","number of iterations for Model 4",PARLEV_ITER,5); GLOBAL_PARAMETER3(int,Model5_Iterations,"Model5_Iterations","NO. ITERATIONS MODEL 5","m5","number of iterations for Model 5",PARLEV_ITER,0); GLOBAL_PARAMETER3(int,Model6_Iterations,"Model6_Iterations","NO. ITERATIONS MODEL 6","m6","number of iterations for Model 6",PARLEV_ITER,0); GLOBAL_PARAMETER(float, PROB_SMOOTH,"probSmooth","probability smoothing (floor) value ",PARLEV_OPTHEUR,1e-7); GLOBAL_PARAMETER(float, MINCOUNTINCREASE,"minCountIncrease","minimal count increase",PARLEV_OPTHEUR,1e-7); GLOBAL_PARAMETER2(int,Transfer_Dump_Freq,"TRANSFER DUMP FREQUENCY","t2to3","output: dump of transfer from Model 2 to 3",PARLEV_OUTPUT,0); GLOBAL_PARAMETER2(bool,Verbose,"verbose","v","0: not verbose; 1: verbose",PARLEV_OUTPUT,0); GLOBAL_PARAMETER(bool,Log,"log","0: no logfile; 1: logfile",PARLEV_OUTPUT,0); GLOBAL_PARAMETER(double,P0,"p0","fixed value for parameter p_0 in IBM-3/4 (if negative then it is determined in training)",PARLEV_EM,-1.0); GLOBAL_PARAMETER(double,M5P0,"m5p0","fixed value for parameter p_0 in IBM-5 (if negative then it is determined in training)",PARLEV_EM,-1.0); GLOBAL_PARAMETER3(bool,Peg,"pegging","p","DO PEGGING? (Y/N)","0: no pegging; 1: do pegging",PARLEV_EM,0); GLOBAL_PARAMETER(short,OldADBACKOFF,"adbackoff","",-1,0); GLOBAL_PARAMETER2(unsigned int,MAX_SENTENCE_LENGTH,"ml","MAX SENTENCE LENGTH","maximum sentence length",0,MAX_SENTENCE_LENGTH_ALLOWED); GLOBAL_PARAMETER(short, DeficientDistortionForEmptyWord,"DeficientDistortionForEmptyWord","0: IBM-3/IBM-4 as described in (Brown et al. 1993); 1: distortion model of empty word is deficient; 2: distoriton model of empty word is deficient (differently); setting this parameter also helps to avoid that during IBM-3 and IBM-4 training too many words are aligned with the empty word",PARLEV_MODELS,0); short OutputInAachenFormat=0; bool Transfer=TRANSFER; bool Transfer2to3=0; short NoEmptyWord=0; bool FEWDUMPS=0; GLOBAL_PARAMETER(bool,ONLYALDUMPS,"ONLYALDUMPS","1: do not write any files",PARLEV_OUTPUT,0); GLOBAL_PARAMETER(short,CompactAlignmentFormat,"CompactAlignmentFormat","0: detailled alignment format, 1: compact alignment format ",PARLEV_OUTPUT,0); GLOBAL_PARAMETER2(bool,NODUMPS,"NODUMPS","NO FILE DUMPS? (Y/N)","1: do not write any files",PARLEV_OUTPUT,0); GLOBAL_PARAMETER(WordIndex,MAX_FERTILITY,"MAX_FERTILITY","maximal fertility for fertility models",PARLEV_EM,10); Vector,char > > ReferenceAlignment; bool useDict = false ; string CoocurrenceFile; string Prefix, LogFilename, OPath, Usage, SourceVocabFilename, TargetVocabFilename, CorpusFilename, TestCorpusFilename, t_Filename, a_Filename, p0_Filename, d_Filename, n_Filename, dictionary_Filename; ofstream logmsg ; const string str2Num(int n){ string number = ""; do{ number.insert((size_t)0, 1, (char)(n % 10 + '0')); } while((n /= 10) > 0); return(number) ; } double LAMBDA=1.09; sentenceHandler *testCorpus=0,*corpus=0; Perplexity trainPerp, testPerp, trainViterbiPerp, testViterbiPerp ; string ReadTablePrefix; void printGIZAPars(ostream&out) { out << "general parameters:\n" "-------------------\n"; printPars(out,getGlobalParSet(),0); out << '\n'; out << "No. of iterations:\n-" "------------------\n"; printPars(out,getGlobalParSet(),PARLEV_ITER); out << '\n'; out << "parameter for various heuristics in GIZA++ for efficient training:\n" "------------------------------------------------------------------\n"; printPars(out,getGlobalParSet(),PARLEV_OPTHEUR); out << '\n'; out << "parameters for describing the type and amount of output:\n" "-----------------------------------------------------------\n"; printPars(out,getGlobalParSet(),PARLEV_OUTPUT); out << '\n'; out << "parameters describing input files:\n" "----------------------------------\n"; printPars(out,getGlobalParSet(),PARLEV_INPUT); out << '\n'; out << "smoothing parameters:\n" "---------------------\n"; printPars(out,getGlobalParSet(),PARLEV_SMOOTH); out << '\n'; out << "parameters modifying the models:\n" "--------------------------------\n"; printPars(out,getGlobalParSet(),PARLEV_MODELS); out << '\n'; out << "parameters modifying the EM-algorithm:\n" "--------------------------------------\n"; printPars(out,getGlobalParSet(),PARLEV_EM); out << '\n'; } const char*stripPath(const char*fullpath) // strip the path info from the file name { const char *ptr = fullpath + strlen(fullpath) - 1 ; while(ptr && ptr > fullpath && *ptr != '/'){ptr--;} if( *ptr=='/' ) return(ptr+1); else return ptr; } void printDecoderConfigFile() { string decoder_config_file = Prefix + ".Decoder.config" ; cerr << "writing decoder configuration file to " << decoder_config_file.c_str() <<'\n'; ofstream decoder(decoder_config_file.c_str()); if(!decoder){ cerr << "\nCannot write to " << decoder_config_file <<'\n'; exit(1); } decoder << "# Template for Configuration File for the Rewrite Decoder\n# Syntax:\n" << "# = \n# '#' is the comment character\n" << "#================================================================\n" << "#================================================================\n" << "# LANGUAGE MODEL FILE\n# The full path and file name of the language model file:\n"; decoder << "LanguageModelFile =\n"; decoder << "#================================================================\n" << "#================================================================\n" << "# TRANSLATION MODEL FILES\n# The directory where the translation model tables as created\n" << "# by Giza are located:\n#\n" << "# Notes: - All translation model \"source\" files are assumed to be in\n" << "# TM_RawDataDir, the binaries will be put in TM_BinDataDir\n" << "#\n# - Attention: RELATIVE PATH NAMES DO NOT WORK!!!\n" << "#\n# - Absolute paths (file name starts with /) will override\n" << "# the default directory.\n\n"; // strip file prefix info and leave only the path name in Prefix string path = Prefix.substr(0, Prefix.find_last_of("/")+1); if( path=="" ) path="."; decoder << "TM_RawDataDir = " << path << '\n'; decoder << "TM_BinDataDir = " << path << '\n' << '\n'; decoder << "# file names of the TM tables\n# Notes:\n" << "# 1. TTable and InversTTable are expected to use word IDs not\n" << "# strings (Giza produces both, whereby the *.actual.* files\n" << "# use strings and are THE WRONG CHOICE.\n" << "# 2. FZeroWords, on the other hand, is a simple list of strings\n" << "# with one word per line. This file is typically edited\n" << "# manually. Hoeever, this one listed here is generated by GIZA\n\n"; int lastmodel; if (Model5_Iterations>0) lastmodel = 5 ; else if (Model4_Iterations>0) lastmodel = 4 ; else if (Model3_Iterations>0) lastmodel = 3 ; else if (Model2_Iterations>0) lastmodel = 2 ; else lastmodel = 1 ; string lastModelName = str2Num(lastmodel); string p=Prefix + ".t" + /*lastModelName*/"3" +".final"; decoder << "TTable = " << stripPath(p.c_str()) << '\n'; p = Prefix + ".ti.final" ; decoder << "InverseTTable = " << stripPath(p.c_str()) << '\n'; p=Prefix + ".n" + /*lastModelName*/"3" + ".final"; decoder << "NTable = " << stripPath(p.c_str()) << '\n'; p=Prefix + ".d" + /*lastModelName*/"3" + ".final"; decoder << "D3Table = " << stripPath(p.c_str()) << '\n'; p=Prefix + ".D4.final"; decoder << "D4Table = " << stripPath(p.c_str()) << '\n'; p=Prefix + ".p0_"+ /*lastModelName*/"3" + ".final"; decoder << "PZero = " << stripPath(p.c_str()) << '\n'; decoder << "Source.vcb = " << SourceVocabFilename << '\n'; decoder << "Target.vcb = " << TargetVocabFilename << '\n'; // decoder << "Source.classes = " << SourceVocabFilename + ".classes" << '\n'; // decoder << "Target.classes = " << TargetVocabFilename + ".classes" <<'\n'; decoder << "Source.classes = " << SourceVocabFilename+".classes" << '\n'; decoder << "Target.classes = " << TargetVocabFilename + ".classes" <<'\n'; p=Prefix + ".fe0_"+ /*lastModelName*/"3" + ".final"; decoder << "FZeroWords = " <,char >&s,int&number) { string x; if( !(is >> x) ) return 0; if( x=="SENT:" ) is >> x; int n=atoi(x.c_str()); if( number==-1 ) number=n; else if( number!=n ) { cerr << "ERROR: readNextSent: DIFFERENT NUMBERS: " << number << " " << n << '\n'; return 0; } int nS,nP,nO; nS=nP=nO=0; while( is >> x ) { if( x=="SENT:" ) return 1; int n1,n2; is >> n1 >> n2; map< pair,char >::const_iterator i=s.find(pair(n1,n2)); if( i==s.end()||i->second=='P' ) s[pair(n1,n2)]=x[0]; massert(x[0]=='S'||x[0]=='P'); nS+= (x[0]=='S'); nP+= (x[0]=='P'); nO+= (!(x[0]=='S'||x[0]=='P')); } return 1; } bool emptySent(map< pair,char >&x) { x = map< pair,char >(); return 1; } void ReadAlignment(const string&x,Vector,char > >&a) { ifstream infile(x.c_str()); a.clear(); map< pair,char >sent; int number=0; while( emptySent(sent) && (readNextSent(infile,sent,number)) ) { if( int(a.size())!=number ) cerr << "ERROR: ReadAlignment: " << a.size() << " " << number << '\n'; a.push_back(sent); number++; } cout << "Read: " << a.size() << " sentences in reference alignment." << '\n'; } void initGlobals(void) { NODUMPS = false ; Prefix = Get_File_Spec(); LogFilename= Prefix + ".log"; MAX_SENTENCE_LENGTH = MAX_SENTENCE_LENGTH_ALLOWED ; } void convert(const map< pair,char >&reference,alignment&x) { int l=x.get_l(); int m=x.get_m(); for(map< pair,char >::const_iterator i=reference.begin();i!=reference.end();++i) { if( i->first.first+1>int(m) ) { cerr << "ERROR m to big: " << i->first.first << " " << i->first.second+1 << " " << l << " " << m << " is wrong.\n"; continue; } if( i->first.second+1>int(l) ) { cerr << "ERROR l to big: " << i->first.first << " " << i->first.second+1 << " " << l << " " << m << " is wrong.\n"; continue; } if( x(i->first.first+1)!=0 ) cerr << "ERROR: position " << i->first.first+1 << " already set\n"; x.set(i->first.first+1,i->first.second+1); } } double ErrorsInAlignment(const map< pair,char >&reference,const Vector&test,int l,int&missing,int&toomuch,int&eventsMissing,int&eventsToomuch,int pair_no) { int err=0; for(unsigned int j=1;j0 ) { map< pair,char >::const_iterator i=reference.find(make_pair(test[j]-1,j-1)); if( i==reference.end() ) { toomuch++; err++; } else if( !(i->second=='S' || i->second=='P')) cerr << "ERROR: wrong symbol in reference alignment '" << i->second << ' ' << int(i->second) << " no:" << pair_no<< "'\n"; eventsToomuch++; } } for(map< pair,char >::const_iterator i=reference.begin();i!=reference.end();++i) { if( i->second=='S' ) { unsigned int J=i->first.second+1; unsigned int I=i->first.first+1; if( int(J)>=int(test.size())||int(I)>int(l)||int(J)<1||int(I)<1 ) cerr << "ERROR: alignment outside of range in reference alignment" << J << " " << test.size() << " (" << I << " " << l << ") no:" << pair_no << '\n'; else { if(test[J]!=I) { missing++; err++; } } eventsMissing++; } } if( Verbose ) cout << err << " errors in sentence\n"; if( eventsToomuch+eventsMissing ) return (toomuch+missing)/(eventsToomuch+eventsMissing); else return 1.0; } vcbList *globeTrainVcbList,*globfTrainVcbList; double StartTraining(int&result) { double errors=0.0; vcbList eTrainVcbList, fTrainVcbList; globeTrainVcbList=&eTrainVcbList; globfTrainVcbList=&fTrainVcbList; string repFilename = Prefix + ".gizacfg" ; ofstream of2(repFilename.c_str()); writeParameters(of2,getGlobalParSet(),-1) ; cout << "reading vocabulary files \n"; eTrainVcbList.setName(SourceVocabFilename.c_str()); fTrainVcbList.setName(TargetVocabFilename.c_str()); eTrainVcbList.readVocabList(); fTrainVcbList.readVocabList(); cout << "Source vocabulary list has " << eTrainVcbList.uniqTokens() << " unique tokens \n"; cout << "Target vocabulary list has " << fTrainVcbList.uniqTokens() << " unique tokens \n"; vcbList eTestVcbList(eTrainVcbList) ; vcbList fTestVcbList(fTrainVcbList) ; corpus = new sentenceHandler(CorpusFilename.c_str(), &eTrainVcbList, &fTrainVcbList); if (TestCorpusFilename == "NONE") TestCorpusFilename = ""; if (TestCorpusFilename != ""){ cout << "Test corpus will be read from: " << TestCorpusFilename << '\n'; testCorpus= new sentenceHandler(TestCorpusFilename.c_str(), &eTestVcbList, &fTestVcbList); cout << " Test total # sentence pairs : " <<(*testCorpus).getTotalNoPairs1()<<" weighted:"<<(*testCorpus).getTotalNoPairs2() <<'\n'; cout << "Size of the source portion of test corpus: " << eTestVcbList.totalVocab() << " tokens\n"; cout << "Size of the target portion of test corpus: " << fTestVcbList.totalVocab() << " tokens \n"; cout << "In source portion of the test corpus, only " << eTestVcbList.uniqTokensInCorpus() << " unique tokens appeared\n"; cout << "In target portion of the test corpus, only " << fTestVcbList.uniqTokensInCorpus() << " unique tokens appeared\n"; cout << "ratio (target/source) : " << double(fTestVcbList.totalVocab()) / eTestVcbList.totalVocab() << '\n'; } cout << " Train total # sentence pairs (weighted): " << corpus->getTotalNoPairs2() << '\n'; cout << "Size of source portion of the training corpus: " << eTrainVcbList.totalVocab()-corpus->getTotalNoPairs2() << " tokens\n"; cout << "Size of the target portion of the training corpus: " << fTrainVcbList.totalVocab() << " tokens \n"; cout << "In source portion of the training corpus, only " << eTrainVcbList.uniqTokensInCorpus() << " unique tokens appeared\n"; cout << "In target portion of the training corpus, only " << fTrainVcbList.uniqTokensInCorpus() << " unique tokens appeared\n"; cout << "lambda for PP calculation in IBM-1,IBM-2,HMM:= " << double(fTrainVcbList.totalVocab()) << "/(" << eTrainVcbList.totalVocab() << "-" << corpus->getTotalNoPairs2() << ")="; LAMBDA = double(fTrainVcbList.totalVocab()) / (eTrainVcbList.totalVocab()-corpus->getTotalNoPairs2()); cout << "= " << LAMBDA << '\n'; // load dictionary Dictionary *dictionary; if (useDict) dictionary = new Dictionary(dictionary_Filename.c_str()); else dictionary = new Dictionary(""); int minIter=0; #ifdef BINARY_SEARCH_FOR_TTABLE if( CoocurrenceFile.length()==0 ) { cerr << "ERROR: NO COOCURRENCE FILE GIVEN!\n"; abort(); } //ifstream coocs(CoocurrenceFile.c_str()); tmodel tTable(CoocurrenceFile); #else tmodel tTable; #endif model1 m1(CorpusFilename.c_str(), eTrainVcbList, fTrainVcbList,tTable,trainPerp, *corpus,&testPerp, testCorpus, trainViterbiPerp, &testViterbiPerp); amodel aTable(false); amodel aCountTable(false); model2 m2(m1,aTable,aCountTable); hmm h(m2); model3 m3(m2); if(ReadTablePrefix.length() ) { string number = "final"; string tfile,afilennfile,dfile,d4file,p0file,afile,nfile; //d5file tfile = ReadTablePrefix + ".t3." + number ; afile = ReadTablePrefix + ".a3." + number ; nfile = ReadTablePrefix + ".n3." + number ; dfile = ReadTablePrefix + ".d3." + number ; d4file = ReadTablePrefix + ".d4." + number ; //d5file = ReadTablePrefix + ".d5." + number ; p0file = ReadTablePrefix + ".p0_3." + number ; tTable.readProbTable(tfile.c_str()); aTable.readTable(afile.c_str()); m3.dTable.readTable(dfile.c_str()); m3.nTable.readNTable(nfile.c_str()); sentPair sent ; double p0; ifstream p0f(p0file.c_str()); p0f >> p0; d4model d4m(MAX_SENTENCE_LENGTH); d4m.makeWordClasses(m1.Elist,m1.Flist,SourceVocabFilename+".classes",TargetVocabFilename+".classes"); d4m.readProbTable(d4file.c_str()); //d5model d5m(d4m); //d5m.makeWordClasses(m1.Elist,m1.Flist,SourceVocabFilename+".classes",TargetVocabFilename+".classes"); //d5m.readProbTable(d5file.c_str()); makeSetCommand("model4smoothfactor","0.0",getGlobalParSet(),2); //makeSetCommand("model5smoothfactor","0.0",getGlobalParSet(),2); if( corpus||testCorpus ) { sentenceHandler *x=corpus; if(x==0) x=testCorpus; cout << "Text corpus exists.\n"; x->rewind(); while(x&&x->getNextSentence(sent)){ Vector& es = sent.eSent; Vector& fs = sent.fSent; int l=es.size()-1; int m=fs.size()-1; transpair_model4 tm4(es,fs,m1.tTable,m2.aTable,m3.dTable,m3.nTable,1-p0,p0,&d4m); alignment al(l,m); cout << "I use the alignment " << sent.sentenceNo-1 << '\n'; //convert(ReferenceAlignment[sent.sentenceNo-1],al); transpair_model3 tm3(es,fs,m1.tTable,m2.aTable,m3.dTable,m3.nTable,1-p0,p0,0); double p=tm3.prob_of_target_and_alignment_given_source(al,1); cout << "Sentence " << sent.sentenceNo << " has IBM-3 prob " << p << '\n'; p=tm4.prob_of_target_and_alignment_given_source(al,3,1); cout << "Sentence " << sent.sentenceNo << " has IBM-4 prob " << p << '\n'; //transpair_model5 tm5(es,fs,m1.tTable,m2.aTable,m3.dTable,m3.nTable,1-p0,p0,&d5m); //p=tm5.prob_of_target_and_alignment_given_source(al,3,1); //cout << "Sentence " << sent.sentenceNo << " has IBM-5 prob " << p << '\n'; } } else { cout << "No corpus exists.\n"; } } else { // initialize model1 bool seedModel1 = false ; if(Model1_Iterations > 0){ if (t_Filename != "NONE" && t_Filename != ""){ seedModel1 = true ; m1.load_table(t_Filename.c_str()); } minIter=m1.em_with_tricks(Model1_Iterations,seedModel1,*dictionary, useDict); errors=m1.errorsAL(); } { if(Model2_Iterations > 0){ m2.initialize_table_uniformly(*corpus); minIter=m2.em_with_tricks(Model2_Iterations); errors=m2.errorsAL(); } if(HMM_Iterations > 0){ cout << "NOTE: I am doing iterations with the HMM model!\n"; h.makeWordClasses(m1.Elist,m1.Flist,SourceVocabFilename+".classes",TargetVocabFilename+".classes"); h.initialize_table_uniformly(*corpus); minIter=h.em_with_tricks(HMM_Iterations); errors=h.errorsAL(); } if(Transfer2to3||HMM_Iterations==0){ if( HMM_Iterations>0 ) cout << "WARNING: transfor is not needed, as results are overwritten bei transfer from HMM.\n"; string test_alignfile = Prefix +".tst.A2to3"; if (testCorpus) m2.em_loop(testPerp, *testCorpus,Transfer_Dump_Freq==1&&!NODUMPS,test_alignfile.c_str(), testViterbiPerp, true); if (testCorpus) cout << "\nTransfer: TEST CROSS-ENTROPY " << testPerp.cross_entropy() << " PERPLEXITY " << testPerp.perplexity() << "\n\n"; if (Transfer == TRANSFER_SIMPLE) m3.transferSimple(*corpus, Transfer_Dump_Freq==1&&!NODUMPS,trainPerp, trainViterbiPerp); else m3.transfer(*corpus, Transfer_Dump_Freq==1&&!NODUMPS, trainPerp, trainViterbiPerp); errors=m3.errorsAL(); } if( HMM_Iterations>0 ) m3.setHMM(&h); if(Model3_Iterations > 0 || Model4_Iterations > 0 || Model5_Iterations || Model6_Iterations ) { minIter=m3.viterbi(Model3_Iterations,Model4_Iterations,Model5_Iterations,Model6_Iterations); errors=m3.errorsAL(); } if (FEWDUMPS||!NODUMPS) { printAllTables(eTrainVcbList,eTestVcbList,fTrainVcbList,fTestVcbList,m1 ); } } } result=minIter; return errors; } int main(int argc, char* argv[]) { #ifdef BINARY_SEARCH_FOR_TTABLE getGlobalParSet().insert(new Parameter("CoocurrenceFile",ParameterChangedFlag,"",CoocurrenceFile,PARLEV_SPECIAL)); #endif getGlobalParSet().insert(new Parameter("ReadTablePrefix",ParameterChangedFlag,"optimized",ReadTablePrefix,-1)); getGlobalParSet().insert(new Parameter("S",ParameterChangedFlag,"source vocabulary file name",SourceVocabFilename,PARLEV_INPUT)); getGlobalParSet().insert(new Parameter("SOURCE VOCABULARY FILE",ParameterChangedFlag,"source vocabulary file name",SourceVocabFilename,-1)); getGlobalParSet().insert(new Parameter("T",ParameterChangedFlag,"target vocabulary file name",TargetVocabFilename,PARLEV_INPUT)); getGlobalParSet().insert(new Parameter("TARGET VOCABULARY FILE",ParameterChangedFlag,"target vocabulary file name",TargetVocabFilename,-1)); getGlobalParSet().insert(new Parameter("C",ParameterChangedFlag,"training corpus file name",CorpusFilename,PARLEV_INPUT)); getGlobalParSet().insert(new Parameter("CORPUS FILE",ParameterChangedFlag,"training corpus file name",CorpusFilename,-1)); getGlobalParSet().insert(new Parameter("TC",ParameterChangedFlag,"test corpus file name",TestCorpusFilename,PARLEV_INPUT)); getGlobalParSet().insert(new Parameter("TEST CORPUS FILE",ParameterChangedFlag,"test corpus file name",TestCorpusFilename,-1)); getGlobalParSet().insert(new Parameter("d",ParameterChangedFlag,"dictionary file name",dictionary_Filename,PARLEV_INPUT)); getGlobalParSet().insert(new Parameter("DICTIONARY",ParameterChangedFlag,"dictionary file name",dictionary_Filename,-1)); getGlobalParSet().insert(new Parameter("l",ParameterChangedFlag,"log file name",LogFilename,PARLEV_OUTPUT)); getGlobalParSet().insert(new Parameter("LOG FILE",ParameterChangedFlag,"log file name",LogFilename,-1)); getGlobalParSet().insert(new Parameter("o",ParameterChangedFlag,"output file prefix",Prefix,PARLEV_OUTPUT)); getGlobalParSet().insert(new Parameter("OUTPUT FILE PREFIX",ParameterChangedFlag,"output file prefix",Prefix,-1)); getGlobalParSet().insert(new Parameter("OUTPUT PATH",ParameterChangedFlag,"output path",OPath,PARLEV_OUTPUT)); time_t st1, fn; st1 = time(NULL); // starting time string temp(argv[0]); Usage = temp + " [options]\n"; if(argc < 2) { printHelp(); exit(1); } initGlobals() ; parseArguments(argc, argv); if (Log) logmsg.open(LogFilename.c_str(), ios::out); printGIZAPars(cout); int a=-1; double errors=0.0; if( OldADBACKOFF!=0 ) cerr << "WARNING: Parameter -adBackOff does not exist further; use CompactADTable instead.\n"; if( MAX_SENTENCE_LENGTH > MAX_SENTENCE_LENGTH_ALLOWED ) cerr << "ERROR: MAX_SENTENCE_LENGTH is too big " << MAX_SENTENCE_LENGTH << " > " << MAX_SENTENCE_LENGTH_ALLOWED << '\n'; errors=StartTraining(a); fn = time(NULL); // finish time cout << '\n' << "Entire Training took: " << difftime(fn, st1) << " seconds\n"; cout << "Program Finished at: "<< ctime(&fn) << '\n'; cout << "==========================================================\n"; return 0; }