/* EGYPT Toolkit for Statistical Machine Translation Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include #include #include #include "defs.h" #include "vocab.h" #include "Perplexity.h" #include "getSentence.h" #include "TTables.h" #include "Globals.h" #include "Parameter.h" void printHelp(void) { cerr << "Usage:\n\n" << Usage << '\n'; cerr << "Options (these override parameters set in the config file):\n\n"; cerr << "\t--v \t\t print verbose message, Warning this is not very descriptive and not systematic.\n"; cerr << "\t--NODUMPS \t Do not write any files to disk (This will over write dump frequency options).\n"; cerr << "\t--h[elp]\t\tprint this help\n"; cerr << "\t--p\t\tUse pegging when generating alignments for Model3 training. (Default NO PEGGING)\n"; cerr << "\t--st\t\tto use a fixed ditribution for the fertility parameters when tranfering from model 2 to model 3 (Default complicated estimation)\n"; printGIZAPars(cout); } void generatePerplexityReport(const Perplexity& trainperp, const Perplexity& testperp, const Perplexity& trainVperp, const Perplexity& testVperp, ostream& of, int trainsize, int testsize, bool) { unsigned int i, m; unsigned int m1 = max(trainperp.size(), testperp.size()); unsigned int m2 = max(trainVperp.size(), testVperp.size()); m = max(m1,m2); of << "#trnsz\ttstsz\titer\tmodel\ttrn-pp\t\ttest-pp\t\ttrn-vit-pp\t\ttst-vit-pp\n"; for (i = 0 ; i & es, Vector& fs, ostream& of) // just writes a sentece pair to the give output stream, one sentence pair line // it writes token ids not actual tokens. { WordIndex i, j, l, m; l = es.size() - 1; m = fs.size() - 1; of << "Source sentence length : " << l << " , target : " << m << "\n"; for (i = 1 ; i <= l ; i++) of << es[i] << ' '; of << "\n"; for (j = 1 ; j <= m ; j++) of << fs[j] << ' '; of << "\n"; } extern short CompactAlignmentFormat; void printAlignToFile(const Vector& es, const Vector& fs, const Vector& evlist, const Vector& fvlist, ostream& of2, const Vector& viterbi_alignment, int pair_no, double alignment_score) // prints the given alignment to alignments file (given it stream pointer) // in a format recognizable by the draw-alignment tool ... which is of the // example (each line triple is one sentence pair): // # sentence caption // target_word_1 target_word_2 ..... target_word_m // source_word_1 ({ x y z }) source_word_2 ({ }) .. source_word_n ({w}) // where x, y, z, and w are positions of target words that each source word // is connected to. { WordIndex l, m; Vector > translations(es.size()); // each english words has a vector // of zero or more translations . l = es.size() - 1; m = fs.size() - 1; if( CompactAlignmentFormat ) { for (WordIndex j = 1 ; j <= m ; j++) if( viterbi_alignment[j] ) of2 << viterbi_alignment[j]-1 << ' ' << j-1 << ' '; of2 << '\n'; } else { of2 << "# Sentence pair (" << pair_no <<") source length " << l << " target length "<< m << " alignment score : "<< alignment_score << '\n'; for (WordIndex j = 1 ; j <= m ; j++){ of2 << fvlist[fs[j]].word << " " ; translations[viterbi_alignment[j]].push_back(j); } of2 << '\n'; for (WordIndex i = 0 ; i <= l ; i++){ of2 << evlist[es[i]].word << " ({ " ; for (WordIndex j = 0 ; j < translations[i].size() ; j++) of2 << translations[i][j] << " " ; of2 << "}) "; } of2 << '\n'; } } void printOverlapReport(const tmodel& tTable, sentenceHandler& testHandler, vcbList& trainEList, vcbList& trainFList, vcbList& testEList, vcbList& testFList) { set > testCoocur ; sentPair s ; /* string unseenCoocurFile = Prefix + ".tst.unseen.cooc" ; ofstream of_unseenCoocur(unseenCoocurFile.c_str()); string seenCoocurFile = Prefix + ".tst.seen.cooc" ; ofstream of_seenCoocur(seenCoocurFile.c_str()); */ testHandler.rewind(); int seen_coocur = 0, unseen_coocur = 0, srcUnk = 0, trgUnk = 0 ; while(testHandler.getNextSentence(s)){ for (WordIndex i = 1 ; i < s.eSent.size() ; i++) for (WordIndex j = 1 ; j < s.fSent.size() ; j++) testCoocur.insert(pair (s.eSent[i], s.fSent[j])) ; } set >::const_iterator i ; for (i = testCoocur.begin() ; i != testCoocur.end() ; ++i){ if (tTable.getProb((*i).first, (*i).second) > PROB_SMOOTH){ seen_coocur ++ ; // of_seenCoocur << (*i).first << ' ' << (*i).second << '\n'; } else { unseen_coocur++; // of_unseenCoocur << (*i).first << ' ' << (*i).second << '\n'; } } string trgUnkFile = Prefix + ".tst.trg.unk" ; ofstream of_trgUnk(trgUnkFile.c_str()); for (WordIndex i = 0 ; i < testFList.getVocabList().size() && i < testFList.uniqTokens();i++) if (testFList.getVocabList()[i].freq > 0 && trainFList.getVocabList()[i].freq <= 0){ of_trgUnk << i << ' ' << testFList.getVocabList()[i].word << ' ' << testFList.getVocabList()[i].freq << '\n'; trgUnk++ ; } string srcUnkFile = Prefix + ".tst.src.unk" ; ofstream of_srcUnk(srcUnkFile.c_str()); for (WordIndex j = 0 ; j < testEList.getVocabList().size() && j < testEList.uniqTokens();j++) if (testEList.getVocabList()[j].freq > 0 && trainEList.getVocabList()[j].freq <= 0){ srcUnk++ ; of_srcUnk << j << ' ' << testEList.getVocabList()[j].word << ' ' << testEList.getVocabList()[j].freq << '\n'; } string summaryFile = Prefix + ".tst.stats" ; ofstream of_summary(summaryFile.c_str()); of_summary << "\t\t STATISTICS ABOUT TEST CORPUS\n\n"; of_summary << "source unique tokens: " << testEList.uniqTokens() << '\n'; of_summary << "target unique tokens: " << testFList.uniqTokens() << '\n'; of_summary << "unique unseen source tokens: " << srcUnk << '\n'; of_summary << "unique unseen target tokens: " << trgUnk << '\n'; of_summary << "cooccurrences not found in the final t table: " << unseen_coocur << '\n'; of_summary << "cooccurrences found in the final t table: " << seen_coocur << '\n'; }