/* EGYPT Toolkit for Statistical Machine Translation Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include "TTables.h" #include "Parameter.h" GLOBAL_PARAMETER(float,PROB_CUTOFF,"PROB CUTOFF","Probability cutoff threshold for lexicon probabilities",PARLEV_OPTHEUR,1e-7); GLOBAL_PARAMETER2(float, COUNTINCREASE_CUTOFF,"COUNTINCREASE CUTOFF","countCutoff","Counts increment cutoff threshold",PARLEV_OPTHEUR,1e-6); #ifdef BINARY_SEARCH_FOR_TTABLE template void tmodel::printCountTable(const char *, const Vector&, const Vector&, const bool) const { } void tmodel::printProbTable(const char *filename, const Vector& evlist, const Vector& fvlist, const bool actual) const { ofstream of(filename); /* for(unsigned int i=0;isize();++j) { const CPPair&x=(*lexmat[i])[j].second; WordIndex e=i,f=(*lexmat[i])[j].first; if( x.prob>PROB_SMOOTH ) if( actual ) of << evlist[e].word << ' ' << fvlist[f].word << ' ' << x.prob << '\n'; else of << e << ' ' << f << ' ' << x.prob << '\n'; } } } template void tmodel::printProbTableInverse(const char *, const Vector&, const Vector&, const double, const double, const bool ) const { } template void tmodel::normalizeTable(const vcbList&, const vcbList&, int) { for(unsigned int i=0;isize(); for(unsigned int j=0;j void tmodel::readProbTable(const char *){ } template class tmodel ; #else /* ------------------ Method Definiotns for Class tmodel --------------------*/ # template void tmodel::printCountTable(const char *filename, const Vector& evlist, const Vector& fvlist, const bool actual) const // this function dumps the t table. Each line is of the following format: // // c(target_word/source_word) source_word target_word { ofstream of(filename); typename hash_map >::const_iterator i; for(i = ef.begin(); i != ef.end();++i){ if ( ((*i).second).count > COUNTINCREASE_CUTOFF) if (actual) of << ((*i).second).count << ' ' << evlist[ ((*i).first).first ].word << ' ' << fvlist[((*i).first).second].word << ' ' << (*i).second.prob << '\n'; else of << ((*i).second).count << ' ' << ((*i).first).first << ' ' << ((*i).first).second << ' ' << (*i).second.prob << '\n'; } } void tmodel::printProbTable(const char *filename, const Vector& evlist, const Vector& fvlist, const bool actual) const // this function dumps the t table. Each line is of the following format: // // source_word target_word p(target_word/source_word) { ofstream of(filename); hash_map >::const_iterator i; for(i = ef.begin(); i != ef.end();++i) if( actual ) of << evlist[((*i).first).first].word << ' ' << fvlist[((*i).first).second].word << ' ' << (*i).second.prob << '\n'; else of << ((*i).first).first << ' ' << ((*i).first).second << ' ' << (*i).second.prob << '\n'; } template void tmodel::printProbTableInverse(const char *filename, const Vector& evlist, const Vector& fvlist, const double, const double, const bool actual) const // this function dumps the inverse t table. Each line is of the format: // // target_word_id source_word_id p(source_word/target_word) // // if flag "actual " is true then print actual word entries instead of // token ids { cerr << "Dumping the t table inverse to file: " << filename << '\n'; ofstream of(filename); typename hash_map >::const_iterator i; PROB p_inv = 0 ; // static const PROB ratio(double(fTotal)/eTotal); WordIndex e, f ; int no_errors(0); vector total(fvlist.size(),PROB(0)) ; // Sum over all e of P(f/e) * p(e) - needed for normalization for(i = ef.begin(); i != ef.end(); i++){ e = ((*i).first).first ; f = ((*i).first).second ; total[f] += (PROB) evlist[e].freq * ((*i).second.prob); //add P(f/ei) * F(ei) } for(i = ef.begin(); i != ef.end(); i++){ e = ((*i).first).first ; f = ((*i).first).second ; p_inv = ((*i).second.prob) * (PROB) evlist[e].freq / total[f] ; if (p_inv > 1.0001 || p_inv < 0){ no_errors++; if (no_errors <= 10){ cerr << "printProbTableInverse(): Error - P("< >::const_iterator i; PROB p_inv = 0 ; static const PROB ratio(double(fTotal)/eTotal); WordIndex e, f ; for(i = ef.begin(); i != ef.end(); i++){ e = ((*i).first).first ; f = ((*i).first).second ; p_inv = ((*i).second.prob) * ratio * (PROB) evlist[e].freq / (PROB) fvlist[f].freq ; if (actual) of << fvlist[f].word << ' ' << evlist[e].word << ' ' << p_inv << '\n'; else of << f << ' ' << e << ' ' << p_inv << '\n'; } } */ template void tmodel::normalizeTable(const vcbList&engl, const vcbList&french, int iter) // normalize conditional probability P(fj/ei): // i.e. make sure that Sum over all j of P(fj/e) = 1 // this method reads the counts portion of the table and normalize into // the probability portion. Then the counts are cleared (i.e. zeroed) // if the resulting probability of an entry is below a threshold, then // remove it . { if( iter==2 ) { total2.resize(engl.uniqTokens());for(unsigned int i=0;i total(engl.uniqTokens(),0.0); //Vector nFrench(engl.uniqTokens(), 0); //Vector nEng(french.uniqTokens(), 0); typename hash_map >::const_iterator i; for(i = ef.begin(); i != ef.end(); i++){ // for all possible source words e if( iter==2 ) total2[((*i).first).first] += (*i).second.count; total[((*i).first).first] += (*i).second.count; nFrench[((*i).first).first]++; nEng[((*i).first).second]++; } for(unsigned int k=0;k >::iterator j, k; PROB p ; int nParams=0; for(j = ef.begin(); j != ef.end(); ){ k = j; k++ ; if( (total[((*j).first).first])>0.0 ) p = ((((*j).second).count) /(total[((*j).first).first])) ; else p= 0.0; if (p > PROB_CUTOFF) { if( iter>0 ) { ((*j).second).prob = 0 ; ((*j).second).count = p ; } else { ((*j).second).prob = p ; ((*j).second).count = 0 ; } nParams++; } else { erase(((*j).first).first, ((*j).first).second); } j = k ; } if( iter>0 ) return normalizeTable(engl, french, iter-1); else { } } template void tmodel::readProbTable(const char *filename){ /* This function reads the t table from a file. Each line is of the format: source_word_id target_word_id p(target_word|source_word) This is the inverse operation of the printTable function. NAS, 7/11/99 */ ifstream inf(filename); cerr << "Reading t prob. table from " << filename << "\n"; if(!inf){ cerr << "\nERROR: Cannot open " << filename << "\n"; return; } WordIndex src_id, trg_id; PROB prob; int nEntry=0; while( inf >> src_id >> trg_id >> prob){ insert(src_id, trg_id, 0.0, prob); nEntry++; } cerr << "Read " << nEntry << " entries in prob. table.\n"; } template class tmodel ; /* ---------------- End of Method Definitions of class tmodel ---------------*/ #endif