/* Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och mkcls - a program for making word classes . This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include "KategProblemTest.h" #include "ProblemTest.h" #include "HCOptimization.h" #include "TAOptimization.h" #include "RRTOptimization.h" #include "GDAOptimization.h" #include #include #include #include typedef pair PSS; #define NEW_SENTENCE_END "mkcls-mapped-dollar-symbol-$" #ifdef NeXT char *strdup(char *a) { char *p = (char *)malloc(strlen(a)+1); strcpy(p,a); return p; } #endif void writeClasses(Array &katOfWord,KategProblem &problem,ostream &to) { for(int i=0;i translation(-1); int maxCat=2; ifstream in(fname); if(!in) { cerr << "Error: File '" << fname << "' cannot be opened.\n"; exit(1); } for(int i=0;iwordFreq.nWords;i++) (p->initLike)[i]= -1; translation["1"]=1; translation["0"]=0; string s; while( getline(in,s) ) { string str,categ; mysplit(s,str,categ); int i=p->words->binary_locate(str); if(i>=0 && (*(p->words))[i]==str ) { if( translation[categ]==-1 ) translation[categ]=maxCat++; int cat=translation[categ]; if( (p->initLike)[i]!= -1 ) cerr << "Warning: Word '" << ((*(p->words))[i])<< "' is already in a category.\n"; (p->initLike)[i]=cat; } else cerr << "Warning: Word '" << str << "' " << i << " is not in training corpus.\n"; } if( verboseMode ) cout << "We have " << maxCat << " read non-empty categories" " (with words from the corpus).\n"; if(maxCat>p->katFreq.nKats) { cerr << "Error: Not enough categories reserved (only " << p->katFreq.nKats << ", but i need " << maxCat << ").\n"; exit(1); } int i=p->words->binary_locate("$"); if( i>=0 && (*(p->words))[i]=="$" ) (p->initLike)[i]=0; else if( verboseMode ) cerr << "Warning: No '$' in vocabulary!\n"; int errors=0; for(i=0;iwordFreq.nWords;i++) if((p->initLike)[i]== -1 ) { if( verb ) cerr << "Error: I don't know the category of word " << i << " (" << (*(p->words))[i] << ") " << ".\n"; errors=1; } return errors; } KategProblem *makeKategProblem(const leda_h_array&cTbl,const leda_set&setVokabular, int maxClass,int initialisierung, int auswertung,int nachbarschaft,int minWordFrequency) { int nwrd=0; leda_array&sVok = *new leda_array(setVokabular.size()); string s; unsigned int ctr=0; forall_set(leda_set,s,setVokabular) { if( verboseMode>2 ) cout << "mkcls:Wort " << ctr << " " << s << endl; sVok[ctr++]=s; } for(unsigned int z=0;z2 ) cout << "*****Vocabulary: " << sVok; unsigned int vokSize=sVok.size(); massert(vokSize==ctr); massert(vokSize==setVokabular.size()); if(verboseMode) {cout << "Size of vocabulary: " << vokSize << "\n";cout.flush();} KategProblem *k = new KategProblem(vokSize,maxClass,initialisierung, auswertung,nachbarschaft,minWordFrequency); KategProblemWBC &w=k->wordFreq; k->words=&sVok; Array after(vokSize,0); Array before(vokSize,0); nwrd=0; { PSS s; forall_defined_h2(PSS,FreqType,s,cTbl) { const string&ss1=s.first; const string&ss2=s.second; if( ss2.length()&&(ss1!="$" || ss2!="$") ) { int i1=sVok.binary_search(ss1); int i2=sVok.binary_search(ss2); iassert( sVok[i1] == ss1 );iassert( sVok[i2] == ss2 ); after[i1]++; before[i2]++; } if( verboseMode&&((nwrd++)%10000==0) ) {cout<<"Statistiken-1 " << nwrd<< ". \r";cout.flush();} } } for(unsigned int i=0;i2 ) cout << "BIGRAMM-HAEUF: " << ss1 << ":" << i1 << " " << ss2 << ":" << i2 << " " << p << endl; } if( verboseMode&&((nwrd++)%10000==0) ) {cout<<"Statistiken-2 " < setVokabular; leda_h_array cTbl; double c=0; if( verboseMode )cout << "NGRFILE: " << str << endl; string s1,s2; while(file >> c >> s1 >> s2) { if( s1.length()==0||s2.length()==0 ) { cerr << "ERROR: strings are zero: " << s1.length() <<" " << s1 <<" " << s2.length()<<" " << s2 << endl; return 0; } if( c==0 ) { cerr << "Count ist 0 " << s1 << " " << s2 << endl; return 0; } cTbl[pair(s1,s2)]=(FreqType)c; setVokabular.insert(s1); setVokabular.insert(s2); if( verboseMode>1 ) cout << "R: " << s1 << " " << s2 << " " << c << endl; c=0; } return makeKategProblem(cTbl,setVokabular,maxClass,initialisierung,auswertung,nachbarschaft,minWordFrequency); } KategProblem *fromKModel(const char *str,int maxClass,int initialisierung, int auswertung,int nachbarschaft,int minWordFrequency) { string oldText,text,line; ifstream f(str); if( !f ) { cerr << "ERROR: can not open file " << str << ".\n"; return 0; } leda_set setVokabular; leda_h_array cTbl(0); oldText="$"; while(1) { getline(f,line); if(f.fail() && !f.bad() && !f.eof()) { cerr << "WARNING: strange characters in stream (getline) " << endl;f.clear(); } if(!f)break; istrstream f2(line.c_str()); while( 1 ) { f2 >> text; if(f2.fail() && !f2.bad() && !f2.eof()) { cerr << "WARNING: strange characters in stream (>>) !\n"; f2.clear(ios::failbit); } if(!f2){break;} if( text == "$" ) text = "mkcls-mapped-dollar-symbol-$"; if( !setVokabular.member(text) )setVokabular.insert(text); cTbl[pair(oldText,text)]++; oldText=text; } text="$"; if( !setVokabular.member(text) )setVokabular.insert(text); cTbl[pair(oldText,text)]++; oldText=text; } return makeKategProblem(cTbl,setVokabular,maxClass,initialisierung,auswertung,nachbarschaft,minWordFrequency); } void KategProblemSetParameters(KategProblem &p) { if( p.katwahl()==K_BEST ) { TAOptimization::defaultAnnRate=0.7; RRTOptimization::defaultAnnRate=0.95; GDAOptimization::defaultAlpha=0.05; if( verboseMode ) cout << "Parameter-setting like W-DET-BEST\n"; } else { TAOptimization::defaultAnnRate=0.4; RRTOptimization::defaultAnnRate=0.6; GDAOptimization::defaultAlpha=0.0125; if( verboseMode ) cout << "Parameter-setting like W-DET-DET\n"; } } KategProblem &makRandom(int ANZ_WORD,int ANZ_CLS,int initValue, int auswertung,int nachbarschaft,float relInit) { KategProblem &k= *new KategProblem(ANZ_WORD,ANZ_CLS,initValue,auswertung,nachbarschaft); KategProblemWBC &w=k.wordFreq; Array after(ANZ_WORD,0); Array before(ANZ_WORD,0); Array twoD(ANZ_WORD); int i; for(i=0;i &_izrOptimization(Array &probs, int anzprob,double timeForOneRed,double maxClock,Array &katOfWord, int anzIter,int verfahren) { massert(anzprob>1); massert(probs[0]->wordFreq.mindestAnzahl<=1); KategProblem *p0=probs[0]; int nWords=p0->wordFreq.nWords; int nKats=p0->katFreq.nKats; int minimumNumberOfWords = max(1,int(nWords*0.95)); int indexOfDurchschnitt; Array newWords(nWords); int useAnzprob=anzprob; do { int w,k; indexOfDurchschnitt=0; for(w=0;wwordFreq.nWords==nWords); probs[k]->makeKats(); } for(w=0;w durchschnitt=(*p0->kats)[p0->katOfWord(w)]; for(k=1;kkats)[probs[k]->katOfWord(w)]; int _anzInDurchschnitt=0; int nr=0; forall_set(leda_set,nr,durchschnitt) { _anzInDurchschnitt++; newWords[nr]=indexOfDurchschnitt; } if( verboseMode && _anzInDurchschnitt>1 && anzIter==0 ) { cout << "- ("; forall_set(leda_set,nr,durchschnitt) { cout << p0->getString(nr); if( p0->wordFreq.n1(nr)==1 ) cout << "* "; else cout << " "; } cout << ")\n"; } for(k=0;kkats)[probs[k]->katOfWord(w)]; } indexOfDurchschnitt++; } } if(indexOfDurchschnitt>=minimumNumberOfWords) { if(useAnzprob==1) { cout << "useAnzProb==1 => mysterious.\n"; break; } useAnzprob--; } } while(indexOfDurchschnitt>=minimumNumberOfWords); Array &neu=*new Array(MAX_MULTIPLE*anzprob,(KategProblem *)0); qsort(probs.getPointerToData(),useAnzprob,sizeof(KategProblem *),compareProblem); massert(useAnzprob<=probs.size()); double startTime=clockSec(); int i, numberOfNew; for(numberOfNew=0; (clockSec()-startTimeinitialisierung,p0->auswertung,p0->nachbarschaft); for(w=0;wwordFreq.setAfterWords(w,5); p->wordFreq.setBeforeWords(w,5); } for(w=0;w &after=p0->wordFreq.after[w]; int size=after.size(); for(i=0;iwordFreq.addFreq(newWords[w],newWords[after[i].w],after[i].n); } p->wordFreq.testFull(1); p->wordFreq.set_h_of_words(p0->wordFreq.get_h_of_words()); double w1=0.0,w2=0.0; if(numberOfNewinitLike)[newWords[i]]=probs[numberOfNew]->katOfWord(i); p->_initialize(5); HCOptimization hc(*p,-1); if(verboseMode) { w1=p->nicevalue(); cout << "from old category system:" << w1 << endl; } hc.minimize(-1); if(verboseMode) { w2=p->nicevalue(); if(w2_initialize(1); double mean; StatVar end,laufzeit,start; solveProblem(0,*p,1,-1,verfahren,mean,end,laufzeit,start); w2=p->value(); if(verboseMode) cout << "new category system: " << w2 << " (" << p->nicevalue() << ") Zeit: " << clockSec() << "\n"; } } int p; for(p=0;pvalue() << " " << neu[0]->nicevalue() << " (" << numberOfNew-anzprob << ")" << "time: " << clockSec() << endl; if( indexOfDurchschnitt<=nKats || (clockSec()>maxClock&&maxClock) ) { if( clockSec()>maxClock&&maxClock ) cout << "STOP (time limit: " << (clockSec()-maxClock) << " s)\n"; for(i=0;ikatOfWord(newWords[i]); return neu; } else { Array &newKatOfWord= *(new Array(neu[0]->wordFreq.nWords,-1)); Array &erg=_izrOptimization(neu,anzprob,timeForOneRed, maxClock,newKatOfWord, anzIter+1,verfahren); for(i=0;i katOfWord(p.wordFreq.nWords,-1); int startN; if( clockForOneRed<=0 ) startN=firstN; else startN=1000; Array probs(startN); double val1=0.0,val2=0.0; double endTime=-1; double startTime=clockSec(); int i; for(i=0;i=firstN-1 && (startTime+clockForOneRed>clockSec() || i==999) ) break; } if( endTime<0 ) endTime=clockSec(); massert(i>=firstN); qsort(probs.getPointerToData(),i,sizeof(KategProblem *),compareProblem); massert(i<=probs.size()); if( clockForOneRed<=0 ) { clockForOneRed=endTime-startTime; if( verboseMode ) cout << "time for one reduction: " << clockForOneRed << endl; } _izrOptimization(probs,minN,clockForOneRed,maxClock,katOfWord,0,verfahren); KategProblem *n=(KategProblem *)(p.makeEqualProblem()); n->initLike= katOfWord; n->_initialize(5); if( verboseMode ) val1=n->value(); HCOptimization hc(*n,-1); hc.minimize(-1); val2=n->value(); if( verboseMode ) cout << "last improvement: " << val2-val1 << "\n"; cout << "final costs: " << val2 << " " << n->nicevalue() << endl; if(PrintBestTo) n->dumpOn(*PrintBestTo); return n; }