/************************************************************************* * * * (C) Copyright 2004. Media Research Centre at the * * Sociology and Communications Department of the * * Budapest University of Technology and Economics. * * * * Developed by Daniel Varga. * * * *************************************************************************/ #ifndef __HUNGLISH_ALIGNMENT_TRAILPOSTPROCESSORS_H #define __HUNGLISH_ALIGNMENT_TRAILPOSTPROCESSORS_H #include "alignment.h" namespace Hunglish { // Helper class that calculates scores of holes. class TrailScores { public: TrailScores( const Trail& trail_, const AlignMatrix& dynMatrix_ ); // The score of the jth segmentum. The bigger the better. double operator()( int j ) const; private: const Trail& trail; const AlignMatrix& dynMatrix; }; class SentenceList; // Helper class that calculates scores of segmentums. class TrailScoresInterval { public: TrailScoresInterval( const Trail& trail_, const AlignMatrix& dynMatrix_, const SentenceList& huSentenceList_, const SentenceList& enSentenceList_ ); // The average score of the jth segmentum. The bigger the better. // Division is by the maximum of the Hungarian and English intervals. // This is a somewhat arbritary decision, and goes very badly with the // scoring of the knight's moves. But we really have no better choice. // // Also, the method applies some very ugly hacks to avoid the effect of // paragraph-delimiters. It strips both intervals of

s, and // modifies the dynMatrix-based score assuming that all

s got paired. // except surplus

s. double scoreSegmentum( const Rundle& start, const Rundle& end ) const; // The score of a segment identified by its index. double operator()( int j ) const; // The score of a union of segments identified by its start and end rundles' index. // Both these methods rely on scoreSegmentum(): // This means an important thing: the score only depends // on the start and end rundle, not the rundles in between. double operator()( int j, int k ) const; private: const Trail& trail; const AlignMatrix& dynMatrix; const SentenceList& huSentenceList; const SentenceList& enSentenceList; }; // Helper class that calculates scores of one-to-one holes. class BisentenceListScores { public: BisentenceListScores( const BisentenceList& bisentenceList_, const AlignMatrix& dynMatrix_ ); // The score of the jth bisentence. The bigger the better. double operator()( int j ) const; private: const BisentenceList& bisentenceList; const AlignMatrix& dynMatrix; }; void removeRundles( Trail& trail, const std::set& rundlesToKill ); // In cautious mode, auto-aligned rundles are thrown away if // their left or right neighbour holes are not one-to-one. // From the point of view of the resultant bisentences: // In cautious mode, one-to-one bisentences are thrown away if // they have left or right neighbours which are not one-to-one. // This of course dramatically improves precision while slightly degrading recall. void cautiouslyFilterTrail( Trail& bestTrail ); void spaceOutBySentenceLength( Trail& bestTrail, const SentenceList& huSentenceListPretty, const SentenceList& enSentenceList ); // The function gets a nonconst reference to bestTrail. // On the other hand, it gets a const reference to bestTrail, through trailScoresInterval. // Therefore, the function may only modify bestTrail after it finished reading trailScoresInterval. void postprocessTrailStart( Trail& bestTrail, const TrailScoresInterval& trailScoresInterval, const double& qualityThreshold ); // The function gets a nonconst reference to bestTrail. // On the other hand, it gets a const reference to bestTrail, through trailScoresInterval. // Therefore, the function may only modify bestTrail after it finished reading trailScoresInterval. void postprocessTrailStartAndEnd( Trail& bestTrail, const TrailScoresInterval& trailScoresInterval, double qualityThreshold ); // The function gets a nonconst reference to bestTrail. // On the other hand, it gets a const reference to bestTrail, through trailScoresInterval. // Therefore, the function may only modify bestTrail after it finished reading trailScoresInterval. void postprocessTrail( Trail& bestTrail, const TrailScoresInterval& trailScoresInterval, double qualityThreshold ); // Throws away rundles which are predominantly surrounded by not-one-to-one holes. void postprocessTrailByTopology( Trail& bestTrail, double qualityThreshold ); // Only collect bisentences with score at least qualityThreshold. void trailToBisentenceList( const Trail& bestTrail, const TrailScores& trailScores, double qualityThreshold, BisentenceList& bisentenceList ); // This is basically incorrect. // Here we use the score of the right-hand segment to decide about the rundle. // // The function gets a nonconst reference to bestTrail. // On the other hand, it gets a const reference to bestTrail, through trailScoresInterval. // Therefore, the function may only modify bestTrail after it finished reading trailScoresInterval. void filterTrailByQuality( Trail& trail, const TrailScoresInterval& trailScoresInterval, const double& qualityThreshold ); void filterBisentenceListByQuality( BisentenceList& bisentenceList, const AlignMatrix& dynMatrix, const double& qualityThreshold ); } // namespace Hunglish #endif // #define __HUNGLISH_ALIGNMENT_TRAILPOSTPROCESSORS_H