00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020 #ifndef RATNGS_H
00021 #define RATNGS_H
00022
00023 #include <assert.h>
00024
00025 #include "clst.h"
00026 #include "genericvector.h"
00027 #include "notdll.h"
00028 #include "unichar.h"
00029 #include "unicharset.h"
00030 #include "werd.h"
00031
00032 class BLOB_CHOICE: public ELIST_LINK
00033 {
00034 public:
00035 BLOB_CHOICE() {
00036 unichar_id_ = INVALID_UNICHAR_ID;
00037 fontinfo_id_ = -1;
00038 fontinfo_id2_ = -1;
00039 rating_ = MAX_FLOAT32;
00040 certainty_ = -MAX_FLOAT32;
00041 script_id_ = -1;
00042 language_model_state_ = NULL;
00043 min_xheight_ = 0;
00044 max_xheight_ = 0;
00045 adapted_ = false;
00046 }
00047 BLOB_CHOICE(UNICHAR_ID src_unichar_id,
00048 float src_rating,
00049 float src_cert,
00050 inT16 src_fontinfo_id,
00051 inT16 src_fontinfo_id2,
00052 int script_id,
00053 inT16 min_xheight,
00054 inT16 max_xheight,
00055 bool adapted);
00056 BLOB_CHOICE(const BLOB_CHOICE &other);
00057 ~BLOB_CHOICE() {}
00058
00059 UNICHAR_ID unichar_id() const {
00060 return unichar_id_;
00061 }
00062 float rating() const {
00063 return rating_;
00064 }
00065 float certainty() const {
00066 return certainty_;
00067 }
00068 inT16 fontinfo_id() const {
00069 return fontinfo_id_;
00070 }
00071 inT16 fontinfo_id2() const {
00072 return fontinfo_id2_;
00073 }
00074 int script_id() const {
00075 return script_id_;
00076 }
00077 void *language_model_state() {
00078 return language_model_state_;
00079 }
00080 inT16 xgap_before() const {
00081 return xgap_before_;
00082 }
00083 inT16 xgap_after() const {
00084 return xgap_after_;
00085 }
00086 inT16 min_xheight() const {
00087 return min_xheight_;
00088 }
00089 inT16 max_xheight() const {
00090 return max_xheight_;
00091 }
00092 bool adapted() const {
00093 return adapted_;
00094 }
00095
00096 void set_unichar_id(UNICHAR_ID newunichar_id) {
00097 unichar_id_ = newunichar_id;
00098 }
00099 void set_rating(float newrat) {
00100 rating_ = newrat;
00101 }
00102 void set_certainty(float newrat) {
00103 certainty_ = newrat;
00104 }
00105 void set_fontinfo_id(inT16 newfont) {
00106 fontinfo_id_ = newfont;
00107 }
00108 void set_fontinfo_id2(inT16 newfont) {
00109 fontinfo_id2_ = newfont;
00110 }
00111 void set_script(int newscript_id) {
00112 script_id_ = newscript_id;
00113 }
00114 void set_language_model_state(void *language_model_state) {
00115 language_model_state_ = language_model_state;
00116 }
00117 void set_xgap_before(inT16 gap) {
00118 xgap_before_ = gap;
00119 }
00120 void set_xgap_after(inT16 gap) {
00121 xgap_after_ = gap;
00122 }
00123 void set_adapted(bool adapted) {
00124 adapted_ = adapted;
00125 }
00126 static BLOB_CHOICE* deep_copy(const BLOB_CHOICE* src) {
00127 BLOB_CHOICE* choice = new BLOB_CHOICE;
00128 *choice = *src;
00129 return choice;
00130 }
00131 void print(const UNICHARSET *unicharset) {
00132 tprintf("r%.2f c%.2f : %d %s", rating_, certainty_, unichar_id_,
00133 (unicharset == NULL) ? "" :
00134 unicharset->debug_str(unichar_id_).string());
00135 }
00136
00137 private:
00138 UNICHAR_ID unichar_id_;
00139 inT16 fontinfo_id_;
00140 inT16 fontinfo_id2_;
00141 float rating_;
00142 float certainty_;
00143 int script_id_;
00144
00145
00146
00147
00148 void *language_model_state_;
00149 inT16 xgap_before_;
00150 inT16 xgap_after_;
00151
00152 inT16 min_xheight_;
00153 inT16 max_xheight_;
00154 bool adapted_;
00155 };
00156
00157
00158 ELISTIZEH (BLOB_CHOICE) CLISTIZEH (BLOB_CHOICE_LIST)
00159
00160
00161 enum PermuterType {
00162 NO_PERM,
00163 PUNC_PERM,
00164 TOP_CHOICE_PERM,
00165 LOWER_CASE_PERM,
00166 UPPER_CASE_PERM,
00167 NGRAM_PERM,
00168 NUMBER_PERM,
00169 USER_PATTERN_PERM,
00170 SYSTEM_DAWG_PERM,
00171 DOC_DAWG_PERM,
00172 USER_DAWG_PERM,
00173 FREQ_DAWG_PERM,
00174 COMPOUND_PERM,
00175 };
00176
00177 class WERD_CHOICE {
00178 public:
00179 static const float kBadRating;
00180
00181 WERD_CHOICE(const UNICHARSET *unicharset)
00182 : unicharset_(unicharset) { this->init(8); }
00183 WERD_CHOICE(const UNICHARSET *unicharset, int reserved)
00184 : unicharset_(unicharset) { this->init(reserved); }
00185 WERD_CHOICE(const char *src_string,
00186 const char *src_lengths,
00187 float src_rating,
00188 float src_certainty,
00189 uinT8 src_permuter,
00190 const UNICHARSET &unicharset)
00191 : unicharset_(&unicharset) {
00192 this->init(src_string, src_lengths, src_rating,
00193 src_certainty, src_permuter);
00194 }
00195 WERD_CHOICE(const char *src_string, const UNICHARSET &unicharset);
00196 WERD_CHOICE(const WERD_CHOICE &word) : unicharset_(word.unicharset_) {
00197 this->init(word.length());
00198 this->operator=(word);
00199 }
00200 ~WERD_CHOICE();
00201
00202 const UNICHARSET *unicharset() const {
00203 return unicharset_;
00204 }
00205 inline int length() const {
00206 return length_;
00207 }
00208 inline const UNICHAR_ID *unichar_ids() const {
00209 return unichar_ids_;
00210 }
00211 inline const UNICHAR_ID unichar_id(int index) const {
00212 assert(index < length_);
00213 return unichar_ids_[index];
00214 }
00215 inline const char *fragment_lengths() const {
00216 return fragment_lengths_;
00217 }
00218 inline const char fragment_length(int index) const {
00219 assert(index < length_);
00220 return fragment_lengths_[index];
00221 }
00222 inline float rating() const {
00223 return rating_;
00224 }
00225 inline float certainty() const {
00226 return certainty_;
00227 }
00228 inline uinT8 permuter() const {
00229 return permuter_;
00230 }
00231 const char *permuter_name() const;
00232 inline bool fragment_mark() const {
00233 return fragment_mark_;
00234 }
00235 inline BLOB_CHOICE_LIST_CLIST* blob_choices() {
00236 return blob_choices_;
00237 }
00238 inline void set_unichar_id(UNICHAR_ID unichar_id, int index) {
00239 assert(index < length_);
00240 unichar_ids_[index] = unichar_id;
00241 }
00242 inline void set_fragment_length(char flen, int index) {
00243 assert(index < length_);
00244 fragment_lengths_[index] = flen;
00245 }
00246 inline void set_rating(float new_val) {
00247 rating_ = new_val;
00248 }
00249 inline void set_certainty(float new_val) {
00250 certainty_ = new_val;
00251 }
00252 inline void set_permuter(uinT8 perm) {
00253 permuter_ = perm;
00254 }
00255 inline void set_fragment_mark(bool new_fragment_mark) {
00256 fragment_mark_ = new_fragment_mark;
00257 }
00258
00259
00260
00261 inline void set_length(int len) {
00262 ASSERT_HOST(reserved_ >= len);
00263 length_ = len;
00264 }
00265 void set_blob_choices(BLOB_CHOICE_LIST_CLIST *blob_choices);
00266
00268 inline void double_the_size() {
00269 if (reserved_ > 0) {
00270 unichar_ids_ = GenericVector<UNICHAR_ID>::double_the_size_memcpy(
00271 reserved_, unichar_ids_);
00272 fragment_lengths_ = GenericVector<char>::double_the_size_memcpy(
00273 reserved_, fragment_lengths_);
00274 reserved_ *= 2;
00275 } else {
00276 unichar_ids_ = new UNICHAR_ID[1];
00277 fragment_lengths_ = new char[1];
00278 reserved_ = 1;
00279 }
00280 }
00281
00284 inline void init(int reserved) {
00285 reserved_ = reserved;
00286 if (reserved > 0) {
00287 unichar_ids_ = new UNICHAR_ID[reserved];
00288 fragment_lengths_ = new char[reserved];
00289 } else {
00290 unichar_ids_ = NULL;
00291 fragment_lengths_ = NULL;
00292 }
00293 length_ = 0;
00294 rating_ = 0.0;
00295 certainty_ = MAX_FLOAT32;
00296 permuter_ = NO_PERM;
00297 fragment_mark_ = false;
00298 blob_choices_ = NULL;
00299 unichars_in_script_order_ = false;
00300 unichar_string_ = "";
00301 unichar_lengths_ = "";
00302 }
00303
00309 void init(const char *src_string, const char *src_lengths,
00310 float src_rating, float src_certainty,
00311 uinT8 src_permuter);
00312
00314 inline void make_bad() {
00315 length_ = 0;
00316 rating_ = kBadRating;
00317 certainty_ = -MAX_FLOAT32;
00318 fragment_mark_ = false;
00319 unichar_string_ = "";
00320 unichar_lengths_ = "";
00321 }
00322
00326 inline void append_unichar_id_space_allocated(
00327 UNICHAR_ID unichar_id, char fragment_length,
00328 float rating, float certainty) {
00329 assert(reserved_ > length_);
00330 length_++;
00331 this->set_unichar_id(unichar_id, fragment_length,
00332 rating, certainty, length_-1);
00333 }
00334
00335 void append_unichar_id(UNICHAR_ID unichar_id, char fragment_length,
00336 float rating, float certainty);
00337
00338 inline void set_unichar_id(UNICHAR_ID unichar_id, char fragment_length,
00339 float rating, float certainty, int index) {
00340 assert(index < length_);
00341 unichar_ids_[index] = unichar_id;
00342 fragment_lengths_[index] = fragment_length;
00343 rating_ += rating;
00344 if (certainty < certainty_) {
00345 certainty_ = certainty;
00346 }
00347 }
00348
00349 bool contains_unichar_id(UNICHAR_ID unichar_id) const;
00350 void remove_unichar_ids(int index, int num);
00351 inline void remove_last_unichar_id() { --length_; }
00352 inline void remove_unichar_id(int index) {
00353 this->remove_unichar_ids(index, 1);
00354 }
00355 bool has_rtl_unichar_id() const;
00356 void reverse_and_mirror_unichar_ids();
00357
00358
00359
00360
00361 void punct_stripped(int *start_core, int *end_core) const;
00362
00363
00364
00365 WERD_CHOICE shallow_copy(int start, int end) const;
00366
00367 void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const;
00368 const STRING debug_string() const {
00369 STRING word_str;
00370 for (int i = 0; i < length_; ++i) {
00371 word_str += unicharset_->debug_str(unichar_ids_[i]);
00372 word_str += " ";
00373 }
00374 return word_str;
00375 }
00379 void populate_unichars() {
00380 this->string_and_lengths(&unichar_string_, &unichar_lengths_);
00381 }
00382
00385 void depopulate_unichars() {
00386 unichar_string_ = "";
00387 unichar_lengths_ = "";
00388 }
00389
00390
00391
00392
00393 bool set_unichars_in_script_order(bool in_script_order) {
00394 return unichars_in_script_order_ = in_script_order;
00395 }
00396
00397 bool unichars_in_script_order() const {
00398 return unichars_in_script_order_;
00399 }
00400
00403 const STRING &unichar_string() const {
00404 assert(unichar_string_.length() <= 0 ||
00405 unichar_string_.length() >= length_);
00406 return unichar_string_;
00407 }
00408
00411 const STRING &unichar_lengths() const {
00412 assert(unichar_lengths_.length() <= 0 ||
00413 unichar_lengths_.length() == length_);
00414 return unichar_lengths_;
00415 }
00416 const void print() const { this->print(""); }
00417 const void print(const char *msg) const;
00418
00419 WERD_CHOICE& operator+= (
00420 const WERD_CHOICE & second);
00421
00422 WERD_CHOICE& operator= (const WERD_CHOICE& source);
00423
00424 private:
00425 const UNICHARSET *unicharset_;
00426 UNICHAR_ID *unichar_ids_;
00427 char *fragment_lengths_;
00428 int reserved_;
00429 int length_;
00430 float rating_;
00431 float certainty_;
00432 uinT8 permuter_;
00433 bool fragment_mark_;
00434
00435
00436 BLOB_CHOICE_LIST_CLIST *blob_choices_;
00437
00438
00439
00440
00441
00442 bool unichars_in_script_order_;
00443
00444
00445
00446 STRING unichar_string_;
00447 STRING unichar_lengths_;
00448
00449 bool unichar_info_present;
00450
00451 private:
00452 void delete_blob_choices();
00453 };
00454
00455
00456 ELISTIZEH (WERD_CHOICE)
00457 typedef GenericVector<BLOB_CHOICE_LIST *> BLOB_CHOICE_LIST_VECTOR;
00458 typedef GenericVector<WERD_CHOICE_LIST *> WERD_CHOICE_LIST_VECTOR;
00459
00460
00461
00462 bool EqualIgnoringCaseAndTerminalPunct(const WERD_CHOICE &word1,
00463 const WERD_CHOICE &word2);
00464
00465
00466 void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings);
00467 void print_ratings_list(
00468 const char *msg,
00469 BLOB_CHOICE_LIST *ratings,
00470 const UNICHARSET ¤t_unicharset
00471
00472 );
00473 void print_ratings_info(
00474 FILE *fp,
00475 BLOB_CHOICE_LIST *ratings,
00476 const UNICHARSET ¤t_unicharset
00477
00478 );
00479 void print_char_choices_list(
00480 const char *msg,
00481 const BLOB_CHOICE_LIST_VECTOR &char_choices,
00482 const UNICHARSET ¤t_unicharset,
00483 BOOL8 detailed
00484 );
00485 void print_word_alternates_list(
00486 WERD_CHOICE *word,
00487 GenericVector<WERD_CHOICE *> *alternates,
00488 bool needs_populate_unichars);
00489
00490 #endif