00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00020
00021 #ifndef TESSERACT_WORDREC_LANGUAGE_MODEL_H_
00022 #define TESSERACT_WORDREC_LANGUAGE_MODEL_H_
00023
00024 #include "associate.h"
00025 #include "dawg.h"
00026 #include "dict.h"
00027 #include "fontinfo.h"
00028 #include "intproto.h"
00029 #include "matrix.h"
00030 #include "oldheap.h"
00031 #include "params.h"
00032 #include "pageres.h"
00033
00034 namespace tesseract {
00035
00036
00037 typedef unsigned char LanguageModelFlagsType;
00038
00039
00040 struct LanguageModelConsistencyInfo {
00041 LanguageModelConsistencyInfo()
00042 : punc_ref(NO_EDGE), num_punc(0), invalid_punc(false),
00043 num_non_first_upper(0), num_lower(0),
00044 script_id(0), inconsistent_script(false),
00045 num_alphas(0), num_digits(0), num_other(0),
00046 num_inconsistent_spaces(0), inconsistent_font(false) {}
00047 inline int NumInconsistentPunc() const {
00048 return invalid_punc ? num_punc : 0;
00049 }
00050 inline int NumInconsistentCase() const {
00051 return (num_non_first_upper > num_lower) ? num_lower : num_non_first_upper;
00052 }
00053 inline int NumInconsistentChartype() const {
00054 return (NumInconsistentPunc() + num_other +
00055 ((num_alphas > num_digits) ? num_digits : num_alphas));
00056 }
00057 inline bool Consistent() const {
00058 return (NumInconsistentPunc() == 0 && NumInconsistentCase() == 0 &&
00059 NumInconsistentChartype() == 0 && !inconsistent_script);
00060 }
00061 inline int NumInconsistentSpaces() const {
00062 return num_inconsistent_spaces;
00063 }
00064
00065 EDGE_REF punc_ref;
00066 int num_punc;
00067 bool invalid_punc;
00068 int num_non_first_upper;
00069 int num_lower;
00070 int script_id;
00071 bool inconsistent_script;
00072 int num_alphas;
00073 int num_digits;
00074 int num_other;
00075 int num_inconsistent_spaces;
00076 bool inconsistent_font;
00077 };
00078
00079
00080
00081
00082
00083
00084
00085
00086
00087
00088
00089
00090
00091
00092
00093
00094
00095
00096
00097
00098
00099
00100 struct LanguageModelDawgInfo {
00101 LanguageModelDawgInfo(DawgInfoVector *a, DawgInfoVector *c,
00102 PermuterType pt) : permuter(pt) {
00103 active_dawgs = new DawgInfoVector(*a);
00104 constraints = new DawgInfoVector(*c);
00105 }
00106 ~LanguageModelDawgInfo() {
00107 delete active_dawgs;
00108 delete constraints;
00109 }
00110 DawgInfoVector *active_dawgs;
00111 DawgInfoVector *constraints;
00112 PermuterType permuter;
00113 };
00114
00115
00116
00117 struct LanguageModelNgramInfo {
00118 LanguageModelNgramInfo(const char *c, int l, bool p, float np, float nc)
00119 : context(c), context_unichar_step_len(l), pruned(p), ngram_prob(np),
00120 ngram_cost(nc) {}
00121 STRING context;
00122
00123
00124 int context_unichar_step_len;
00125
00126
00127
00128
00129 bool pruned;
00130
00131 float ngram_prob;
00132
00133 float ngram_cost;
00134 };
00135
00136
00137
00138 struct ViterbiStateEntry : public ELIST_LINK {
00139 ViterbiStateEntry(BLOB_CHOICE *pb, ViterbiStateEntry *pe,
00140 BLOB_CHOICE *b, float c, float ol,
00141 const LanguageModelConsistencyInfo &ci,
00142 const AssociateStats &as,
00143 LanguageModelFlagsType tcf,
00144 LanguageModelDawgInfo *d, LanguageModelNgramInfo *n)
00145 : cost(c), parent_b(pb), parent_vse(pe), ratings_sum(b->rating()),
00146 min_certainty(b->certainty()), adapted(b->adapted()), length(1),
00147 outline_length(ol), consistency_info(ci), associate_stats(as),
00148 top_choice_flags(tcf), dawg_info(d), ngram_info(n), updated(true) {
00149 if (pe != NULL) {
00150 ratings_sum += pe->ratings_sum;
00151 if (pe->min_certainty < min_certainty) {
00152 min_certainty = pe->min_certainty;
00153 }
00154 adapted += pe->adapted;
00155 length += pe->length;
00156 outline_length += pe->outline_length;
00157 }
00158 }
00159 ~ViterbiStateEntry() {
00160 delete dawg_info;
00161 delete ngram_info;
00162 }
00163
00164
00165 static int Compare(const void *e1, const void *e2) {
00166 const ViterbiStateEntry *ve1 =
00167 *reinterpret_cast<const ViterbiStateEntry * const *>(e1);
00168 const ViterbiStateEntry *ve2 =
00169 *reinterpret_cast<const ViterbiStateEntry * const *>(e2);
00170 return (ve1->cost < ve2->cost) ? -1 : 1;
00171 }
00172 inline bool Consistent() const {
00173 if (dawg_info != NULL && consistency_info.NumInconsistentCase() == 0) {
00174 return true;
00175 }
00176 return consistency_info.Consistent();
00177 }
00178
00179
00180
00181 float cost;
00182
00183
00184 BLOB_CHOICE *parent_b;
00185 ViterbiStateEntry *parent_vse;
00186
00187
00188
00189 float ratings_sum;
00190 float min_certainty;
00191 int adapted;
00192 int length;
00193 float outline_length;
00194 LanguageModelConsistencyInfo consistency_info;
00195 AssociateStats associate_stats;
00196
00197
00198
00199 LanguageModelFlagsType top_choice_flags;
00200
00201
00202
00203 LanguageModelDawgInfo *dawg_info;
00204
00205
00206
00207 LanguageModelNgramInfo *ngram_info;
00208
00209 bool updated;
00210 };
00211
00212 ELISTIZEH(ViterbiStateEntry);
00213
00214
00215 struct LanguageModelState {
00216 LanguageModelState(int col, int row) : contained_in_col(col),
00217 contained_in_row(row), viterbi_state_entries_prunable_length(0),
00218 viterbi_state_entries_length(0),
00219 viterbi_state_entries_prunable_max_cost(MAX_FLOAT32) {}
00220 ~LanguageModelState() {}
00221
00222
00223
00224
00225 int contained_in_col;
00226 int contained_in_row;
00227
00228
00229 ViterbiStateEntry_LIST viterbi_state_entries;
00230
00231 int viterbi_state_entries_prunable_length;
00232
00233 int viterbi_state_entries_length;
00234 float viterbi_state_entries_prunable_max_cost;
00235
00236
00237 };
00238
00239
00240 struct BestChoiceBundle {
00241 BestChoiceBundle(STATE *s, WERD_CHOICE *bc, WERD_CHOICE *rc,
00242 BLOB_CHOICE_LIST_VECTOR *bcc)
00243 : best_state(s), best_choice(bc), raw_choice(rc),
00244 best_char_choices(bcc), updated(false), best_vse(NULL), best_b(NULL) {}
00245
00246 STATE *best_state;
00247 WERD_CHOICE *best_choice;
00248 WERD_CHOICE *raw_choice;
00249 BLOB_CHOICE_LIST_VECTOR *best_char_choices;
00250 bool updated;
00251 DANGERR fixpt;
00252 ViterbiStateEntry *best_vse;
00253 BLOB_CHOICE *best_b;
00254 };
00255
00256 struct BestPathByColumn {
00257 float avg_cost;
00258 ViterbiStateEntry *best_vse;
00259 BLOB_CHOICE *best_b;
00260 };
00261
00262
00263
00264 class LanguageModel {
00265 public:
00266
00267 static const float kInitialPainPointPriorityAdjustment;
00268 static const float kDefaultPainPointPriorityAdjustment;
00269 static const float kBestChoicePainPointPriorityAdjustment;
00270 static const float kCriticalPainPointPriorityAdjustment;
00271
00272
00273
00274 static const float kMaxAvgNgramCost;
00275
00276
00277
00278 static const int kMinFixedLengthDawgLength;
00279
00280
00281
00282 static const float kLooseMaxCharWhRatio;
00283
00284
00285
00286 static const LanguageModelFlagsType kSmallestRatingFlag = 0x1;
00287 static const LanguageModelFlagsType kLowerCaseFlag = 0x2;
00288 static const LanguageModelFlagsType kUpperCaseFlag = 0x4;
00289 static const LanguageModelFlagsType kConsistentFlag = 0x8;
00290 static const LanguageModelFlagsType kDawgFlag = 0x10;
00291 static const LanguageModelFlagsType kNgramFlag = 0x20;
00292 static const LanguageModelFlagsType kJustClassifiedFlag = 0x80;
00293 static const LanguageModelFlagsType kAllChangedFlag = 0xff;
00294
00295 LanguageModel(const UnicityTable<FontInfo> *fontinfo_table, Dict *dict);
00296 ~LanguageModel();
00297
00298
00299
00300 void InitForWord(const WERD_CHOICE *prev_word,
00301 bool fixed_pitch, float best_choice_cert,
00302 float max_char_wh_ratio, float rating_cert_scale,
00303 HEAP *pain_points, CHUNKS_RECORD *chunks_record,
00304 BlamerBundle *blamer_bundle, bool debug_blamer);
00305
00306
00307 void CleanUp();
00308
00309
00310 void DeleteState(BLOB_CHOICE_LIST *choices);
00311
00312
00313
00314
00315
00316
00317
00318
00319
00320
00321
00322
00323
00324
00325
00326
00327
00328 LanguageModelFlagsType UpdateState(
00329 LanguageModelFlagsType changed,
00330 int curr_col, int curr_row,
00331 BLOB_CHOICE_LIST *curr_list,
00332 BLOB_CHOICE_LIST *parent_list,
00333 HEAP *pain_points,
00334 BestPathByColumn *best_path_by_column[],
00335 CHUNKS_RECORD *chunks_record,
00336 BestChoiceBundle *best_choice_bundle,
00337 BlamerBundle *blamer_bundle);
00338
00339
00340
00341
00342
00343
00344
00345
00346
00347
00348
00349 void GenerateNgramModelPainPointsFromColumn(int col, int row,
00350 HEAP *pain_points,
00351 CHUNKS_RECORD *chunks_record);
00352
00353
00354
00355
00356
00357
00358 void GenerateProblematicPathPainPointsFromColumn(
00359 int col, int row, float best_choice_cert,
00360 HEAP *pain_points, BestPathByColumn *best_path_by_column[],
00361 CHUNKS_RECORD *chunks_record);
00362
00363
00364
00365
00366
00367
00368 void GeneratePainPointsFromColumn(
00369 int col,
00370 const GenericVector<int> &non_empty_rows,
00371 float best_choice_cert,
00372 HEAP *pain_points,
00373 BestPathByColumn *best_path_by_column[],
00374 CHUNKS_RECORD *chunks_record);
00375
00376
00377
00378
00379
00380 void GeneratePainPointsFromBestChoice(
00381 HEAP *pain_points,
00382 CHUNKS_RECORD *chunks_record,
00383 BestChoiceBundle *best_choice_bundle);
00384
00385
00386
00387
00388
00389
00390
00391
00392
00393
00394
00395 bool GeneratePainPoint(int col, int row, bool ok_to_extend,
00396 float priority_adjustment,
00397 float worst_piece_cert,
00398 bool fragmented,
00399 float best_choice_cert,
00400 float max_char_wh_ratio,
00401 BLOB_CHOICE *parent_b,
00402 ViterbiStateEntry *parent_vse,
00403 CHUNKS_RECORD *chunks_record,
00404 HEAP *pain_points);
00405
00406
00407 inline bool AcceptableChoiceFound() { return acceptable_choice_found_; }
00408
00409
00410
00411
00412
00413 inline void GetWorstPieceCertainty(int col, int row, MATRIX *ratings,
00414 float *cert, bool *fragmented) {
00415 *cert = 0.0f;
00416 *fragmented = false;
00417 if (row > 0) {
00418 GetPieceCertainty(ratings->get(col, row-1), cert, fragmented);
00419 }
00420 if (col+1 < ratings->dimension()) {
00421 GetPieceCertainty(ratings->get(col+1, row), cert, fragmented);
00422 }
00423 ASSERT_HOST(*cert < 0.0f);
00424 }
00425
00426
00427
00428
00429
00430
00431
00432
00433
00434 inline float ComputeOutlineLength(BLOB_CHOICE *b) {
00435 return rating_cert_scale_ * b->rating() / b->certainty();
00436 }
00437
00438 protected:
00439
00440 inline float CertaintyScore(float cert) {
00441 if (language_model_use_sigmoidal_certainty) {
00442
00443
00444
00445 cert = -cert / dict_->certainty_scale;
00446 return 1.0f / (1.0f + exp(10.0f * cert));
00447 } else {
00448 return (-1.0f / cert);
00449 }
00450 }
00451
00452 inline bool NonAlphaOrDigitMiddle(int col, int row, int dimension,
00453 UNICHAR_ID unichar_id) {
00454 return (!dict_->getUnicharset().get_isalpha(unichar_id) &&
00455 !dict_->getUnicharset().get_isdigit(unichar_id) &&
00456 col > 0 && row+1 < dimension);
00457 }
00458
00459 inline bool IsFragment(BLOB_CHOICE *b) {
00460 return dict_->getUnicharset().get_fragment(b->unichar_id());
00461 }
00462
00463 inline bool IsHan(int script_id) {
00464 return ((dict_->getUnicharset().han_sid() !=
00465 dict_->getUnicharset().null_sid()) &&
00466 (script_id == dict_->getUnicharset().han_sid()));
00467 }
00468
00469
00470
00471
00472 inline void GetPieceCertainty(BLOB_CHOICE_LIST *blist,
00473 float *cert, bool *fragmented) {
00474 if (blist == NOT_CLASSIFIED || blist->empty()) return;
00475 BLOB_CHOICE_IT bit(blist);
00476 while (!bit.at_last() && IsFragment(bit.data())) {
00477 *fragmented = true;
00478 bit.forward();
00479 }
00480
00481 ASSERT_HOST(!IsFragment(bit.data()));
00482 if (bit.data()->certainty() < *cert) *cert = bit.data()->certainty();
00483 }
00484
00485 inline float ComputeAdjustment(int num_problems, float penalty) {
00486 if (num_problems == 0) return 0.0f;
00487 if (num_problems == 1) return penalty;
00488 return (penalty + (language_model_penalty_increment *
00489 static_cast<float>(num_problems-1)));
00490 }
00491
00492
00493
00494
00495
00496 inline float ComputeConsistencyAdjustment(
00497 const LanguageModelDawgInfo *dawg_info,
00498 const LanguageModelConsistencyInfo &consistency_info) {
00499 if (dawg_info != NULL) {
00500 return ComputeAdjustment(consistency_info.NumInconsistentCase(),
00501 language_model_penalty_case);
00502 }
00503 return (ComputeAdjustment(consistency_info.NumInconsistentPunc(),
00504 language_model_penalty_punc) +
00505 ComputeAdjustment(consistency_info.NumInconsistentCase(),
00506 language_model_penalty_case) +
00507 ComputeAdjustment(consistency_info.NumInconsistentChartype(),
00508 language_model_penalty_chartype) +
00509 ComputeAdjustment(consistency_info.NumInconsistentSpaces(),
00510 language_model_penalty_spacing) +
00511 (consistency_info.inconsistent_script ?
00512 language_model_penalty_script : 0.0f) +
00513 (consistency_info.inconsistent_font ?
00514 language_model_penalty_font : 0.0f));
00515 }
00516
00517
00518 inline float ComputeConsistencyAdjustedRatingsSum(
00519 float ratings_sum,
00520 const LanguageModelDawgInfo *dawg_info,
00521 const LanguageModelConsistencyInfo &consistency_info) {
00522 return (ratings_sum * (1.0f + ComputeConsistencyAdjustment(
00523 dawg_info, consistency_info)));
00524 }
00525
00526
00527
00528
00529 float ComputeAdjustedPathCost(
00530 float ratings_sum, int length, float dawg_score,
00531 const LanguageModelDawgInfo *dawg_info,
00532 const LanguageModelNgramInfo *ngram_info,
00533 const LanguageModelConsistencyInfo &consistency_info,
00534 const AssociateStats &associate_stats,
00535 ViterbiStateEntry *parent_vse);
00536
00537
00538
00539
00540
00541 bool ProblematicPath(const ViterbiStateEntry &vse,
00542 UNICHAR_ID unichar_id, bool word_end);
00543
00544
00545
00546 void GetTopChoiceLowerUpper(LanguageModelFlagsType changed,
00547 BLOB_CHOICE_LIST *curr_list,
00548 BLOB_CHOICE **first_lower,
00549 BLOB_CHOICE **first_upper);
00550
00551
00552
00553
00554
00555
00556
00557 LanguageModelFlagsType AddViterbiStateEntry(
00558 LanguageModelFlagsType top_choice_flags,
00559 float denom,
00560 bool word_end,
00561 int curr_col, int curr_row,
00562 BLOB_CHOICE *b,
00563 BLOB_CHOICE *parent_b,
00564 ViterbiStateEntry *parent_vse,
00565 HEAP *pain_points,
00566 BestPathByColumn *best_path_by_column[],
00567 CHUNKS_RECORD *chunks_record,
00568 BestChoiceBundle *best_choice_bundle,
00569 BlamerBundle *blamer_bundle);
00570
00571
00572 void PrintViterbiStateEntry(const char *msg,
00573 ViterbiStateEntry *vse,
00574 BLOB_CHOICE *b,
00575 CHUNKS_RECORD *chunks_record);
00576
00577
00578
00579
00580
00581
00582 void GenerateTopChoiceInfo(
00583 float ratings_sum,
00584 const LanguageModelDawgInfo *dawg_info,
00585 const LanguageModelConsistencyInfo &consistency_info,
00586 const ViterbiStateEntry *parent_vse,
00587 BLOB_CHOICE *b,
00588 LanguageModelFlagsType *top_choice_flags,
00589 LanguageModelFlagsType *changed);
00590
00591
00592
00593
00594
00595
00596 LanguageModelDawgInfo *GenerateDawgInfo(bool word_end, int script_id,
00597 int curr_col, int curr_row,
00598 const BLOB_CHOICE &b,
00599 const ViterbiStateEntry *parent_vse,
00600 LanguageModelFlagsType *changed);
00601
00602
00603
00604
00605
00606
00607
00608
00609 LanguageModelNgramInfo *GenerateNgramInfo(const char *unichar,
00610 float certainty, float denom,
00611 int curr_col, int curr_row,
00612 const ViterbiStateEntry *parent_vse,
00613 BLOB_CHOICE *parent_b,
00614 LanguageModelFlagsType *changed);
00615
00616
00617
00618
00619
00620
00621
00622
00623 float ComputeNgramCost(const char *unichar, float certainty, float denom,
00624 const char *context, int *unichar_step_len,
00625 bool *found_small_prob, float *ngram_prob);
00626
00627
00628
00629 float ComputeDenom(BLOB_CHOICE_LIST *curr_list);
00630
00631
00632
00633 void FillConsistencyInfo(
00634 int curr_col, bool word_end, BLOB_CHOICE *b,
00635 ViterbiStateEntry *parent_vse, BLOB_CHOICE *parent_b,
00636 CHUNKS_RECORD *chunks_record,
00637 LanguageModelConsistencyInfo *consistency_info);
00638
00639
00640
00641
00642
00643
00644
00645
00646 void UpdateBestChoice(BLOB_CHOICE *b,
00647 ViterbiStateEntry *vse,
00648 HEAP *pain_points,
00649 CHUNKS_RECORD *chunks_record,
00650 BestChoiceBundle *best_choice_bundle,
00651 BlamerBundle *blamer_bundle);
00652
00653
00654
00655
00656 void ExtractRawFeaturesFromPath(const ViterbiStateEntry &vse,
00657 float *features);
00658
00659
00660
00661
00662
00663
00664
00665
00666 WERD_CHOICE *ConstructWord(BLOB_CHOICE *b,
00667 ViterbiStateEntry *vse,
00668 CHUNKS_RECORD *chunks_record,
00669 BLOB_CHOICE_LIST_VECTOR *best_char_choices,
00670 float certainties[],
00671 float *dawg_score,
00672 STATE *state,
00673 BlamerBundle *blamer_bundle,
00674 bool *truth_path);
00675
00676
00677
00678
00679
00680
00681
00682
00683
00684
00685
00686
00687
00688
00689
00690
00691
00692 void UpdateCoveredByFixedLengthDawgs(const DawgInfoVector &active_dawgs,
00693 int word_index, int word_length,
00694 int *skip, int *covered,
00695 float *dawg_score,
00696 bool *dawg_score_done);
00697
00698
00699 inline void ComputeAssociateStats(int col, int row,
00700 float max_char_wh_ratio,
00701 ViterbiStateEntry *parent_vse,
00702 CHUNKS_RECORD *chunks_record,
00703 AssociateStats *associate_stats) {
00704 AssociateUtils::ComputeStats(
00705 col, row,
00706 (parent_vse != NULL) ? &(parent_vse->associate_stats) : NULL,
00707 (parent_vse != NULL) ? parent_vse->length : 0,
00708 fixed_pitch_, max_char_wh_ratio,
00709 chunks_record->word_res != NULL ? &chunks_record->word_res->denorm : NULL,
00710 chunks_record, language_model_debug_level, associate_stats);
00711 }
00712
00713
00714
00715
00716
00717
00718
00719 inline bool PrunablePath(LanguageModelFlagsType top_choice_flags,
00720 const LanguageModelDawgInfo *dawg_info) {
00721 if (top_choice_flags) return false;
00722 if (dawg_info != NULL &&
00723 (dawg_info->permuter == SYSTEM_DAWG_PERM ||
00724 dawg_info->permuter == USER_DAWG_PERM ||
00725 dawg_info->permuter == FREQ_DAWG_PERM) &&
00726 dict_->GetMaxFixedLengthDawgIndex() < 0) return false;
00727 return true;
00728 }
00729
00730
00731 inline bool AcceptablePath(const ViterbiStateEntry &vse) {
00732 return (vse.dawg_info != NULL || vse.Consistent() ||
00733 (vse.ngram_info != NULL && !vse.ngram_info->pruned));
00734 }
00735
00736 public:
00737
00738 INT_VAR_H(language_model_debug_level, 0, "Language model debug level");
00739 BOOL_VAR_H(language_model_ngram_on, false,
00740 "Turn on/off the use of character ngram model");
00741 INT_VAR_H(language_model_ngram_order, 8,
00742 "Maximum order of the character ngram model");
00743 INT_VAR_H(language_model_viterbi_list_max_num_prunable, 10,
00744 "Maximum number of prunable (those for which PrunablePath() is true)"
00745 "entries in each viterbi list recorded in BLOB_CHOICEs");
00746 INT_VAR_H(language_model_viterbi_list_max_size, 500,
00747 "Maximum size of viterbi lists recorded in BLOB_CHOICEs");
00748 double_VAR_H(language_model_ngram_small_prob, 0.000001,
00749 "To avoid overly small denominators use this as the floor"
00750 " of the probability returned by the ngram model");
00751 double_VAR_H(language_model_ngram_nonmatch_score, -40.0,
00752 "Average classifier score of a non-matching unichar");
00753 BOOL_VAR_H(language_model_ngram_use_only_first_uft8_step, false,
00754 "Use only the first UTF8 step of the given string"
00755 " when computing log probabilities");
00756 double_VAR_H(language_model_ngram_scale_factor, 0.03,
00757 "Strength of the character ngram model relative to the"
00758 " character classifier ");
00759 BOOL_VAR_H(language_model_ngram_space_delimited_language, true,
00760 "Words are delimited by space");
00761
00762 INT_VAR_H(language_model_min_compound_length, 3,
00763 "Minimum length of compound words");
00764 INT_VAR_H(language_model_fixed_length_choices_depth, 3,
00765 "Depth of blob choice lists to explore"
00766 " when fixed length dawgs are on");
00767
00768 double_VAR_H(language_model_penalty_non_freq_dict_word, 0.1,
00769 "Penalty for words not in the frequent word dictionary");
00770 double_VAR_H(language_model_penalty_non_dict_word, 0.15,
00771 "Penalty for non-dictionary words");
00772 double_VAR_H(language_model_penalty_punc, 0.2,
00773 "Penalty for inconsistent punctuation");
00774 double_VAR_H(language_model_penalty_case, 0.1,
00775 "Penalty for inconsistent case");
00776 double_VAR_H(language_model_penalty_script, 0.5,
00777 "Penalty for inconsistent script");
00778 double_VAR_H(language_model_penalty_chartype, 0.3,
00779 "Penalty for inconsistent character type");
00780 double_VAR_H(language_model_penalty_font, 0.00,
00781 "Penalty for inconsistent font");
00782 double_VAR_H(language_model_penalty_spacing, 0.05,
00783 "Penalty for inconsistent spacing");
00784 double_VAR_H(language_model_penalty_increment, 0.01, "Penalty increment");
00785 BOOL_VAR_H(language_model_use_sigmoidal_certainty, false,
00786 "Use sigmoidal score for certainty");
00787
00788 protected:
00789
00790
00791
00792
00793 DawgArgs *dawg_args_;
00794
00795
00796 GenericVector<bool *> updated_flags_;
00797
00798 float rating_cert_scale_;
00799
00800
00801
00802
00803 const UnicityTable<FontInfo> *fontinfo_table_;
00804
00805
00806
00807 Dict *dict_;
00808
00809
00810
00811
00812
00813
00814 bool fixed_pitch_;
00815
00816
00817 float max_char_wh_ratio_;
00818
00819
00820
00821
00822
00823
00824 STRING prev_word_str_;
00825 int prev_word_unichar_step_len_;
00826
00827 DawgInfoVector *beginning_active_dawgs_;
00828 DawgInfoVector *beginning_constraints_;
00829 DawgInfoVector *fixed_length_beginning_active_dawgs_;
00830 DawgInfoVector *empty_dawg_info_vec_;
00831
00832 float max_penalty_adjust_;
00833
00834
00835
00836
00837
00838
00839
00840
00841
00842
00843
00844 bool acceptable_choice_found_;
00845
00846 bool correct_segmentation_explored_;
00847
00848 };
00849
00850 }
00851
00852 #endif // TESSERACT_WORDREC_LANGUAGE_MODEL_H_