00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00020
00021 #ifndef TESSERACT_CCMAIN_TESSERACTCLASS_H__
00022 #define TESSERACT_CCMAIN_TESSERACTCLASS_H__
00023
00024 #include "allheaders.h"
00025 #include "control.h"
00026 #include "docqual.h"
00027 #include "devanagari_processing.h"
00028 #include "genericvector.h"
00029 #include "params.h"
00030 #include "ocrclass.h"
00031 #include "textord.h"
00032 #include "wordrec.h"
00033
00034 class PAGE_RES;
00035 class PAGE_RES_IT;
00036 class BLOCK_LIST;
00037 class CharSamp;
00038 class TO_BLOCK_LIST;
00039 class IMAGE;
00040 class WERD_RES;
00041 class ROW;
00042 class TBOX;
00043 class SVMenuNode;
00044 struct Pix;
00045 class WERD_CHOICE;
00046 class WERD;
00047 class BLOB_CHOICE_LIST_CLIST;
00048 struct OSResults;
00049
00050
00051
00052
00053
00054
00055
00056
00057
00058
00059
00060
00061
00062
00063
00064
00065
00066
00067
00068
00069
00070
00071
00072
00073
00074
00075
00076
00077
00078
00079
00080
00081
00082
00083
00084
00085
00086
00087
00088
00089
00090
00091
00092
00093 namespace tesseract {
00094
00095 class ColumnFinder;
00096 class CubeLineObject;
00097 class CubeObject;
00098 class CubeRecoContext;
00099 class EquationDetect;
00100 class Tesseract;
00101 class TesseractCubeCombiner;
00102
00103 typedef void (Tesseract::*WordRecognizer)(BLOCK* block,
00104 ROW *row,
00105 WERD_RES *word);
00106
00107
00108 struct TesseractStats {
00109 TesseractStats()
00110 : adaption_word_number(0),
00111 doc_blob_quality(0),
00112 doc_outline_errs(0),
00113 doc_char_quality(0),
00114 good_char_count(0),
00115 doc_good_char_quality(0),
00116 word_count(0),
00117 dict_words(0),
00118 tilde_crunch_written(false),
00119 last_char_was_newline(true),
00120 last_char_was_tilde(false),
00121 write_results_empty_block(true) {}
00122
00123 inT32 adaption_word_number;
00124 inT16 doc_blob_quality;
00125 inT16 doc_outline_errs;
00126 inT16 doc_char_quality;
00127 inT16 good_char_count;
00128 inT16 doc_good_char_quality;
00129 inT32 word_count;
00130 inT32 dict_words;
00131 STRING dump_words_str;
00132
00133 bool tilde_crunch_written;
00134 bool last_char_was_newline;
00135 bool last_char_was_tilde;
00136 bool write_results_empty_block;
00137 };
00138
00139 class Tesseract : public Wordrec {
00140 public:
00141 Tesseract();
00142 ~Tesseract();
00143
00144
00145
00146 void Clear();
00147
00148 void ResetAdaptiveClassifier();
00149
00150 void ResetDocumentDictionary();
00151
00152
00153 void SetEquationDetect(EquationDetect* detector);
00154
00155
00156 const FCOORD& reskew() const {
00157 return reskew_;
00158 }
00159
00160 Pix** mutable_pix_binary() {
00161 Clear();
00162 return &pix_binary_;
00163 }
00164 Pix* pix_binary() const {
00165 return pix_binary_;
00166 }
00167 Pix* pix_grey() const {
00168 return pix_grey_;
00169 }
00170 void set_pix_grey(Pix* grey_pix) {
00171 pixDestroy(&pix_grey_);
00172 pix_grey_ = grey_pix;
00173 }
00174
00175
00176
00177
00178
00179
00180
00181 Pix* BestPix() const {
00182 return pix_grey_ != NULL ? pix_grey_ : pix_binary_;
00183 }
00184 int source_resolution() const {
00185 return source_resolution_;
00186 }
00187 void set_source_resolution(int ppi) {
00188 source_resolution_ = ppi;
00189 }
00190 int ImageWidth() const {
00191 return pixGetWidth(pix_binary_);
00192 }
00193 int ImageHeight() const {
00194 return pixGetHeight(pix_binary_);
00195 }
00196 Pix* scaled_color() const {
00197 return scaled_color_;
00198 }
00199 int scaled_factor() const {
00200 return scaled_factor_;
00201 }
00202 void SetScaledColor(int factor, Pix* color) {
00203 scaled_factor_ = factor;
00204 scaled_color_ = color;
00205 }
00206 const Textord& textord() const {
00207 return textord_;
00208 }
00209 Textord* mutable_textord() {
00210 return &textord_;
00211 }
00212
00213 bool right_to_left() const {
00214 return right_to_left_;
00215 }
00216 int num_sub_langs() const {
00217 return sub_langs_.size();
00218 }
00219 Tesseract* get_sub_lang(int index) const {
00220 return sub_langs_[index];
00221 }
00222
00223 void SetBlackAndWhitelist();
00224
00225
00226
00227
00228
00229 void PrepareForPageseg();
00230
00231
00232
00233
00234
00235
00236 void PrepareForTessOCR(BLOCK_LIST* block_list,
00237 Tesseract* osd_tess, OSResults* osr);
00238
00239 int SegmentPage(const STRING* input_file, BLOCK_LIST* blocks,
00240 Tesseract* osd_tess, OSResults* osr);
00241 void SetupWordScripts(BLOCK_LIST* blocks);
00242 int AutoPageSeg(bool single_column, bool osd, bool only_osd,
00243 BLOCK_LIST* blocks, TO_BLOCK_LIST* to_blocks,
00244 Tesseract* osd_tess, OSResults* osr);
00245 ColumnFinder* SetupPageSegAndDetectOrientation(
00246 bool single_column, bool osd, bool only_osd,
00247 BLOCK_LIST* blocks, Tesseract* osd_tess, OSResults* osr,
00248 TO_BLOCK_LIST* to_blocks, Pix** photo_mask_pix, Pix** music_mask_pix);
00249
00251 bool ProcessTargetWord(const TBOX& word_box, const TBOX& target_word_box,
00252 const char* word_config, int pass);
00253 bool recog_all_words(PAGE_RES* page_res,
00254 ETEXT_DESC* monitor,
00255 const TBOX* target_word_box,
00256 const char* word_config,
00257 int dopasses);
00258 void rejection_passes(PAGE_RES* page_res,
00259 ETEXT_DESC* monitor,
00260 const TBOX* target_word_box,
00261 const char* word_config);
00262 void bigram_correction_pass(PAGE_RES *page_res);
00263 void blamer_pass(PAGE_RES* page_res);
00264
00265
00266 bool RetryWithLanguage(WERD_RES *word, BLOCK* block, ROW *row,
00267 WordRecognizer recognizer);
00268 void classify_word_and_language(WordRecognizer recognizer,
00269 BLOCK* block, ROW *row, WERD_RES *word);
00270 void classify_word_pass1(BLOCK* block, ROW *row, WERD_RES *word);
00271 void recog_pseudo_word(PAGE_RES* page_res,
00272 TBOX &selection_box);
00273
00274 void fix_rep_char(PAGE_RES_IT* page_res_it);
00275 void ExplodeRepeatedWord(BLOB_CHOICE* best_choice, PAGE_RES_IT* page_res_it);
00276
00277 ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET& char_set,
00278 const char *s,
00279 const char *lengths);
00280 void match_word_pass2(
00281 WERD_RES *word,
00282 ROW *row,
00283 BLOCK* block);
00284 void classify_word_pass2(BLOCK* block, ROW *row, WERD_RES *word);
00285 void ReportXhtFixResult(bool accept_new_word, float new_x_ht,
00286 WERD_RES* word, WERD_RES* new_word);
00287 bool RunOldFixXht(WERD_RES *word, BLOCK* block, ROW *row);
00288 bool TrainedXheightFix(WERD_RES *word, BLOCK* block, ROW *row);
00289 BOOL8 recog_interactive(BLOCK* block, ROW* row, WERD_RES* word_res);
00290
00291 void set_word_fonts(
00292 WERD_RES *word,
00293 BLOB_CHOICE_LIST_CLIST *blob_choices);
00294 void font_recognition_pass(PAGE_RES* page_res);
00295 BOOL8 check_debug_pt(WERD_RES *word, int location);
00296
00298 bool init_cube_objects(bool load_combiner,
00299 TessdataManager *tessdata_manager);
00300
00301
00302 void run_cube_combiner(PAGE_RES *page_res);
00303
00304
00305 void cube_word_pass1(BLOCK* block, ROW *row, WERD_RES *word);
00306
00307
00308 CubeObject* cube_recognize_word(BLOCK* block, WERD_RES* word);
00309
00310
00311 void cube_combine_word(CubeObject* cube_obj, WERD_RES* cube_word,
00312 WERD_RES* tess_word);
00313
00314
00315 bool cube_recognize(CubeObject *cube_obj, BLOCK* block, WERD_RES *word);
00316 void fill_werd_res(const BoxWord& cube_box_word,
00317 WERD_CHOICE* cube_werd_choice,
00318 const char* cube_best_str,
00319 WERD_RES* tess_werd_res);
00320 bool extract_cube_state(CubeObject* cube_obj, int* num_chars,
00321 Boxa** char_boxes, CharSamp*** char_samples);
00322 bool create_cube_box_word(Boxa *char_boxes, int num_chars,
00323 TBOX word_box, BoxWord* box_word);
00325
00326 void output_pass(PAGE_RES_IT &page_res_it, const TBOX *target_word_box);
00327 void write_results(PAGE_RES_IT &page_res_it,
00328 char newline_type,
00329 BOOL8 force_eol
00330 );
00331 void set_unlv_suspects(WERD_RES *word);
00332 UNICHAR_ID get_rep_char(WERD_RES *word);
00333 BOOL8 acceptable_number_string(const char *s,
00334 const char *lengths);
00335 inT16 count_alphanums(const WERD_CHOICE &word);
00336 inT16 count_alphas(const WERD_CHOICE &word);
00338 void read_config_file(const char *filename, SetParamConstraint constraint);
00339
00340
00341
00342
00343 int init_tesseract(const char *arg0,
00344 const char *textbase,
00345 const char *language,
00346 OcrEngineMode oem,
00347 char **configs,
00348 int configs_size,
00349 const GenericVector<STRING> *vars_vec,
00350 const GenericVector<STRING> *vars_values,
00351 bool set_only_init_params);
00352 int init_tesseract(const char *datapath,
00353 const char *language,
00354 OcrEngineMode oem) {
00355 return init_tesseract(datapath, NULL, language, oem,
00356 NULL, 0, NULL, NULL, false);
00357 }
00358
00359
00360
00361
00362
00363
00364
00365
00366
00367
00368
00369
00370
00371
00372
00373
00374 int init_tesseract_internal(const char *arg0,
00375 const char *textbase,
00376 const char *language,
00377 OcrEngineMode oem,
00378 char **configs,
00379 int configs_size,
00380 const GenericVector<STRING> *vars_vec,
00381 const GenericVector<STRING> *vars_values,
00382 bool set_only_init_params);
00383
00384
00385
00386 void SetupUniversalFontIds();
00387
00388 int init_tesseract_lm(const char *arg0,
00389 const char *textbase,
00390 const char *language);
00391
00392 void recognize_page(STRING& image_name);
00393 void end_tesseract();
00394
00395 bool init_tesseract_lang_data(const char *arg0,
00396 const char *textbase,
00397 const char *language,
00398 OcrEngineMode oem,
00399 char **configs,
00400 int configs_size,
00401 const GenericVector<STRING> *vars_vec,
00402 const GenericVector<STRING> *vars_values,
00403 bool set_only_init_params);
00404
00405 void ParseLanguageString(const char* lang_str,
00406 GenericVector<STRING>* to_load,
00407 GenericVector<STRING>* not_to_load);
00408
00410 SVMenuNode *build_menu_new();
00411 void pgeditor_main(int width, int height, PAGE_RES* page_res);
00412 void process_image_event(
00413 const SVEvent &event);
00414 BOOL8 process_cmd_win_event(
00415 inT32 cmd_event,
00416 char *new_value
00417 );
00418 void debug_word(PAGE_RES* page_res, const TBOX &selection_box);
00419 void do_re_display(
00420 BOOL8 (tesseract::Tesseract::*word_painter)(BLOCK* block,
00421 ROW* row,
00422 WERD_RES* word_res));
00423 BOOL8 word_display(BLOCK* block, ROW* row, WERD_RES* word_res);
00424 BOOL8 word_bln_display(BLOCK* block, ROW* row, WERD_RES* word_res);
00425 BOOL8 word_blank_and_set_display(BLOCK* block, ROW* row, WERD_RES* word_res);
00426 BOOL8 word_set_display(BLOCK* block, ROW* row, WERD_RES* word_res);
00427 BOOL8 word_dumper(BLOCK* block, ROW* row, WERD_RES* word_res);
00429 void make_reject_map(
00430 WERD_RES *word,
00431 BLOB_CHOICE_LIST_CLIST *blob_choices,
00432 ROW *row,
00433 inT16 pass
00434 );
00435 BOOL8 one_ell_conflict(WERD_RES *word_res, BOOL8 update_map);
00436 inT16 first_alphanum_index(const char *word,
00437 const char *word_lengths);
00438 inT16 first_alphanum_offset(const char *word,
00439 const char *word_lengths);
00440 inT16 alpha_count(const char *word,
00441 const char *word_lengths);
00442 BOOL8 word_contains_non_1_digit(const char *word,
00443 const char *word_lengths);
00444 void dont_allow_1Il(WERD_RES *word);
00445 inT16 count_alphanums(
00446 WERD_RES *word);
00447 void flip_0O(WERD_RES *word);
00448 BOOL8 non_0_digit(const UNICHARSET& ch_set, UNICHAR_ID unichar_id);
00449 BOOL8 non_O_upper(const UNICHARSET& ch_set, UNICHAR_ID unichar_id);
00450 BOOL8 repeated_nonalphanum_wd(WERD_RES *word, ROW *row);
00451 void nn_match_word(
00452 WERD_RES *word,
00453 ROW *row);
00454 void nn_recover_rejects(WERD_RES *word, ROW *row);
00455 BOOL8 test_ambig_word(
00456 WERD_RES *word);
00457 void set_done(
00458 WERD_RES *word,
00459 inT16 pass);
00460 inT16 safe_dict_word(const WERD_RES *werd_res);
00461 void flip_hyphens(WERD_RES *word);
00462 void reject_I_1_L(WERD_RES *word);
00463 void reject_edge_blobs(WERD_RES *word);
00464 void reject_mostly_rejects(WERD_RES *word);
00466 BOOL8 word_adaptable(
00467 WERD_RES *word,
00468 uinT16 mode);
00469
00471 void recog_word_recursive(WERD_RES* word,
00472 BLOB_CHOICE_LIST_CLIST *blob_choices);
00473 void recog_word(WERD_RES *word,
00474 BLOB_CHOICE_LIST_CLIST *blob_choices);
00475 void split_and_recog_word(WERD_RES* word,
00476 BLOB_CHOICE_LIST_CLIST *blob_choices);
00478 BOOL8 digit_or_numeric_punct(WERD_RES *word, int char_position);
00479 inT16 eval_word_spacing(WERD_RES_LIST &word_res_list);
00480 void match_current_words(WERD_RES_LIST &words, ROW *row, BLOCK* block);
00481 inT16 fp_eval_word_spacing(WERD_RES_LIST &word_res_list);
00482 void fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK* block);
00483 void fix_fuzzy_space_list(
00484 WERD_RES_LIST &best_perm,
00485 ROW *row,
00486 BLOCK* block);
00487 void fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row, BLOCK* block);
00488 void fix_fuzzy_spaces(
00489 ETEXT_DESC *monitor,
00490 inT32 word_count,
00491 PAGE_RES *page_res);
00492 void dump_words(WERD_RES_LIST &perm, inT16 score,
00493 inT16 mode, BOOL8 improved);
00494 BOOL8 uniformly_spaced(WERD_RES *word);
00495 BOOL8 fixspace_thinks_word_done(WERD_RES *word);
00496 inT16 worst_noise_blob(WERD_RES *word_res, float *worst_noise_score);
00497 float blob_noise_score(TBLOB *blob);
00498 void break_noisiest_blob_word(WERD_RES_LIST &words);
00500 GARBAGE_LEVEL garbage_word(WERD_RES *word, BOOL8 ok_dict_word);
00501 BOOL8 potential_word_crunch(WERD_RES *word,
00502 GARBAGE_LEVEL garbage_level,
00503 BOOL8 ok_dict_word);
00504 void tilde_crunch(PAGE_RES_IT &page_res_it);
00505 void unrej_good_quality_words(
00506 PAGE_RES_IT &page_res_it);
00507 void doc_and_block_rejection(
00508 PAGE_RES_IT &page_res_it,
00509 BOOL8 good_quality_doc);
00510 void quality_based_rejection(PAGE_RES_IT &page_res_it,
00511 BOOL8 good_quality_doc);
00512 void convert_bad_unlv_chs(WERD_RES *word_res);
00513 void tilde_delete(PAGE_RES_IT &page_res_it);
00514 inT16 word_blob_quality(WERD_RES *word, ROW *row);
00515 void word_char_quality(WERD_RES *word, ROW *row, inT16 *match_count,
00516 inT16 *accepted_match_count);
00517 void unrej_good_chs(WERD_RES *word, ROW *row);
00518 inT16 count_outline_errs(char c, inT16 outline_count);
00519 inT16 word_outline_errs(WERD_RES *word);
00520 BOOL8 terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level);
00521 CRUNCH_MODE word_deletable(WERD_RES *word, inT16 &delete_mode);
00522 inT16 failure_count(WERD_RES *word);
00523 BOOL8 noise_outlines(TWERD *word);
00525 void
00526 process_selected_words (
00527 PAGE_RES* page_res,
00528
00529 TBOX & selection_box,
00530 BOOL8 (tesseract::Tesseract::*word_processor) (BLOCK* block,
00531 ROW* row,
00532 WERD_RES* word_res));
00534 void tess_add_doc_word(
00535 WERD_CHOICE *word_choice
00536 );
00537 void tess_segment_pass1(WERD_RES *word,
00538 BLOB_CHOICE_LIST_CLIST *blob_choices);
00539 void tess_segment_pass2(WERD_RES *word,
00540 BLOB_CHOICE_LIST_CLIST *blob_choices);
00541 BOOL8 tess_acceptable_word(
00542 WERD_CHOICE *word_choice,
00543 WERD_CHOICE *raw_choice
00544 );
00546
00547
00548
00549
00550
00551
00552
00553
00554
00555
00556
00557
00558
00559
00560
00561
00562
00563
00564
00565
00566 PAGE_RES* ApplyBoxes(const STRING& fname, bool find_segmentation,
00567 BLOCK_LIST *block_list);
00568
00569
00570
00571 PAGE_RES* SetupApplyBoxes(const GenericVector<TBOX>& boxes,
00572 BLOCK_LIST *block_list);
00573
00574
00575
00576 void MaximallyChopWord(const GenericVector<TBOX>& boxes,
00577 BLOCK* block, ROW* row, WERD_RES* word_res);
00578
00579
00580
00581
00582
00583
00584
00585
00586 bool ResegmentCharBox(PAGE_RES* page_res, const TBOX *prev_box,
00587 const TBOX& box, const TBOX& next_box,
00588 const char* correct_text);
00589
00590
00591
00592
00593
00594
00595 bool ResegmentWordBox(BLOCK_LIST *block_list,
00596 const TBOX& box, const TBOX& next_box,
00597 const char* correct_text);
00598
00599
00600 void ReSegmentByClassification(PAGE_RES* page_res);
00601
00602
00603 bool ConvertStringToUnichars(const char* utf8,
00604 GenericVector<UNICHAR_ID>* class_ids);
00605
00606
00607
00608
00609
00610
00611 bool FindSegmentation(const GenericVector<UNICHAR_ID>& target_text,
00612 WERD_RES* word_res);
00613
00614
00615
00616
00617
00618
00619 void SearchForText(const GenericVector<BLOB_CHOICE_LIST*>* choices,
00620 int choices_pos, int choices_length,
00621 const GenericVector<UNICHAR_ID>& target_text,
00622 int text_index,
00623 float rating, GenericVector<int>* segmentation,
00624 float* best_rating, GenericVector<int>* best_segmentation);
00625
00626
00627
00628
00629 void TidyUp(PAGE_RES* page_res);
00630
00631 void ReportFailedBox(int boxfile_lineno, TBOX box, const char *box_ch,
00632 const char *err_msg);
00633
00634 void CorrectClassifyWords(PAGE_RES* page_res);
00635
00636
00637 void ApplyBoxTraining(const STRING& filename, PAGE_RES* page_res);
00638
00640
00641 int CountMisfitTops(WERD_RES *word_res);
00642
00643
00644
00645
00646 float ComputeCompatibleXheight(WERD_RES *word_res);
00648
00649 BOOL_VAR_H(tessedit_resegment_from_boxes, false,
00650 "Take segmentation and labeling from box file");
00651 BOOL_VAR_H(tessedit_resegment_from_line_boxes, false,
00652 "Conversion of word/line box file to char box file");
00653 BOOL_VAR_H(tessedit_train_from_boxes, false,
00654 "Generate training data from boxed chars");
00655 BOOL_VAR_H(tessedit_make_boxes_from_boxes, false,
00656 "Generate more boxes from boxed chars");
00657 BOOL_VAR_H(tessedit_dump_pageseg_images, false,
00658 "Dump intermediate images made during page segmentation");
00659 INT_VAR_H(tessedit_pageseg_mode, PSM_SINGLE_BLOCK,
00660 "Page seg mode: 0=osd only, 1=auto+osd, 2=auto, 3=col, 4=block,"
00661 " 5=line, 6=word, 7=char"
00662 " (Values from PageSegMode enum in publictypes.h)");
00663 INT_VAR_H(tessedit_ocr_engine_mode, tesseract::OEM_TESSERACT_ONLY,
00664 "Which OCR engine(s) to run (Tesseract, Cube, both). Defaults"
00665 " to loading and running only Tesseract (no Cube, no combiner)."
00666 " (Values from OcrEngineMode enum in tesseractclass.h)");
00667 STRING_VAR_H(tessedit_char_blacklist, "",
00668 "Blacklist of chars not to recognize");
00669 STRING_VAR_H(tessedit_char_whitelist, "",
00670 "Whitelist of chars to recognize");
00671 BOOL_VAR_H(tessedit_ambigs_training, false,
00672 "Perform training for ambiguities");
00673 INT_VAR_H(pageseg_devanagari_split_strategy,
00674 tesseract::ShiroRekhaSplitter::NO_SPLIT,
00675 "Whether to use the top-line splitting process for Devanagari "
00676 "documents while performing page-segmentation.");
00677 INT_VAR_H(ocr_devanagari_split_strategy,
00678 tesseract::ShiroRekhaSplitter::NO_SPLIT,
00679 "Whether to use the top-line splitting process for Devanagari "
00680 "documents while performing ocr.");
00681 STRING_VAR_H(tessedit_write_params_to_file, "",
00682 "Write all parameters to the given file.");
00683 BOOL_VAR_H(tessedit_adapt_to_char_fragments, true,
00684 "Adapt to words that contain "
00685 " a character composed form fragments");
00686 BOOL_VAR_H(tessedit_adaption_debug, false,
00687 "Generate and print debug information for adaption");
00688 INT_VAR_H(bidi_debug, 0, "Debug level for BiDi");
00689 INT_VAR_H(applybox_debug, 1, "Debug level");
00690 INT_VAR_H(applybox_page, 0, "Page number to apply boxes from");
00691 STRING_VAR_H(applybox_exposure_pattern, ".exp",
00692 "Exposure value follows this pattern in the image"
00693 " filename. The name of the image files are expected"
00694 " to be in the form [lang].[fontname].exp[num].tif");
00695 BOOL_VAR_H(applybox_learn_chars_and_char_frags_mode, false,
00696 "Learn both character fragments (as is done in the"
00697 " special low exposure mode) as well as unfragmented"
00698 " characters.");
00699 BOOL_VAR_H(applybox_learn_ngrams_mode, false,
00700 "Each bounding box is assumed to contain ngrams. Only"
00701 " learn the ngrams whose outlines overlap horizontally.");
00702 BOOL_VAR_H(tessedit_display_outwords, false, "Draw output words");
00703 BOOL_VAR_H(tessedit_training_tess, false, "Call Tess to learn blobs");
00704 BOOL_VAR_H(tessedit_dump_choices, false, "Dump char choices");
00705 BOOL_VAR_H(tessedit_fix_fuzzy_spaces, true,
00706 "Try to improve fuzzy spaces");
00707 BOOL_VAR_H(tessedit_unrej_any_wd, false,
00708 "Dont bother with word plausibility");
00709 BOOL_VAR_H(tessedit_fix_hyphens, true, "Crunch double hyphens?");
00710 BOOL_VAR_H(tessedit_redo_xheight, true, "Check/Correct x-height");
00711 BOOL_VAR_H(tessedit_enable_doc_dict, true,
00712 "Add words to the document dictionary");
00713 BOOL_VAR_H(tessedit_debug_fonts, false, "Output font info per char");
00714 BOOL_VAR_H(tessedit_debug_block_rejection, false, "Block and Row stats");
00715 BOOL_VAR_H(tessedit_enable_bigram_correction, false,
00716 "Enable correction based on the word bigram dictionary.");
00717 INT_VAR_H(tessedit_bigram_debug, 0, "Amount of debug output for bigram "
00718 "correction.");
00719 INT_VAR_H(debug_x_ht_level, 0, "Reestimate debug");
00720 BOOL_VAR_H(debug_acceptable_wds, false, "Dump word pass/fail chk");
00721 STRING_VAR_H(chs_leading_punct, "('`\"", "Leading punctuation");
00722 STRING_VAR_H(chs_trailing_punct1, ").,;:?!", "1st Trailing punctuation");
00723 STRING_VAR_H(chs_trailing_punct2, ")'`\"", "2nd Trailing punctuation");
00724 double_VAR_H(quality_rej_pc, 0.08, "good_quality_doc lte rejection limit");
00725 double_VAR_H(quality_blob_pc, 0.0, "good_quality_doc gte good blobs limit");
00726 double_VAR_H(quality_outline_pc, 1.0,
00727 "good_quality_doc lte outline error limit");
00728 double_VAR_H(quality_char_pc, 0.95, "good_quality_doc gte good char limit");
00729 INT_VAR_H(quality_min_initial_alphas_reqd, 2, "alphas in a good word");
00730 BOOL_VAR_H(tessedit_tess_adapt_to_rejmap, false,
00731 "Use reject map to control Tesseract adaption");
00732 INT_VAR_H(tessedit_tess_adaption_mode, 0x27,
00733 "Adaptation decision algorithm for tess");
00734 BOOL_VAR_H(tessedit_minimal_rej_pass1, false,
00735 "Do minimal rejection on pass 1 output");
00736 BOOL_VAR_H(tessedit_test_adaption, false, "Test adaption criteria");
00737 BOOL_VAR_H(tessedit_matcher_log, false, "Log matcher activity");
00738 INT_VAR_H(tessedit_test_adaption_mode, 3,
00739 "Adaptation decision algorithm for tess");
00740 BOOL_VAR_H(save_blob_choices, false,
00741 "Save the results of the recognition step"
00742 " (blob_choices) within the corresponding WERD_CHOICE");
00743 BOOL_VAR_H(test_pt, false, "Test for point");
00744 double_VAR_H(test_pt_x, 99999.99, "xcoord");
00745 double_VAR_H(test_pt_y, 99999.99, "ycoord");
00746 INT_VAR_H(paragraph_debug_level, 0, "Print paragraph debug info.");
00747 INT_VAR_H(cube_debug_level, 1, "Print cube debug info.");
00748 STRING_VAR_H(outlines_odd, "%| ", "Non standard number of outlines");
00749 STRING_VAR_H(outlines_2, "ij!?%\":;", "Non standard number of outlines");
00750 BOOL_VAR_H(docqual_excuse_outline_errs, false,
00751 "Allow outline errs in unrejection?");
00752 BOOL_VAR_H(tessedit_good_quality_unrej, true,
00753 "Reduce rejection on good docs");
00754 BOOL_VAR_H(tessedit_use_reject_spaces, true, "Reject spaces?");
00755 double_VAR_H(tessedit_reject_doc_percent, 65.00,
00756 "%rej allowed before rej whole doc");
00757 double_VAR_H(tessedit_reject_block_percent, 45.00,
00758 "%rej allowed before rej whole block");
00759 double_VAR_H(tessedit_reject_row_percent, 40.00,
00760 "%rej allowed before rej whole row");
00761 double_VAR_H(tessedit_whole_wd_rej_row_percent, 70.00,
00762 "Number of row rejects in whole word rejects"
00763 "which prevents whole row rejection");
00764 BOOL_VAR_H(tessedit_preserve_blk_rej_perfect_wds, true,
00765 "Only rej partially rejected words in block rejection");
00766 BOOL_VAR_H(tessedit_preserve_row_rej_perfect_wds, true,
00767 "Only rej partially rejected words in row rejection");
00768 BOOL_VAR_H(tessedit_dont_blkrej_good_wds, false,
00769 "Use word segmentation quality metric");
00770 BOOL_VAR_H(tessedit_dont_rowrej_good_wds, false,
00771 "Use word segmentation quality metric");
00772 INT_VAR_H(tessedit_preserve_min_wd_len, 2,
00773 "Only preserve wds longer than this");
00774 BOOL_VAR_H(tessedit_row_rej_good_docs, true,
00775 "Apply row rejection to good docs");
00776 double_VAR_H(tessedit_good_doc_still_rowrej_wd, 1.1,
00777 "rej good doc wd if more than this fraction rejected");
00778 BOOL_VAR_H(tessedit_reject_bad_qual_wds, true,
00779 "Reject all bad quality wds");
00780 BOOL_VAR_H(tessedit_debug_doc_rejection, false, "Page stats");
00781 BOOL_VAR_H(tessedit_debug_quality_metrics, false,
00782 "Output data to debug file");
00783 BOOL_VAR_H(bland_unrej, false, "unrej potential with no chekcs");
00784 double_VAR_H(quality_rowrej_pc, 1.1,
00785 "good_quality_doc gte good char limit");
00786 BOOL_VAR_H(unlv_tilde_crunching, true,
00787 "Mark v.bad words for tilde crunch");
00788 BOOL_VAR_H(crunch_early_merge_tess_fails, true, "Before word crunch?");
00789 BOOL_VAR_H(crunch_early_convert_bad_unlv_chs, false, "Take out ~^ early?");
00790 double_VAR_H(crunch_terrible_rating, 80.0, "crunch rating lt this");
00791 BOOL_VAR_H(crunch_terrible_garbage, true, "As it says");
00792 double_VAR_H(crunch_poor_garbage_cert, -9.0,
00793 "crunch garbage cert lt this");
00794 double_VAR_H(crunch_poor_garbage_rate, 60, "crunch garbage rating lt this");
00795 double_VAR_H(crunch_pot_poor_rate, 40, "POTENTIAL crunch rating lt this");
00796 double_VAR_H(crunch_pot_poor_cert, -8.0, "POTENTIAL crunch cert lt this");
00797 BOOL_VAR_H(crunch_pot_garbage, true, "POTENTIAL crunch garbage");
00798 double_VAR_H(crunch_del_rating, 60, "POTENTIAL crunch rating lt this");
00799 double_VAR_H(crunch_del_cert, -10.0, "POTENTIAL crunch cert lt this");
00800 double_VAR_H(crunch_del_min_ht, 0.7, "Del if word ht lt xht x this");
00801 double_VAR_H(crunch_del_max_ht, 3.0, "Del if word ht gt xht x this");
00802 double_VAR_H(crunch_del_min_width, 3.0, "Del if word width lt xht x this");
00803 double_VAR_H(crunch_del_high_word, 1.5,
00804 "Del if word gt xht x this above bl");
00805 double_VAR_H(crunch_del_low_word, 0.5, "Del if word gt xht x this below bl");
00806 double_VAR_H(crunch_small_outlines_size, 0.6, "Small if lt xht x this");
00807 INT_VAR_H(crunch_rating_max, 10, "For adj length in rating per ch");
00808 INT_VAR_H(crunch_pot_indicators, 1, "How many potential indicators needed");
00809 BOOL_VAR_H(crunch_leave_ok_strings, true, "Dont touch sensible strings");
00810 BOOL_VAR_H(crunch_accept_ok, true, "Use acceptability in okstring");
00811 BOOL_VAR_H(crunch_leave_accept_strings, false,
00812 "Dont pot crunch sensible strings");
00813 BOOL_VAR_H(crunch_include_numerals, false, "Fiddle alpha figures");
00814 INT_VAR_H(crunch_leave_lc_strings, 4,
00815 "Dont crunch words with long lower case strings");
00816 INT_VAR_H(crunch_leave_uc_strings, 4,
00817 "Dont crunch words with long lower case strings");
00818 INT_VAR_H(crunch_long_repetitions, 3, "Crunch words with long repetitions");
00819 INT_VAR_H(crunch_debug, 0, "As it says");
00820 INT_VAR_H(fixsp_non_noise_limit, 1,
00821 "How many non-noise blbs either side?");
00822 double_VAR_H(fixsp_small_outlines_size, 0.28, "Small if lt xht x this");
00823 BOOL_VAR_H(tessedit_prefer_joined_punct, false, "Reward punctation joins");
00824 INT_VAR_H(fixsp_done_mode, 1, "What constitues done for spacing");
00825 INT_VAR_H(debug_fix_space_level, 0, "Contextual fixspace debug");
00826 STRING_VAR_H(numeric_punctuation, ".,",
00827 "Punct. chs expected WITHIN numbers");
00828 INT_VAR_H(x_ht_acceptance_tolerance, 8,
00829 "Max allowed deviation of blob top outside of font data");
00830 INT_VAR_H(x_ht_min_change, 8, "Min change in xht before actually trying it");
00831 BOOL_VAR_H(tessedit_write_block_separators, false,
00832 "Write block separators in output");
00833 BOOL_VAR_H(tessedit_write_rep_codes, false,
00834 "Write repetition char code");
00835 BOOL_VAR_H(tessedit_write_unlv, false, "Write .unlv output file");
00836 BOOL_VAR_H(tessedit_create_hocr, false, "Write .html hOCR output file");
00837 STRING_VAR_H(unrecognised_char, "|",
00838 "Output char for unidentified blobs");
00839 INT_VAR_H(suspect_level, 99, "Suspect marker level");
00840 INT_VAR_H(suspect_space_level, 100,
00841 "Min suspect level for rejecting spaces");
00842 INT_VAR_H(suspect_short_words, 2,
00843 "Dont Suspect dict wds longer than this");
00844 BOOL_VAR_H(suspect_constrain_1Il, false, "UNLV keep 1Il chars rejected");
00845 double_VAR_H(suspect_rating_per_ch, 999.9, "Dont touch bad rating limit");
00846 double_VAR_H(suspect_accept_rating, -999.9, "Accept good rating limit");
00847 BOOL_VAR_H(tessedit_minimal_rejection, false, "Only reject tess failures");
00848 BOOL_VAR_H(tessedit_zero_rejection, false, "Dont reject ANYTHING");
00849 BOOL_VAR_H(tessedit_word_for_word, false,
00850 "Make output have exactly one word per WERD");
00851 BOOL_VAR_H(tessedit_zero_kelvin_rejection, false,
00852 "Dont reject ANYTHING AT ALL");
00853 BOOL_VAR_H(tessedit_consistent_reps, true, "Force all rep chars the same");
00854 INT_VAR_H(tessedit_reject_mode, 0, "Rejection algorithm");
00855 INT_VAR_H(tessedit_ok_mode, 5, "Acceptance decision algorithm");
00856 BOOL_VAR_H(tessedit_rejection_debug, false, "Adaption debug");
00857 BOOL_VAR_H(tessedit_flip_0O, true, "Contextual 0O O0 flips");
00858 double_VAR_H(tessedit_lower_flip_hyphen, 1.5,
00859 "Aspect ratio dot/hyphen test");
00860 double_VAR_H(tessedit_upper_flip_hyphen, 1.8,
00861 "Aspect ratio dot/hyphen test");
00862 BOOL_VAR_H(rej_trust_doc_dawg, false, "Use DOC dawg in 11l conf. detector");
00863 BOOL_VAR_H(rej_1Il_use_dict_word, false, "Use dictword test");
00864 BOOL_VAR_H(rej_1Il_trust_permuter_type, true, "Dont double check");
00865 BOOL_VAR_H(rej_use_tess_accepted, true, "Individual rejection control");
00866 BOOL_VAR_H(rej_use_tess_blanks, true, "Individual rejection control");
00867 BOOL_VAR_H(rej_use_good_perm, true, "Individual rejection control");
00868 BOOL_VAR_H(rej_use_sensible_wd, false, "Extend permuter check");
00869 BOOL_VAR_H(rej_alphas_in_number_perm, false, "Extend permuter check");
00870 double_VAR_H(rej_whole_of_mostly_reject_word_fract, 0.85, "if >this fract");
00871 INT_VAR_H(tessedit_image_border, 2, "Rej blbs near image edge limit");
00872 STRING_VAR_H(ok_repeated_ch_non_alphanum_wds, "-?*\075",
00873 "Allow NN to unrej");
00874 STRING_VAR_H(conflict_set_I_l_1, "Il1[]", "Il1 conflict set");
00875 INT_VAR_H(min_sane_x_ht_pixels, 8, "Reject any x-ht lt or eq than this");
00876 BOOL_VAR_H(tessedit_create_boxfile, false, "Output text with boxes");
00877 INT_VAR_H(tessedit_page_number, -1,
00878 "-1 -> All pages, else specifc page to process");
00879 BOOL_VAR_H(tessedit_write_images, false, "Capture the image from the IPE");
00880 BOOL_VAR_H(interactive_display_mode, false, "Run interactively?");
00881 STRING_VAR_H(file_type, ".tif", "Filename extension");
00882 BOOL_VAR_H(tessedit_override_permuter, true, "According to dict_word");
00883 INT_VAR_H(tessdata_manager_debug_level, 0,
00884 "Debug level for TessdataManager functions.");
00885 STRING_VAR_H(tessedit_load_sublangs, "",
00886 "List of languages to load with this one");
00887
00888
00889 double_VAR_H(min_orientation_margin, 7.0,
00890 "Min acceptable orientation margin");
00891 BOOL_VAR_H(textord_tabfind_show_vlines, false, "Debug line finding");
00892 BOOL_VAR_H(textord_use_cjk_fp_model, FALSE, "Use CJK fixed pitch model");
00893 BOOL_VAR_H(tessedit_init_config_only, false,
00894 "Only initialize with the config file. Useful if the instance is "
00895 "not going to be used for OCR but say only for layout analysis.");
00896 BOOL_VAR_H(textord_equation_detect, false, "Turn on equation detector");
00897
00899 FILE *init_recog_training(const STRING &fname);
00900 void recog_training_segmented(const STRING &fname,
00901 PAGE_RES *page_res,
00902 volatile ETEXT_DESC *monitor,
00903 FILE *output_file);
00904 void ambigs_classify_and_output(WERD_RES *werd_res,
00905 ROW_RES *row_res,
00906 BLOCK_RES *block_res,
00907 const char *label,
00908 FILE *output_file);
00909
00910 inline CubeRecoContext *GetCubeRecoContext() { return cube_cntxt_; }
00911
00912 private:
00913
00914
00915
00916 const char* backup_config_file_;
00917
00918 STRING word_config_;
00919
00920
00921 Pix* pix_binary_;
00922
00923 Pix* cube_binary_;
00924
00925 Pix* pix_grey_;
00926
00927
00928 int source_resolution_;
00929
00930
00931 ShiroRekhaSplitter splitter_;
00932
00933 Textord textord_;
00934
00935 bool right_to_left_;
00936 Pix* scaled_color_;
00937 int scaled_factor_;
00938 FCOORD deskew_;
00939 FCOORD reskew_;
00940 TesseractStats stats_;
00941
00942 GenericVector<Tesseract*> sub_langs_;
00943
00944
00945 Tesseract* most_recently_used_;
00946
00947 int font_table_size_;
00948
00949 CubeRecoContext* cube_cntxt_;
00950 TesseractCubeCombiner *tess_cube_combiner_;
00951
00952 EquationDetect* equ_detect_;
00953 };
00954
00955 }
00956
00957
00958 #endif // TESSERACT_CCMAIN_TESSERACTCLASS_H__