00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00018
00019 #ifndef TESSERACT_WORDREC_WORDREC_H__
00020 #define TESSERACT_WORDREC_WORDREC_H__
00021
00022 #include "associate.h"
00023 #include "classify.h"
00024 #include "dict.h"
00025 #include "language_model.h"
00026 #include "ratngs.h"
00027 #include "matrix.h"
00028 #include "matchtab.h"
00029 #include "oldheap.h"
00030 #include "gradechop.h"
00031 #include "seam.h"
00032 #include "states.h"
00033 #include "findseam.h"
00034 #include "callcpp.h"
00035
00036 struct CHUNKS_RECORD;
00037 struct SEARCH_RECORD;
00038 class WERD_RES;
00039
00040
00041
00042 struct SEG_SEARCH_PENDING : public ELIST_LINK {
00043 SEG_SEARCH_PENDING(int child_row_arg,
00044 BLOB_CHOICE_LIST *parent_arg,
00045 tesseract::LanguageModelFlagsType changed_arg) :
00046 child_row(child_row_arg), parent(parent_arg), changed(changed_arg) {}
00047
00048
00049 static int compare(const void *p1, const void *p2) {
00050 const SEG_SEARCH_PENDING *e1 = *reinterpret_cast<
00051 const SEG_SEARCH_PENDING * const *>(p1);
00052 const SEG_SEARCH_PENDING *e2 = *reinterpret_cast<
00053 const SEG_SEARCH_PENDING * const *>(p2);
00054 if (e1->child_row == e2->child_row &&
00055 e1->parent == e2->parent) return 0;
00056 return (e1->child_row < e2->child_row) ? -1 : 1;
00057 }
00058
00059 int child_row;
00060 BLOB_CHOICE_LIST *parent;
00061
00062
00063
00064
00065
00066 tesseract::LanguageModelFlagsType changed;
00067 };
00068
00069 ELISTIZEH(SEG_SEARCH_PENDING);
00070
00071
00072 namespace tesseract {
00073
00074
00075 class FRAGMENT:public ELIST_LINK
00076 {
00077 public:
00078 FRAGMENT() {
00079 }
00080 FRAGMENT(EDGEPT *head_pt,
00081 EDGEPT *tail_pt);
00082
00083 ICOORD head;
00084 ICOORD tail;
00085 EDGEPT *headpt;
00086 EDGEPT *tailpt;
00087 };
00088 ELISTIZEH(FRAGMENT)
00089
00090
00091 class Wordrec : public Classify {
00092 public:
00093
00094 BOOL_VAR_H(merge_fragments_in_matrix, TRUE,
00095 "Merge the fragments in the ratings matrix and delete them "
00096 "after merging");
00097 BOOL_VAR_H(wordrec_no_block, FALSE, "Don't output block information");
00098 BOOL_VAR_H(wordrec_enable_assoc, TRUE, "Associator Enable");
00099 BOOL_VAR_H(force_word_assoc, FALSE,
00100 "force associator to run regardless of what enable_assoc is."
00101 "This is used for CJK where component grouping is necessary.");
00102 INT_VAR_H(wordrec_num_seg_states, 30, "Segmentation states");
00103 double_VAR_H(wordrec_worst_state, 1, "Worst segmentation state");
00104 BOOL_VAR_H(fragments_guide_chopper, FALSE,
00105 "Use information from fragments to guide chopping process");
00106 INT_VAR_H(repair_unchopped_blobs, 1, "Fix blobs that aren't chopped");
00107 double_VAR_H(tessedit_certainty_threshold, -2.25, "Good blob limit");
00108 INT_VAR_H(chop_debug, 0, "Chop debug");
00109 BOOL_VAR_H(chop_enable, 1, "Chop enable");
00110 BOOL_VAR_H(chop_vertical_creep, 0, "Vertical creep");
00111 INT_VAR_H(chop_split_length, 10000, "Split Length");
00112 INT_VAR_H(chop_same_distance, 2, "Same distance");
00113 INT_VAR_H(chop_min_outline_points, 6, "Min Number of Points on Outline");
00114 INT_VAR_H(chop_inside_angle, -50, "Min Inside Angle Bend");
00115 INT_VAR_H(chop_min_outline_area, 2000, "Min Outline Area");
00116 double_VAR_H(chop_split_dist_knob, 0.5, "Split length adjustment");
00117 double_VAR_H(chop_overlap_knob, 0.9, "Split overlap adjustment");
00118 double_VAR_H(chop_center_knob, 0.15, "Split center adjustment");
00119 double_VAR_H(chop_sharpness_knob, 0.06, "Split sharpness adjustment");
00120 double_VAR_H(chop_width_change_knob, 5.0, "Width change adjustment");
00121 double_VAR_H(chop_ok_split, 100.0, "OK split limit");
00122 double_VAR_H(chop_good_split, 50.0, "Good split limit");
00123 INT_VAR_H(chop_x_y_weight, 3, "X / Y length weight");
00124 INT_VAR_H(segment_adjust_debug, 0, "Segmentation adjustment debug");
00125 BOOL_VAR_H(assume_fixed_pitch_char_segment, FALSE,
00126 "include fixed-pitch heuristics in char segmentation");
00127 BOOL_VAR_H(use_new_state_cost, FALSE,
00128 "use new state cost heuristics for segmentation state evaluation");
00129 double_VAR_H(heuristic_segcost_rating_base, 1.25,
00130 "base factor for adding segmentation cost into word rating."
00131 "It's a multiplying factor, the larger the value above 1, "
00132 "the bigger the effect of segmentation cost.");
00133 double_VAR_H(heuristic_weight_rating, 1,
00134 "weight associated with char rating in combined cost of state");
00135 double_VAR_H(heuristic_weight_width, 0,
00136 "weight associated with width evidence in combined cost of state");
00137 double_VAR_H(heuristic_weight_seamcut, 0,
00138 "weight associated with seam cut in combined cost of state");
00139 double_VAR_H(heuristic_max_char_wh_ratio, 2.0,
00140 "max char width-to-height ratio allowed in segmentation");
00141 INT_VAR_H(wordrec_debug_level, 0, "Debug level for wordrec");
00142 BOOL_VAR_H(wordrec_debug_blamer, false, "Print blamer debug messages");
00143 BOOL_VAR_H(wordrec_run_blamer, false, "Try to set the blame for errors");
00144 BOOL_VAR_H(enable_new_segsearch, false,
00145 "Enable new segmentation search path.");
00146 INT_VAR_H(segsearch_debug_level, 0, "SegSearch debug level");
00147 INT_VAR_H(segsearch_max_pain_points, 2000,
00148 "Maximum number of pain points stored in the queue");
00149 INT_VAR_H(segsearch_max_futile_classifications, 10,
00150 "Maximum number of pain point classifications per word.");
00151 double_VAR_H(segsearch_max_char_wh_ratio, 2.0,
00152 "Maximum character width-to-height ratio");
00153 double_VAR_H(segsearch_max_fixed_pitch_char_wh_ratio, 2.0,
00154 "Maximum character width-to-height ratio for"
00155 "fixed pitch fonts");
00156 BOOL_VAR_H(save_alt_choices, false,
00157 "Save alternative paths found during chopping "
00158 "and segmentation search");
00159
00160
00161 Wordrec();
00162 virtual ~Wordrec();
00163
00164 void CopyCharChoices(const BLOB_CHOICE_LIST_VECTOR &from,
00165 BLOB_CHOICE_LIST_VECTOR *to);
00166
00167
00168 bool ChoiceIsCorrect(const UNICHARSET& uni_set,
00169 const WERD_CHOICE *choice,
00170 const GenericVector<STRING> &truth_text);
00171
00172
00173
00174
00175
00176
00177
00178 void SaveAltChoices(const LIST &best_choices, WERD_RES *word);
00179
00180
00181
00182 void FillLattice(const MATRIX &ratings, const LIST &best_choices,
00183 const UNICHARSET &unicharset, BlamerBundle *blamer_bundle);
00184
00185
00186
00187 void CallFillLattice(const MATRIX &ratings, const LIST &best_choices,
00188 const UNICHARSET &unicharset, BlamerBundle *blamer_bundle) {
00189 (this->*fill_lattice_)(ratings, best_choices, unicharset, blamer_bundle);
00190 }
00191
00192
00193 void program_editup(const char *textbase,
00194 bool init_classifier,
00195 bool init_permute);
00196 BLOB_CHOICE_LIST_VECTOR *cc_recog(WERD_RES *word);
00197 void program_editdown(inT32 elasped_time);
00198 void set_pass1();
00199 void set_pass2();
00200 int end_recog();
00201 BLOB_CHOICE_LIST *call_matcher(const DENORM* denorm, TBLOB* blob);
00202 int dict_word(const WERD_CHOICE &word);
00203
00204 BLOB_CHOICE_LIST *classify_blob(TBLOB *blob,
00205 const DENORM& denorm,
00206 const char *string,
00207 C_COL color,
00208 BlamerBundle *blamer_bundle);
00209 BLOB_CHOICE_LIST *fake_classify_blob(UNICHAR_ID class_id,
00210 float rating, float certainty);
00211 void update_blob_classifications(TWERD *word,
00212 const BLOB_CHOICE_LIST_VECTOR &choices);
00213
00214
00215 BLOB_CHOICE_LIST_VECTOR *evaluate_chunks(CHUNKS_RECORD *chunks_record,
00216 SEARCH_STATE search_state,
00217 BlamerBundle *blamer_bundle);
00218 void update_ratings(const BLOB_CHOICE_LIST_VECTOR &new_choices,
00219 const CHUNKS_RECORD *chunks_record,
00220 const SEARCH_STATE search_state);
00221 inT16 evaluate_state(CHUNKS_RECORD *chunks_record,
00222 SEARCH_RECORD *the_search,
00223 DANGERR *fixpt,
00224 BlamerBundle *blamer_bundle);
00225 SEARCH_RECORD *new_search(CHUNKS_RECORD *chunks_record,
00226 int num_joints,
00227 BLOB_CHOICE_LIST_VECTOR *best_char_choices,
00228 WERD_CHOICE *best_choice,
00229 WERD_CHOICE *raw_choice,
00230 STATE *state);
00231 void best_first_search(CHUNKS_RECORD *chunks_record,
00232 BLOB_CHOICE_LIST_VECTOR *best_char_choices,
00233 WERD_RES *word,
00234 STATE *state,
00235 DANGERR *fixpt,
00236 STATE *best_state);
00237 void delete_search(SEARCH_RECORD *the_search);
00238 void expand_node(FLOAT32 worst_priority,
00239 CHUNKS_RECORD *chunks_record,
00240 SEARCH_RECORD *the_search);
00241 void replace_char_widths(CHUNKS_RECORD *chunks_record,
00242 SEARCH_STATE state);
00243
00244
00245 BLOB_CHOICE_LIST_VECTOR *rebuild_current_state(
00246 WERD_RES *word,
00247 STATE *state,
00248 BLOB_CHOICE_LIST_VECTOR *char_choices,
00249 MATRIX *ratings);
00250
00251
00252
00253
00254
00255
00256
00257 BLOB_CHOICE* rebuild_fragments(
00258 const char* unichar,
00259 const char* expanded_fragment_lengths,
00260 int choice_index,
00261 BLOB_CHOICE_LIST_VECTOR *old_choices);
00262
00263
00264
00265 BLOB_CHOICE_LIST *join_blobs_and_classify(
00266 WERD_RES* word, int x, int y, int choice_index, MATRIX *ratings,
00267 BLOB_CHOICE_LIST_VECTOR *old_choices);
00268 STATE *pop_queue(HEAP *queue);
00269 void push_queue(HEAP *queue, STATE *state, FLOAT32 worst_priority,
00270 FLOAT32 priority, bool debug);
00271
00272
00273
00274
00275
00276
00277
00278
00279
00280
00281
00282
00283
00284
00285
00286
00287
00288
00289
00290
00291
00292
00293
00294
00295
00296
00297
00298
00299
00300
00301
00302
00303
00304
00305
00306
00307
00308
00309
00310
00311
00312
00313
00314
00315
00316
00317 void SegSearch(CHUNKS_RECORD *chunks_record,
00318 WERD_CHOICE *best_choice,
00319 BLOB_CHOICE_LIST_VECTOR *best_char_choices,
00320 WERD_CHOICE *raw_choice,
00321 STATE *output_best_state,
00322 BlamerBundle *blamer_bundle);
00323
00324
00325 PRIORITY point_priority(EDGEPT *point);
00326 void add_point_to_list(POINT_GROUP point_list, EDGEPT *point);
00327 int angle_change(EDGEPT *point1, EDGEPT *point2, EDGEPT *point3);
00328 int is_little_chunk(EDGEPT *point1, EDGEPT *point2);
00329 int is_small_area(EDGEPT *point1, EDGEPT *point2);
00330 EDGEPT *pick_close_point(EDGEPT *critical_point,
00331 EDGEPT *vertical_point,
00332 int *best_dist);
00333 void prioritize_points(TESSLINE *outline, POINT_GROUP points);
00334 void new_min_point(EDGEPT *local_min, POINT_GROUP points);
00335 void new_max_point(EDGEPT *local_max, POINT_GROUP points);
00336 void vertical_projection_point(EDGEPT *split_point, EDGEPT *target_point,
00337 EDGEPT** best_point,
00338 EDGEPT_CLIST *new_points);
00339
00340
00341 SEAM *attempt_blob_chop(TWERD *word, TBLOB *blob, inT32 blob_number,
00342 bool italic_blob, SEAMS seam_list);
00343 SEAM *chop_numbered_blob(TWERD *word, inT32 blob_number,
00344 bool italic_blob, SEAMS seam_list);
00345 SEAM *chop_overlapping_blob(const GenericVector<TBOX>& boxes,
00346 WERD_RES *word_res, inT32 *blob_number,
00347 bool italic_blob, SEAMS seam_list);
00348 bool improve_one_blob(WERD_RES *word_res,
00349 BLOB_CHOICE_LIST_VECTOR *char_choices,
00350 inT32 *blob_number,
00351 SEAMS *seam_list,
00352 DANGERR *fixpt,
00353 bool split_next_to_fragment,
00354 BlamerBundle *blamer_bundle);
00355 void modify_blob_choice(BLOB_CHOICE_LIST *answer,
00356 int chop_index);
00357 bool chop_one_blob(TWERD *word,
00358 BLOB_CHOICE_LIST_VECTOR *char_choices,
00359 inT32 *blob_number,
00360 SEAMS *seam_list,
00361 int *right_chop_index);
00362 bool chop_one_blob2(const GenericVector<TBOX>& boxes,
00363 WERD_RES *word_res, SEAMS *seam_list);
00364 BLOB_CHOICE_LIST_VECTOR *chop_word_main(WERD_RES *word);
00365 void improve_by_chopping(WERD_RES *word,
00366 BLOB_CHOICE_LIST_VECTOR *char_choices,
00367 STATE *best_state,
00368 BLOB_CHOICE_LIST_VECTOR *best_char_choices,
00369 DANGERR *fixpt,
00370 bool *updated_best_choice);
00371 MATRIX *word_associator(bool only_create_ratings_matrtix,
00372 WERD_RES *word,
00373 STATE *state,
00374 BLOB_CHOICE_LIST_VECTOR *best_char_choices,
00375 DANGERR *fixpt,
00376 STATE *best_state);
00377 inT16 select_blob_to_split(const BLOB_CHOICE_LIST_VECTOR &char_choices,
00378 float rating_ceiling,
00379 bool split_next_to_fragment);
00380 void set_chopper_blame(WERD_RES *word);
00381
00382
00383 void junk_worst_seam(SEAM_QUEUE seams, SEAM *new_seam, float new_priority);
00384 void choose_best_seam(SEAM_QUEUE seam_queue,
00385 SEAM_PILE *seam_pile,
00386 SPLIT *split,
00387 PRIORITY priority,
00388 SEAM **seam_result,
00389 TBLOB *blob);
00390 void combine_seam(SEAM_QUEUE seam_queue, SEAM_PILE seam_pile, SEAM *seam);
00391 inT16 constrained_split(SPLIT *split, TBLOB *blob);
00392 void delete_seam_pile(SEAM_PILE seam_pile);
00393 SEAM *pick_good_seam(TBLOB *blob);
00394 PRIORITY seam_priority(SEAM *seam, inT16 xmin, inT16 xmax);
00395 void try_point_pairs (EDGEPT * points[MAX_NUM_POINTS],
00396 inT16 num_points,
00397 SEAM_QUEUE seam_queue,
00398 SEAM_PILE * seam_pile, SEAM ** seam, TBLOB * blob);
00399 void try_vertical_splits(EDGEPT * points[MAX_NUM_POINTS],
00400 inT16 num_points,
00401 EDGEPT_CLIST *new_points,
00402 SEAM_QUEUE seam_queue,
00403 SEAM_PILE * seam_pile, SEAM ** seam, TBLOB * blob);
00404
00405
00406 PRIORITY full_split_priority(SPLIT *split, inT16 xmin, inT16 xmax);
00407 PRIORITY grade_center_of_blob(register BOUNDS_RECT rect);
00408 PRIORITY grade_overlap(register BOUNDS_RECT rect);
00409 PRIORITY grade_split_length(register SPLIT *split);
00410 PRIORITY grade_sharpness(register SPLIT *split);
00411 PRIORITY grade_width_change(register BOUNDS_RECT rect);
00412 void set_outline_bounds(register EDGEPT *point1,
00413 register EDGEPT *point2,
00414 BOUNDS_RECT rect);
00415
00416
00417 int crosses_outline(EDGEPT *p0, EDGEPT *p1, EDGEPT *outline);
00418 int is_crossed(TPOINT a0, TPOINT a1, TPOINT b0, TPOINT b1);
00419 int is_same_edgept(EDGEPT *p1, EDGEPT *p2);
00420 bool near_point(EDGEPT *point, EDGEPT *line_pt_0, EDGEPT *line_pt_1,
00421 EDGEPT **near_pt);
00422 void reverse_outline(EDGEPT *outline);
00423
00424
00425 virtual BLOB_CHOICE_LIST *classify_piece(TBLOB *pieces,
00426 const DENORM& denorm,
00427 SEAMS seams,
00428 inT16 start,
00429 inT16 end,
00430 BlamerBundle *blamer_bundle);
00431
00432
00433 void merge_fragments(MATRIX *ratings,
00434 inT16 num_blobs);
00435
00436
00437
00438
00439
00440
00441
00442
00443 void get_fragment_lists(inT16 current_frag,
00444 inT16 current_row,
00445 inT16 start,
00446 inT16 num_frag_parts,
00447 inT16 num_blobs,
00448 MATRIX *ratings,
00449 BLOB_CHOICE_LIST *choice_lists);
00450
00451
00452 void merge_and_put_fragment_lists(inT16 row,
00453 inT16 column,
00454 inT16 num_frag_parts,
00455 BLOB_CHOICE_LIST *choice_lists,
00456 MATRIX *ratings);
00457
00458
00459
00460
00461
00462
00463 void fill_filtered_fragment_list(BLOB_CHOICE_LIST *choices,
00464 int fragment_pos,
00465 int num_frag_parts,
00466 BLOB_CHOICE_LIST *filtered_choices);
00467 BLOB_CHOICE_LIST *get_piece_rating(MATRIX *ratings,
00468 TBLOB *blobs,
00469 const DENORM& denorm,
00470 SEAMS seams,
00471 inT16 start,
00472 inT16 end,
00473 BlamerBundle *blamer_bundle);
00474
00475 TBOX *record_blob_bounds(TBLOB *blobs);
00476 MATRIX *record_piece_ratings(TBLOB *blobs);
00477
00478
00479 WIDTH_RECORD* state_char_widths(WIDTH_RECORD *chunk_widths,
00480 STATE *state,
00481 int num_joints);
00482 FLOAT32 get_width_variance(WIDTH_RECORD *wrec, float norm_height);
00483 FLOAT32 get_gap_variance(WIDTH_RECORD *wrec, float norm_height);
00484 FLOAT32 prioritize_state(CHUNKS_RECORD *chunks_record,
00485 SEARCH_RECORD *the_search);
00486 FLOAT32 width_priority(CHUNKS_RECORD *chunks_record,
00487 STATE *state,
00488 int num_joints);
00489 FLOAT32 seamcut_priority(SEAMS seams,
00490 STATE *state,
00491 int num_joints);
00492 FLOAT32 rating_priority(CHUNKS_RECORD *chunks_record,
00493 STATE *state,
00494 int num_joints);
00495
00496
00497
00498 LanguageModel *language_model_;
00499 PRIORITY pass2_ok_split;
00500 int pass2_seg_states;
00501 int num_joints;
00502 int num_pushed;
00503 int num_popped;
00504 BlobMatchTable blob_match_table;
00505 EVALUATION_ARRAY last_segmentation;
00506
00507
00508
00509 WERD_CHOICE *prev_word_best_choice_;
00510
00511 GenericVector<int> blame_reasons_;
00512
00513 void (Wordrec::*fill_lattice_)(const MATRIX &ratings,
00514 const LIST &best_choices,
00515 const UNICHARSET &unicharset,
00516 BlamerBundle *blamer_bundle);
00517
00518 protected:
00519 inline bool SegSearchDone(int num_futile_classifications) {
00520 return (language_model_->AcceptableChoiceFound() ||
00521 num_futile_classifications >=
00522 segsearch_max_futile_classifications);
00523 }
00524
00525
00526
00527
00528
00529
00530
00531
00532
00533
00534
00535
00536
00537
00538
00539
00540
00541
00542
00543
00544
00545
00546
00547
00548
00549
00550 void UpdateSegSearchNodes(int starting_col,
00551 SEG_SEARCH_PENDING_LIST *pending[],
00552 BestPathByColumn *best_path_by_column[],
00553 CHUNKS_RECORD *chunks_record,
00554 HEAP *pain_points,
00555 BestChoiceBundle *best_choice_bundle,
00556 BlamerBundle *blamer_bundle);
00557
00558
00559
00560 void ProcessSegSearchPainPoint(float pain_point_priority,
00561 const MATRIX_COORD &pain_point,
00562 const WERD_CHOICE *best_choice,
00563 SEG_SEARCH_PENDING_LIST *pending[],
00564 CHUNKS_RECORD *chunks_record,
00565 HEAP *pain_points,
00566 BlamerBundle *blamer_bundle);
00567
00568
00569
00570
00571 void InitBlamerForSegSearch(const WERD_CHOICE *best_choice,
00572 CHUNKS_RECORD *chunks_record,
00573 HEAP *pain_points,
00574 BlamerBundle *blamer_bundle,
00575 STRING *blamer_debug);
00576
00577
00578 void FinishBlamerForSegSearch(const WERD_CHOICE *best_choice,
00579 BlamerBundle *blamer_bundle,
00580 STRING *blamer_debug);
00581
00582 };
00583
00584
00585 }
00586
00587 #endif // TESSERACT_WORDREC_WORDREC_H__