00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025 #ifndef PERMUTE_H
00026 #define PERMUTE_H
00027
00028
00029
00030
00031
00032 #include "ratngs.h"
00033 #include "params.h"
00034 #include "unicharset.h"
00035
00036 #define MAX_PERM_LENGTH 128
00037
00038
00039
00040
00041 extern INT_VAR_H(fragments_debug, 0, "Debug character fragments");
00042 extern INT_VAR_H(segment_debug, 0, "Debug the whole segmentation process");
00043 extern BOOL_VAR_H(permute_debug, 0, "char permutation debug");
00044
00045 extern BOOL_VAR_H(permute_script_word, 0,
00046 "Turn on word script consistency permuter");
00047
00048 extern BOOL_VAR_H(permute_fixed_length_dawg, 0,
00049 "Turn on fixed-length phrasebook search permuter");
00050
00051 extern BOOL_VAR_H(segment_segcost_rating, 0,
00052 "incorporate segmentation cost in word rating?");
00053
00054 extern double_VAR_H(segment_reward_script, 0.95,
00055 "Score multipler for script consistency within a word. "
00056 "Being a 'reward' factor, it should be <= 1. "
00057 "Smaller value implies bigger reward.");
00058
00059 extern BOOL_VAR_H(permute_chartype_word, 0,
00060 "Turn on character type (property) consistency permuter");
00061 extern double_VAR_H(segment_reward_chartype, 0.97,
00062 "Score multipler for char type consistency within a word. ");
00063
00064 extern double_VAR_H(segment_reward_ngram_best_choice, 0.99,
00065 "Score multipler for ngram permuter's best choice"
00066 " (only used in the Han script path).");
00067
00068 extern INT_VAR_H(max_permuter_attempts, 100000,
00069 "Maximum number of different character choices to consider"
00070 " during permutation. This limit is especially useful when"
00071 " user patterns are specified, since overly generic patterns"
00072 " can result in dawg search exploring an overly large number"
00073 "of options.");
00074
00075 extern int permute_only_top;
00076
00077
00078
00079
00080 void adjust_non_word(const char *word, const char *word_lengths,
00081 float rating, float *new_rating, float *adjust_factor);
00082
00083 const char* choose_il1(const char *first_char,
00084 const char *second_char,
00085 const char *third_char,
00086 const char *prev_char,
00087 const char *next_char,
00088 const char *next_next_char);
00089
00090 namespace tesseract {
00091
00092
00093
00094
00095
00096
00097
00098
00099
00100
00101
00102
00103
00104
00105
00106
00107
00108
00109
00110
00111
00112
00113 class PermuterState {
00114 public:
00115 PermuterState();
00116
00117 void Init(const BLOB_CHOICE_LIST_VECTOR& char_choices,
00118 const UNICHARSET &unicharset,
00119 float default_bias,
00120 bool debug);
00121
00122 void AddPreference(int start_pos, char* pos_str, float weight);
00123
00124 void AddPreference(int char_pos, BLOB_CHOICE* blob_choice, float weight);
00125
00126 WERD_CHOICE* GetPermutedWord(float *certainties, float *adjust_factor);
00127
00128 void set_allow_collision(bool flag) { allow_collision_ = flag; }
00129 void set_adjust_factor(float factor) { adjust_factor_ = factor; }
00130 void set_debug(bool debug) { debug_ = debug; }
00131 bool position_marked(int pos) { return perm_state_[pos] != kPosFree; }
00132
00133 private:
00134 static const char kPosFree = '.';
00135
00136 const UNICHARSET *unicharset_;
00137
00138 const BLOB_CHOICE_LIST_VECTOR *char_choices_;
00139
00140 char perm_state_[MAX_PERM_LENGTH];
00141
00142 int word_length_;
00143 bool allow_collision_;
00144 float adjust_factor_;
00145 bool debug_;
00146 };
00147
00148 }
00149
00150 #endif