00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027 #ifndef DICT_DAWG_H_
00028 #define DICT_DAWG_H_
00029
00030
00031
00032
00033
00034 #include "elst.h"
00035 #include "ratngs.h"
00036 #include "params.h"
00037 #include "tesscallback.h"
00038
00039 #ifndef __GNUC__
00040 #ifdef __MSW32__
00041 #define NO_EDGE (inT64) 0xffffffffffffffffi64
00042 #endif
00043 #else
00044 #define NO_EDGE (inT64) 0xffffffffffffffffll
00045 #endif
00046
00047
00048
00049
00050 class UNICHARSET;
00051
00052 typedef uinT64 EDGE_RECORD;
00053 typedef EDGE_RECORD *EDGE_ARRAY;
00054 typedef inT64 EDGE_REF;
00055 typedef inT64 NODE_REF;
00056 typedef EDGE_REF *NODE_MAP;
00057
00058 namespace tesseract {
00059
00060 struct NodeChild {
00061 UNICHAR_ID unichar_id;
00062 EDGE_REF edge_ref;
00063 NodeChild(UNICHAR_ID id, EDGE_REF ref): unichar_id(id), edge_ref(ref) {}
00064 NodeChild(): unichar_id(INVALID_UNICHAR_ID), edge_ref(NO_EDGE) {}
00065 };
00066
00067 typedef GenericVector<NodeChild> NodeChildVector;
00068 typedef GenericVector<int> SuccessorList;
00069 typedef GenericVector<SuccessorList *> SuccessorListsVector;
00070
00071 enum DawgType {
00072 DAWG_TYPE_PUNCTUATION,
00073 DAWG_TYPE_WORD,
00074 DAWG_TYPE_NUMBER,
00075 DAWG_TYPE_PATTERN,
00076
00077 DAWG_TYPE_COUNT
00078 };
00079
00080
00081
00082
00083
00084 #define FORWARD_EDGE (inT32) 0
00085 #define BACKWARD_EDGE (inT32) 1
00086 #define MAX_NODE_EDGES_DISPLAY (inT64) 100
00087 #define MARKER_FLAG (inT64) 1
00088 #define DIRECTION_FLAG (inT64) 2
00089 #define WERD_END_FLAG (inT64) 4
00090 #define LETTER_START_BIT 0
00091 #define NUM_FLAG_BITS 3
00092 #define REFFORMAT "%lld"
00093
00094
00095
00096 static const bool kBeginningDawgsType[] = { 1, 1, 1, 1 };
00097
00098 static const bool kDawgSuccessors[DAWG_TYPE_COUNT][DAWG_TYPE_COUNT] = {
00099 { 0, 1, 1, 0 },
00100 { 1, 0, 0, 0 },
00101 { 1, 0, 0, 0 },
00102 { 0, 0, 0, 0 },
00103 };
00104
00105 static const char kWildcard[] = "*";
00106
00107
00108
00109
00110
00111
00121
00122 class Dawg {
00123 public:
00125 static const inT16 kDawgMagicNumber = 42;
00129 static const UNICHAR_ID kPatternUnicharID = 0;
00130
00131 inline DawgType type() const { return type_; }
00132 inline const STRING &lang() const { return lang_; }
00133 inline PermuterType permuter() const { return perm_; }
00134
00135 virtual ~Dawg() {};
00136
00138 bool word_in_dawg(const WERD_CHOICE &word) const;
00139
00142 int check_for_words(const char *filename,
00143 const UNICHARSET &unicharset,
00144 bool enable_wildcard) const;
00145
00146
00147
00148 void iterate_words(const UNICHARSET &unicharset,
00149 TessCallback1<const char *> *cb) const;
00150
00151
00152
00154 virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id,
00155 bool word_end) const = 0;
00156
00159 virtual void unichar_ids_of(NODE_REF node, NodeChildVector *vec) const = 0;
00160
00163 virtual NODE_REF next_node(EDGE_REF edge_ref) const = 0;
00164
00167 virtual bool end_of_word(EDGE_REF edge_ref) const = 0;
00168
00170 virtual UNICHAR_ID edge_letter(EDGE_REF edge_ref) const = 0;
00171
00174 virtual void print_node(NODE_REF node, int max_num_edges) const = 0;
00175
00178 virtual void unichar_id_to_patterns(UNICHAR_ID unichar_id,
00179 const UNICHARSET &unicharset,
00180 GenericVector<UNICHAR_ID> *vec) const {};
00181
00185 virtual EDGE_REF pattern_loop_edge(
00186 EDGE_REF edge_ref, UNICHAR_ID unichar_id, bool word_end) const {
00187 return false;
00188 }
00189
00190 protected:
00191 Dawg() {}
00192
00194 inline NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const {
00195 return ((edge_rec & next_node_mask_) >> next_node_start_bit_);
00196 }
00198 inline bool marker_flag_from_edge_rec(const EDGE_RECORD &edge_rec) const {
00199 return (edge_rec & (MARKER_FLAG << flag_start_bit_)) != 0;
00200 }
00202 inline int direction_from_edge_rec(const EDGE_RECORD &edge_rec) const {
00203 return ((edge_rec & (DIRECTION_FLAG << flag_start_bit_))) ?
00204 BACKWARD_EDGE : FORWARD_EDGE;
00205 }
00207 inline bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const {
00208 return (edge_rec & (WERD_END_FLAG << flag_start_bit_)) != 0;
00209 }
00211 inline UNICHAR_ID unichar_id_from_edge_rec(
00212 const EDGE_RECORD &edge_rec) const {
00213 return ((edge_rec & letter_mask_) >> LETTER_START_BIT);
00214 }
00216 inline void set_next_node_in_edge_rec(
00217 EDGE_RECORD *edge_rec, EDGE_REF value) {
00218 *edge_rec &= (~next_node_mask_);
00219 *edge_rec |= ((value << next_node_start_bit_) & next_node_mask_);
00220 }
00222 inline void set_marker_flag_in_edge_rec(EDGE_RECORD *edge_rec) {
00223 *edge_rec |= (MARKER_FLAG << flag_start_bit_);
00224 }
00232 inline int given_greater_than_edge_rec(NODE_REF next_node,
00233 bool word_end,
00234 UNICHAR_ID unichar_id,
00235 const EDGE_RECORD &edge_rec) const {
00236 UNICHAR_ID curr_unichar_id = unichar_id_from_edge_rec(edge_rec);
00237 NODE_REF curr_next_node = next_node_from_edge_rec(edge_rec);
00238 bool curr_word_end = end_of_word_from_edge_rec(edge_rec);
00239 if (edge_rec_match(next_node, word_end, unichar_id, curr_next_node,
00240 curr_word_end, curr_unichar_id)) return 0;
00241 if (unichar_id > curr_unichar_id) return 1;
00242 if (unichar_id == curr_unichar_id) {
00243 if (next_node > curr_next_node) return 1;
00244 if (next_node == curr_next_node) {
00245 if (word_end > curr_word_end) return 1;
00246 }
00247 }
00248 return -1;
00249 }
00253 inline bool edge_rec_match(NODE_REF next_node,
00254 bool word_end,
00255 UNICHAR_ID unichar_id,
00256 NODE_REF other_next_node,
00257 bool other_word_end,
00258 UNICHAR_ID other_unichar_id) const {
00259 return ((unichar_id == other_unichar_id) &&
00260 (next_node == NO_EDGE || next_node == other_next_node) &&
00261 (!word_end || (word_end == other_word_end)));
00262 }
00263
00266 void init(DawgType type, const STRING &lang,
00267 PermuterType perm, int unicharset_size, int debug_level);
00268
00274 bool match_words(WERD_CHOICE *word, inT32 index,
00275 NODE_REF node, UNICHAR_ID wildcard) const;
00276
00277
00278 void iterate_words_rec(const WERD_CHOICE &word_so_far,
00279 NODE_REF to_explore,
00280 TessCallback1<const char *> *cb) const;
00281
00282
00283 DawgType type_;
00284 STRING lang_;
00286 PermuterType perm_;
00287
00288
00289
00290
00291 int unicharset_size_;
00292 int flag_start_bit_;
00293 int next_node_start_bit_;
00294 uinT64 next_node_mask_;
00295 uinT64 flags_mask_;
00296 uinT64 letter_mask_;
00297
00298 int debug_level_;
00299 };
00300
00301
00304
00305 struct DawgInfo {
00306 DawgInfo() : dawg_index(-1), ref(NO_EDGE) {}
00307 DawgInfo(int i, EDGE_REF r) : dawg_index(i), ref(r) {}
00308 bool operator==(const DawgInfo &other) {
00309 return (this->dawg_index == other.dawg_index && this->ref == other.ref);
00310 }
00311 int dawg_index;
00312 EDGE_REF ref;
00313 };
00314 class DawgInfoVector : public GenericVector<DawgInfo> {
00315 public:
00317 ~DawgInfoVector() {
00318 if (size_reserved_ > 0) {
00319 delete[] data_;
00320 size_used_ = 0;
00321 size_reserved_ = 0;
00322 }
00323 }
00326 void clear() { size_used_ = 0; }
00330 inline bool add_unique(const DawgInfo &new_info, bool debug,
00331 const char *debug_msg) {
00332 for (int i = 0; i < size_used_; ++i) {
00333 if (data_[i] == new_info) return false;
00334 }
00335 push_back(new_info);
00336 if (debug) {
00337 tprintf("%s[%d, " REFFORMAT "]\n", debug_msg,
00338 new_info.dawg_index, new_info.ref);
00339 }
00340 return true;
00341 }
00342 };
00343
00344
00351
00352 class SquishedDawg : public Dawg {
00353 public:
00354 SquishedDawg(FILE *file, DawgType type, const STRING &lang,
00355 PermuterType perm, int debug_level) {
00356 read_squished_dawg(file, type, lang, perm, debug_level);
00357 num_forward_edges_in_node0 = num_forward_edges(0);
00358 }
00359 SquishedDawg(const char* filename, DawgType type,
00360 const STRING &lang, PermuterType perm, int debug_level) {
00361 FILE *file = fopen(filename, "rb");
00362 if (file == NULL) {
00363 tprintf("Failed to open dawg file %s\n", filename);
00364 exit(1);
00365 }
00366 read_squished_dawg(file, type, lang, perm, debug_level);
00367 num_forward_edges_in_node0 = num_forward_edges(0);
00368 fclose(file);
00369 }
00370 SquishedDawg(EDGE_ARRAY edges, int num_edges, DawgType type,
00371 const STRING &lang, PermuterType perm,
00372 int unicharset_size, int debug_level) :
00373 edges_(edges), num_edges_(num_edges) {
00374 init(type, lang, perm, unicharset_size, debug_level);
00375 num_forward_edges_in_node0 = num_forward_edges(0);
00376 if (debug_level > 3) print_all("SquishedDawg:");
00377 }
00378 ~SquishedDawg();
00379
00380 int NumEdges() { return num_edges_; }
00381
00383 EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id,
00384 bool word_end) const;
00385
00388 void unichar_ids_of(NODE_REF node, NodeChildVector *vec) const {
00389 EDGE_REF edge = node;
00390 if (!edge_occupied(edge) || edge == NO_EDGE) return;
00391 assert(forward_edge(edge));
00392 do {
00393 vec->push_back(NodeChild(unichar_id_from_edge_rec(edges_[edge]), edge));
00394 } while (!last_edge(edge++));
00395 }
00396
00399 NODE_REF next_node(EDGE_REF edge) const {
00400 return next_node_from_edge_rec((edges_[edge]));
00401 }
00402
00405 bool end_of_word(EDGE_REF edge_ref) const {
00406 return end_of_word_from_edge_rec((edges_[edge_ref]));
00407 }
00408
00410 UNICHAR_ID edge_letter(EDGE_REF edge_ref) const {
00411 return unichar_id_from_edge_rec((edges_[edge_ref]));
00412 }
00413
00416 void print_node(NODE_REF node, int max_num_edges) const;
00417
00419 void write_squished_dawg(FILE *file);
00420
00423 void write_squished_dawg(const char *filename) {
00424 FILE *file = fopen(filename, "wb");
00425 if (file == NULL) {
00426 tprintf("Error opening %s\n", filename);
00427 exit(1);
00428 }
00429 this->write_squished_dawg(file);
00430 fclose(file);
00431 }
00432
00433 private:
00435 inline void set_next_node(EDGE_REF edge_ref, EDGE_REF value) {
00436 set_next_node_in_edge_rec(&(edges_[edge_ref]), value);
00437 }
00439 inline void set_empty_edge(EDGE_REF edge_ref) {
00440 (edges_[edge_ref] = next_node_mask_);
00441 }
00443 inline void clear_all_edges() {
00444 for (int edge = 0; edge < num_edges_; edge++) set_empty_edge(edge);
00445 }
00447 inline void clear_marker_flag(EDGE_REF edge_ref) {
00448 (edges_[edge_ref] &= ~(MARKER_FLAG << flag_start_bit_));
00449 }
00451 inline bool forward_edge(EDGE_REF edge_ref) const {
00452 return (edge_occupied(edge_ref) &&
00453 (FORWARD_EDGE == direction_from_edge_rec(edges_[edge_ref])));
00454 }
00456 inline bool backward_edge(EDGE_REF edge_ref) const {
00457 return (edge_occupied(edge_ref) &&
00458 (BACKWARD_EDGE == direction_from_edge_rec(edges_[edge_ref])));
00459 }
00461 inline bool edge_occupied(EDGE_REF edge_ref) const {
00462 return (edges_[edge_ref] != next_node_mask_);
00463 }
00465 inline bool last_edge(EDGE_REF edge_ref) const {
00466 return (edges_[edge_ref] & (MARKER_FLAG << flag_start_bit_)) != 0;
00467 }
00468
00470 inT32 num_forward_edges(NODE_REF node) const;
00471
00473 void read_squished_dawg(FILE *file, DawgType type, const STRING &lang,
00474 PermuterType perm, int debug_level);
00475
00477 void print_edge(EDGE_REF edge) const;
00478
00480 void print_all(const char* msg) {
00481 tprintf("\n__________________________\n%s\n", msg);
00482 for (int i = 0; i < num_edges_; ++i) print_edge(i);
00483 tprintf("__________________________\n");
00484 }
00486 NODE_MAP build_node_map(inT32 *num_nodes) const;
00487
00488
00489
00490 EDGE_ARRAY edges_;
00491 int num_edges_;
00492 int num_forward_edges_in_node0;
00493 };
00494
00495 }
00496
00497 #endif // DICT_DAWG_H_