00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00019
00020 #ifndef TESSERACT_CCUTIL_TESSDATAMANAGER_H_
00021 #define TESSERACT_CCUTIL_TESSDATAMANAGER_H_
00022
00023 #include <stdio.h>
00024 #include "host.h"
00025 #include "tprintf.h"
00026
00027 static const char kTrainedDataSuffix[] = "traineddata";
00028
00029
00030
00031 static const char kLangConfigFileSuffix[] = "config";
00032 static const char kUnicharsetFileSuffix[] = "unicharset";
00033 static const char kAmbigsFileSuffix[] = "unicharambigs";
00034 static const char kBuiltInTemplatesFileSuffix[] = "inttemp";
00035 static const char kBuiltInCutoffsFileSuffix[] = "pffmtable";
00036 static const char kNormProtoFileSuffix[] = "normproto";
00037 static const char kPuncDawgFileSuffix[] = "punc-dawg";
00038 static const char kSystemDawgFileSuffix[] = "word-dawg";
00039 static const char kNumberDawgFileSuffix[] = "number-dawg";
00040 static const char kFreqDawgFileSuffix[] = "freq-dawg";
00041 static const char kFixedLengthDawgsFileSuffix[] = "fixed-length-dawgs";
00042 static const char kCubeUnicharsetFileSuffix[] = "cube-unicharset";
00043 static const char kCubeSystemDawgFileSuffix[] = "cube-word-dawg";
00044 static const char kShapeTableFileSuffix[] = "shapetable";
00045 static const char kBigramDawgFileSuffix[] = "bigram-dawg";
00046 static const char kUnambigDawgFileSuffix[] = "unambig-dawg";
00047 static const char kParamsTrainingModelFileSuffix[] = "params-training-model";
00048
00049 namespace tesseract {
00050
00051 enum TessdataType {
00052 TESSDATA_LANG_CONFIG,
00053 TESSDATA_UNICHARSET,
00054 TESSDATA_AMBIGS,
00055 TESSDATA_INTTEMP,
00056 TESSDATA_PFFMTABLE,
00057 TESSDATA_NORMPROTO,
00058 TESSDATA_PUNC_DAWG,
00059 TESSDATA_SYSTEM_DAWG,
00060 TESSDATA_NUMBER_DAWG,
00061 TESSDATA_FREQ_DAWG,
00062 TESSDATA_FIXED_LENGTH_DAWGS,
00063 TESSDATA_CUBE_UNICHARSET,
00064 TESSDATA_CUBE_SYSTEM_DAWG,
00065 TESSDATA_SHAPE_TABLE,
00066 TESSDATA_BIGRAM_DAWG,
00067 TESSDATA_UNAMBIG_DAWG,
00068 TESSDATA_PARAMS_TRAINING_MODEL,
00069
00070 TESSDATA_NUM_ENTRIES
00071 };
00072
00077 static const char * const kTessdataFileSuffixes[] = {
00078 kLangConfigFileSuffix,
00079 kUnicharsetFileSuffix,
00080 kAmbigsFileSuffix,
00081 kBuiltInTemplatesFileSuffix,
00082 kBuiltInCutoffsFileSuffix,
00083 kNormProtoFileSuffix,
00084 kPuncDawgFileSuffix,
00085 kSystemDawgFileSuffix,
00086 kNumberDawgFileSuffix,
00087 kFreqDawgFileSuffix,
00088 kFixedLengthDawgsFileSuffix,
00089 kCubeUnicharsetFileSuffix,
00090 kCubeSystemDawgFileSuffix,
00091 kShapeTableFileSuffix,
00092 kBigramDawgFileSuffix,
00093 kUnambigDawgFileSuffix,
00094 kParamsTrainingModelFileSuffix,
00095 };
00096
00101 static const bool kTessdataFileIsText[] = {
00102 true,
00103 true,
00104 true,
00105 false,
00106 true,
00107 true,
00108 false,
00109 false,
00110 false,
00111 false,
00112 false,
00113 true,
00114 false,
00115 false,
00116 false,
00117 false,
00118 false,
00119 };
00120
00128 static const int kMaxNumTessdataEntries = 1000;
00129
00130
00131 class TessdataManager {
00132 public:
00133 TessdataManager() {
00134 data_file_ = NULL;
00135 actual_tessdata_num_entries_ = 0;
00136 for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
00137 offset_table_[i] = -1;
00138 }
00139 }
00140 ~TessdataManager() {}
00141 int DebugLevel() { return debug_level_; }
00142
00147 bool Init(const char *data_file_name, int debug_level);
00148
00150 inline FILE *GetDataFilePtr() const { return data_file_; }
00151
00157 inline bool SeekToStart(TessdataType tessdata_type) {
00158 if (debug_level_) {
00159 tprintf("TessdataManager: seek to offset %lld - start of tessdata"
00160 "type %d (%s))\n", offset_table_[tessdata_type],
00161 tessdata_type, kTessdataFileSuffixes[tessdata_type]);
00162 }
00163 if (offset_table_[tessdata_type] < 0) {
00164 return false;
00165 } else {
00166 ASSERT_HOST(fseek(data_file_,
00167 static_cast<size_t>(offset_table_[tessdata_type]),
00168 SEEK_SET) == 0);
00169 return true;
00170 }
00171 }
00173 inline inT64 GetEndOffset(TessdataType tessdata_type) const {
00174 int index = tessdata_type + 1;
00175 while (index < actual_tessdata_num_entries_ && offset_table_[index] == -1) {
00176 ++index;
00177 }
00178 if (debug_level_) {
00179 tprintf("TessdataManager: end offset for type %d is %lld\n",
00180 tessdata_type,
00181 (index == actual_tessdata_num_entries_) ? -1
00182 : offset_table_[index]);
00183 }
00184 return (index == actual_tessdata_num_entries_) ? -1 : offset_table_[index] - 1;
00185 }
00187 inline void End() {
00188 if (data_file_ != NULL) {
00189 fclose(data_file_);
00190 data_file_ = NULL;
00191 }
00192 }
00193 bool swap() const {
00194 return swap_;
00195 }
00196
00198 static void WriteMetadata(inT64 *offset_table, FILE *output_file);
00199
00205 static bool CombineDataFiles(const char *language_data_path_prefix,
00206 const char *output_filename);
00207
00213 bool OverwriteComponents(const char *new_traineddata_filename,
00214 char **component_filenames,
00215 int num_new_components);
00216
00227 bool ExtractToFile(const char *filename);
00228
00234 static void CopyFile(FILE *input_file, FILE *output_file,
00235 bool newline_end, inT64 num_bytes_to_copy);
00236
00245 static bool TessdataTypeFromFileSuffix(const char *suffix,
00246 TessdataType *type,
00247 bool *text_file);
00248
00253 static bool TessdataTypeFromFileName(const char *filename,
00254 TessdataType *type,
00255 bool *text_file);
00256
00257 private:
00258
00263 static FILE *GetFilePtr(const char *language_data_path_prefix,
00264 const char *file_suffix, bool text_file);
00265
00270 inT64 offset_table_[TESSDATA_NUM_ENTRIES];
00279 inT32 actual_tessdata_num_entries_;
00280 FILE *data_file_;
00281 int debug_level_;
00282
00283 bool swap_;
00284 };
00285
00286
00287 }
00288
00289 #endif // TESSERACT_CCUTIL_TESSDATAMANAGER_H_