00001 00002 // File: pageiterator.h 00003 // Description: Iterator for tesseract page structure that avoids using 00004 // tesseract internal data structures. 00005 // Author: Ray Smith 00006 // Created: Fri Feb 26 11:01:06 PST 2010 00007 // 00008 // (C) Copyright 2010, Google Inc. 00009 // Licensed under the Apache License, Version 2.0 (the "License"); 00010 // you may not use this file except in compliance with the License. 00011 // You may obtain a copy of the License at 00012 // http://www.apache.org/licenses/LICENSE-2.0 00013 // Unless required by applicable law or agreed to in writing, software 00014 // distributed under the License is distributed on an "AS IS" BASIS, 00015 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00016 // See the License for the specific language governing permissions and 00017 // limitations under the License. 00018 // 00020 00021 #ifndef TESSERACT_CCMAIN_PAGEITERATOR_H__ 00022 #define TESSERACT_CCMAIN_PAGEITERATOR_H__ 00023 00024 #include "publictypes.h" 00025 00026 class C_BLOB_IT; 00027 class PBLOB_IT; 00028 class PAGE_RES; 00029 class PAGE_RES_IT; 00030 class WERD; 00031 struct Pix; 00032 00033 namespace tesseract { 00034 00035 class Tesseract; 00036 00037 // Class to iterate over tesseract page structure, providing access to all 00038 // levels of the page hierarchy, without including any tesseract headers or 00039 // having to handle any tesseract structures. 00040 // WARNING! This class points to data held within the TessBaseAPI class, and 00041 // therefore can only be used while the TessBaseAPI class still exists and 00042 // has not been subjected to a call of Init, SetImage, Recognize, Clear, End 00043 // DetectOS, or anything else that changes the internal PAGE_RES. 00044 // See apitypes.h for the definition of PageIteratorLevel. 00045 // See also ResultIterator, derived from PageIterator, which adds in the 00046 // ability to access OCR output with text-specific methods. 00047 00048 class PageIterator { 00049 public: 00050 // page_res and tesseract come directly from the BaseAPI. 00051 // The rectangle parameters are copied indirectly from the Thresholder, 00052 // via the BaseAPI. They represent the coordinates of some rectangle in an 00053 // original image (in top-left-origin coordinates) and therefore the top-left 00054 // needs to be added to any output boxes in order to specify coordinates 00055 // in the original image. See TessBaseAPI::SetRectangle. 00056 // The scale and scaled_yres are in case the Thresholder scaled the image 00057 // rectangle prior to thresholding. Any coordinates in tesseract's image 00058 // must be divided by scale before adding (rect_left, rect_top). 00059 // The scaled_yres indicates the effective resolution of the binary image 00060 // that tesseract has been given by the Thresholder. 00061 // After the constructor, Begin has already been called. 00062 PageIterator(PAGE_RES* page_res, Tesseract* tesseract, 00063 int scale, int scaled_yres, 00064 int rect_left, int rect_top, 00065 int rect_width, int rect_height); 00066 virtual ~PageIterator(); 00067 00068 // Page/ResultIterators may be copied! This makes it possible to iterate over 00069 // all the objects at a lower level, while maintaining an iterator to 00070 // objects at a higher level. These constructors DO NOT CALL Begin, so 00071 // iterations will continue from the location of src. 00072 PageIterator(const PageIterator& src); 00073 const PageIterator& operator=(const PageIterator& src); 00074 00075 // Are we positioned at the same location as other? 00076 bool PositionedAtSameWord(const PAGE_RES_IT* other) const; 00077 00078 // ============= Moving around within the page ============. 00079 00080 // Moves the iterator to point to the start of the page to begin an iteration. 00081 virtual void Begin(); 00082 00083 // Moves the iterator to the beginning of the paragraph. 00084 // This class implements this functionality by moving it to the zero indexed 00085 // blob of the first (leftmost) word on the first row of the paragraph. 00086 virtual void RestartParagraph(); 00087 00088 // Return whether this iterator points anywhere in the first textline of a 00089 // paragraph. 00090 bool IsWithinFirstTextlineOfParagraph() const; 00091 00092 // Moves the iterator to the beginning of the text line. 00093 // This class implements this functionality by moving it to the zero indexed 00094 // blob of the first (leftmost) word of the row. 00095 virtual void RestartRow(); 00096 00097 // Moves to the start of the next object at the given level in the 00098 // page hierarchy, and returns false if the end of the page was reached. 00099 // NOTE that RIL_SYMBOL will skip non-text blocks, but all other 00100 // PageIteratorLevel level values will visit each non-text block once. 00101 // Think of non text blocks as containing a single para, with a single line, 00102 // with a single imaginary word. 00103 // Calls to Next with different levels may be freely intermixed. 00104 // This function iterates words in right-to-left scripts correctly, if 00105 // the appropriate language has been loaded into Tesseract. 00106 virtual bool Next(PageIteratorLevel level); 00107 00108 // Returns true if the iterator is at the start of an object at the given 00109 // level. 00110 // 00111 // For instance, suppose an iterator it is pointed to the first symbol of the 00112 // first word of the third line of the second paragraph of the first block in 00113 // a page, then: 00114 // it.IsAtBeginningOf(RIL_BLOCK) = false 00115 // it.IsAtBeginningOf(RIL_PARA) = false 00116 // it.IsAtBeginningOf(RIL_TEXTLINE) = true 00117 // it.IsAtBeginningOf(RIL_WORD) = true 00118 // it.IsAtBeginningOf(RIL_SYMBOL) = true 00119 virtual bool IsAtBeginningOf(PageIteratorLevel level) const; 00120 00121 // Returns whether the iterator is positioned at the last element in a 00122 // given level. (e.g. the last word in a line, the last line in a block) 00123 // 00124 // Here's some two-paragraph example 00125 // text. It starts off innocuously 00126 // enough but quickly turns bizarre. 00127 // The author inserts a cornucopia 00128 // of words to guard against confused 00129 // references. 00130 // 00131 // Now take an iterator it pointed to the start of "bizarre." 00132 // it.IsAtFinalElement(RIL_PARA, RIL_SYMBOL) = false 00133 // it.IsAtFinalElement(RIL_PARA, RIL_WORD) = true 00134 // it.IsAtFinalElement(RIL_BLOCK, RIL_WORD) = false 00135 virtual bool IsAtFinalElement(PageIteratorLevel level, 00136 PageIteratorLevel element) const; 00137 00138 // Returns whether this iterator is positioned 00139 // before other: -1 00140 // equal to other: 0 00141 // after other: 1 00142 int Cmp(const PageIterator &other) const; 00143 00144 // ============= Accessing data ==============. 00145 // Coordinate system: 00146 // Integer coordinates are at the cracks between the pixels. 00147 // The top-left corner of the top-left pixel in the image is at (0,0). 00148 // The bottom-right corner of the bottom-right pixel in the image is at 00149 // (width, height). 00150 // Every bounding box goes from the top-left of the top-left contained 00151 // pixel to the bottom-right of the bottom-right contained pixel, so 00152 // the bounding box of the single top-left pixel in the image is: 00153 // (0,0)->(1,1). 00154 // If an image rectangle has been set in the API, then returned coordinates 00155 // relate to the original (full) image, rather than the rectangle. 00156 00157 // Returns the bounding rectangle of the current object at the given level. 00158 // See comment on coordinate system above. 00159 // Returns false if there is no such object at the current position. 00160 // The returned bounding box is guaranteed to match the size and position 00161 // of the image returned by GetBinaryImage, but may clip foreground pixels 00162 // from a grey image. The padding argument to GetImage can be used to expand 00163 // the image to include more foreground pixels. See GetImage below. 00164 bool BoundingBox(PageIteratorLevel level, 00165 int* left, int* top, int* right, int* bottom) const; 00166 // Returns the bounding rectangle of the object in a coordinate system of the 00167 // working image rectangle having its origin at (rect_left_, rect_top_) with 00168 // respect to the original image and is scaled by a factor scale_. 00169 bool BoundingBoxInternal(PageIteratorLevel level, 00170 int* left, int* top, int* right, int* bottom) const; 00171 00172 // Returns whether there is no object of a given level. 00173 bool Empty(PageIteratorLevel level) const; 00174 00175 // Returns the type of the current block. See apitypes.h for PolyBlockType. 00176 PolyBlockType BlockType() const; 00177 00178 // Returns a binary image of the current object at the given level. 00179 // The position and size match the return from BoundingBoxInternal, and so 00180 // this could be upscaled with respect to the original input image. 00181 // Use pixDestroy to delete the image after use. 00182 Pix* GetBinaryImage(PageIteratorLevel level) const; 00183 00184 // Returns an image of the current object at the given level in greyscale 00185 // if available in the input. To guarantee a binary image use BinaryImage. 00186 // NOTE that in order to give the best possible image, the bounds are 00187 // expanded slightly over the binary connected component, by the supplied 00188 // padding, so the top-left position of the returned image is returned 00189 // in (left,top). These will most likely not match the coordinates 00190 // returned by BoundingBox. 00191 // Use pixDestroy to delete the image after use. 00192 Pix* GetImage(PageIteratorLevel level, int padding, 00193 int* left, int* top) const; 00194 00195 // Returns the baseline of the current object at the given level. 00196 // The baseline is the line that passes through (x1, y1) and (x2, y2). 00197 // WARNING: with vertical text, baselines may be vertical! 00198 // Returns false if there is no baseline at the current position. 00199 bool Baseline(PageIteratorLevel level, 00200 int* x1, int* y1, int* x2, int* y2) const; 00201 00202 // Returns orientation for the block the iterator points to. 00203 // orientation, writing_direction, textline_order: see publictypes.h 00204 // deskew_angle: after rotating the block so the text orientation is 00205 // upright, how many radians does one have to rotate the 00206 // block anti-clockwise for it to be level? 00207 // -Pi/4 <= deskew_angle <= Pi/4 00208 void Orientation(tesseract::Orientation *orientation, 00209 tesseract::WritingDirection *writing_direction, 00210 tesseract::TextlineOrder *textline_order, 00211 float *deskew_angle) const; 00212 00213 // Returns information about the current paragraph, if available. 00214 // 00215 // justification - 00216 // LEFT if ragged right, or fully justified and script is left-to-right. 00217 // RIGHT if ragged left, or fully justified and script is right-to-left. 00218 // unknown if it looks like source code or we have very few lines. 00219 // is_list_item - 00220 // true if we believe this is a member of an ordered or unordered list. 00221 // is_crown - 00222 // true if the first line of the paragraph is aligned with the other 00223 // lines of the paragraph even though subsequent paragraphs have first 00224 // line indents. This typically indicates that this is the continuation 00225 // of a previous paragraph or that it is the very first paragraph in 00226 // the chapter. 00227 // first_line_indent - 00228 // For LEFT aligned paragraphs, the first text line of paragraphs of 00229 // this kind are indented this many pixels from the left edge of the 00230 // rest of the paragraph. 00231 // for RIGHT aligned paragraphs, the first text line of paragraphs of 00232 // this kind are indented this many pixels from the right edge of the 00233 // rest of the paragraph. 00234 // NOTE 1: This value may be negative. 00235 // NOTE 2: if *is_crown == true, the first line of this paragraph is 00236 // actually flush, and first_line_indent is set to the "common" 00237 // first_line_indent for subsequent paragraphs in this block 00238 // of text. 00239 void ParagraphInfo(tesseract::ParagraphJustification *justification, 00240 bool *is_list_item, 00241 bool *is_crown, 00242 int *first_line_indent) const; 00243 00244 protected: 00245 // Sets up the internal data for iterating the blobs of a new word, then 00246 // moves the iterator to the given offset. 00247 void BeginWord(int offset); 00248 00249 // Pointer to the page_res owned by the API. 00250 PAGE_RES* page_res_; 00251 // Pointer to the Tesseract object owned by the API. 00252 Tesseract* tesseract_; 00253 // The iterator to the page_res_. Owned by this ResultIterator. 00254 // A pointer just to avoid dragging in Tesseract includes. 00255 PAGE_RES_IT* it_; 00256 // The current input WERD being iterated. If there is an output from OCR, 00257 // then word_ is NULL. Owned by the API. 00258 WERD* word_; 00259 // The length of the current word_. 00260 int word_length_; 00261 // The current blob index within the word. 00262 int blob_index_; 00263 // Iterator to the blobs within the word. If NULL, then we are iterating 00264 // OCR results in the box_word. 00265 // Owned by this ResultIterator. 00266 C_BLOB_IT* cblob_it_; 00267 // Parameters saved from the Thresholder. Needed to rebuild coordinates. 00268 int scale_; 00269 int scaled_yres_; 00270 int rect_left_; 00271 int rect_top_; 00272 int rect_width_; 00273 int rect_height_; 00274 }; 00275 00276 } // namespace tesseract. 00277 00278 #endif // TESSERACT_CCMAIN_PAGEITERATOR_H__