ICU 4.8.1.1  4.8.1.1
 All Data Structures Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
stringtriebuilder.h
1 /*
2 *******************************************************************************
3 * Copyright (C) 2010-2011, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * file name: stringtriebuilder.h
7 * encoding: US-ASCII
8 * tab size: 8 (not used)
9 * indentation:4
10 *
11 * created on: 2010dec24
12 * created by: Markus W. Scherer
13 */
14 
15 #ifndef __STRINGTRIEBUILDER_H__
16 #define __STRINGTRIEBUILDER_H__
17 
18 #include "unicode/utypes.h"
19 #include "unicode/uobject.h"
20 
21 // Forward declaration.
22 struct UHashtable;
23 typedef struct UHashtable UHashtable;
24 
29 enum UStringTrieBuildOption {
34  USTRINGTRIE_BUILD_FAST,
45  USTRINGTRIE_BUILD_SMALL
46 };
47 
49 
57 public:
59  static UBool hashNode(const void *node);
61  static UBool equalNodes(const void *left, const void *right);
62 
63 protected:
67  virtual ~StringTrieBuilder();
68 
70  void createCompactBuilder(int32_t sizeGuess, UErrorCode &errorCode);
72  void deleteCompactBuilder();
73 
75  void build(UStringTrieBuildOption buildOption, int32_t elementsLength, UErrorCode &errorCode);
76 
78  int32_t writeNode(int32_t start, int32_t limit, int32_t unitIndex);
80  int32_t writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length);
81 
82  class Node;
83 
85  Node *makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode);
87  Node *makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex,
88  int32_t length, UErrorCode &errorCode);
89 
91  virtual int32_t getElementStringLength(int32_t i) const = 0;
93  virtual UChar getElementUnit(int32_t i, int32_t unitIndex) const = 0;
95  virtual int32_t getElementValue(int32_t i) const = 0;
96 
97  // Finds the first unit index after this one where
98  // the first and last element have different units again.
100  virtual int32_t getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const = 0;
101 
102  // Number of different units at unitIndex.
104  virtual int32_t countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const = 0;
106  virtual int32_t skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const = 0;
108  virtual int32_t indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, UChar unit) const = 0;
109 
111  virtual UBool matchNodesCanHaveValues() const = 0;
112 
114  virtual int32_t getMaxBranchLinearSubNodeLength() const = 0;
116  virtual int32_t getMinLinearMatch() const = 0;
118  virtual int32_t getMaxLinearMatchLength() const = 0;
119 
120  // max(BytesTrie::kMaxBranchLinearSubNodeLength, UCharsTrie::kMaxBranchLinearSubNodeLength).
122  static const int32_t kMaxBranchLinearSubNodeLength=5;
123 
124  // Maximum number of nested split-branch levels for a branch on all 2^16 possible UChar units.
125  // log2(2^16/kMaxBranchLinearSubNodeLength) rounded up.
127  static const int32_t kMaxSplitBranchLevels=14;
128 
139  Node *registerNode(Node *newNode, UErrorCode &errorCode);
150  Node *registerFinalValue(int32_t value, UErrorCode &errorCode);
151 
152  /*
153  * C++ note:
154  * registerNode() and registerFinalValue() take ownership of their input nodes,
155  * and only return owned nodes.
156  * If they see a failure UErrorCode, they will delete the input node.
157  * If they get a NULL pointer, they will record a U_MEMORY_ALLOCATION_ERROR.
158  * If there is a failure, they return NULL.
159  *
160  * NULL Node pointers can be safely passed into other Nodes because
161  * they call the static Node::hashCode() which checks for a NULL pointer first.
162  *
163  * Therefore, as long as builder functions register a new node,
164  * they need to check for failures only before explicitly dereferencing
165  * a Node pointer, or before setting a new UErrorCode.
166  */
167 
168  // Hash set of nodes, maps from nodes to integer 1.
170  UHashtable *nodes;
171 
173  class Node : public UObject {
174  public:
175  Node(int32_t initialHash) : hash(initialHash), offset(0) {}
176  inline int32_t hashCode() const { return hash; }
177  // Handles node==NULL.
178  static inline int32_t hashCode(const Node *node) { return node==NULL ? 0 : node->hashCode(); }
179  // Base class operator==() compares the actual class types.
180  virtual UBool operator==(const Node &other) const;
181  inline UBool operator!=(const Node &other) const { return !operator==(other); }
209  virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
210  // write() must set the offset to a positive value.
211  virtual void write(StringTrieBuilder &builder) = 0;
212  // See markRightEdgesFirst.
213  inline void writeUnlessInsideRightEdge(int32_t firstRight, int32_t lastRight,
214  StringTrieBuilder &builder) {
215  // Note: Edge numbers are negative, lastRight<=firstRight.
216  // If offset>0 then this node and its sub-nodes have been written already
217  // and we need not write them again.
218  // If this node is part of the unwritten right branch edge,
219  // then we wait until that is written.
220  if(offset<0 && (offset<lastRight || firstRight<offset)) {
221  write(builder);
222  }
223  }
224  inline int32_t getOffset() const { return offset; }
225  protected:
226  int32_t hash;
227  int32_t offset;
228  private:
229  // No ICU "poor man's RTTI" for this class nor its subclasses.
230  virtual UClassID getDynamicClassID() const;
231  };
232 
233  // This class should not be overridden because
234  // registerFinalValue() compares a stack-allocated FinalValueNode
235  // (stack-allocated so that we don't unnecessarily create lots of duplicate nodes)
236  // with the input node, and the
237  // !Node::operator==(other) used inside FinalValueNode::operator==(other)
238  // will be false if the typeid's are different.
240  class FinalValueNode : public Node {
241  public:
242  FinalValueNode(int32_t v) : Node(0x111111*37+v), value(v) {}
243  virtual UBool operator==(const Node &other) const;
244  virtual void write(StringTrieBuilder &builder);
245  protected:
246  int32_t value;
247  };
248 
250  class ValueNode : public Node {
251  public:
252  ValueNode(int32_t initialHash) : Node(initialHash), hasValue(FALSE), value(0) {}
253  virtual UBool operator==(const Node &other) const;
254  void setValue(int32_t v) {
255  hasValue=TRUE;
256  value=v;
257  hash=hash*37+v;
258  }
259  protected:
260  UBool hasValue;
261  int32_t value;
262  };
263 
266  public:
267  IntermediateValueNode(int32_t v, Node *nextNode)
268  : ValueNode(0x222222*37+hashCode(nextNode)), next(nextNode) { setValue(v); }
269  virtual UBool operator==(const Node &other) const;
270  virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
271  virtual void write(StringTrieBuilder &builder);
272  protected:
273  Node *next;
274  };
275 
277  class LinearMatchNode : public ValueNode {
278  public:
279  LinearMatchNode(int32_t len, Node *nextNode)
280  : ValueNode((0x333333*37+len)*37+hashCode(nextNode)),
281  length(len), next(nextNode) {}
282  virtual UBool operator==(const Node &other) const;
283  virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
284  protected:
285  int32_t length;
286  Node *next;
287  };
288 
290  class BranchNode : public Node {
291  public:
292  BranchNode(int32_t initialHash) : Node(initialHash) {}
293  protected:
294  int32_t firstEdgeNumber;
295  };
296 
298  class ListBranchNode : public BranchNode {
299  public:
300  ListBranchNode() : BranchNode(0x444444), length(0) {}
301  virtual UBool operator==(const Node &other) const;
302  virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
303  virtual void write(StringTrieBuilder &builder);
304  // Adds a unit with a final value.
305  void add(int32_t c, int32_t value) {
306  units[length]=(UChar)c;
307  equal[length]=NULL;
308  values[length]=value;
309  ++length;
310  hash=(hash*37+c)*37+value;
311  }
312  // Adds a unit which leads to another match node.
313  void add(int32_t c, Node *node) {
314  units[length]=(UChar)c;
315  equal[length]=node;
316  values[length]=0;
317  ++length;
318  hash=(hash*37+c)*37+hashCode(node);
319  }
320  protected:
321  Node *equal[kMaxBranchLinearSubNodeLength]; // NULL means "has final value".
322  int32_t length;
323  int32_t values[kMaxBranchLinearSubNodeLength];
324  UChar units[kMaxBranchLinearSubNodeLength];
325  };
326 
328  class SplitBranchNode : public BranchNode {
329  public:
330  SplitBranchNode(UChar middleUnit, Node *lessThanNode, Node *greaterOrEqualNode)
331  : BranchNode(((0x555555*37+middleUnit)*37+
332  hashCode(lessThanNode))*37+hashCode(greaterOrEqualNode)),
333  unit(middleUnit), lessThan(lessThanNode), greaterOrEqual(greaterOrEqualNode) {}
334  virtual UBool operator==(const Node &other) const;
335  virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
336  virtual void write(StringTrieBuilder &builder);
337  protected:
338  UChar unit;
339  Node *lessThan;
340  Node *greaterOrEqual;
341  };
342 
343  // Branch head node, for writing the actual node lead unit.
345  class BranchHeadNode : public ValueNode {
346  public:
347  BranchHeadNode(int32_t len, Node *subNode)
348  : ValueNode((0x666666*37+len)*37+hashCode(subNode)),
349  length(len), next(subNode) {}
350  virtual UBool operator==(const Node &other) const;
351  virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
352  virtual void write(StringTrieBuilder &builder);
353  protected:
354  int32_t length;
355  Node *next; // A branch sub-node.
356  };
357 
359  virtual Node *createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length,
360  Node *nextNode) const = 0;
361 
363  virtual int32_t write(int32_t unit) = 0;
365  virtual int32_t writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) = 0;
367  virtual int32_t writeValueAndFinal(int32_t i, UBool isFinal) = 0;
369  virtual int32_t writeValueAndType(UBool hasValue, int32_t value, int32_t node) = 0;
371  virtual int32_t writeDeltaTo(int32_t jumpTarget) = 0;
372 
373 private:
374  // No ICU "poor man's RTTI" for this class nor its subclasses.
375  virtual UClassID getDynamicClassID() const;
376 };
377 
379 
380 #endif // __STRINGTRIEBUILDER_H__
U_EXPORT UBool operator==(const StringPiece &x, const StringPiece &y)
Global operator == for StringPiece.
#define U_NAMESPACE_BEGIN
This is used to begin a declaration of a public ICU C++ API.
Definition: uversion.h:131
#define NULL
Define NULL if necessary, to 0 for C++ and to ((void *)0) for C.
Definition: utypes.h:299
#define TRUE
The TRUE value of a UBool.
Definition: umachine.h:232
UBool operator!=(const StringPiece &x, const StringPiece &y)
Global operator != for StringPiece.
Definition: stringpiece.h:218
C++ API: Common ICU base class UObject.
uint16_t UChar
Define UChar to be wchar_t if that is 16 bits wide; always assumed to be unsigned.
Definition: umachine.h:325
#define U_NAMESPACE_END
This is used to end a declaration of a public ICU C++ API.
Definition: uversion.h:132
UObject is the common ICU &quot;boilerplate&quot; class.
Definition: uobject.h:215
void * UClassID
UClassID is used to identify classes without using RTTI, since RTTI is not yet supported by all C++ c...
Definition: utypes.h:385
UErrorCode
Error code to replace exception handling, so that the code is compatible with all C++ compilers...
Definition: utypes.h:639
Basic definitions for ICU, for both C and C++ APIs.
#define FALSE
The FALSE value of a UBool.
Definition: umachine.h:236
#define U_COMMON_API
Set to export library symbols from inside the common library, and to import them from outside...
Definition: utypes.h:520
Base class for string trie builder classes.
virtual UClassID getDynamicClassID() const =0
ICU4C &quot;poor man&#39;s RTTI&quot;, returns a UClassID for the actual ICU class.
int8_t UBool
The ICU boolean type.
Definition: umachine.h:228