Lucene++ - a full-featured, c++ search engine
API Documentation


 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
UTF8Stream.h
Go to the documentation of this file.
1 // Copyright (c) 2009-2014 Alan Wright. All rights reserved.
3 // Distributable under the terms of either the Apache License (Version 2.0)
4 // or the GNU Lesser General Public License.
6 
7 #ifndef UTF8STREAM_H
8 #define UTF8STREAM_H
9 
10 #include "LuceneObject.h"
11 
12 namespace Lucene {
13 
14 class UTF8Base : public LuceneObject {
15 public:
16  virtual ~UTF8Base();
18 
19 public:
20  static const uint16_t LEAD_SURROGATE_MIN;
21  static const uint16_t LEAD_SURROGATE_MAX;
22  static const uint16_t TRAIL_SURROGATE_MIN;
23  static const uint16_t TRAIL_SURROGATE_MAX;
24  static const uint16_t LEAD_OFFSET;
25  static const uint32_t SURROGATE_OFFSET;
26  static const uint32_t CODE_POINT_MAX;
27 
28  static const wchar_t UNICODE_REPLACEMENT_CHAR;
29  static const wchar_t UNICODE_TERMINATOR;
30 
31 protected:
32  virtual uint32_t readNext() = 0;
33 
34  uint8_t mask8(uint32_t b);
35  uint16_t mask16(uint32_t c);
36  bool isTrail(uint32_t b);
37  bool isSurrogate(uint32_t cp);
38  bool isLeadSurrogate(uint32_t cp);
39  bool isTrailSurrogate(uint32_t cp);
40  bool isValidCodePoint(uint32_t cp);
41  bool isOverlongSequence(uint32_t cp, int32_t length);
42 };
43 
44 class UTF8Encoder : public UTF8Base {
45 public:
46  UTF8Encoder(const wchar_t* unicodeBegin, const wchar_t* unicodeEnd);
47  virtual ~UTF8Encoder();
48 
50 
51 protected:
52  const wchar_t* unicodeBegin;
53  const wchar_t* unicodeEnd;
54 
55 public:
56  int32_t encode(uint8_t* utf8, int32_t length);
57 
58  int32_t utf16to8(uint8_t* utf8, int32_t length);
59  int32_t utf32to8(uint8_t* utf8, int32_t length);
60 
61 protected:
62  virtual uint32_t readNext();
63 
64  uint8_t* appendChar(uint8_t* utf8, uint32_t cp);
65 };
66 
68 public:
70  virtual ~UTF8EncoderStream();
71 
73 
74 protected:
76 
77 protected:
78  virtual uint32_t readNext();
79 };
80 
81 class UTF8Decoder : public UTF8Base {
82 public:
83  UTF8Decoder(const uint8_t* utf8Begin, const uint8_t* utf8End);
84  virtual ~UTF8Decoder();
85 
87 
88 protected:
89  const uint8_t* utf8Begin;
90  const uint8_t* utf8End;
91 
92 public:
93  int32_t decode(wchar_t* unicode, int32_t length);
94 
95  int32_t utf8to16(wchar_t* unicode, int32_t length);
96  int32_t utf8to32(wchar_t* unicode, int32_t length);
97 
98 protected:
99  virtual uint32_t readNext();
100 
101  int32_t sequenceLength(uint32_t cp);
102  bool getSequence(uint32_t& cp, int32_t length);
103  bool isValidNext(uint32_t& cp);
104 };
105 
107 public:
109  virtual ~UTF8DecoderStream();
110 
112 
113 protected:
115 
116 protected:
117  virtual uint32_t readNext();
118 };
119 
120 class UTF16Decoder : public UTF8Base {
121 public:
122  UTF16Decoder(const uint16_t* utf16Begin, const uint16_t* utf16End);
123  virtual ~UTF16Decoder();
124 
126 
127 protected:
128  const uint16_t* utf16Begin;
129  const uint16_t* utf16End;
130 
131 public:
132  int32_t decode(wchar_t* unicode, int32_t length);
133 
134  int32_t utf16to16(wchar_t* unicode, int32_t length);
135  int32_t utf16to32(wchar_t* unicode, int32_t length);
136 
137 protected:
138  virtual uint32_t readNext();
139 };
140 
141 }
142 
143 #endif
int32_t utf16to32(wchar_t *unicode, int32_t length)
static const uint16_t TRAIL_SURROGATE_MAX
Definition: UTF8Stream.h:23
bool isTrail(uint32_t b)
int32_t utf16to8(uint8_t *utf8, int32_t length)
UTF16Decoder(const uint16_t *utf16Begin, const uint16_t *utf16End)
virtual uint32_t readNext()
static const wchar_t UNICODE_REPLACEMENT_CHAR
Definition: UTF8Stream.h:28
bool isValidNext(uint32_t &cp)
static const wchar_t UNICODE_TERMINATOR
Definition: UTF8Stream.h:29
boost::shared_ptr< Reader > ReaderPtr
Definition: LuceneTypes.h:547
int32_t utf32to8(uint8_t *utf8, int32_t length)
const uint16_t * utf16Begin
Definition: UTF8Stream.h:125
bool isSurrogate(uint32_t cp)
const uint16_t * utf16End
Definition: UTF8Stream.h:129
bool isTrailSurrogate(uint32_t cp)
bool isValidCodePoint(uint32_t cp)
static const uint16_t LEAD_SURROGATE_MAX
Definition: UTF8Stream.h:21
Definition: UTF8Stream.h:106
UTF8EncoderStream(const ReaderPtr &reader)
UTF8Decoder(const uint8_t *utf8Begin, const uint8_t *utf8End)
const uint8_t * utf8Begin
Definition: UTF8Stream.h:86
virtual uint32_t readNext()=0
const wchar_t * unicodeEnd
Definition: UTF8Stream.h:53
static const uint32_t SURROGATE_OFFSET
Definition: UTF8Stream.h:25
uint16_t mask16(uint32_t c)
ReaderPtr reader
Definition: UTF8Stream.h:72
uint8_t * appendChar(uint8_t *utf8, uint32_t cp)
virtual ~UTF8Encoder()
int32_t decode(wchar_t *unicode, int32_t length)
Definition: UTF8Stream.h:81
static const uint16_t LEAD_SURROGATE_MIN
Definition: UTF8Stream.h:17
int32_t encode(uint8_t *utf8, int32_t length)
Definition: UTF8Stream.h:120
const wchar_t * unicodeBegin
Definition: UTF8Stream.h:49
static const uint16_t TRAIL_SURROGATE_MIN
Definition: UTF8Stream.h:22
Definition: UTF8Stream.h:44
uint8_t mask8(uint32_t b)
Definition: UTF8Stream.h:14
Base class for all Lucene classes.
Definition: LuceneObject.h:31
#define LUCENE_CLASS(Name)
Definition: LuceneObject.h:24
Definition: AbstractAllTermDocs.h:12
ReaderPtr reader
Definition: UTF8Stream.h:111
virtual uint32_t readNext()
int32_t utf16to16(wchar_t *unicode, int32_t length)
virtual ~UTF8Base()
int32_t sequenceLength(uint32_t cp)
UTF8Encoder(const wchar_t *unicodeBegin, const wchar_t *unicodeEnd)
UTF8DecoderStream(const ReaderPtr &reader)
int32_t utf8to16(wchar_t *unicode, int32_t length)
bool isLeadSurrogate(uint32_t cp)
virtual uint32_t readNext()
int32_t decode(wchar_t *unicode, int32_t length)
virtual uint32_t readNext()
virtual ~UTF8Decoder()
static const uint16_t LEAD_OFFSET
Definition: UTF8Stream.h:24
static const uint32_t CODE_POINT_MAX
Definition: UTF8Stream.h:26
bool getSequence(uint32_t &cp, int32_t length)
Definition: UTF8Stream.h:67
int32_t utf8to32(wchar_t *unicode, int32_t length)
const uint8_t * utf8End
Definition: UTF8Stream.h:90
bool isOverlongSequence(uint32_t cp, int32_t length)
virtual uint32_t readNext()

clucene.sourceforge.net