1 : /** @file unicode.h
2 : * @brief Unicode and UTF-8 related classes and functions.
3 : */
4 : /* Copyright (C) 2006,2007,2008 Olly Betts
5 : *
6 : * This program is free software; you can redistribute it and/or modify
7 : * it under the terms of the GNU General Public License as published by
8 : * the Free Software Foundation; either version 2 of the License, or
9 : * (at your option) any later version.
10 : *
11 : * This program is distributed in the hope that it will be useful,
12 : * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 : * GNU General Public License for more details.
15 : *
16 : * You should have received a copy of the GNU General Public License
17 : * along with this program; if not, write to the Free Software
18 : * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 : */
20 :
21 : #ifndef XAPIAN_INCLUDED_UNICODE_H
22 : #define XAPIAN_INCLUDED_UNICODE_H
23 :
24 : #include <xapian/visibility.h>
25 :
26 : #include <string>
27 :
28 : namespace Xapian {
29 :
30 : /** An iterator which returns unicode character values from a UTF-8 encoded
31 : * string.
32 : */
33 : class XAPIAN_VISIBILITY_DEFAULT Utf8Iterator {
34 : const unsigned char *p;
35 : const unsigned char *end;
36 : mutable unsigned seqlen;
37 :
38 : void calculate_sequence_length() const;
39 :
40 : unsigned get_char() const;
41 :
42 : Utf8Iterator(const unsigned char *p_, const unsigned char *end_, unsigned seqlen_)
43 : : p(p_), end(end_), seqlen(seqlen_) { }
44 :
45 : public:
46 : /** Return the raw const char * pointer for the current position. */
47 : const char * raw() const {
48 : return reinterpret_cast<const char *>(p ? p : end);
49 : }
50 :
51 : /** Return the number of bytes left in the iterator's buffer. */
52 : size_t left() const { return p ? end - p : 0; }
53 :
54 : /** Assign a new string to the iterator.
55 : *
56 : * The iterator will forget the string it was iterating through, and
57 : * return characters from the start of the new string when next called.
58 : * The string is not copied into the iterator, so it must remain valid
59 : * while the iteration is in progress.
60 : *
61 : * @param p A pointer to the start of the string to read.
62 : *
63 : * @param len The length of the string to read.
64 : */
65 5208 : void assign(const char *p_, size_t len) {
66 5208 : if (len) {
67 5208 : p = reinterpret_cast<const unsigned char*>(p_);
68 5208 : end = p + len;
69 5208 : seqlen = 0;
70 : } else {
71 0 : p = NULL;
72 : }
73 5208 : }
74 :
75 : /** Assign a new string to the iterator.
76 : *
77 : * The iterator will forget the string it was iterating through, and
78 : * return characters from the start of the new string when next called.
79 : * The string is not copied into the iterator, so it must remain valid
80 : * while the iteration is in progress.
81 : *
82 : * @param s The string to read. Must not be modified while the iteration
83 : * is in progress.
84 : */
85 : void assign(const std::string &s) { assign(s.data(), s.size()); }
86 :
87 : /** Create an iterator given a pointer to a null terminated string.
88 : *
89 : * The iterator will return characters from the start of the string when
90 : * next called. The string is not copied into the iterator, so it must
91 : * remain valid while the iteration is in progress.
92 : *
93 : * @param p A pointer to the start of the null terminated string to read.
94 : */
95 : explicit Utf8Iterator(const char *p_);
96 :
97 : /** Create an iterator given a pointer and a length.
98 : *
99 : * The iterator will return characters from the start of the string when
100 : * next called. The string is not copied into the iterator, so it must
101 : * remain valid while the iteration is in progress.
102 : *
103 : * @param p A pointer to the start of the string to read.
104 : *
105 : * @param len The length of the string to read.
106 : */
107 : Utf8Iterator(const char *p_, size_t len) { assign(p_, len); }
108 :
109 : /** Create an iterator given a string.
110 : *
111 : * The iterator will return characters from the start of the string when
112 : * next called. The string is not copied into the iterator, so it must
113 : * remain valid while the iteration is in progress.
114 : *
115 : * @param s The string to read. Must not be modified while the iteration
116 : * is in progress.
117 : */
118 5208 : Utf8Iterator(const std::string &s) { assign(s.data(), s.size()); }
119 :
120 : /** Create an iterator which is at the end of its iteration.
121 : *
122 : * This can be compared to another iterator to check if the other iterator
123 : * has reached its end.
124 : */
125 : Utf8Iterator() : p(NULL), end(0), seqlen(0) { }
126 :
127 : /** Get the current unicode character value pointed to by the iterator.
128 : *
129 : * Returns unsigned(-1) if the iterator has reached the end of its buffer.
130 : */
131 : unsigned operator*() const;
132 :
133 : /** Move forward to the next unicode character.
134 : *
135 : * @return An iterator pointing to the position before the move.
136 : */
137 : Utf8Iterator operator++(int) {
138 : // If we've not calculated seqlen yet, do so.
139 : if (seqlen == 0) calculate_sequence_length();
140 : const unsigned char *old_p = p;
141 : unsigned old_seqlen = seqlen;
142 : p += seqlen;
143 : if (p == end) p = NULL;
144 : seqlen = 0;
145 : return Utf8Iterator(old_p, end, old_seqlen);
146 : }
147 :
148 : /** Move forward to the next unicode character.
149 : *
150 : * @return A reference to this object.
151 : */
152 : Utf8Iterator & operator++() {
153 : if (seqlen == 0) calculate_sequence_length();
154 : p += seqlen;
155 : if (p == end) p = NULL;
156 : seqlen = 0;
157 : return *this;
158 : }
159 :
160 : /** Test two Utf8Iterators for equality.
161 : *
162 : * @return true iff the iterators point to the same position.
163 : */
164 : bool operator==(const Utf8Iterator &other) const { return p == other.p; }
165 :
166 : /** Test two Utf8Iterators for inequality.
167 : *
168 : * @return true iff the iterators do not point to the same position.
169 : */
170 : bool operator!=(const Utf8Iterator &other) const { return p != other.p; }
171 :
172 : /// We implement the semantics of an STL input_iterator.
173 : //@{
174 : typedef std::input_iterator_tag iterator_category;
175 : typedef unsigned value_type;
176 : typedef size_t difference_type;
177 : typedef const unsigned * pointer;
178 : typedef const unsigned & reference;
179 : //@}
180 : };
181 :
182 : namespace Unicode {
183 :
184 : /** Each unicode character is in one of these categories. */
185 : typedef enum {
186 : UNASSIGNED,
187 : UPPERCASE_LETTER,
188 : LOWERCASE_LETTER,
189 : TITLECASE_LETTER,
190 : MODIFIER_LETTER,
191 : OTHER_LETTER,
192 : NON_SPACING_MARK,
193 : ENCLOSING_MARK,
194 : COMBINING_SPACING_MARK,
195 : DECIMAL_DIGIT_NUMBER,
196 : LETTER_NUMBER,
197 : OTHER_NUMBER,
198 : SPACE_SEPARATOR,
199 : LINE_SEPARATOR,
200 : PARAGRAPH_SEPARATOR,
201 : CONTROL,
202 : FORMAT,
203 : PRIVATE_USE,
204 : SURROGATE,
205 : CONNECTOR_PUNCTUATION,
206 : DASH_PUNCTUATION,
207 : OPEN_PUNCTUATION,
208 : CLOSE_PUNCTUATION,
209 : INITIAL_QUOTE_PUNCTUATION,
210 : FINAL_QUOTE_PUNCTUATION,
211 : OTHER_PUNCTUATION,
212 : MATH_SYMBOL,
213 : CURRENCY_SYMBOL,
214 : MODIFIER_SYMBOL,
215 : OTHER_SYMBOL
216 : } category;
217 :
218 : namespace Internal {
219 : /** @internal Extract the information about a character from the Unicode
220 : * character tables.
221 : *
222 : * ch must be a valid Unicode character value (i.e. < 0x110000)
223 : */
224 : XAPIAN_VISIBILITY_DEFAULT
225 : int get_character_info(unsigned ch);
226 :
227 : /** @internal Extract how to convert the case of a unicode character from
228 : * its info.
229 : */
230 : inline int get_case_type(int info) { return ((info & 0xe0) >> 5); }
231 :
232 : /// @internal Extract the category of a unicode character from its info.
233 : inline category get_category(int info) { return static_cast<category>(info & 0x1f); }
234 :
235 : /** @internal Extract the delta to use for case conversion of a character
236 : * from its info.
237 : */
238 : inline int get_delta(int info) {
239 : /* It's implementation defined if sign extension happens on right shift
240 : * of a signed int, hence the conditional (hopefully the compiler will
241 : * spot this and optimise it to a sign-extending shift on architectures
242 : * with a suitable instruction).
243 : */
244 : return (info >= 0) ? (info >> 15) : (~(~info >> 15));
245 : }
246 : }
247 :
248 : /** Convert a single non-ASCII unicode character to UTF-8.
249 : *
250 : * This is intended mainly as a helper method for to_utf8().
251 : *
252 : * The character @a ch (which must be > 128) is written to the buffer @a buf
253 : * and the length of the resultant UTF-8 character is returned.
254 : *
255 : * NB buf must have space for (at least) 4 bytes.
256 : */
257 : XAPIAN_VISIBILITY_DEFAULT
258 : unsigned nonascii_to_utf8(unsigned ch, char * buf);
259 :
260 : /** Convert a single unicode character to UTF-8.
261 : *
262 : * The character @a ch is written to the buffer @a buf and the length of the
263 : * resultant UTF-8 character is returned.
264 : *
265 : * NB buf must have space for (at least) 4 bytes.
266 : */
267 : inline unsigned to_utf8(unsigned ch, char *buf) {
268 : if (ch < 128) {
269 : *buf = static_cast<unsigned char>(ch);
270 : return 1;
271 : }
272 : return Xapian::Unicode::nonascii_to_utf8(ch, buf);
273 : }
274 :
275 : /** Append the UTF-8 representation of a single unicode character to a
276 : * std::string.
277 : */
278 : inline void append_utf8(std::string &s, unsigned ch) {
279 : char buf[4];
280 : s.append(buf, to_utf8(ch, buf));
281 : }
282 :
283 : /// Return the category which a given unicode character falls into.
284 : inline category get_category(unsigned ch) {
285 : // Categorise non-Unicode values as UNASSIGNED.
286 : if (ch >= 0x110000) return Xapian::Unicode::UNASSIGNED;
287 : return Internal::get_category(Internal::get_character_info(ch));
288 : }
289 :
290 : /// Test is a given unicode character is a letter or number.
291 : inline bool is_wordchar(unsigned ch) {
292 : const unsigned int WORDCHAR_MASK =
293 : (1 << Xapian::Unicode::UPPERCASE_LETTER) |
294 : (1 << Xapian::Unicode::LOWERCASE_LETTER) |
295 : (1 << Xapian::Unicode::TITLECASE_LETTER) |
296 : (1 << Xapian::Unicode::MODIFIER_LETTER) |
297 : (1 << Xapian::Unicode::OTHER_LETTER) |
298 : (1 << Xapian::Unicode::DECIMAL_DIGIT_NUMBER) |
299 : (1 << Xapian::Unicode::LETTER_NUMBER) |
300 : (1 << Xapian::Unicode::OTHER_NUMBER) |
301 : (1 << Xapian::Unicode::CONNECTOR_PUNCTUATION);
302 : return ((WORDCHAR_MASK >> get_category(ch)) & 1);
303 : }
304 :
305 : /// Test is a given unicode character is a whitespace character.
306 : inline bool is_whitespace(unsigned ch) {
307 : const unsigned int WHITESPACE_MASK =
308 : (1 << Xapian::Unicode::CONTROL) | // For TAB, CR, LF, FF.
309 : (1 << Xapian::Unicode::SPACE_SEPARATOR) |
310 : (1 << Xapian::Unicode::LINE_SEPARATOR) |
311 : (1 << Xapian::Unicode::PARAGRAPH_SEPARATOR);
312 : return ((WHITESPACE_MASK >> get_category(ch)) & 1);
313 : }
314 :
315 : /// Test is a given unicode character is a currency symbol.
316 : inline bool is_currency(unsigned ch) {
317 : return (get_category(ch) == Xapian::Unicode::CURRENCY_SYMBOL);
318 : }
319 :
320 : /// Convert a unicode character to lowercase.
321 : inline unsigned tolower(unsigned ch) {
322 : int info;
323 : // Leave non-Unicode values unchanged.
324 : if (ch >= 0x110000 || !(Internal::get_case_type((info = Xapian::Unicode::Internal::get_character_info(ch))) & 2))
325 : return ch;
326 : return ch + Internal::get_delta(info);
327 : }
328 :
329 : /// Convert a unicode character to uppercase.
330 : inline unsigned toupper(unsigned ch) {
331 : int info;
332 : // Leave non-Unicode values unchanged.
333 : if (ch >= 0x110000 || !(Internal::get_case_type((info = Xapian::Unicode::Internal::get_character_info(ch))) & 4))
334 : return ch;
335 : return ch - Internal::get_delta(info);
336 : }
337 :
338 : /// Convert a UTF-8 std::string to lowercase.
339 : inline std::string
340 : tolower(const std::string &term)
341 : {
342 : std::string result;
343 : result.reserve(term.size());
344 : for (Utf8Iterator i(term); i != Utf8Iterator(); ++i) {
345 : append_utf8(result, tolower(*i));
346 : }
347 : return result;
348 : }
349 :
350 : /// Convert a UTF-8 std::string to uppercase.
351 : inline std::string
352 : toupper(const std::string &term)
353 : {
354 : std::string result;
355 : result.reserve(term.size());
356 : for (Utf8Iterator i(term); i != Utf8Iterator(); ++i) {
357 : append_utf8(result, toupper(*i));
358 : }
359 : return result;
360 : }
361 :
362 : }
363 :
364 : }
365 :
366 : #endif // XAPIAN_INCLUDED_UNICODE_H
|