Drizzled Public API Documentation

charset.cc
1 /* Copyright (C) 2000 MySQL AB
2 
3  This program is free software; you can redistribute it and/or modify
4  it under the terms of the GNU General Public License as published by
5  the Free Software Foundation; version 2 of the License.
6 
7  This program is distributed in the hope that it will be useful,
8  but WITHOUT ANY WARRANTY; without even the implied warranty of
9  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10  GNU General Public License for more details.
11 
12  You should have received a copy of the GNU General Public License
13  along with this program; if not, write to the Free Software
14  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */
15 
16 #include <config.h>
17 
18 #include <drizzled/charset.h>
19 #include <drizzled/error.h>
20 #include <drizzled/internal/m_string.h>
21 #include <drizzled/configmake.h>
22 #include <vector>
23 
24 #include <drizzled/visibility.h>
25 
26 using namespace std;
27 
28 namespace drizzled {
29 
30 /*
31  We collect memory in this vector that we free on delete.
32 */
33 static vector<unsigned char*> memory_vector;
34 
35 extern charset_info_st my_charset_utf8mb4_icelandic_uca_ci;
36 extern charset_info_st my_charset_utf8mb4_latvian_uca_ci;
37 extern charset_info_st my_charset_utf8mb4_romanian_uca_ci;
38 extern charset_info_st my_charset_utf8mb4_slovenian_uca_ci;
39 extern charset_info_st my_charset_utf8mb4_polish_uca_ci;
40 extern charset_info_st my_charset_utf8mb4_estonian_uca_ci;
41 extern charset_info_st my_charset_utf8mb4_spanish_uca_ci;
42 extern charset_info_st my_charset_utf8mb4_swedish_uca_ci;
43 extern charset_info_st my_charset_utf8mb4_turkish_uca_ci;
44 extern charset_info_st my_charset_utf8mb4_czech_uca_ci;
45 extern charset_info_st my_charset_utf8mb4_danish_uca_ci;
46 extern charset_info_st my_charset_utf8mb4_lithuanian_uca_ci;
47 extern charset_info_st my_charset_utf8mb4_slovak_uca_ci;
48 extern charset_info_st my_charset_utf8mb4_spanish2_uca_ci;
49 extern charset_info_st my_charset_utf8mb4_roman_uca_ci;
50 extern charset_info_st my_charset_utf8mb4_persian_uca_ci;
51 extern charset_info_st my_charset_utf8mb4_esperanto_uca_ci;
52 extern charset_info_st my_charset_utf8mb4_hungarian_uca_ci;
53 extern charset_info_st my_charset_utf8mb4_sinhala_uca_ci;
54 
55 /*
56  The code below implements this functionality:
57 
58  - Initializing charset related structures
59  - Loading dynamic charsets
60  - Searching for a proper charset_info_st
61  using charset name, collation name or collation ID
62  - Setting server default character set
63 */
64 
65 bool my_charset_same(const charset_info_st *cs1, const charset_info_st *cs2)
66 {
67  return cs1 == cs2 || not strcmp(cs1->csname, cs2->csname);
68 }
69 
70 static uint get_collation_number_internal(const char *name)
71 {
72  for (charset_info_st **cs= all_charsets; cs < all_charsets + array_elements(all_charsets) - 1; cs++)
73  {
74  if (cs[0] && cs[0]->name && not my_charset_utf8_general_ci.strcasecmp(cs[0]->name, name))
75  {
76  return cs[0]->number;
77  }
78  }
79  return 0;
80 }
81 
82 static unsigned char* cs_alloc(size_t size)
83 {
84  memory_vector.push_back(new unsigned char[size]);
85  return memory_vector.back();
86 }
87 
88 static void init_state_maps(charset_info_st *cs)
89 {
90  cs->state_map= cs_alloc(256);
91  cs->ident_map= cs_alloc(256);
92 
93  unsigned char *state_map= cs->state_map;
94  unsigned char *ident_map= cs->ident_map;
95 
96  /* Fill state_map with states to get a faster parser */
97  for (int i= 0; i < 256; i++)
98  {
99  if (cs->isalpha(i))
100  state_map[i]= MY_LEX_IDENT;
101  else if (cs->isdigit(i))
102  state_map[i]= MY_LEX_NUMBER_IDENT;
103  else if (my_mbcharlen(cs, i) > 1)
104  state_map[i]= MY_LEX_IDENT;
105  else if (cs->isspace(i))
106  state_map[i]= MY_LEX_SKIP;
107  else
108  state_map[i]= MY_LEX_CHAR;
109  }
110  state_map['_']=state_map['$']= MY_LEX_IDENT;
111  state_map['\'']= MY_LEX_STRING;
112  state_map['.']= MY_LEX_REAL_OR_POINT;
113  state_map['>']=state_map['=']=state_map['!']= MY_LEX_CMP_OP;
114  state_map['<']= MY_LEX_LONG_CMP_OP;
115  state_map['&']=state_map['|']= MY_LEX_BOOL;
116  state_map['#']= MY_LEX_COMMENT;
117  state_map[';']= MY_LEX_SEMICOLON;
118  state_map[':']= MY_LEX_SET_VAR;
119  state_map[0]= MY_LEX_EOL;
120  state_map['\\']= MY_LEX_ESCAPE;
121  state_map['/']= MY_LEX_LONG_COMMENT;
122  state_map['*']= MY_LEX_END_LONG_COMMENT;
123  state_map['@']= MY_LEX_USER_END;
124  state_map['`']= MY_LEX_USER_VARIABLE_DELIMITER;
125  state_map['"']= MY_LEX_STRING_OR_DELIMITER;
126 
127  /*
128  Create a second map to make it faster to find identifiers
129  */
130  for (int i= 0; i < 256; i++)
131  {
132  ident_map[i]= state_map[i] == MY_LEX_IDENT || state_map[i] == MY_LEX_NUMBER_IDENT;
133  }
134 
135  /* Special handling of hex and binary strings */
136  state_map['x']= state_map['X']= MY_LEX_IDENT_OR_HEX;
137  state_map['b']= state_map['B']= MY_LEX_IDENT_OR_BIN;
138 }
139 
140 static bool charset_initialized= false;
141 
142 DRIZZLED_API charset_info_st *all_charsets[256];
143 const DRIZZLED_API charset_info_st *default_charset_info = &my_charset_utf8_general_ci;
144 
145 static void add_compiled_collation(charset_info_st * cs)
146 {
147  all_charsets[cs->number]= cs;
148  cs->state|= MY_CS_AVAILABLE;
149 }
150 
151 static void init_compiled_charsets()
152 {
153  add_compiled_collation(&my_charset_bin);
154 
155  add_compiled_collation(&my_charset_utf8mb4_general_ci);
156  add_compiled_collation(&my_charset_utf8mb4_bin);
157  add_compiled_collation(&my_charset_utf8mb4_unicode_ci);
158  add_compiled_collation(&my_charset_utf8mb4_icelandic_uca_ci);
159  add_compiled_collation(&my_charset_utf8mb4_latvian_uca_ci);
160  add_compiled_collation(&my_charset_utf8mb4_romanian_uca_ci);
161  add_compiled_collation(&my_charset_utf8mb4_slovenian_uca_ci);
162  add_compiled_collation(&my_charset_utf8mb4_polish_uca_ci);
163  add_compiled_collation(&my_charset_utf8mb4_estonian_uca_ci);
164  add_compiled_collation(&my_charset_utf8mb4_spanish_uca_ci);
165  add_compiled_collation(&my_charset_utf8mb4_swedish_uca_ci);
166  add_compiled_collation(&my_charset_utf8mb4_turkish_uca_ci);
167  add_compiled_collation(&my_charset_utf8mb4_czech_uca_ci);
168  add_compiled_collation(&my_charset_utf8mb4_danish_uca_ci);
169  add_compiled_collation(&my_charset_utf8mb4_lithuanian_uca_ci);
170  add_compiled_collation(&my_charset_utf8mb4_slovak_uca_ci);
171  add_compiled_collation(&my_charset_utf8mb4_spanish2_uca_ci);
172  add_compiled_collation(&my_charset_utf8mb4_roman_uca_ci);
173  add_compiled_collation(&my_charset_utf8mb4_persian_uca_ci);
174  add_compiled_collation(&my_charset_utf8mb4_esperanto_uca_ci);
175  add_compiled_collation(&my_charset_utf8mb4_hungarian_uca_ci);
176  add_compiled_collation(&my_charset_utf8mb4_sinhala_uca_ci);
177 }
178 
179 static void init_available_charsets()
180 {
181  /*
182  We have to use charset_initialized to not lock on THR_LOCK_charset
183  inside get_internal_charset...
184  */
185  if (charset_initialized)
186  return;
187  memset(&all_charsets, 0, sizeof(all_charsets));
188  init_compiled_charsets();
189 
190  /* Copy compiled charsets */
191  for (charset_info_st**cs= all_charsets;
192  cs < all_charsets+array_elements(all_charsets)-1;
193  cs++)
194  {
195  if (*cs && cs[0]->ctype)
196  init_state_maps(*cs);
197  }
198 
199  charset_initialized= true;
200 }
201 
202 void free_charsets()
203 {
204  charset_initialized= false;
205 
206  while (not memory_vector.empty())
207  {
208  delete[] memory_vector.back();
209  memory_vector.pop_back();
210  }
211 }
212 
213 uint32_t get_collation_number(const char *name)
214 {
215  init_available_charsets();
216  return get_collation_number_internal(name);
217 }
218 
219 uint32_t get_charset_number(const char *charset_name, uint32_t cs_flags)
220 {
221  init_available_charsets();
222 
223  for (charset_info_st** cs= all_charsets; cs < all_charsets + array_elements(all_charsets) - 1; cs++)
224  {
225  if (cs[0] && cs[0]->csname && (cs[0]->state & cs_flags) && not my_charset_utf8_general_ci.strcasecmp(cs[0]->csname, charset_name))
226  return cs[0]->number;
227  }
228  return 0;
229 }
230 
231 const char *get_charset_name(uint32_t charset_number)
232 {
233  init_available_charsets();
234  const charset_info_st* cs= all_charsets[charset_number];
235  return cs && cs->number == charset_number && cs->name ? cs->name : "?";
236 }
237 
238 static const charset_info_st *get_internal_charset(uint32_t cs_number)
239 {
240  charset_info_st* cs= all_charsets[cs_number];
241  /*
242  To make things thread safe we are not allowing other threads to interfere
243  while we may changing the cs_info_table
244  */
245  if (not cs)
246  return NULL;
247  assert(not (not (cs->state & MY_CS_COMPILED) && not (cs->state & MY_CS_LOADED)));
248  if (not (cs->state & MY_CS_AVAILABLE))
249  return NULL;
250  if (not (cs->state & MY_CS_READY))
251  {
252  if (cs->coll->init && cs->coll->init(*cs, cs_alloc))
253  return NULL;
254  cs->state|= MY_CS_READY;
255  }
256  return cs;
257 }
258 
259 const charset_info_st *get_charset(uint32_t cs_number)
260 {
261  if (cs_number == default_charset_info->number)
262  return default_charset_info;
263 
264  init_available_charsets(); /* If it isn't initialized */
265 
266  if (!cs_number || cs_number >= array_elements(all_charsets)-1)
267  return NULL;
268 
269  return get_internal_charset(cs_number);
270 }
271 
272 const charset_info_st *get_charset_by_name(const char *cs_name)
273 {
274  init_available_charsets(); /* If it isn't initialized */
275  uint32_t cs_number= get_collation_number(cs_name);
276  return cs_number ? get_internal_charset(cs_number) : NULL;
277 }
278 
279 const charset_info_st *get_charset_by_csname(const char *cs_name, uint32_t cs_flags)
280 {
281  init_available_charsets(); /* If it isn't initialized */
282  uint32_t cs_number= get_charset_number(cs_name, cs_flags);
283  return cs_number ? get_internal_charset(cs_number) : NULL;
284 }
285 
286 
287 /*
288  Escape apostrophes by doubling them up
289 
290  SYNOPSIS
291  escape_quotes_for_drizzle()
292  charset_info Charset of the strings
293  to Buffer for escaped string
294  to_length Length of destination buffer, or 0
295  from The string to escape
296  length The length of the string to escape
297 
298  DESCRIPTION
299  This escapes the contents of a string by doubling up any apostrophes that
300  it contains. This is used when the NO_BACKSLASH_ESCAPES SQL_MODE is in
301  effect on the server.
302 
303  NOTE
304  To be consistent with escape_string_for_mysql(), to_length may be 0 to
305  mean "big enough"
306 
307  RETURN VALUES
308  UINT32_MAX The escaped string did not fit in the to buffer
309  >=0 The length of the escaped string
310 */
311 
312 size_t escape_quotes_for_drizzle(const charset_info_st *charset_info,
313  char *to, size_t to_length,
314  const char *from, size_t length)
315 {
316  const char *to_start= to;
317  const char *end, *to_end=to_start + (to_length ? to_length-1 : 2*length);
318  bool overflow= false;
319  bool use_mb_flag= use_mb(charset_info);
320  for (end= from + length; from < end; from++)
321  {
322  int tmp_length;
323  if (use_mb_flag && (tmp_length= my_ismbchar(charset_info, from, end)))
324  {
325  if (to + tmp_length > to_end)
326  {
327  overflow= true;
328  break;
329  }
330  while (tmp_length--)
331  *to++= *from++;
332  from--;
333  continue;
334  }
335  /*
336  We don't have the same issue here with a non-multi-byte character being
337  turned into a multi-byte character by the addition of an escaping
338  character, because we are only escaping the ' character with itself.
339  */
340  if (*from == '\'')
341  {
342  if (to + 2 > to_end)
343  {
344  overflow= true;
345  break;
346  }
347  *to++= '\'';
348  *to++= '\'';
349  }
350  else
351  {
352  if (to + 1 > to_end)
353  {
354  overflow= true;
355  break;
356  }
357  *to++= *from;
358  }
359  }
360  *to= 0;
361  return overflow ? UINT32_MAX : to - to_start;
362 }
363 
364 } /* namespace drizzled */