Boost.Locale
utf.hpp
1 //
2 // Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
3 //
4 // Distributed under the Boost Software License, Version 1.0. (See
5 // accompanying file LICENSE_1_0.txt or copy at
6 // http://www.boost.org/LICENSE_1_0.txt)
7 //
8 #ifndef BOOST_LOCALE_UTF_HPP_INCLUDED
9 #define BOOST_LOCALE_UTF_HPP_INCLUDED
10 
11 #include <boost/cstdint.hpp>
12 
13 namespace boost {
14 namespace locale {
20 namespace utf {
22  #ifdef __GNUC__
23  # define BOOST_LOCALE_LIKELY(x) __builtin_expect((x),1)
24  # define BOOST_LOCALE_UNLIKELY(x) __builtin_expect((x),0)
25  #else
26  # define BOOST_LOCALE_LIKELY(x) (x)
27  # define BOOST_LOCALE_UNLIKELY(x) (x)
28  #endif
29 
34  typedef uint32_t code_point;
35 
39  static const code_point illegal = 0xFFFFFFFFu;
40 
44  static const code_point incomplete = 0xFFFFFFFEu;
45 
50  {
51  if(v>0x10FFFF)
52  return false;
53  if(0xD800 <=v && v<= 0xDFFF) // surragates
54  return false;
55  return true;
56  }
57 
58  #ifdef BOOST_LOCALE_DOXYGEN
59  template<typename CharType,int size=sizeof(CharType)>
63  struct utf_traits {
67  typedef CharType char_type;
82  template<typename Iterator>
83  static code_point decode(Iterator &p,Iterator e);
84 
92  static const int max_width;
99  static int width(code_point value);
100 
106  static int trail_length(char_type c);
110  static bool is_trail(char_type c);
114  static bool is_lead(char_type c);
115 
126  template<typename Iterator>
127  static Iterator encode(code_point value,Iterator out);
133  template<typename Iterator>
134  static code_point decode_valid(Iterator &p);
135  };
136 
137  #else
138 
139  template<typename CharType,int size=sizeof(CharType)>
140  struct utf_traits;
141 
142  template<typename CharType>
143  struct utf_traits<CharType,1> {
144 
145  typedef CharType char_type;
146 
147  static int trail_length(char_type ci)
148  {
149  unsigned char c = ci;
150  if(c < 128)
151  return 0;
152  if(BOOST_LOCALE_UNLIKELY(c < 194))
153  return -1;
154  if(c < 224)
155  return 1;
156  if(c < 240)
157  return 2;
158  if(BOOST_LOCALE_LIKELY(c <=244))
159  return 3;
160  return -1;
161  }
162 
163  static const int max_width = 4;
164 
165  static int width(code_point value)
166  {
167  if(value <=0x7F) {
168  return 1;
169  }
170  else if(value <=0x7FF) {
171  return 2;
172  }
173  else if(BOOST_LOCALE_LIKELY(value <=0xFFFF)) {
174  return 3;
175  }
176  else {
177  return 4;
178  }
179  }
180 
181  static bool is_trail(char_type ci)
182  {
183  unsigned char c=ci;
184  return (c & 0xC0)==0x80;
185  }
186 
187  static bool is_lead(char_type ci)
188  {
189  return !is_trail(ci);
190  }
191 
192  template<typename Iterator>
193  static code_point decode(Iterator &p,Iterator e)
194  {
195  if(BOOST_LOCALE_UNLIKELY(p==e))
196  return incomplete;
197 
198  unsigned char lead = *p++;
199 
200  // First byte is fully validated here
201  int trail_size = trail_length(lead);
202 
203  if(BOOST_LOCALE_UNLIKELY(trail_size < 0))
204  return illegal;
205 
206  //
207  // Ok as only ASCII may be of size = 0
208  // also optimize for ASCII text
209  //
210  if(trail_size == 0)
211  return lead;
212 
213  code_point c = lead & ((1<<(6-trail_size))-1);
214 
215  // Read the rest
216  unsigned char tmp;
217  switch(trail_size) {
218  case 3:
219  if(BOOST_LOCALE_UNLIKELY(p==e))
220  return incomplete;
221  tmp = *p++;
222  if (!is_trail(tmp))
223  return illegal;
224  c = (c << 6) | ( tmp & 0x3F);
225  case 2:
226  if(BOOST_LOCALE_UNLIKELY(p==e))
227  return incomplete;
228  tmp = *p++;
229  if (!is_trail(tmp))
230  return illegal;
231  c = (c << 6) | ( tmp & 0x3F);
232  case 1:
233  if(BOOST_LOCALE_UNLIKELY(p==e))
234  return incomplete;
235  tmp = *p++;
236  if (!is_trail(tmp))
237  return illegal;
238  c = (c << 6) | ( tmp & 0x3F);
239  }
240 
241  // Check code point validity: no surrogates and
242  // valid range
243  if(BOOST_LOCALE_UNLIKELY(!is_valid_codepoint(c)))
244  return illegal;
245 
246  // make sure it is the most compact representation
247  if(BOOST_LOCALE_UNLIKELY(width(c)!=trail_size + 1))
248  return illegal;
249 
250  return c;
251 
252  }
253 
254  template<typename Iterator>
255  static code_point decode_valid(Iterator &p)
256  {
257  unsigned char lead = *p++;
258  if(lead < 192)
259  return lead;
260 
261  int trail_size;
262 
263  if(lead < 224)
264  trail_size = 1;
265  else if(BOOST_LOCALE_LIKELY(lead < 240)) // non-BMP rare
266  trail_size = 2;
267  else
268  trail_size = 3;
269 
270  code_point c = lead & ((1<<(6-trail_size))-1);
271 
272  switch(trail_size) {
273  case 3:
274  c = (c << 6) | ( static_cast<unsigned char>(*p++) & 0x3F);
275  case 2:
276  c = (c << 6) | ( static_cast<unsigned char>(*p++) & 0x3F);
277  case 1:
278  c = (c << 6) | ( static_cast<unsigned char>(*p++) & 0x3F);
279  }
280 
281  return c;
282  }
283 
284 
285 
286  template<typename Iterator>
287  static Iterator encode(code_point value,Iterator out)
288  {
289  if(value <= 0x7F) {
290  *out++ = static_cast<char_type>(value);
291  }
292  else if(value <= 0x7FF) {
293  *out++ = static_cast<char_type>((value >> 6) | 0xC0);
294  *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
295  }
296  else if(BOOST_LOCALE_LIKELY(value <= 0xFFFF)) {
297  *out++ = static_cast<char_type>((value >> 12) | 0xE0);
298  *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80);
299  *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
300  }
301  else {
302  *out++ = static_cast<char_type>((value >> 18) | 0xF0);
303  *out++ = static_cast<char_type>(((value >> 12) & 0x3F) | 0x80);
304  *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80);
305  *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
306  }
307  return out;
308  }
309  }; // utf8
310 
311  template<typename CharType>
312  struct utf_traits<CharType,2> {
313  typedef CharType char_type;
314 
315  // See RFC 2781
316  static bool is_first_surrogate(uint16_t x)
317  {
318  return 0xD800 <=x && x<= 0xDBFF;
319  }
320  static bool is_second_surrogate(uint16_t x)
321  {
322  return 0xDC00 <=x && x<= 0xDFFF;
323  }
324  static code_point combine_surrogate(uint16_t w1,uint16_t w2)
325  {
326  return ((code_point(w1 & 0x3FF) << 10) | (w2 & 0x3FF)) + 0x10000;
327  }
328  static int trail_length(char_type c)
329  {
330  if(is_first_surrogate(c))
331  return 1;
332  if(is_second_surrogate(c))
333  return -1;
334  return 0;
335  }
339  static bool is_trail(char_type c)
340  {
341  return is_second_surrogate(c);
342  }
346  static bool is_lead(char_type c)
347  {
348  return !is_second_surrogate(c);
349  }
350 
351  template<typename It>
352  static code_point decode(It &current,It last)
353  {
354  if(BOOST_LOCALE_UNLIKELY(current == last))
355  return incomplete;
356  uint16_t w1=*current++;
357  if(BOOST_LOCALE_LIKELY(w1 < 0xD800 || 0xDFFF < w1)) {
358  return w1;
359  }
360  if(w1 > 0xDBFF)
361  return illegal;
362  if(current==last)
363  return incomplete;
364  uint16_t w2=*current++;
365  if(w2 < 0xDC00 || 0xDFFF < w2)
366  return illegal;
367  return combine_surrogate(w1,w2);
368  }
369  template<typename It>
370  static code_point decode_valid(It &current)
371  {
372  uint16_t w1=*current++;
373  if(BOOST_LOCALE_LIKELY(w1 < 0xD800 || 0xDFFF < w1)) {
374  return w1;
375  }
376  uint16_t w2=*current++;
377  return combine_surrogate(w1,w2);
378  }
379 
380  static const int max_width = 2;
381  static int width(code_point u)
382  {
383  return u>=0x10000 ? 2 : 1;
384  }
385  template<typename It>
386  static It encode(code_point u,It out)
387  {
388  if(BOOST_LOCALE_LIKELY(u<=0xFFFF)) {
389  *out++ = static_cast<char_type>(u);
390  }
391  else {
392  u -= 0x10000;
393  *out++ = static_cast<char_type>(0xD800 | (u>>10));
394  *out++ = static_cast<char_type>(0xDC00 | (u & 0x3FF));
395  }
396  return out;
397  }
398  }; // utf16;
399 
400 
401  template<typename CharType>
402  struct utf_traits<CharType,4> {
403  typedef CharType char_type;
404  static int trail_length(char_type c)
405  {
406  if(is_valid_codepoint(c))
407  return 0;
408  return -1;
409  }
410  static bool is_trail(char_type /*c*/)
411  {
412  return false;
413  }
414  static bool is_lead(char_type /*c*/)
415  {
416  return true;
417  }
418 
419  template<typename It>
420  static code_point decode_valid(It &current)
421  {
422  return *current++;
423  }
424 
425  template<typename It>
426  static code_point decode(It &current,It last)
427  {
428  if(BOOST_LOCALE_UNLIKELY(current == last))
430  code_point c=*current++;
431  if(BOOST_LOCALE_UNLIKELY(!is_valid_codepoint(c)))
433  return c;
434  }
435  static const int max_width = 1;
436  static int width(code_point /*u*/)
437  {
438  return 1;
439  }
440  template<typename It>
441  static It encode(code_point u,It out)
442  {
443  *out++ = static_cast<char_type>(u);
444  return out;
445  }
446 
447  }; // utf32
448 
449  #endif
450 
451 
452 } // utf
453 } // locale
454 } // boost
455 
456 
457 #endif
458 
459 // vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
460 
static code_point decode(Iterator &p, Iterator e)
bool is_valid_codepoint(code_point v)
the function checks if v is a valid code point
Definition: utf.hpp:49
static Iterator encode(code_point value, Iterator out)
static const code_point incomplete
Special constant that defines incomplete code point.
Definition: utf.hpp:44
uint32_t code_point
The integral type that can hold a Unicode code point.
Definition: utf.hpp:34
static const code_point illegal
Special constant that defines illegal code point.
Definition: utf.hpp:39
UTF Traits class - functions to convert UTF sequences to and from Unicode code points.
Definition: utf.hpp:63
static int width(code_point value)
CharType char_type
Definition: utf.hpp:67
static int trail_length(char_type c)
static bool is_lead(char_type c)
static code_point decode_valid(Iterator &p)
static bool is_trail(char_type c)
static const int max_width
Definition: utf.hpp:92