src/util/utf8_iterator.cpp File Reference

#include "util/utf8_iterator.hpp"
#include <stdexcept>
#include "util/log.hpp"

Go to the source code of this file.

Functions

bool has_multibyte_mark (unsigned char c)
uint32_t decode_utf8 (const std::string &text, size_t &p)
std::string encode_utf8 (uint32_t code)
bool has_multibyte_mark (unsigned char c)
 returns true if this byte matches a bitmask of 10xx.xxxx, i.e.
uint32_t decode_utf8 (const std::string &text, size_t &p)
 gets unicode character at byte position p of UTF-8 encoded text, then advances p to the next character.


Function Documentation

uint32_t @497::decode_utf8 ( const std::string &  text,
size_t &  p 
) [static]

gets unicode character at byte position p of UTF-8 encoded text, then advances p to the next character.

Exceptions:
std::runtime_error if decoding fails. See unicode standard section 3.10 table 3-5 and 3-6 for details.

Definition at line 43 of file utf8_iterator.cpp.

References has_multibyte_mark().

00044 {
00045   uint32_t c1 = (unsigned char) text[p+0];
00046 
00047   if (has_multibyte_mark(c1)) std::runtime_error("Malformed utf-8 sequence");
00048 
00049   if ((c1 & 0200) == 0000) {
00050     // 0xxx.xxxx: 1 byte sequence
00051     p+=1;
00052     return c1;
00053   }
00054   else if ((c1 & 0340) == 0300) {
00055     // 110x.xxxx: 2 byte sequence
00056     if(p+1 >= text.size()) throw std::range_error("Malformed utf-8 sequence");
00057     uint32_t c2 = (unsigned char) text[p+1];
00058     if (!has_multibyte_mark(c2)) throw std::runtime_error("Malformed utf-8 sequence");
00059     p+=2;
00060     return (c1 & 0037) << 6 | (c2 & 0077);
00061   }
00062   else if ((c1 & 0360) == 0340) {
00063     // 1110.xxxx: 3 byte sequence
00064     if(p+2 >= text.size()) throw std::range_error("Malformed utf-8 sequence");
00065     uint32_t c2 = (unsigned char) text[p+1];
00066     uint32_t c3 = (unsigned char) text[p+2];
00067     if (!has_multibyte_mark(c2)) throw std::runtime_error("Malformed utf-8 sequence");
00068     if (!has_multibyte_mark(c3)) throw std::runtime_error("Malformed utf-8 sequence");
00069     p+=3;
00070     return (c1 & 0017) << 12 | (c2 & 0077) << 6 | (c3 & 0077);
00071   }
00072   else if ((c1 & 0370) == 0360) {
00073     // 1111.0xxx: 4 byte sequence
00074     if(p+3 >= text.size()) throw std::range_error("Malformed utf-8 sequence");
00075     uint32_t c2 = (unsigned char) text[p+1];
00076     uint32_t c3 = (unsigned char) text[p+2];
00077     uint32_t c4 = (unsigned char) text[p+4];
00078     if (!has_multibyte_mark(c2)) throw std::runtime_error("Malformed utf-8 sequence");
00079     if (!has_multibyte_mark(c3)) throw std::runtime_error("Malformed utf-8 sequence");
00080     if (!has_multibyte_mark(c4)) throw std::runtime_error("Malformed utf-8 sequence");
00081     p+=4;
00082     return (c1 & 0007) << 18 | (c2 & 0077) << 12 | (c3 & 0077) << 6 | (c4 & 0077);
00083   }
00084   throw std::runtime_error("Malformed utf-8 sequence");
00085 }

uint32_t @497::decode_utf8 ( const std::string &  text,
size_t &  p 
) [static]

Referenced by UTF8Iterator::operator++(), and UTF8Iterator::UTF8Iterator().

std::string @497::encode_utf8 ( uint32_t  code  )  [static]

bool @497::has_multibyte_mark ( unsigned char  c  )  [static]

returns true if this byte matches a bitmask of 10xx.xxxx, i.e.

it is the 2nd, 3rd or 4th byte of a multibyte utf8 string

Definition at line 32 of file utf8_iterator.cpp.

00032                                          {
00033   return ((c & 0300) == 0200);
00034 }

bool @497::has_multibyte_mark ( unsigned char  c  )  [static]

Referenced by decode_utf8().


Generated on Mon Jun 9 03:38:27 2014 for SuperTux by  doxygen 1.5.1