src/util/utf8_iterator.cpp

Go to the documentation of this file.
00001 //  SuperTux
00002 //  Copyright (C) 2009 Ingo Ruhnke <grumbel@gmx.de>
00003 //
00004 //  This program is free software: you can redistribute it and/or modify
00005 //  it under the terms of the GNU General Public License as published by
00006 //  the Free Software Foundation, either version 3 of the License, or
00007 //  (at your option) any later version.
00008 //
00009 //  This program is distributed in the hope that it will be useful,
00010 //  but WITHOUT ANY WARRANTY; without even the implied warranty of
00011 //  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012 //  GNU General Public License for more details.
00013 //
00014 //  You should have received a copy of the GNU General Public License
00015 //  along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016 
00017 #include "util/utf8_iterator.hpp"
00018 
00019 #include <stdexcept>
00020 
00021 #include "util/log.hpp"
00022 
00023 namespace {
00024 
00025 bool     has_multibyte_mark(unsigned char c);
00026 uint32_t decode_utf8(const std::string& text, size_t& p);
00027 std::string encode_utf8(uint32_t code);
00028 
00032 bool has_multibyte_mark(unsigned char c) {
00033   return ((c & 0300) == 0200);
00034 }
00035 
00043 uint32_t decode_utf8(const std::string& text, size_t& p)
00044 {
00045   uint32_t c1 = (unsigned char) text[p+0];
00046 
00047   if (has_multibyte_mark(c1)) std::runtime_error("Malformed utf-8 sequence");
00048 
00049   if ((c1 & 0200) == 0000) {
00050     // 0xxx.xxxx: 1 byte sequence
00051     p+=1;
00052     return c1;
00053   }
00054   else if ((c1 & 0340) == 0300) {
00055     // 110x.xxxx: 2 byte sequence
00056     if(p+1 >= text.size()) throw std::range_error("Malformed utf-8 sequence");
00057     uint32_t c2 = (unsigned char) text[p+1];
00058     if (!has_multibyte_mark(c2)) throw std::runtime_error("Malformed utf-8 sequence");
00059     p+=2;
00060     return (c1 & 0037) << 6 | (c2 & 0077);
00061   }
00062   else if ((c1 & 0360) == 0340) {
00063     // 1110.xxxx: 3 byte sequence
00064     if(p+2 >= text.size()) throw std::range_error("Malformed utf-8 sequence");
00065     uint32_t c2 = (unsigned char) text[p+1];
00066     uint32_t c3 = (unsigned char) text[p+2];
00067     if (!has_multibyte_mark(c2)) throw std::runtime_error("Malformed utf-8 sequence");
00068     if (!has_multibyte_mark(c3)) throw std::runtime_error("Malformed utf-8 sequence");
00069     p+=3;
00070     return (c1 & 0017) << 12 | (c2 & 0077) << 6 | (c3 & 0077);
00071   }
00072   else if ((c1 & 0370) == 0360) {
00073     // 1111.0xxx: 4 byte sequence
00074     if(p+3 >= text.size()) throw std::range_error("Malformed utf-8 sequence");
00075     uint32_t c2 = (unsigned char) text[p+1];
00076     uint32_t c3 = (unsigned char) text[p+2];
00077     uint32_t c4 = (unsigned char) text[p+4];
00078     if (!has_multibyte_mark(c2)) throw std::runtime_error("Malformed utf-8 sequence");
00079     if (!has_multibyte_mark(c3)) throw std::runtime_error("Malformed utf-8 sequence");
00080     if (!has_multibyte_mark(c4)) throw std::runtime_error("Malformed utf-8 sequence");
00081     p+=4;
00082     return (c1 & 0007) << 18 | (c2 & 0077) << 12 | (c3 & 0077) << 6 | (c4 & 0077);
00083   }
00084   throw std::runtime_error("Malformed utf-8 sequence");
00085 }
00086 
00087 } // namespace
00088 
00089 
00090 UTF8Iterator::UTF8Iterator(const std::string& text_) :
00091   text(text_),
00092   pos(0),
00093   chr()
00094 {
00095   try {
00096     chr = decode_utf8(text, pos);
00097   } catch (std::exception) {
00098     log_debug << "Malformed utf-8 sequence beginning with " << *((uint32_t*)(text.c_str() + pos)) << " found " << std::endl;
00099     chr = 0;
00100   }
00101 }
00102 
00103   bool 
00104 UTF8Iterator::done() const
00105   {
00106     return pos > text.size();
00107   }
00108 
00109   UTF8Iterator& 
00110 UTF8Iterator::operator++() {
00111     try {
00112       chr = decode_utf8(text, pos);
00113     } catch (std::exception) {
00114       log_debug << "Malformed utf-8 sequence beginning with " << *((uint32_t*)(text.c_str() + pos)) << " found " << std::endl;
00115       chr = 0;
00116       ++pos;
00117     }
00118 
00119     return *this;
00120   }
00121 
00122   uint32_t
00123   UTF8Iterator::operator*() const {
00124     return chr;
00125   }
00126 
00127 /* EOF */

Generated on Mon Jun 9 03:38:23 2014 for SuperTux by  doxygen 1.5.1