Panda3D
|
00001 // Filename: textEncoder.cxx 00002 // Created by: drose (26Mar03) 00003 // 00004 //////////////////////////////////////////////////////////////////// 00005 // 00006 // PANDA 3D SOFTWARE 00007 // Copyright (c) Carnegie Mellon University. All rights reserved. 00008 // 00009 // All use of this software is subject to the terms of the revised BSD 00010 // license. You should have received a copy of this license along 00011 // with this source code in a file named "LICENSE." 00012 // 00013 //////////////////////////////////////////////////////////////////// 00014 00015 #include "textEncoder.h" 00016 #include "stringDecoder.h" 00017 #include "unicodeLatinMap.h" 00018 #include "config_express.h" 00019 00020 TypeHandle TextEncoder::_type_handle; 00021 ConfigVariableEnum<TextEncoder::Encoding> TextEncoder::_default_encoding 00022 ("text-encoding", TextEncoder::E_iso8859, 00023 PRC_DESC("Specifies how international characters are represented in strings " 00024 "of 8-byte characters presented to Panda. See TextEncoder::set_encoding().")); 00025 00026 //////////////////////////////////////////////////////////////////// 00027 // Function: TextEncoder::make_upper 00028 // Access: Published 00029 // Description: Adjusts the text stored within the encoder to all 00030 // uppercase letters (preserving accent marks 00031 // correctly). 00032 //////////////////////////////////////////////////////////////////// 00033 void TextEncoder:: 00034 make_upper() { 00035 get_wtext(); 00036 wstring::iterator si; 00037 for (si = _wtext.begin(); si != _wtext.end(); ++si) { 00038 (*si) = unicode_toupper(*si); 00039 } 00040 _flags &= ~F_got_text; 00041 } 00042 00043 //////////////////////////////////////////////////////////////////// 00044 // Function: TextEncoder::make_lower 00045 // Access: Published 00046 // Description: Adjusts the text stored within the encoder to all 00047 // lowercase letters (preserving accent marks 00048 // correctly). 00049 //////////////////////////////////////////////////////////////////// 00050 void TextEncoder:: 00051 make_lower() { 00052 get_wtext(); 00053 wstring::iterator si; 00054 for (si = _wtext.begin(); si != _wtext.end(); ++si) { 00055 (*si) = unicode_tolower(*si); 00056 } 00057 _flags &= ~F_got_text; 00058 } 00059 00060 //////////////////////////////////////////////////////////////////// 00061 // Function: TextEncoder::get_wtext_as_ascii 00062 // Access: Published 00063 // Description: Returns the text associated with the node, converted 00064 // as nearly as possible to a fully-ASCII 00065 // representation. This means replacing accented 00066 // letters with their unaccented ASCII equivalents. 00067 // 00068 // It is possible that some characters in the string 00069 // cannot be converted to ASCII. (The string may 00070 // involve symbols like the copyright symbol, for 00071 // instance, or it might involve letters in some other 00072 // alphabet such as Greek or Cyrillic, or even Latin 00073 // letters like thorn or eth that are not part of the 00074 // ASCII character set.) In this case, as much of the 00075 // string as possible will be converted to ASCII, and 00076 // the nonconvertible characters will remain in their 00077 // original form. 00078 //////////////////////////////////////////////////////////////////// 00079 wstring TextEncoder:: 00080 get_wtext_as_ascii() const { 00081 get_wtext(); 00082 wstring result; 00083 wstring::const_iterator si; 00084 for (si = _wtext.begin(); si != _wtext.end(); ++si) { 00085 wchar_t character = (*si); 00086 00087 const UnicodeLatinMap::Entry *map_entry = 00088 UnicodeLatinMap::look_up(character); 00089 if (map_entry != NULL && map_entry->_ascii_equiv != 0) { 00090 result += (wchar_t)map_entry->_ascii_equiv; 00091 if (map_entry->_ascii_additional != 0) { 00092 result += (wchar_t)map_entry->_ascii_additional; 00093 } 00094 00095 } else { 00096 result += character; 00097 } 00098 } 00099 00100 return result; 00101 } 00102 00103 //////////////////////////////////////////////////////////////////// 00104 // Function: TextEncoder::is_wtext 00105 // Access: Published 00106 // Description: Returns true if any of the characters in the string 00107 // returned by get_wtext() are out of the range of an 00108 // ASCII character (and, therefore, get_wtext() should 00109 // be called in preference to get_text()). 00110 //////////////////////////////////////////////////////////////////// 00111 bool TextEncoder:: 00112 is_wtext() const { 00113 get_wtext(); 00114 wstring::const_iterator ti; 00115 for (ti = _wtext.begin(); ti != _wtext.end(); ++ti) { 00116 if (((*ti) & ~0x7f) != 0) { 00117 return true; 00118 } 00119 } 00120 00121 return false; 00122 } 00123 00124 //////////////////////////////////////////////////////////////////// 00125 // Function: TextEncoder::encode_wchar 00126 // Access: Published, Static 00127 // Description: Encodes a single wide char into a one-, two-, or 00128 // three-byte string, according to the given encoding 00129 // system. 00130 //////////////////////////////////////////////////////////////////// 00131 string TextEncoder:: 00132 encode_wchar(wchar_t ch, TextEncoder::Encoding encoding) { 00133 switch (encoding) { 00134 case E_iso8859: 00135 if ((ch & ~0xff) == 0) { 00136 return string(1, (char)ch); 00137 } else { 00138 // The character won't fit in the 8-bit ISO 8859. See if we can 00139 // make it fit by reducing it to its ascii equivalent 00140 // (essentially stripping off an unusual accent mark). 00141 const UnicodeLatinMap::Entry *map_entry = 00142 UnicodeLatinMap::look_up(ch); 00143 if (map_entry != NULL && map_entry->_ascii_equiv != 0) { 00144 // Yes, it has an ascii equivalent. 00145 if (map_entry->_ascii_additional != 0) { 00146 // In fact, it has two of them. 00147 return 00148 string(1, map_entry->_ascii_equiv) + 00149 string(1, map_entry->_ascii_additional); 00150 } 00151 return string(1, map_entry->_ascii_equiv); 00152 } 00153 // Nope; return "." for lack of anything better. 00154 return "."; 00155 } 00156 00157 case E_utf8: 00158 if ((ch & ~0x7f) == 0) { 00159 return string(1, (char)ch); 00160 } else if ((ch & ~0x7ff) == 0) { 00161 return 00162 string(1, (char)((ch >> 6) | 0xc0)) + 00163 string(1, (char)((ch & 0x3f) | 0x80)); 00164 } else { 00165 return 00166 string(1, (char)((ch >> 12) | 0xe0)) + 00167 string(1, (char)(((ch >> 6) & 0x3f) | 0x80)) + 00168 string(1, (char)((ch & 0x3f) | 0x80)); 00169 } 00170 00171 case E_unicode: 00172 return 00173 string(1, (char)(ch >> 8)) + 00174 string(1, (char)(ch & 0xff)); 00175 } 00176 00177 return ""; 00178 } 00179 00180 //////////////////////////////////////////////////////////////////// 00181 // Function: TextEncoder::encode_wtext 00182 // Access: Published, Static 00183 // Description: Encodes a wide-text string into a single-char string, 00184 // according to the given encoding. 00185 //////////////////////////////////////////////////////////////////// 00186 string TextEncoder:: 00187 encode_wtext(const wstring &wtext, TextEncoder::Encoding encoding) { 00188 string result; 00189 00190 for (wstring::const_iterator pi = wtext.begin(); pi != wtext.end(); ++pi) { 00191 result += encode_wchar(*pi, encoding); 00192 } 00193 00194 return result; 00195 } 00196 00197 //////////////////////////////////////////////////////////////////// 00198 // Function: TextEncoder::decode_text 00199 // Access: Published, Static 00200 // Description: Returns the given wstring decoded to a single-byte 00201 // string, via the given encoding system. 00202 //////////////////////////////////////////////////////////////////// 00203 wstring TextEncoder:: 00204 decode_text(const string &text, TextEncoder::Encoding encoding) { 00205 switch (encoding) { 00206 case E_utf8: 00207 { 00208 StringUtf8Decoder decoder(text); 00209 return decode_text_impl(decoder); 00210 } 00211 00212 case E_unicode: 00213 { 00214 StringUnicodeDecoder decoder(text); 00215 return decode_text_impl(decoder); 00216 } 00217 00218 case E_iso8859: 00219 default: 00220 { 00221 StringDecoder decoder(text); 00222 return decode_text_impl(decoder); 00223 } 00224 }; 00225 } 00226 00227 //////////////////////////////////////////////////////////////////// 00228 // Function: TextEncoder::decode_text_impl 00229 // Access: Private, Static 00230 // Description: Decodes the eight-bit stream from the indicated 00231 // decoder, returning the decoded wide-char string. 00232 //////////////////////////////////////////////////////////////////// 00233 wstring TextEncoder:: 00234 decode_text_impl(StringDecoder &decoder) { 00235 wstring result; 00236 // bool expand_amp = get_expand_amp(); 00237 00238 wchar_t character = decoder.get_next_character(); 00239 while (!decoder.is_eof()) { 00240 /* 00241 if (character == '&' && expand_amp) { 00242 // An ampersand in expand_amp mode is treated as an escape 00243 // character. 00244 character = expand_amp_sequence(decoder); 00245 } 00246 */ 00247 result += character; 00248 character = decoder.get_next_character(); 00249 } 00250 00251 return result; 00252 } 00253 00254 /* 00255 //////////////////////////////////////////////////////////////////// 00256 // Function: TextEncoder::expand_amp_sequence 00257 // Access: Private 00258 // Description: Given that we have just read an ampersand from the 00259 // StringDecoder, and that we have expand_amp in effect 00260 // and are therefore expected to expand the sequence 00261 // that this ampersand begins into a single unicode 00262 // character, do the expansion and return the character. 00263 //////////////////////////////////////////////////////////////////// 00264 int TextEncoder:: 00265 expand_amp_sequence(StringDecoder &decoder) const { 00266 int result = 0; 00267 00268 int character = decoder.get_next_character(); 00269 if (!decoder.is_eof() && character == '#') { 00270 // An explicit numeric sequence: &#nnn; 00271 result = 0; 00272 character = decoder.get_next_character(); 00273 while (!decoder.is_eof() && character < 128 && isdigit((unsigned int)character)) { 00274 result = (result * 10) + (character - '0'); 00275 character = decoder.get_next_character(); 00276 } 00277 if (character != ';') { 00278 // Invalid sequence. 00279 return 0; 00280 } 00281 00282 return result; 00283 } 00284 00285 string sequence; 00286 00287 // Some non-numeric sequence. 00288 while (!decoder.is_eof() && character < 128 && isalpha((unsigned int)character)) { 00289 sequence += character; 00290 character = decoder.get_next_character(); 00291 } 00292 if (character != ';') { 00293 // Invalid sequence. 00294 return 0; 00295 } 00296 00297 static const struct { 00298 const char *name; 00299 int code; 00300 } tokens[] = { 00301 { "amp", '&' }, { "lt", '<' }, { "gt", '>' }, { "quot", '"' }, 00302 { "nbsp", ' ' }, 00303 00304 { "iexcl", 161 }, { "cent", 162 }, { "pound", 163 }, { "curren", 164 }, 00305 { "yen", 165 }, { "brvbar", 166 }, { "brkbar", 166 }, { "sect", 167 }, 00306 { "uml", 168 }, { "die", 168 }, { "copy", 169 }, { "ordf", 170 }, 00307 { "laquo", 171 }, { "not", 172 }, { "shy", 173 }, { "reg", 174 }, 00308 { "macr", 175 }, { "hibar", 175 }, { "deg", 176 }, { "plusmn", 177 }, 00309 { "sup2", 178 }, { "sup3", 179 }, { "acute", 180 }, { "micro", 181 }, 00310 { "para", 182 }, { "middot", 183 }, { "cedil", 184 }, { "sup1", 185 }, 00311 { "ordm", 186 }, { "raquo", 187 }, { "frac14", 188 }, { "frac12", 189 }, 00312 { "frac34", 190 }, { "iquest", 191 }, { "Agrave", 192 }, { "Aacute", 193 }, 00313 { "Acirc", 194 }, { "Atilde", 195 }, { "Auml", 196 }, { "Aring", 197 }, 00314 { "AElig", 198 }, { "Ccedil", 199 }, { "Egrave", 200 }, { "Eacute", 201 }, 00315 { "Ecirc", 202 }, { "Euml", 203 }, { "Igrave", 204 }, { "Iacute", 205 }, 00316 { "Icirc", 206 }, { "Iuml", 207 }, { "ETH", 208 }, { "Dstrok", 208 }, 00317 { "Ntilde", 209 }, { "Ograve", 210 }, { "Oacute", 211 }, { "Ocirc", 212 }, 00318 { "Otilde", 213 }, { "Ouml", 214 }, { "times", 215 }, { "Oslash", 216 }, 00319 { "Ugrave", 217 }, { "Uacute", 218 }, { "Ucirc", 219 }, { "Uuml", 220 }, 00320 { "Yacute", 221 }, { "THORN", 222 }, { "szlig", 223 }, { "agrave", 224 }, 00321 { "aacute", 225 }, { "acirc", 226 }, { "atilde", 227 }, { "auml", 228 }, 00322 { "aring", 229 }, { "aelig", 230 }, { "ccedil", 231 }, { "egrave", 232 }, 00323 { "eacute", 233 }, { "ecirc", 234 }, { "euml", 235 }, { "igrave", 236 }, 00324 { "iacute", 237 }, { "icirc", 238 }, { "iuml", 239 }, { "eth", 240 }, 00325 { "ntilde", 241 }, { "ograve", 242 }, { "oacute", 243 }, { "ocirc", 244 }, 00326 { "otilde", 245 }, { "ouml", 246 }, { "divide", 247 }, { "oslash", 248 }, 00327 { "ugrave", 249 }, { "uacute", 250 }, { "ucirc", 251 }, { "uuml", 252 }, 00328 { "yacute", 253 }, { "thorn", 254 }, { "yuml", 255 }, 00329 00330 { NULL, 0 }, 00331 }; 00332 00333 for (int i = 0; tokens[i].name != NULL; i++) { 00334 if (sequence == tokens[i].name) { 00335 // Here's a match. 00336 return tokens[i].code; 00337 } 00338 } 00339 00340 // Some unrecognized sequence. 00341 return 0; 00342 } 00343 */ 00344 00345 00346 //////////////////////////////////////////////////////////////////// 00347 // Function: TextEncoder::Encoding ostream operator 00348 // Description: 00349 //////////////////////////////////////////////////////////////////// 00350 ostream & 00351 operator << (ostream &out, TextEncoder::Encoding encoding) { 00352 switch (encoding) { 00353 case TextEncoder::E_iso8859: 00354 return out << "iso8859"; 00355 00356 case TextEncoder::E_utf8: 00357 return out << "utf8"; 00358 00359 case TextEncoder::E_unicode: 00360 return out << "unicode"; 00361 }; 00362 00363 return out << "**invalid TextEncoder::Encoding(" << (int)encoding << ")**"; 00364 } 00365 00366 //////////////////////////////////////////////////////////////////// 00367 // Function: TextEncoder::Encoding istream operator 00368 // Description: 00369 //////////////////////////////////////////////////////////////////// 00370 istream & 00371 operator >> (istream &in, TextEncoder::Encoding &encoding) { 00372 string word; 00373 in >> word; 00374 00375 if (word == "iso8859") { 00376 encoding = TextEncoder::E_iso8859; 00377 } else if (word == "utf8" || word == "utf-8") { 00378 encoding = TextEncoder::E_utf8; 00379 } else if (word == "unicode") { 00380 encoding = TextEncoder::E_unicode; 00381 } else { 00382 express_cat.error() 00383 << "Invalid TextEncoder::Encoding: " << word << "\n"; 00384 encoding = TextEncoder::E_iso8859; 00385 } 00386 00387 return in; 00388 }