Panda3D
|
00001 // Filename: textEncoder.cxx 00002 // Created by: drose (26Mar03) 00003 // 00004 //////////////////////////////////////////////////////////////////// 00005 // 00006 // PANDA 3D SOFTWARE 00007 // Copyright (c) Carnegie Mellon University. All rights reserved. 00008 // 00009 // All use of this software is subject to the terms of the revised BSD 00010 // license. You should have received a copy of this license along 00011 // with this source code in a file named "LICENSE." 00012 // 00013 //////////////////////////////////////////////////////////////////// 00014 00015 #include "textEncoder.h" 00016 #include "stringDecoder.h" 00017 #include "unicodeLatinMap.h" 00018 #include "config_dtoolutil.h" 00019 00020 TextEncoder::Encoding TextEncoder::_default_encoding = TextEncoder::E_iso8859; 00021 00022 //////////////////////////////////////////////////////////////////// 00023 // Function: TextEncoder::make_upper 00024 // Access: Published 00025 // Description: Adjusts the text stored within the encoder to all 00026 // uppercase letters (preserving accent marks 00027 // correctly). 00028 //////////////////////////////////////////////////////////////////// 00029 void TextEncoder:: 00030 make_upper() { 00031 get_wtext(); 00032 wstring::iterator si; 00033 for (si = _wtext.begin(); si != _wtext.end(); ++si) { 00034 (*si) = unicode_toupper(*si); 00035 } 00036 _flags &= ~F_got_text; 00037 } 00038 00039 //////////////////////////////////////////////////////////////////// 00040 // Function: TextEncoder::make_lower 00041 // Access: Published 00042 // Description: Adjusts the text stored within the encoder to all 00043 // lowercase letters (preserving accent marks 00044 // correctly). 00045 //////////////////////////////////////////////////////////////////// 00046 void TextEncoder:: 00047 make_lower() { 00048 get_wtext(); 00049 wstring::iterator si; 00050 for (si = _wtext.begin(); si != _wtext.end(); ++si) { 00051 (*si) = unicode_tolower(*si); 00052 } 00053 _flags &= ~F_got_text; 00054 } 00055 00056 //////////////////////////////////////////////////////////////////// 00057 // Function: TextEncoder::get_wtext_as_ascii 00058 // Access: Published 00059 // Description: Returns the text associated with the node, converted 00060 // as nearly as possible to a fully-ASCII 00061 // representation. This means replacing accented 00062 // letters with their unaccented ASCII equivalents. 00063 // 00064 // It is possible that some characters in the string 00065 // cannot be converted to ASCII. (The string may 00066 // involve symbols like the copyright symbol, for 00067 // instance, or it might involve letters in some other 00068 // alphabet such as Greek or Cyrillic, or even Latin 00069 // letters like thorn or eth that are not part of the 00070 // ASCII character set.) In this case, as much of the 00071 // string as possible will be converted to ASCII, and 00072 // the nonconvertible characters will remain in their 00073 // original form. 00074 //////////////////////////////////////////////////////////////////// 00075 wstring TextEncoder:: 00076 get_wtext_as_ascii() const { 00077 get_wtext(); 00078 wstring result; 00079 wstring::const_iterator si; 00080 for (si = _wtext.begin(); si != _wtext.end(); ++si) { 00081 wchar_t character = (*si); 00082 00083 const UnicodeLatinMap::Entry *map_entry = 00084 UnicodeLatinMap::look_up(character); 00085 if (map_entry != NULL && map_entry->_ascii_equiv != 0) { 00086 result += (wchar_t)map_entry->_ascii_equiv; 00087 if (map_entry->_ascii_additional != 0) { 00088 result += (wchar_t)map_entry->_ascii_additional; 00089 } 00090 00091 } else { 00092 result += character; 00093 } 00094 } 00095 00096 return result; 00097 } 00098 00099 //////////////////////////////////////////////////////////////////// 00100 // Function: TextEncoder::is_wtext 00101 // Access: Published 00102 // Description: Returns true if any of the characters in the string 00103 // returned by get_wtext() are out of the range of an 00104 // ASCII character (and, therefore, get_wtext() should 00105 // be called in preference to get_text()). 00106 //////////////////////////////////////////////////////////////////// 00107 bool TextEncoder:: 00108 is_wtext() const { 00109 get_wtext(); 00110 wstring::const_iterator ti; 00111 for (ti = _wtext.begin(); ti != _wtext.end(); ++ti) { 00112 if (((*ti) & ~0x7f) != 0) { 00113 return true; 00114 } 00115 } 00116 00117 return false; 00118 } 00119 00120 //////////////////////////////////////////////////////////////////// 00121 // Function: TextEncoder::encode_wchar 00122 // Access: Published, Static 00123 // Description: Encodes a single wide char into a one-, two-, or 00124 // three-byte string, according to the given encoding 00125 // system. 00126 //////////////////////////////////////////////////////////////////// 00127 string TextEncoder:: 00128 encode_wchar(wchar_t ch, TextEncoder::Encoding encoding) { 00129 switch (encoding) { 00130 case E_iso8859: 00131 if ((ch & ~0xff) == 0) { 00132 return string(1, (char)ch); 00133 } else { 00134 // The character won't fit in the 8-bit ISO 8859. See if we can 00135 // make it fit by reducing it to its ascii equivalent 00136 // (essentially stripping off an unusual accent mark). 00137 const UnicodeLatinMap::Entry *map_entry = 00138 UnicodeLatinMap::look_up(ch); 00139 if (map_entry != NULL && map_entry->_ascii_equiv != 0) { 00140 // Yes, it has an ascii equivalent. 00141 if (map_entry->_ascii_additional != 0) { 00142 // In fact, it has two of them. 00143 return 00144 string(1, map_entry->_ascii_equiv) + 00145 string(1, map_entry->_ascii_additional); 00146 } 00147 return string(1, map_entry->_ascii_equiv); 00148 } 00149 // Nope; return "." for lack of anything better. 00150 return "."; 00151 } 00152 00153 case E_utf8: 00154 if ((ch & ~0x7f) == 0) { 00155 return string(1, (char)ch); 00156 } else if ((ch & ~0x7ff) == 0) { 00157 return 00158 string(1, (char)((ch >> 6) | 0xc0)) + 00159 string(1, (char)((ch & 0x3f) | 0x80)); 00160 } else { 00161 return 00162 string(1, (char)((ch >> 12) | 0xe0)) + 00163 string(1, (char)(((ch >> 6) & 0x3f) | 0x80)) + 00164 string(1, (char)((ch & 0x3f) | 0x80)); 00165 } 00166 00167 case E_unicode: 00168 return 00169 string(1, (char)(ch >> 8)) + 00170 string(1, (char)(ch & 0xff)); 00171 } 00172 00173 return ""; 00174 } 00175 00176 //////////////////////////////////////////////////////////////////// 00177 // Function: TextEncoder::encode_wtext 00178 // Access: Published, Static 00179 // Description: Encodes a wide-text string into a single-char string, 00180 // according to the given encoding. 00181 //////////////////////////////////////////////////////////////////// 00182 string TextEncoder:: 00183 encode_wtext(const wstring &wtext, TextEncoder::Encoding encoding) { 00184 string result; 00185 00186 for (wstring::const_iterator pi = wtext.begin(); pi != wtext.end(); ++pi) { 00187 result += encode_wchar(*pi, encoding); 00188 } 00189 00190 return result; 00191 } 00192 00193 //////////////////////////////////////////////////////////////////// 00194 // Function: TextEncoder::decode_text 00195 // Access: Published, Static 00196 // Description: Returns the given wstring decoded to a single-byte 00197 // string, via the given encoding system. 00198 //////////////////////////////////////////////////////////////////// 00199 wstring TextEncoder:: 00200 decode_text(const string &text, TextEncoder::Encoding encoding) { 00201 switch (encoding) { 00202 case E_utf8: 00203 { 00204 StringUtf8Decoder decoder(text); 00205 return decode_text_impl(decoder); 00206 } 00207 00208 case E_unicode: 00209 { 00210 StringUnicodeDecoder decoder(text); 00211 return decode_text_impl(decoder); 00212 } 00213 00214 case E_iso8859: 00215 default: 00216 { 00217 StringDecoder decoder(text); 00218 return decode_text_impl(decoder); 00219 } 00220 }; 00221 } 00222 00223 //////////////////////////////////////////////////////////////////// 00224 // Function: TextEncoder::decode_text_impl 00225 // Access: Private, Static 00226 // Description: Decodes the eight-bit stream from the indicated 00227 // decoder, returning the decoded wide-char string. 00228 //////////////////////////////////////////////////////////////////// 00229 wstring TextEncoder:: 00230 decode_text_impl(StringDecoder &decoder) { 00231 wstring result; 00232 // bool expand_amp = get_expand_amp(); 00233 00234 wchar_t character = decoder.get_next_character(); 00235 while (!decoder.is_eof()) { 00236 /* 00237 if (character == '&' && expand_amp) { 00238 // An ampersand in expand_amp mode is treated as an escape 00239 // character. 00240 character = expand_amp_sequence(decoder); 00241 } 00242 */ 00243 result += character; 00244 character = decoder.get_next_character(); 00245 } 00246 00247 return result; 00248 } 00249 00250 /* 00251 //////////////////////////////////////////////////////////////////// 00252 // Function: TextEncoder::expand_amp_sequence 00253 // Access: Private 00254 // Description: Given that we have just read an ampersand from the 00255 // StringDecoder, and that we have expand_amp in effect 00256 // and are therefore expected to expand the sequence 00257 // that this ampersand begins into a single unicode 00258 // character, do the expansion and return the character. 00259 //////////////////////////////////////////////////////////////////// 00260 int TextEncoder:: 00261 expand_amp_sequence(StringDecoder &decoder) const { 00262 int result = 0; 00263 00264 int character = decoder.get_next_character(); 00265 if (!decoder.is_eof() && character == '#') { 00266 // An explicit numeric sequence: &#nnn; 00267 result = 0; 00268 character = decoder.get_next_character(); 00269 while (!decoder.is_eof() && character < 128 && isdigit((unsigned int)character)) { 00270 result = (result * 10) + (character - '0'); 00271 character = decoder.get_next_character(); 00272 } 00273 if (character != ';') { 00274 // Invalid sequence. 00275 return 0; 00276 } 00277 00278 return result; 00279 } 00280 00281 string sequence; 00282 00283 // Some non-numeric sequence. 00284 while (!decoder.is_eof() && character < 128 && isalpha((unsigned int)character)) { 00285 sequence += character; 00286 character = decoder.get_next_character(); 00287 } 00288 if (character != ';') { 00289 // Invalid sequence. 00290 return 0; 00291 } 00292 00293 static const struct { 00294 const char *name; 00295 int code; 00296 } tokens[] = { 00297 { "amp", '&' }, { "lt", '<' }, { "gt", '>' }, { "quot", '"' }, 00298 { "nbsp", ' ' }, 00299 00300 { "iexcl", 161 }, { "cent", 162 }, { "pound", 163 }, { "curren", 164 }, 00301 { "yen", 165 }, { "brvbar", 166 }, { "brkbar", 166 }, { "sect", 167 }, 00302 { "uml", 168 }, { "die", 168 }, { "copy", 169 }, { "ordf", 170 }, 00303 { "laquo", 171 }, { "not", 172 }, { "shy", 173 }, { "reg", 174 }, 00304 { "macr", 175 }, { "hibar", 175 }, { "deg", 176 }, { "plusmn", 177 }, 00305 { "sup2", 178 }, { "sup3", 179 }, { "acute", 180 }, { "micro", 181 }, 00306 { "para", 182 }, { "middot", 183 }, { "cedil", 184 }, { "sup1", 185 }, 00307 { "ordm", 186 }, { "raquo", 187 }, { "frac14", 188 }, { "frac12", 189 }, 00308 { "frac34", 190 }, { "iquest", 191 }, { "Agrave", 192 }, { "Aacute", 193 }, 00309 { "Acirc", 194 }, { "Atilde", 195 }, { "Auml", 196 }, { "Aring", 197 }, 00310 { "AElig", 198 }, { "Ccedil", 199 }, { "Egrave", 200 }, { "Eacute", 201 }, 00311 { "Ecirc", 202 }, { "Euml", 203 }, { "Igrave", 204 }, { "Iacute", 205 }, 00312 { "Icirc", 206 }, { "Iuml", 207 }, { "ETH", 208 }, { "Dstrok", 208 }, 00313 { "Ntilde", 209 }, { "Ograve", 210 }, { "Oacute", 211 }, { "Ocirc", 212 }, 00314 { "Otilde", 213 }, { "Ouml", 214 }, { "times", 215 }, { "Oslash", 216 }, 00315 { "Ugrave", 217 }, { "Uacute", 218 }, { "Ucirc", 219 }, { "Uuml", 220 }, 00316 { "Yacute", 221 }, { "THORN", 222 }, { "szlig", 223 }, { "agrave", 224 }, 00317 { "aacute", 225 }, { "acirc", 226 }, { "atilde", 227 }, { "auml", 228 }, 00318 { "aring", 229 }, { "aelig", 230 }, { "ccedil", 231 }, { "egrave", 232 }, 00319 { "eacute", 233 }, { "ecirc", 234 }, { "euml", 235 }, { "igrave", 236 }, 00320 { "iacute", 237 }, { "icirc", 238 }, { "iuml", 239 }, { "eth", 240 }, 00321 { "ntilde", 241 }, { "ograve", 242 }, { "oacute", 243 }, { "ocirc", 244 }, 00322 { "otilde", 245 }, { "ouml", 246 }, { "divide", 247 }, { "oslash", 248 }, 00323 { "ugrave", 249 }, { "uacute", 250 }, { "ucirc", 251 }, { "uuml", 252 }, 00324 { "yacute", 253 }, { "thorn", 254 }, { "yuml", 255 }, 00325 00326 { NULL, 0 }, 00327 }; 00328 00329 for (int i = 0; tokens[i].name != NULL; i++) { 00330 if (sequence == tokens[i].name) { 00331 // Here's a match. 00332 return tokens[i].code; 00333 } 00334 } 00335 00336 // Some unrecognized sequence. 00337 return 0; 00338 } 00339 */ 00340 00341 00342 //////////////////////////////////////////////////////////////////// 00343 // Function: TextEncoder::Encoding ostream operator 00344 // Description: 00345 //////////////////////////////////////////////////////////////////// 00346 ostream & 00347 operator << (ostream &out, TextEncoder::Encoding encoding) { 00348 switch (encoding) { 00349 case TextEncoder::E_iso8859: 00350 return out << "iso8859"; 00351 00352 case TextEncoder::E_utf8: 00353 return out << "utf8"; 00354 00355 case TextEncoder::E_unicode: 00356 return out << "unicode"; 00357 }; 00358 00359 return out << "**invalid TextEncoder::Encoding(" << (int)encoding << ")**"; 00360 } 00361 00362 //////////////////////////////////////////////////////////////////// 00363 // Function: TextEncoder::Encoding istream operator 00364 // Description: 00365 //////////////////////////////////////////////////////////////////// 00366 istream & 00367 operator >> (istream &in, TextEncoder::Encoding &encoding) { 00368 string word; 00369 in >> word; 00370 00371 if (word == "iso8859") { 00372 encoding = TextEncoder::E_iso8859; 00373 } else if (word == "utf8" || word == "utf-8") { 00374 encoding = TextEncoder::E_utf8; 00375 } else if (word == "unicode") { 00376 encoding = TextEncoder::E_unicode; 00377 } else { 00378 ostream *notify_ptr = StringDecoder::get_notify_ptr(); 00379 if (notify_ptr != (ostream *)NULL) { 00380 (*notify_ptr) 00381 << "Invalid TextEncoder::Encoding: " << word << "\n"; 00382 } 00383 encoding = TextEncoder::E_iso8859; 00384 } 00385 00386 return in; 00387 }