Panda3D
|
00001 // Filename: textEncoder.I 00002 // Created by: drose (26Mar03) 00003 // 00004 //////////////////////////////////////////////////////////////////// 00005 // 00006 // PANDA 3D SOFTWARE 00007 // Copyright (c) Carnegie Mellon University. All rights reserved. 00008 // 00009 // All use of this software is subject to the terms of the revised BSD 00010 // license. You should have received a copy of this license along 00011 // with this source code in a file named "LICENSE." 00012 // 00013 //////////////////////////////////////////////////////////////////// 00014 00015 00016 //////////////////////////////////////////////////////////////////// 00017 // Function: TextEncoder::Constructor 00018 // Access: Published 00019 // Description: 00020 //////////////////////////////////////////////////////////////////// 00021 INLINE TextEncoder:: 00022 TextEncoder() { 00023 _encoding = _default_encoding; 00024 00025 // Initially, since the text string is empty, we know that both 00026 // _text and _wtext accurately reflect the empty state; so we "got" 00027 // both of them. 00028 _flags = (F_got_text | F_got_wtext); 00029 } 00030 00031 //////////////////////////////////////////////////////////////////// 00032 // Function: TextEncoder::Copy Constructor 00033 // Access: Published 00034 // Description: 00035 //////////////////////////////////////////////////////////////////// 00036 INLINE TextEncoder:: 00037 TextEncoder(const TextEncoder ©) : 00038 _flags(copy._flags), 00039 _encoding(copy._encoding), 00040 _text(copy._text), 00041 _wtext(copy._wtext) 00042 { 00043 } 00044 00045 //////////////////////////////////////////////////////////////////// 00046 // Function: TextEncoder::set_encoding 00047 // Access: Published 00048 // Description: Specifies how the string set via set_text() is to be 00049 // interpreted. The default, E_iso8859, means a 00050 // standard string with one-byte characters 00051 // (i.e. ASCII). Other encodings are possible to take 00052 // advantage of character sets with more than 256 00053 // characters. 00054 // 00055 // This affects only future calls to set_text(); it does 00056 // not change text that was set previously. 00057 //////////////////////////////////////////////////////////////////// 00058 INLINE void TextEncoder:: 00059 set_encoding(TextEncoder::Encoding encoding) { 00060 // Force the previously-set strings to be encoded or decoded now. 00061 get_text(); 00062 get_wtext(); 00063 _encoding = encoding; 00064 } 00065 00066 //////////////////////////////////////////////////////////////////// 00067 // Function: TextEncoder::get_encoding 00068 // Access: Published 00069 // Description: Returns the encoding by which the string set via 00070 // set_text() is to be interpreted. See set_encoding(). 00071 //////////////////////////////////////////////////////////////////// 00072 INLINE TextEncoder::Encoding TextEncoder:: 00073 get_encoding() const { 00074 return _encoding; 00075 } 00076 00077 //////////////////////////////////////////////////////////////////// 00078 // Function: TextEncoder::set_default_encoding 00079 // Access: Published, Static 00080 // Description: Specifies the default encoding to be used for all 00081 // subsequently created TextEncoder objects. See 00082 // set_encoding(). 00083 //////////////////////////////////////////////////////////////////// 00084 INLINE void TextEncoder:: 00085 set_default_encoding(TextEncoder::Encoding encoding) { 00086 _default_encoding = encoding; 00087 } 00088 00089 //////////////////////////////////////////////////////////////////// 00090 // Function: TextEncoder::get_default_encoding 00091 // Access: Published, Static 00092 // Description: Specifies the default encoding to be used for all 00093 // subsequently created TextEncoder objects. See 00094 // set_encoding(). 00095 //////////////////////////////////////////////////////////////////// 00096 INLINE TextEncoder::Encoding TextEncoder:: 00097 get_default_encoding() { 00098 return _default_encoding; 00099 } 00100 00101 //////////////////////////////////////////////////////////////////// 00102 // Function: TextEncoder::set_text 00103 // Access: Published 00104 // Description: Changes the text that is stored in the encoder. The 00105 // text should be encoded according to the method 00106 // indicated by set_encoding(). Subsequent calls to 00107 // get_text() will return this same string, while 00108 // get_wtext() will return the decoded version of the 00109 // string. 00110 //////////////////////////////////////////////////////////////////// 00111 INLINE void TextEncoder:: 00112 set_text(const string &text) { 00113 if (!has_text() || _text != text) { 00114 _text = text; 00115 _flags = (_flags | F_got_text) & ~F_got_wtext; 00116 } 00117 } 00118 00119 //////////////////////////////////////////////////////////////////// 00120 // Function: TextEncoder::set_text 00121 // Access: Published 00122 // Description: The two-parameter version of set_text() accepts an 00123 // explicit encoding; the text is immediately decoded 00124 // and stored as a wide-character string. Subsequent 00125 // calls to get_text() will return the same text 00126 // re-encoded using whichever encoding is specified by 00127 // set_encoding(). 00128 //////////////////////////////////////////////////////////////////// 00129 INLINE void TextEncoder:: 00130 set_text(const string &text, TextEncoder::Encoding encoding) { 00131 set_wtext(decode_text(text, encoding)); 00132 } 00133 00134 //////////////////////////////////////////////////////////////////// 00135 // Function: TextEncoder::clear_text 00136 // Access: Published 00137 // Description: Removes the text from the TextEncoder. 00138 //////////////////////////////////////////////////////////////////// 00139 INLINE void TextEncoder:: 00140 clear_text() { 00141 _text = string(); 00142 _wtext = wstring(); 00143 _flags |= (F_got_text | F_got_wtext); 00144 } 00145 00146 //////////////////////////////////////////////////////////////////// 00147 // Function: TextEncoder::has_text 00148 // Access: Published 00149 // Description: 00150 //////////////////////////////////////////////////////////////////// 00151 INLINE bool TextEncoder:: 00152 has_text() const { 00153 if (_flags & F_got_wtext) { 00154 return !_wtext.empty(); 00155 } else { 00156 return !_text.empty(); 00157 } 00158 } 00159 00160 //////////////////////////////////////////////////////////////////// 00161 // Function: TextEncoder::get_text 00162 // Access: Published 00163 // Description: Returns the current text, as encoded via the current 00164 // encoding system. 00165 //////////////////////////////////////////////////////////////////// 00166 INLINE string TextEncoder:: 00167 get_text() const { 00168 if ((_flags & F_got_text) == 0) { 00169 ((TextEncoder *)this)->_text = encode_wtext(_wtext); 00170 ((TextEncoder *)this)->_flags |= F_got_text; 00171 } 00172 return _text; 00173 } 00174 00175 //////////////////////////////////////////////////////////////////// 00176 // Function: TextEncoder::get_text 00177 // Access: Published 00178 // Description: Returns the current text, as encoded via the indicated 00179 // encoding system. 00180 //////////////////////////////////////////////////////////////////// 00181 INLINE string TextEncoder:: 00182 get_text(TextEncoder::Encoding encoding) const { 00183 return encode_wtext(get_wtext(), encoding); 00184 } 00185 00186 //////////////////////////////////////////////////////////////////// 00187 // Function: TextEncoder::append_text 00188 // Access: Published 00189 // Description: Appends the indicates string to the end of the stored 00190 // text. 00191 //////////////////////////////////////////////////////////////////// 00192 INLINE void TextEncoder:: 00193 append_text(const string &text) { 00194 _text = get_text() + text; 00195 _flags = (_flags | F_got_text) & ~F_got_wtext; 00196 } 00197 00198 //////////////////////////////////////////////////////////////////// 00199 // Function: TextEncoder::append_unicode_char 00200 // Access: Published 00201 // Description: Appends a single character to the end of the stored 00202 // text. This may be a wide character, up to 16 bits in 00203 // Unicode. 00204 //////////////////////////////////////////////////////////////////// 00205 INLINE void TextEncoder:: 00206 append_unicode_char(int character) { 00207 _wtext = get_wtext() + wstring(1, (wchar_t)character); 00208 _flags = (_flags | F_got_wtext) & ~F_got_text; 00209 } 00210 00211 //////////////////////////////////////////////////////////////////// 00212 // Function: TextEncoder::get_num_chars 00213 // Access: Published 00214 // Description: Returns the number of characters in the stored text. 00215 // This is a count of wide characters, after the string 00216 // has been decoded according to set_encoding(). 00217 //////////////////////////////////////////////////////////////////// 00218 INLINE int TextEncoder:: 00219 get_num_chars() const { 00220 return get_wtext().length(); 00221 } 00222 00223 //////////////////////////////////////////////////////////////////// 00224 // Function: TextEncoder::get_unicode_char 00225 // Access: Published 00226 // Description: Returns the Unicode value of the nth character in the 00227 // stored text. This may be a wide character (greater 00228 // than 255), after the string has been decoded 00229 // according to set_encoding(). 00230 //////////////////////////////////////////////////////////////////// 00231 INLINE int TextEncoder:: 00232 get_unicode_char(int index) const { 00233 get_wtext(); 00234 if (index >= 0 && index < (int)_wtext.length()) { 00235 return _wtext[index]; 00236 } 00237 return 0; 00238 } 00239 00240 //////////////////////////////////////////////////////////////////// 00241 // Function: TextEncoder::set_unicode_char 00242 // Access: Published 00243 // Description: Sets the Unicode value of the nth character in the 00244 // stored text. This may be a wide character (greater 00245 // than 255), after the string has been decoded 00246 // according to set_encoding(). 00247 //////////////////////////////////////////////////////////////////// 00248 INLINE void TextEncoder:: 00249 set_unicode_char(int index, int character) { 00250 get_wtext(); 00251 if (index >= 0 && index < (int)_wtext.length()) { 00252 _wtext[index] = character; 00253 _flags &= ~F_got_text; 00254 } 00255 } 00256 00257 //////////////////////////////////////////////////////////////////// 00258 // Function: TextEncoder::get_encoded_char 00259 // Access: Published 00260 // Description: Returns the nth char of the stored text, as a one-, 00261 // two-, or three-byte encoded string. 00262 //////////////////////////////////////////////////////////////////// 00263 INLINE string TextEncoder:: 00264 get_encoded_char(int index) const { 00265 return get_encoded_char(index, get_encoding()); 00266 } 00267 00268 //////////////////////////////////////////////////////////////////// 00269 // Function: TextEncoder::get_encoded_char 00270 // Access: Published 00271 // Description: Returns the nth char of the stored text, as a one-, 00272 // two-, or three-byte encoded string. 00273 //////////////////////////////////////////////////////////////////// 00274 INLINE string TextEncoder:: 00275 get_encoded_char(int index, TextEncoder::Encoding encoding) const { 00276 wstring wch(1, (wchar_t)get_unicode_char(index)); 00277 return encode_wtext(wch, encoding); 00278 } 00279 00280 //////////////////////////////////////////////////////////////////// 00281 // Function: TextEncoder::get_text_as_ascii 00282 // Access: Published 00283 // Description: Returns the text associated with the node, converted 00284 // as nearly as possible to a fully-ASCII 00285 // representation. This means replacing accented 00286 // letters with their unaccented ASCII equivalents. 00287 // 00288 // It is possible that some characters in the string 00289 // cannot be converted to ASCII. (The string may 00290 // involve symbols like the copyright symbol, for 00291 // instance, or it might involve letters in some other 00292 // alphabet such as Greek or Cyrillic, or even Latin 00293 // letters like thorn or eth that are not part of the 00294 // ASCII character set.) In this case, as much of the 00295 // string as possible will be converted to ASCII, and 00296 // the nonconvertible characters will remain encoded in 00297 // the encoding specified by set_encoding(). 00298 //////////////////////////////////////////////////////////////////// 00299 INLINE string TextEncoder:: 00300 get_text_as_ascii() const { 00301 return encode_wtext(get_wtext_as_ascii()); 00302 } 00303 00304 //////////////////////////////////////////////////////////////////// 00305 // Function: TextEncoder::reencode_text 00306 // Access: Published, Static 00307 // Description: Given the indicated text string, which is assumed to 00308 // be encoded via the encoding "from", decodes it and 00309 // then reencodes it into the encoding "to", and returns 00310 // the newly encoded string. This does not change or 00311 // affect any properties on the TextEncoder itself. 00312 //////////////////////////////////////////////////////////////////// 00313 INLINE string TextEncoder:: 00314 reencode_text(const string &text, TextEncoder::Encoding from, 00315 TextEncoder::Encoding to) { 00316 return encode_wtext(decode_text(text, from), to); 00317 } 00318 00319 //////////////////////////////////////////////////////////////////// 00320 // Function: TextEncoder::unicode_isalpha 00321 // Access: Published, Static 00322 // Description: Returns true if the indicated character is an 00323 // alphabetic letter, false otherwise. This is akin to 00324 // ctype's isalpha(), extended to Unicode. 00325 //////////////////////////////////////////////////////////////////// 00326 INLINE bool TextEncoder:: 00327 unicode_isalpha(int character) { 00328 const UnicodeLatinMap::Entry *entry = UnicodeLatinMap::look_up(character); 00329 if (entry == (const UnicodeLatinMap::Entry *)NULL) { 00330 return false; 00331 } 00332 return entry->_char_type == UnicodeLatinMap::CT_upper || 00333 entry->_char_type == UnicodeLatinMap::CT_lower; 00334 } 00335 00336 //////////////////////////////////////////////////////////////////// 00337 // Function: TextEncoder::unicode_isdigit 00338 // Access: Published, Static 00339 // Description: Returns true if the indicated character is a 00340 // numeric digit, false otherwise. This is akin to 00341 // ctype's isdigit(), extended to Unicode. 00342 //////////////////////////////////////////////////////////////////// 00343 INLINE bool TextEncoder:: 00344 unicode_isdigit(int character) { 00345 const UnicodeLatinMap::Entry *entry = UnicodeLatinMap::look_up(character); 00346 if (entry == (const UnicodeLatinMap::Entry *)NULL) { 00347 // The digits aren't actually listed in the map. 00348 return (character >= '0' && character <= '9'); 00349 } 00350 // This silly test (!= 0) is necessary to prevent a VC++ warning. 00351 return (isdigit(entry->_ascii_equiv) != 0); 00352 } 00353 00354 //////////////////////////////////////////////////////////////////// 00355 // Function: TextEncoder::unicode_ispunct 00356 // Access: Published, Static 00357 // Description: Returns true if the indicated character is a 00358 // punctuation mark, false otherwise. This is akin to 00359 // ctype's ispunct(), extended to Unicode. 00360 //////////////////////////////////////////////////////////////////// 00361 INLINE bool TextEncoder:: 00362 unicode_ispunct(int character) { 00363 const UnicodeLatinMap::Entry *entry = UnicodeLatinMap::look_up(character); 00364 if (entry == (const UnicodeLatinMap::Entry *)NULL) { 00365 // Some punctuation marks aren't listed in the map. 00366 return (character >= 0 && character < 128 && ispunct(character)); 00367 } 00368 return entry->_char_type == UnicodeLatinMap::CT_punct; 00369 } 00370 00371 //////////////////////////////////////////////////////////////////// 00372 // Function: TextEncoder::unicode_isupper 00373 // Access: Published, Static 00374 // Description: Returns true if the indicated character is an 00375 // uppercase letter, false otherwise. This is akin to 00376 // ctype's isupper(), extended to Unicode. 00377 //////////////////////////////////////////////////////////////////// 00378 INLINE bool TextEncoder:: 00379 unicode_isupper(int character) { 00380 const UnicodeLatinMap::Entry *entry = UnicodeLatinMap::look_up(character); 00381 if (entry == (const UnicodeLatinMap::Entry *)NULL) { 00382 return false; 00383 } 00384 return entry->_char_type == UnicodeLatinMap::CT_upper; 00385 } 00386 00387 //////////////////////////////////////////////////////////////////// 00388 // Function: TextEncoder::unicode_isspace 00389 // Access: Published, Static 00390 // Description: Returns true if the indicated character is a 00391 // whitespace letter, false otherwise. This is akin to 00392 // ctype's isspace(), extended to Unicode. 00393 //////////////////////////////////////////////////////////////////// 00394 INLINE bool TextEncoder:: 00395 unicode_isspace(int character) { 00396 switch (character) { 00397 case ' ': 00398 case '\t': 00399 case '\n': 00400 return true; 00401 00402 default: 00403 return false; 00404 } 00405 } 00406 00407 //////////////////////////////////////////////////////////////////// 00408 // Function: TextEncoder::unicode_islower 00409 // Access: Published, Static 00410 // Description: Returns true if the indicated character is a 00411 // lowercase letter, false otherwise. This is akin to 00412 // ctype's islower(), extended to Unicode. 00413 //////////////////////////////////////////////////////////////////// 00414 INLINE bool TextEncoder:: 00415 unicode_islower(int character) { 00416 const UnicodeLatinMap::Entry *entry = UnicodeLatinMap::look_up(character); 00417 if (entry == (const UnicodeLatinMap::Entry *)NULL) { 00418 return false; 00419 } 00420 return entry->_char_type == UnicodeLatinMap::CT_lower; 00421 } 00422 00423 //////////////////////////////////////////////////////////////////// 00424 // Function: TextEncoder::unicode_toupper 00425 // Access: Published, Static 00426 // Description: Returns the uppercase equivalent of the given Unicode 00427 // character. This is akin to ctype's toupper(), 00428 // extended to Unicode. 00429 //////////////////////////////////////////////////////////////////// 00430 INLINE int TextEncoder:: 00431 unicode_toupper(int character) { 00432 const UnicodeLatinMap::Entry *entry = UnicodeLatinMap::look_up(character); 00433 if (entry == (const UnicodeLatinMap::Entry *)NULL) { 00434 return character; 00435 } 00436 return entry->_toupper_character; 00437 } 00438 00439 //////////////////////////////////////////////////////////////////// 00440 // Function: TextEncoder::unicode_tolower 00441 // Access: Published, Static 00442 // Description: Returns the uppercase equivalent of the given Unicode 00443 // character. This is akin to ctype's tolower(), 00444 // extended to Unicode. 00445 //////////////////////////////////////////////////////////////////// 00446 INLINE int TextEncoder:: 00447 unicode_tolower(int character) { 00448 const UnicodeLatinMap::Entry *entry = UnicodeLatinMap::look_up(character); 00449 if (entry == (const UnicodeLatinMap::Entry *)NULL) { 00450 return character; 00451 } 00452 return entry->_tolower_character; 00453 } 00454 00455 //////////////////////////////////////////////////////////////////// 00456 // Function: TextEncoder::upper 00457 // Access: Published, Static 00458 // Description: Converts the string to uppercase, assuming the string 00459 // is encoded in the default encoding. 00460 //////////////////////////////////////////////////////////////////// 00461 INLINE string TextEncoder:: 00462 upper(const string &source) { 00463 return upper(source, get_default_encoding()); 00464 } 00465 00466 //////////////////////////////////////////////////////////////////// 00467 // Function: TextEncoder::upper 00468 // Access: Published, Static 00469 // Description: Converts the string to uppercase, assuming the string 00470 // is encoded in the indicated encoding. 00471 //////////////////////////////////////////////////////////////////// 00472 INLINE string TextEncoder:: 00473 upper(const string &source, TextEncoder::Encoding encoding) { 00474 TextEncoder encoder; 00475 encoder.set_encoding(encoding); 00476 encoder.set_text(source); 00477 encoder.make_upper(); 00478 return encoder.get_text(); 00479 } 00480 00481 //////////////////////////////////////////////////////////////////// 00482 // Function: TextEncoder::lower 00483 // Access: Published, Static 00484 // Description: Converts the string to lowercase, assuming the string 00485 // is encoded in the default encoding. 00486 //////////////////////////////////////////////////////////////////// 00487 INLINE string TextEncoder:: 00488 lower(const string &source) { 00489 return lower(source, get_default_encoding()); 00490 } 00491 00492 //////////////////////////////////////////////////////////////////// 00493 // Function: TextEncoder::lower 00494 // Access: Published, Static 00495 // Description: Converts the string to lowercase, assuming the string 00496 // is encoded in the indicated encoding. 00497 //////////////////////////////////////////////////////////////////// 00498 INLINE string TextEncoder:: 00499 lower(const string &source, TextEncoder::Encoding encoding) { 00500 TextEncoder encoder; 00501 encoder.set_encoding(encoding); 00502 encoder.set_text(source); 00503 encoder.make_lower(); 00504 return encoder.get_text(); 00505 } 00506 00507 //////////////////////////////////////////////////////////////////// 00508 // Function: TextEncoder::set_wtext 00509 // Access: Published 00510 // Description: Changes the text that is stored in the encoder. 00511 // Subsequent calls to get_wtext() will return this same 00512 // string, while get_text() will return the encoded 00513 // version of the string. 00514 //////////////////////////////////////////////////////////////////// 00515 INLINE void TextEncoder:: 00516 set_wtext(const wstring &wtext) { 00517 if (!has_text() || _wtext != wtext) { 00518 _wtext = wtext; 00519 _flags = (_flags | F_got_wtext) & ~F_got_text; 00520 } 00521 } 00522 00523 //////////////////////////////////////////////////////////////////// 00524 // Function: TextEncoder::get_wtext 00525 // Access: Published 00526 // Description: Returns the text associated with the TextEncoder, as 00527 // a wide-character string. 00528 //////////////////////////////////////////////////////////////////// 00529 INLINE const wstring &TextEncoder:: 00530 get_wtext() const { 00531 if ((_flags & F_got_wtext) == 0) { 00532 ((TextEncoder *)this)->_wtext = decode_text(_text); 00533 ((TextEncoder *)this)->_flags |= F_got_wtext; 00534 } 00535 return _wtext; 00536 } 00537 00538 //////////////////////////////////////////////////////////////////// 00539 // Function: TextEncoder::append_wtext 00540 // Access: Published 00541 // Description: Appends the indicates string to the end of the stored 00542 // wide-character text. 00543 //////////////////////////////////////////////////////////////////// 00544 INLINE void TextEncoder:: 00545 append_wtext(const wstring &wtext) { 00546 _wtext = get_wtext() + wtext; 00547 _flags = (_flags | F_got_wtext) & ~F_got_text; 00548 } 00549 00550 //////////////////////////////////////////////////////////////////// 00551 // Function: TextEncoder::encode_wtext 00552 // Access: Published 00553 // Description: Encodes a wide-text string into a single-char string, 00554 // according to the current encoding. 00555 //////////////////////////////////////////////////////////////////// 00556 INLINE string TextEncoder:: 00557 encode_wtext(const wstring &wtext) const { 00558 return encode_wtext(wtext, _encoding); 00559 } 00560 00561 //////////////////////////////////////////////////////////////////// 00562 // Function: TextEncoder::decode_text 00563 // Access: Published 00564 // Description: Returns the given wstring decoded to a single-byte 00565 // string, via the current encoding system. 00566 //////////////////////////////////////////////////////////////////// 00567 INLINE wstring TextEncoder:: 00568 decode_text(const string &text) const { 00569 return decode_text(text, _encoding); 00570 } 00571 00572 //////////////////////////////////////////////////////////////////// 00573 // Function: wstring ostream operator 00574 // Description: Uses the current default encoding to output the 00575 // wstring. 00576 //////////////////////////////////////////////////////////////////// 00577 INLINE ostream & 00578 operator << (ostream &out, const wstring &str) { 00579 TextEncoder encoder; 00580 encoder.set_wtext(str); 00581 out << encoder.get_text(); 00582 return out; 00583 }