Panda3D
 All Classes Functions Variables Enumerations
textEncoder.I
00001 // Filename: textEncoder.I
00002 // Created by:  drose (26Mar03)
00003 //
00004 ////////////////////////////////////////////////////////////////////
00005 //
00006 // PANDA 3D SOFTWARE
00007 // Copyright (c) Carnegie Mellon University.  All rights reserved.
00008 //
00009 // All use of this software is subject to the terms of the revised BSD
00010 // license.  You should have received a copy of this license along
00011 // with this source code in a file named "LICENSE."
00012 //
00013 ////////////////////////////////////////////////////////////////////
00014 
00015 
00016 ////////////////////////////////////////////////////////////////////
00017 //     Function: TextEncoder::Constructor
00018 //       Access: Published
00019 //  Description:
00020 ////////////////////////////////////////////////////////////////////
00021 INLINE TextEncoder::
00022 TextEncoder() {
00023   _encoding = _default_encoding;
00024   
00025   // Initially, since the text string is empty, we know that both
00026   // _text and _wtext accurately reflect the empty state; so we "got"
00027   // both of them.
00028   _flags = (F_got_text | F_got_wtext);
00029 }
00030 
00031 ////////////////////////////////////////////////////////////////////
00032 //     Function: TextEncoder::Copy Constructor
00033 //       Access: Published
00034 //  Description:
00035 ////////////////////////////////////////////////////////////////////
00036 INLINE TextEncoder::
00037 TextEncoder(const TextEncoder &copy) :
00038   _flags(copy._flags),
00039   _encoding(copy._encoding),
00040   _text(copy._text),
00041   _wtext(copy._wtext)
00042 {
00043 }
00044 
00045 ////////////////////////////////////////////////////////////////////
00046 //     Function: TextEncoder::set_encoding
00047 //       Access: Published
00048 //  Description: Specifies how the string set via set_text() is to be
00049 //               interpreted.  The default, E_iso8859, means a
00050 //               standard string with one-byte characters
00051 //               (i.e. ASCII).  Other encodings are possible to take
00052 //               advantage of character sets with more than 256
00053 //               characters.
00054 //
00055 //               This affects only future calls to set_text(); it does
00056 //               not change text that was set previously.
00057 ////////////////////////////////////////////////////////////////////
00058 INLINE void TextEncoder::
00059 set_encoding(TextEncoder::Encoding encoding) {
00060   // Force the previously-set strings to be encoded or decoded now.
00061   get_text();
00062   get_wtext();
00063   _encoding = encoding;
00064 }
00065 
00066 ////////////////////////////////////////////////////////////////////
00067 //     Function: TextEncoder::get_encoding
00068 //       Access: Published
00069 //  Description: Returns the encoding by which the string set via
00070 //               set_text() is to be interpreted.  See set_encoding().
00071 ////////////////////////////////////////////////////////////////////
00072 INLINE TextEncoder::Encoding TextEncoder::
00073 get_encoding() const {
00074   return _encoding;
00075 }
00076 
00077 ////////////////////////////////////////////////////////////////////
00078 //     Function: TextEncoder::set_default_encoding
00079 //       Access: Published, Static
00080 //  Description: Specifies the default encoding to be used for all
00081 //               subsequently created TextEncoder objects.  See
00082 //               set_encoding().
00083 ////////////////////////////////////////////////////////////////////
00084 INLINE void TextEncoder::
00085 set_default_encoding(TextEncoder::Encoding encoding) {
00086   _default_encoding = encoding;
00087 }
00088 
00089 ////////////////////////////////////////////////////////////////////
00090 //     Function: TextEncoder::get_default_encoding
00091 //       Access: Published, Static
00092 //  Description: Specifies the default encoding to be used for all
00093 //               subsequently created TextEncoder objects.  See
00094 //               set_encoding().
00095 ////////////////////////////////////////////////////////////////////
00096 INLINE TextEncoder::Encoding TextEncoder::
00097 get_default_encoding() {
00098   return _default_encoding;
00099 }
00100 
00101 ////////////////////////////////////////////////////////////////////
00102 //     Function: TextEncoder::set_text
00103 //       Access: Published
00104 //  Description: Changes the text that is stored in the encoder.  The
00105 //               text should be encoded according to the method
00106 //               indicated by set_encoding().  Subsequent calls to
00107 //               get_text() will return this same string, while
00108 //               get_wtext() will return the decoded version of the
00109 //               string.
00110 ////////////////////////////////////////////////////////////////////
00111 INLINE void TextEncoder::
00112 set_text(const string &text) {
00113   if (!has_text() || _text != text) {
00114     _text = text;
00115     _flags = (_flags | F_got_text) & ~F_got_wtext;
00116   }
00117 }
00118 
00119 ////////////////////////////////////////////////////////////////////
00120 //     Function: TextEncoder::set_text
00121 //       Access: Published
00122 //  Description: The two-parameter version of set_text() accepts an
00123 //               explicit encoding; the text is immediately decoded
00124 //               and stored as a wide-character string.  Subsequent
00125 //               calls to get_text() will return the same text
00126 //               re-encoded using whichever encoding is specified by
00127 //               set_encoding().
00128 ////////////////////////////////////////////////////////////////////
00129 INLINE void TextEncoder::
00130 set_text(const string &text, TextEncoder::Encoding encoding) {
00131   set_wtext(decode_text(text, encoding));
00132 }
00133 
00134 ////////////////////////////////////////////////////////////////////
00135 //     Function: TextEncoder::clear_text
00136 //       Access: Published
00137 //  Description: Removes the text from the TextEncoder.
00138 ////////////////////////////////////////////////////////////////////
00139 INLINE void TextEncoder::
00140 clear_text() {
00141   _text = string();
00142   _wtext = wstring();
00143   _flags |= (F_got_text | F_got_wtext);
00144 }
00145 
00146 ////////////////////////////////////////////////////////////////////
00147 //     Function: TextEncoder::has_text
00148 //       Access: Published
00149 //  Description:
00150 ////////////////////////////////////////////////////////////////////
00151 INLINE bool TextEncoder::
00152 has_text() const {
00153   if (_flags & F_got_wtext) {
00154     return !_wtext.empty();
00155   } else {
00156     return !_text.empty();
00157   }
00158 }
00159 
00160 ////////////////////////////////////////////////////////////////////
00161 //     Function: TextEncoder::get_text
00162 //       Access: Published
00163 //  Description: Returns the current text, as encoded via the current
00164 //               encoding system.
00165 ////////////////////////////////////////////////////////////////////
00166 INLINE string TextEncoder::
00167 get_text() const {
00168   if ((_flags & F_got_text) == 0) {
00169     ((TextEncoder *)this)->_text = encode_wtext(_wtext);
00170     ((TextEncoder *)this)->_flags |= F_got_text;
00171   }
00172   return _text;
00173 }
00174 
00175 ////////////////////////////////////////////////////////////////////
00176 //     Function: TextEncoder::get_text
00177 //       Access: Published
00178 //  Description: Returns the current text, as encoded via the indicated
00179 //               encoding system.
00180 ////////////////////////////////////////////////////////////////////
00181 INLINE string TextEncoder::
00182 get_text(TextEncoder::Encoding encoding) const {
00183   return encode_wtext(get_wtext(), encoding);
00184 }
00185 
00186 ////////////////////////////////////////////////////////////////////
00187 //     Function: TextEncoder::append_text
00188 //       Access: Published
00189 //  Description: Appends the indicates string to the end of the stored
00190 //               text.
00191 ////////////////////////////////////////////////////////////////////
00192 INLINE void TextEncoder::
00193 append_text(const string &text) {
00194   _text = get_text() + text;
00195   _flags = (_flags | F_got_text) & ~F_got_wtext;
00196 }
00197 
00198 ////////////////////////////////////////////////////////////////////
00199 //     Function: TextEncoder::append_unicode_char
00200 //       Access: Published
00201 //  Description: Appends a single character to the end of the stored
00202 //               text.  This may be a wide character, up to 16 bits in
00203 //               Unicode.
00204 ////////////////////////////////////////////////////////////////////
00205 INLINE void TextEncoder::
00206 append_unicode_char(int character) {
00207   _wtext = get_wtext() + wstring(1, (wchar_t)character);
00208   _flags = (_flags | F_got_wtext) & ~F_got_text;
00209 }
00210 
00211 ////////////////////////////////////////////////////////////////////
00212 //     Function: TextEncoder::get_num_chars
00213 //       Access: Published
00214 //  Description: Returns the number of characters in the stored text.
00215 //               This is a count of wide characters, after the string
00216 //               has been decoded according to set_encoding().
00217 ////////////////////////////////////////////////////////////////////
00218 INLINE int TextEncoder::
00219 get_num_chars() const {
00220   return get_wtext().length();
00221 }
00222 
00223 ////////////////////////////////////////////////////////////////////
00224 //     Function: TextEncoder::get_unicode_char
00225 //       Access: Published
00226 //  Description: Returns the Unicode value of the nth character in the
00227 //               stored text.  This may be a wide character (greater
00228 //               than 255), after the string has been decoded
00229 //               according to set_encoding().
00230 ////////////////////////////////////////////////////////////////////
00231 INLINE int TextEncoder::
00232 get_unicode_char(int index) const {
00233   get_wtext();
00234   if (index >= 0 && index < (int)_wtext.length()) {
00235     return _wtext[index];
00236   }
00237   return 0;
00238 }
00239 
00240 ////////////////////////////////////////////////////////////////////
00241 //     Function: TextEncoder::set_unicode_char
00242 //       Access: Published
00243 //  Description: Sets the Unicode value of the nth character in the
00244 //               stored text.  This may be a wide character (greater
00245 //               than 255), after the string has been decoded
00246 //               according to set_encoding().
00247 ////////////////////////////////////////////////////////////////////
00248 INLINE void TextEncoder::
00249 set_unicode_char(int index, int character) {
00250   get_wtext();
00251   if (index >= 0 && index < (int)_wtext.length()) {
00252     _wtext[index] = character;
00253     _flags &= ~F_got_text;
00254   }
00255 }
00256 
00257 ////////////////////////////////////////////////////////////////////
00258 //     Function: TextEncoder::get_encoded_char
00259 //       Access: Published
00260 //  Description: Returns the nth char of the stored text, as a one-,
00261 //               two-, or three-byte encoded string.
00262 ////////////////////////////////////////////////////////////////////
00263 INLINE string TextEncoder::
00264 get_encoded_char(int index) const {
00265   return get_encoded_char(index, get_encoding());
00266 }
00267 
00268 ////////////////////////////////////////////////////////////////////
00269 //     Function: TextEncoder::get_encoded_char
00270 //       Access: Published
00271 //  Description: Returns the nth char of the stored text, as a one-,
00272 //               two-, or three-byte encoded string.
00273 ////////////////////////////////////////////////////////////////////
00274 INLINE string TextEncoder::
00275 get_encoded_char(int index, TextEncoder::Encoding encoding) const {
00276   wstring wch(1, (wchar_t)get_unicode_char(index));
00277   return encode_wtext(wch, encoding);
00278 }
00279 
00280 ////////////////////////////////////////////////////////////////////
00281 //     Function: TextEncoder::get_text_as_ascii
00282 //       Access: Published
00283 //  Description: Returns the text associated with the node, converted
00284 //               as nearly as possible to a fully-ASCII
00285 //               representation.  This means replacing accented
00286 //               letters with their unaccented ASCII equivalents.
00287 //
00288 //               It is possible that some characters in the string
00289 //               cannot be converted to ASCII.  (The string may
00290 //               involve symbols like the copyright symbol, for
00291 //               instance, or it might involve letters in some other
00292 //               alphabet such as Greek or Cyrillic, or even Latin
00293 //               letters like thorn or eth that are not part of the
00294 //               ASCII character set.)  In this case, as much of the
00295 //               string as possible will be converted to ASCII, and
00296 //               the nonconvertible characters will remain encoded in
00297 //               the encoding specified by set_encoding().
00298 ////////////////////////////////////////////////////////////////////
00299 INLINE string TextEncoder::
00300 get_text_as_ascii() const {
00301   return encode_wtext(get_wtext_as_ascii());
00302 }
00303 
00304 ////////////////////////////////////////////////////////////////////
00305 //     Function: TextEncoder::reencode_text
00306 //       Access: Published, Static
00307 //  Description: Given the indicated text string, which is assumed to
00308 //               be encoded via the encoding "from", decodes it and
00309 //               then reencodes it into the encoding "to", and returns
00310 //               the newly encoded string.  This does not change or
00311 //               affect any properties on the TextEncoder itself.
00312 ////////////////////////////////////////////////////////////////////
00313 INLINE string TextEncoder::
00314 reencode_text(const string &text, TextEncoder::Encoding from, 
00315               TextEncoder::Encoding to) {
00316   return encode_wtext(decode_text(text, from), to);
00317 }
00318 
00319 ////////////////////////////////////////////////////////////////////
00320 //     Function: TextEncoder::unicode_isalpha
00321 //       Access: Published, Static
00322 //  Description: Returns true if the indicated character is an
00323 //               alphabetic letter, false otherwise.  This is akin to
00324 //               ctype's isalpha(), extended to Unicode.
00325 ////////////////////////////////////////////////////////////////////
00326 INLINE bool TextEncoder::
00327 unicode_isalpha(int character) {
00328   const UnicodeLatinMap::Entry *entry = UnicodeLatinMap::look_up(character);
00329   if (entry == (const UnicodeLatinMap::Entry *)NULL) {
00330     return false;
00331   }
00332   return entry->_char_type == UnicodeLatinMap::CT_upper ||
00333     entry->_char_type == UnicodeLatinMap::CT_lower;
00334 }
00335 
00336 ////////////////////////////////////////////////////////////////////
00337 //     Function: TextEncoder::unicode_isdigit
00338 //       Access: Published, Static
00339 //  Description: Returns true if the indicated character is a
00340 //               numeric digit, false otherwise.  This is akin to
00341 //               ctype's isdigit(), extended to Unicode.
00342 ////////////////////////////////////////////////////////////////////
00343 INLINE bool TextEncoder::
00344 unicode_isdigit(int character) {
00345   const UnicodeLatinMap::Entry *entry = UnicodeLatinMap::look_up(character);
00346   if (entry == (const UnicodeLatinMap::Entry *)NULL) {
00347     // The digits aren't actually listed in the map.
00348     return (character >= '0' && character <= '9');
00349   }
00350   // This silly test (!= 0) is necessary to prevent a VC++ warning.
00351   return (isdigit(entry->_ascii_equiv) != 0);
00352 }
00353 
00354 ////////////////////////////////////////////////////////////////////
00355 //     Function: TextEncoder::unicode_ispunct
00356 //       Access: Published, Static
00357 //  Description: Returns true if the indicated character is a
00358 //               punctuation mark, false otherwise.  This is akin to
00359 //               ctype's ispunct(), extended to Unicode.
00360 ////////////////////////////////////////////////////////////////////
00361 INLINE bool TextEncoder::
00362 unicode_ispunct(int character) {
00363   const UnicodeLatinMap::Entry *entry = UnicodeLatinMap::look_up(character);
00364   if (entry == (const UnicodeLatinMap::Entry *)NULL) {
00365     // Some punctuation marks aren't listed in the map.
00366     return (character >= 0 && character < 128 && ispunct(character));
00367   }
00368   return entry->_char_type == UnicodeLatinMap::CT_punct;
00369 }
00370 
00371 ////////////////////////////////////////////////////////////////////
00372 //     Function: TextEncoder::unicode_isupper
00373 //       Access: Published, Static
00374 //  Description: Returns true if the indicated character is an
00375 //               uppercase letter, false otherwise.  This is akin to
00376 //               ctype's isupper(), extended to Unicode.
00377 ////////////////////////////////////////////////////////////////////
00378 INLINE bool TextEncoder::
00379 unicode_isupper(int character) {
00380   const UnicodeLatinMap::Entry *entry = UnicodeLatinMap::look_up(character);
00381   if (entry == (const UnicodeLatinMap::Entry *)NULL) {
00382     return false;
00383   }
00384   return entry->_char_type == UnicodeLatinMap::CT_upper;
00385 }
00386 
00387 ////////////////////////////////////////////////////////////////////
00388 //     Function: TextEncoder::unicode_isspace
00389 //       Access: Published, Static
00390 //  Description: Returns true if the indicated character is a
00391 //               whitespace letter, false otherwise.  This is akin to
00392 //               ctype's isspace(), extended to Unicode.
00393 ////////////////////////////////////////////////////////////////////
00394 INLINE bool TextEncoder::
00395 unicode_isspace(int character) {
00396   switch (character) {
00397   case ' ':
00398   case '\t':
00399   case '\n':
00400     return true;
00401 
00402   default:
00403     return false;
00404   }
00405 }
00406 
00407 ////////////////////////////////////////////////////////////////////
00408 //     Function: TextEncoder::unicode_islower
00409 //       Access: Published, Static
00410 //  Description: Returns true if the indicated character is a
00411 //               lowercase letter, false otherwise.  This is akin to
00412 //               ctype's islower(), extended to Unicode.
00413 ////////////////////////////////////////////////////////////////////
00414 INLINE bool TextEncoder::
00415 unicode_islower(int character) {
00416   const UnicodeLatinMap::Entry *entry = UnicodeLatinMap::look_up(character);
00417   if (entry == (const UnicodeLatinMap::Entry *)NULL) {
00418     return false;
00419   }
00420   return entry->_char_type == UnicodeLatinMap::CT_lower;
00421 }
00422 
00423 ////////////////////////////////////////////////////////////////////
00424 //     Function: TextEncoder::unicode_toupper
00425 //       Access: Published, Static
00426 //  Description: Returns the uppercase equivalent of the given Unicode
00427 //               character.  This is akin to ctype's toupper(),
00428 //               extended to Unicode.
00429 ////////////////////////////////////////////////////////////////////
00430 INLINE int TextEncoder::
00431 unicode_toupper(int character) {
00432   const UnicodeLatinMap::Entry *entry = UnicodeLatinMap::look_up(character);
00433   if (entry == (const UnicodeLatinMap::Entry *)NULL) {
00434     return character;
00435   } 
00436   return entry->_toupper_character;
00437 }
00438 
00439 ////////////////////////////////////////////////////////////////////
00440 //     Function: TextEncoder::unicode_tolower
00441 //       Access: Published, Static
00442 //  Description: Returns the uppercase equivalent of the given Unicode
00443 //               character.  This is akin to ctype's tolower(),
00444 //               extended to Unicode.
00445 ////////////////////////////////////////////////////////////////////
00446 INLINE int TextEncoder::
00447 unicode_tolower(int character) {
00448   const UnicodeLatinMap::Entry *entry = UnicodeLatinMap::look_up(character);
00449   if (entry == (const UnicodeLatinMap::Entry *)NULL) {
00450     return character;
00451   } 
00452   return entry->_tolower_character;
00453 }
00454 
00455 ////////////////////////////////////////////////////////////////////
00456 //     Function: TextEncoder::upper
00457 //       Access: Published, Static
00458 //  Description: Converts the string to uppercase, assuming the string
00459 //               is encoded in the default encoding.
00460 ////////////////////////////////////////////////////////////////////
00461 INLINE string TextEncoder::
00462 upper(const string &source) {
00463   return upper(source, get_default_encoding());
00464 }
00465 
00466 ////////////////////////////////////////////////////////////////////
00467 //     Function: TextEncoder::upper
00468 //       Access: Published, Static
00469 //  Description: Converts the string to uppercase, assuming the string
00470 //               is encoded in the indicated encoding.
00471 ////////////////////////////////////////////////////////////////////
00472 INLINE string TextEncoder::
00473 upper(const string &source, TextEncoder::Encoding encoding) {
00474   TextEncoder encoder;
00475   encoder.set_encoding(encoding);
00476   encoder.set_text(source);
00477   encoder.make_upper();
00478   return encoder.get_text();
00479 }
00480 
00481 ////////////////////////////////////////////////////////////////////
00482 //     Function: TextEncoder::lower
00483 //       Access: Published, Static
00484 //  Description: Converts the string to lowercase, assuming the string
00485 //               is encoded in the default encoding.
00486 ////////////////////////////////////////////////////////////////////
00487 INLINE string TextEncoder::
00488 lower(const string &source) {
00489   return lower(source, get_default_encoding());
00490 }
00491 
00492 ////////////////////////////////////////////////////////////////////
00493 //     Function: TextEncoder::lower
00494 //       Access: Published, Static
00495 //  Description: Converts the string to lowercase, assuming the string
00496 //               is encoded in the indicated encoding.
00497 ////////////////////////////////////////////////////////////////////
00498 INLINE string TextEncoder::
00499 lower(const string &source, TextEncoder::Encoding encoding) {
00500   TextEncoder encoder;
00501   encoder.set_encoding(encoding);
00502   encoder.set_text(source);
00503   encoder.make_lower();
00504   return encoder.get_text();
00505 }
00506 
00507 ////////////////////////////////////////////////////////////////////
00508 //     Function: TextEncoder::set_wtext
00509 //       Access: Published
00510 //  Description: Changes the text that is stored in the encoder.
00511 //               Subsequent calls to get_wtext() will return this same
00512 //               string, while get_text() will return the encoded
00513 //               version of the string.
00514 ////////////////////////////////////////////////////////////////////
00515 INLINE void TextEncoder::
00516 set_wtext(const wstring &wtext) {
00517   if (!has_text() || _wtext != wtext) {
00518     _wtext = wtext;
00519     _flags = (_flags | F_got_wtext) & ~F_got_text;
00520   }
00521 }
00522 
00523 ////////////////////////////////////////////////////////////////////
00524 //     Function: TextEncoder::get_wtext
00525 //       Access: Published
00526 //  Description: Returns the text associated with the TextEncoder, as
00527 //               a wide-character string.
00528 ////////////////////////////////////////////////////////////////////
00529 INLINE const wstring &TextEncoder::
00530 get_wtext() const {
00531   if ((_flags & F_got_wtext) == 0) {
00532     ((TextEncoder *)this)->_wtext = decode_text(_text);
00533     ((TextEncoder *)this)->_flags |= F_got_wtext;
00534   }
00535   return _wtext;
00536 }
00537 
00538 ////////////////////////////////////////////////////////////////////
00539 //     Function: TextEncoder::append_wtext
00540 //       Access: Published
00541 //  Description: Appends the indicates string to the end of the stored
00542 //               wide-character text.
00543 ////////////////////////////////////////////////////////////////////
00544 INLINE void TextEncoder::
00545 append_wtext(const wstring &wtext) {
00546   _wtext = get_wtext() + wtext;
00547   _flags = (_flags | F_got_wtext) & ~F_got_text;
00548 }
00549 
00550 ////////////////////////////////////////////////////////////////////
00551 //     Function: TextEncoder::encode_wtext
00552 //       Access: Published
00553 //  Description: Encodes a wide-text string into a single-char string,
00554 //               according to the current encoding.
00555 ////////////////////////////////////////////////////////////////////
00556 INLINE string TextEncoder::
00557 encode_wtext(const wstring &wtext) const {
00558   return encode_wtext(wtext, _encoding);
00559 }
00560 
00561 ////////////////////////////////////////////////////////////////////
00562 //     Function: TextEncoder::decode_text
00563 //       Access: Published
00564 //  Description: Returns the given wstring decoded to a single-byte
00565 //               string, via the current encoding system.
00566 ////////////////////////////////////////////////////////////////////
00567 INLINE wstring TextEncoder::
00568 decode_text(const string &text) const {
00569   return decode_text(text, _encoding);
00570 }
00571 
00572 ////////////////////////////////////////////////////////////////////
00573 //     Function: wstring ostream operator
00574 //  Description: Uses the current default encoding to output the
00575 //               wstring.
00576 ////////////////////////////////////////////////////////////////////
00577 INLINE ostream &
00578 operator << (ostream &out, const wstring &str) {
00579   TextEncoder encoder;
00580   encoder.set_wtext(str);
00581   out << encoder.get_text();
00582   return out;
00583 }
 All Classes Functions Variables Enumerations