Panda3D
textEncoder.I
Go to the documentation of this file.
1 /**
2  * PANDA 3D SOFTWARE
3  * Copyright (c) Carnegie Mellon University. All rights reserved.
4  *
5  * All use of this software is subject to the terms of the revised BSD
6  * license. You should have received a copy of this license along
7  * with this source code in a file named "LICENSE."
8  *
9  * @file textEncoder.I
10  * @author drose
11  * @date 2003-03-26
12  */
13 
14 /**
15  *
16  */
17 INLINE TextEncoder::
18 TextEncoder() {
19  _encoding = _default_encoding;
20 
21  // Initially, since the text string is empty, we know that both _text and
22  // _wtext accurately reflect the empty state; so we "got" both of them.
23  _flags = (F_got_text | F_got_wtext);
24 }
25 
26 /**
27  *
28  */
29 INLINE TextEncoder::
30 TextEncoder(const TextEncoder &copy) :
31  _flags(copy._flags),
32  _encoding(copy._encoding),
33  _text(copy._text),
34  _wtext(copy._wtext)
35 {
36 }
37 
38 /**
39  * Specifies how the string set via set_text() is to be interpreted. The
40  * default, E_iso8859, means a standard string with one-byte characters (i.e.
41  * ASCII). Other encodings are possible to take advantage of character sets
42  * with more than 256 characters.
43  *
44  * This affects only future calls to set_text(); it does not change text that
45  * was set previously.
46  */
47 INLINE void TextEncoder::
48 set_encoding(TextEncoder::Encoding encoding) {
49  // Force the previously-set strings to be encoded or decoded now.
50  get_text();
51  get_wtext();
52  _encoding = encoding;
53 }
54 
55 /**
56  * Returns the encoding by which the string set via set_text() is to be
57  * interpreted. See set_encoding().
58  */
59 INLINE TextEncoder::Encoding TextEncoder::
60 get_encoding() const {
61  return _encoding;
62 }
63 
64 /**
65  * Specifies the default encoding to be used for all subsequently created
66  * TextEncoder objects. See set_encoding().
67  */
68 INLINE void TextEncoder::
69 set_default_encoding(TextEncoder::Encoding encoding) {
70  _default_encoding = encoding;
71 }
72 
73 /**
74  * Specifies the default encoding to be used for all subsequently created
75  * TextEncoder objects. See set_encoding().
76  */
77 INLINE TextEncoder::Encoding TextEncoder::
78 get_default_encoding() {
79  return _default_encoding;
80 }
81 
82 /**
83  * Changes the text that is stored in the encoder. The text should be encoded
84  * according to the method indicated by set_encoding(). Subsequent calls to
85  * get_text() will return this same string, while get_wtext() will return the
86  * decoded version of the string.
87  */
88 INLINE void TextEncoder::
89 set_text(const std::string &text) {
90  if (!has_text() || _text != text) {
91  _text = text;
92  _flags = (_flags | F_got_text) & ~F_got_wtext;
93  text_changed();
94  }
95 }
96 
97 /**
98  * The two-parameter version of set_text() accepts an explicit encoding; the
99  * text is immediately decoded and stored as a wide-character string.
100  * Subsequent calls to get_text() will return the same text re-encoded using
101  * whichever encoding is specified by set_encoding().
102  */
103 INLINE void TextEncoder::
104 set_text(const std::string &text, TextEncoder::Encoding encoding) {
105  if (encoding == _encoding) {
106  set_text(text);
107  } else {
108  set_wtext(decode_text(text, encoding));
109  }
110 }
111 
112 /**
113  * Removes the text from the TextEncoder.
114  */
115 INLINE void TextEncoder::
117  _text = std::string();
118  _wtext = std::wstring();
119  _flags |= (F_got_text | F_got_wtext);
120  text_changed();
121 }
122 
123 /**
124  *
125  */
126 INLINE bool TextEncoder::
127 has_text() const {
128  if (_flags & F_got_wtext) {
129  return !_wtext.empty();
130  } else {
131  return !_text.empty();
132  }
133 }
134 
135 /**
136  * Returns the current text, as encoded via the current encoding system.
137  */
138 INLINE std::string TextEncoder::
139 get_text() const {
140  if ((_flags & F_got_text) == 0) {
141  ((TextEncoder *)this)->_text = encode_wtext(_wtext);
142  ((TextEncoder *)this)->_flags |= F_got_text;
143  }
144  return _text;
145 }
146 
147 /**
148  * Returns the current text, as encoded via the indicated encoding system.
149  */
150 INLINE std::string TextEncoder::
151 get_text(TextEncoder::Encoding encoding) const {
152  return encode_wtext(get_wtext(), encoding);
153 }
154 
155 /**
156  * Appends the indicates string to the end of the stored text.
157  */
158 INLINE void TextEncoder::
159 append_text(const std::string &text) {
160  if (!text.empty()) {
161  _text = get_text() + text;
162  _flags = (_flags | F_got_text) & ~F_got_wtext;
163  text_changed();
164  }
165 }
166 
167 /**
168  * Appends a single character to the end of the stored text. This may be a
169  * wide character, up to 16 bits in Unicode.
170  */
171 INLINE void TextEncoder::
172 append_unicode_char(char32_t character) {
173 #if WCHAR_MAX >= 0x10FFFF
174  // wchar_t might be UTF-32.
175  _wtext = get_wtext() + std::wstring(1, (wchar_t)character);
176 #else
177  if ((character & ~0xffff) == 0) {
178  _wtext = get_wtext() + std::wstring(1, (wchar_t)character);
179  } else {
180  // Encode as a surrogate pair.
181  uint32_t v = (uint32_t)character - 0x10000u;
182  wchar_t wstr[2] = {
183  (wchar_t)((v >> 10u) | 0xd800u),
184  (wchar_t)((v & 0x3ffu) | 0xdc00u),
185  };
186  _wtext = get_wtext() + std::wstring(wstr, 2);
187  }
188 #endif
189  _flags = (_flags | F_got_wtext) & ~F_got_text;
190  text_changed();
191 }
192 
193 /**
194  * Returns the number of characters in the stored text. This is a count of
195  * wide characters, after the string has been decoded according to
196  * set_encoding().
197  */
198 INLINE size_t TextEncoder::
199 get_num_chars() const {
200  return get_wtext().length();
201 }
202 
203 /**
204  * Returns the Unicode value of the nth character in the stored text. This
205  * may be a wide character (greater than 255), after the string has been
206  * decoded according to set_encoding().
207  */
208 INLINE int TextEncoder::
209 get_unicode_char(size_t index) const {
210  get_wtext();
211  if (index < _wtext.length()) {
212  return _wtext[index];
213  }
214  return 0;
215 }
216 
217 /**
218  * Sets the Unicode value of the nth character in the stored text. This may
219  * be a wide character (greater than 255), after the string has been decoded
220  * according to set_encoding().
221  */
222 INLINE void TextEncoder::
223 set_unicode_char(size_t index, char32_t character) {
224  get_wtext();
225  if (index < _wtext.length()) {
226  _wtext[index] = character;
227  _flags &= ~F_got_text;
228  text_changed();
229  }
230 }
231 
232 /**
233  * Returns the nth char of the stored text, as a one-, two-, or three-byte
234  * encoded string.
235  */
236 INLINE std::string TextEncoder::
237 get_encoded_char(size_t index) const {
238  return get_encoded_char(index, get_encoding());
239 }
240 
241 /**
242  * Returns the nth char of the stored text, as a one-, two-, or three-byte
243  * encoded string.
244  */
245 INLINE std::string TextEncoder::
246 get_encoded_char(size_t index, TextEncoder::Encoding encoding) const {
247  std::wstring wch(1, (wchar_t)get_unicode_char(index));
248  return encode_wtext(wch, encoding);
249 }
250 
251 /**
252  * Returns the text associated with the node, converted as nearly as possible
253  * to a fully-ASCII representation. This means replacing accented letters
254  * with their unaccented ASCII equivalents.
255  *
256  * It is possible that some characters in the string cannot be converted to
257  * ASCII. (The string may involve symbols like the copyright symbol, for
258  * instance, or it might involve letters in some other alphabet such as Greek
259  * or Cyrillic, or even Latin letters like thorn or eth that are not part of
260  * the ASCII character set.) In this case, as much of the string as possible
261  * will be converted to ASCII, and the nonconvertible characters will remain
262  * encoded in the encoding specified by set_encoding().
263  */
264 INLINE std::string TextEncoder::
267 }
268 
269 /**
270  * Given the indicated text string, which is assumed to be encoded via the
271  * encoding "from", decodes it and then reencodes it into the encoding "to",
272  * and returns the newly encoded string. This does not change or affect any
273  * properties on the TextEncoder itself.
274  */
275 INLINE std::string TextEncoder::
276 reencode_text(const std::string &text, TextEncoder::Encoding from,
277  TextEncoder::Encoding to) {
278  return encode_wtext(decode_text(text, from), to);
279 }
280 
281 /**
282  * Returns true if the indicated character is an alphabetic letter, false
283  * otherwise. This is akin to ctype's isalpha(), extended to Unicode.
284  */
285 INLINE bool TextEncoder::
286 unicode_isalpha(char32_t character) {
287  const UnicodeLatinMap::Entry *entry = UnicodeLatinMap::look_up(character);
288  if (entry == nullptr) {
289  return false;
290  }
291  return entry->_char_type == UnicodeLatinMap::CT_upper ||
292  entry->_char_type == UnicodeLatinMap::CT_lower;
293 }
294 
295 /**
296  * Returns true if the indicated character is a numeric digit, false
297  * otherwise. This is akin to ctype's isdigit(), extended to Unicode.
298  */
299 INLINE bool TextEncoder::
300 unicode_isdigit(char32_t character) {
301  const UnicodeLatinMap::Entry *entry = UnicodeLatinMap::look_up(character);
302  if (entry == nullptr) {
303  // The digits aren't actually listed in the map.
304  return (character >= '0' && character <= '9');
305  }
306  // This silly test (!= 0) is necessary to prevent a VC++ warning.
307  return (isdigit(entry->_ascii_equiv) != 0);
308 }
309 
310 /**
311  * Returns true if the indicated character is a punctuation mark, false
312  * otherwise. This is akin to ctype's ispunct(), extended to Unicode.
313  */
314 INLINE bool TextEncoder::
315 unicode_ispunct(char32_t character) {
316  const UnicodeLatinMap::Entry *entry = UnicodeLatinMap::look_up(character);
317  if (entry == nullptr) {
318  // Some punctuation marks aren't listed in the map.
319  return (character < 128 && ispunct(character));
320  }
321  return entry->_char_type == UnicodeLatinMap::CT_punct;
322 }
323 
324 /**
325  * Returns true if the indicated character is an uppercase letter, false
326  * otherwise. This is akin to ctype's isupper(), extended to Unicode.
327  */
328 INLINE bool TextEncoder::
329 unicode_isupper(char32_t character) {
330  const UnicodeLatinMap::Entry *entry = UnicodeLatinMap::look_up(character);
331  if (entry == nullptr) {
332  return false;
333  }
334  return entry->_char_type == UnicodeLatinMap::CT_upper;
335 }
336 
337 /**
338  * Returns true if the indicated character is a whitespace letter, false
339  * otherwise. This is akin to ctype's isspace(), extended to Unicode.
340  */
341 INLINE bool TextEncoder::
342 unicode_isspace(char32_t character) {
343  switch (character) {
344  case ' ':
345  case '\t':
346  case '\n':
347  return true;
348 
349  default:
350  return false;
351  }
352 }
353 
354 /**
355  * Returns true if the indicated character is a lowercase letter, false
356  * otherwise. This is akin to ctype's islower(), extended to Unicode.
357  */
358 INLINE bool TextEncoder::
359 unicode_islower(char32_t character) {
360  const UnicodeLatinMap::Entry *entry = UnicodeLatinMap::look_up(character);
361  if (entry == nullptr) {
362  return false;
363  }
364  return entry->_char_type == UnicodeLatinMap::CT_lower;
365 }
366 
367 /**
368  * Returns the uppercase equivalent of the given Unicode character. This is
369  * akin to ctype's toupper(), extended to Unicode.
370  */
371 INLINE int TextEncoder::
372 unicode_toupper(char32_t character) {
373  const UnicodeLatinMap::Entry *entry = UnicodeLatinMap::look_up(character);
374  if (entry == nullptr) {
375  return character;
376  }
377  return entry->_toupper_character;
378 }
379 
380 /**
381  * Returns the uppercase equivalent of the given Unicode character. This is
382  * akin to ctype's tolower(), extended to Unicode.
383  */
384 INLINE int TextEncoder::
385 unicode_tolower(char32_t character) {
386  const UnicodeLatinMap::Entry *entry = UnicodeLatinMap::look_up(character);
387  if (entry == nullptr) {
388  return character;
389  }
390  return entry->_tolower_character;
391 }
392 
393 /**
394  * Converts the string to uppercase, assuming the string is encoded in the
395  * default encoding.
396  */
397 INLINE std::string TextEncoder::
398 upper(const std::string &source) {
399  return upper(source, get_default_encoding());
400 }
401 
402 /**
403  * Converts the string to uppercase, assuming the string is encoded in the
404  * indicated encoding.
405  */
406 INLINE std::string TextEncoder::
407 upper(const std::string &source, TextEncoder::Encoding encoding) {
408  TextEncoder encoder;
409  encoder.set_encoding(encoding);
410  encoder.set_text(source);
411  encoder.make_upper();
412  return encoder.get_text();
413 }
414 
415 /**
416  * Converts the string to lowercase, assuming the string is encoded in the
417  * default encoding.
418  */
419 INLINE std::string TextEncoder::
420 lower(const std::string &source) {
421  return lower(source, get_default_encoding());
422 }
423 
424 /**
425  * Converts the string to lowercase, assuming the string is encoded in the
426  * indicated encoding.
427  */
428 INLINE std::string TextEncoder::
429 lower(const std::string &source, TextEncoder::Encoding encoding) {
430  TextEncoder encoder;
431  encoder.set_encoding(encoding);
432  encoder.set_text(source);
433  encoder.make_lower();
434  return encoder.get_text();
435 }
436 
437 /**
438  * Changes the text that is stored in the encoder. Subsequent calls to
439  * get_wtext() will return this same string, while get_text() will return the
440  * encoded version of the string.
441  */
442 INLINE void TextEncoder::
443 set_wtext(const std::wstring &wtext) {
444  if (!has_text() || _wtext != wtext) {
445  _wtext = wtext;
446  _flags = (_flags | F_got_wtext) & ~F_got_text;
447  text_changed();
448  }
449 }
450 
451 /**
452  * Returns the text associated with the TextEncoder, as a wide-character
453  * string.
454  */
455 INLINE const std::wstring &TextEncoder::
456 get_wtext() const {
457  if ((_flags & F_got_wtext) == 0) {
458  ((TextEncoder *)this)->_wtext = decode_text(_text);
459  ((TextEncoder *)this)->_flags |= F_got_wtext;
460  }
461  return _wtext;
462 }
463 
464 /**
465  * Appends the indicates string to the end of the stored wide-character text.
466  */
467 INLINE void TextEncoder::
468 append_wtext(const std::wstring &wtext) {
469  if (!wtext.empty()) {
470  _wtext = get_wtext() + wtext;
471  _flags = (_flags | F_got_wtext) & ~F_got_text;
472  text_changed();
473  }
474 }
475 
476 /**
477  * Encodes a wide-text string into a single-char string, according to the
478  * current encoding.
479  */
480 INLINE std::string TextEncoder::
481 encode_wtext(const std::wstring &wtext) const {
482  return encode_wtext(wtext, _encoding);
483 }
484 
485 /**
486  * Returns the given wstring decoded to a single-byte string, via the current
487  * encoding system.
488  */
489 INLINE std::wstring TextEncoder::
490 decode_text(const std::string &text) const {
491  return decode_text(text, _encoding);
492 }
493 
494 /**
495  * Uses the current default encoding to output the wstring.
496  */
497 INLINE std::ostream &
498 operator << (std::ostream &out, const std::wstring &str) {
499  TextEncoder encoder;
500  encoder.set_wtext(str);
501  out << encoder.get_text();
502  return out;
503 }
void append_text(const std::string &text)
Appends the indicates string to the end of the stored text.
Definition: textEncoder.I:159
std::ostream & operator<<(std::ostream &out, const std::wstring &str)
Uses the current default encoding to output the wstring.
Definition: textEncoder.I:498
static std::string reencode_text(const std::string &text, Encoding from, Encoding to)
Given the indicated text string, which is assumed to be encoded via the encoding "from", decodes it and then reencodes it into the encoding "to", and returns the newly encoded string.
Definition: textEncoder.I:276
static int unicode_toupper(char32_t character)
Returns the uppercase equivalent of the given Unicode character.
Definition: textEncoder.I:372
int get_unicode_char(size_t index) const
Returns the Unicode value of the nth character in the stored text.
Definition: textEncoder.I:209
This class can be used to convert text between multiple representations, e.g.
Definition: textEncoder.h:33
static bool unicode_ispunct(char32_t character)
Returns true if the indicated character is a punctuation mark, false otherwise.
Definition: textEncoder.I:315
static std::string upper(const std::string &source)
Converts the string to uppercase, assuming the string is encoded in the default encoding.
Definition: textEncoder.I:398
set_default_encoding
Specifies the default encoding to be used for all subsequently created TextEncoder objects...
Definition: textEncoder.h:54
void make_lower()
Adjusts the text stored within the encoder to all lowercase letters (preserving accent marks correctl...
Definition: textEncoder.cxx:46
void clear_text()
Removes the text from the TextEncoder.
Definition: textEncoder.I:116
void append_wtext(const std::wstring &text)
Appends the indicates string to the end of the stored wide-character text.
Definition: textEncoder.I:468
set_text
Changes the text that is stored in the encoder.
Definition: textEncoder.h:124
std::string get_text_as_ascii() const
Returns the text associated with the node, converted as nearly as possible to a fully-ASCII represent...
Definition: textEncoder.I:265
static const Entry * look_up(char32_t character)
Returns the Entry associated with the indicated character, if there is one.
void set_unicode_char(size_t index, char32_t character)
Sets the Unicode value of the nth character in the stored text.
Definition: textEncoder.I:223
get_default_encoding
Specifies the default encoding to be used for all subsequently created TextEncoder objects...
Definition: textEncoder.h:54
static int unicode_tolower(char32_t character)
Returns the uppercase equivalent of the given Unicode character.
Definition: textEncoder.I:385
static bool unicode_isalpha(char32_t character)
Returns true if the indicated character is an alphabetic letter, false otherwise. ...
Definition: textEncoder.I:286
std::string get_encoded_char(size_t index) const
Returns the nth char of the stored text, as a one-, two-, or three-byte encoded string.
Definition: textEncoder.I:237
std::wstring get_wtext_as_ascii() const
Returns the text associated with the node, converted as nearly as possible to a fully-ASCII represent...
Definition: textEncoder.cxx:70
get_text
Returns the current text, as encoded via the current encoding system.
Definition: textEncoder.h:124
std::string encode_wtext(const std::wstring &wtext) const
Encodes a wide-text string into a single-char string, according to the current encoding.
Definition: textEncoder.I:481
static bool unicode_isdigit(char32_t character)
Returns true if the indicated character is a numeric digit, false otherwise.
Definition: textEncoder.I:300
void set_encoding(Encoding encoding)
Specifies how the string set via set_text() is to be interpreted.
Definition: textEncoder.I:48
static std::string lower(const std::string &source)
Converts the string to lowercase, assuming the string is encoded in the default encoding.
Definition: textEncoder.I:420
const std::wstring & get_wtext() const
Returns the text associated with the TextEncoder, as a wide-character string.
Definition: textEncoder.I:456
Encoding get_encoding() const
Returns the encoding by which the string set via set_text() is to be interpreted. ...
Definition: textEncoder.I:60
void append_unicode_char(char32_t character)
Appends a single character to the end of the stored text.
Definition: textEncoder.I:172
std::wstring decode_text(const std::string &text) const
Returns the given wstring decoded to a single-byte string, via the current encoding system...
Definition: textEncoder.I:490
size_t get_num_chars() const
Returns the number of characters in the stored text.
Definition: textEncoder.I:199
void make_upper()
Adjusts the text stored within the encoder to all uppercase letters (preserving accent marks correctl...
Definition: textEncoder.cxx:31
static bool unicode_islower(char32_t character)
Returns true if the indicated character is a lowercase letter, false otherwise.
Definition: textEncoder.I:359
void set_wtext(const std::wstring &wtext)
Changes the text that is stored in the encoder.
Definition: textEncoder.I:443
static bool unicode_isupper(char32_t character)
Returns true if the indicated character is an uppercase letter, false otherwise.
Definition: textEncoder.I:329
static bool unicode_isspace(char32_t character)
Returns true if the indicated character is a whitespace letter, false otherwise.
Definition: textEncoder.I:342