Panda3D
Loading...
Searching...
No Matches
textEncoder.I
Go to the documentation of this file.
1/**
2 * PANDA 3D SOFTWARE
3 * Copyright (c) Carnegie Mellon University. All rights reserved.
4 *
5 * All use of this software is subject to the terms of the revised BSD
6 * license. You should have received a copy of this license along
7 * with this source code in a file named "LICENSE."
8 *
9 * @file textEncoder.I
10 * @author drose
11 * @date 2003-03-26
12 */
13
14/**
15 *
16 */
17INLINE TextEncoder::
18TextEncoder() {
19 _encoding = _default_encoding;
20
21 // Initially, since the text string is empty, we know that both _text and
22 // _wtext accurately reflect the empty state; so we "got" both of them.
23 _flags = (F_got_text | F_got_wtext);
24}
25
26/**
27 *
28 */
29INLINE TextEncoder::
30TextEncoder(const TextEncoder &copy) :
31 _flags(copy._flags),
32 _encoding(copy._encoding),
33 _text(copy._text),
34 _wtext(copy._wtext)
35{
36}
37
38/**
39 * Specifies how the string set via set_text() is to be interpreted. The
40 * default, E_iso8859, means a standard string with one-byte characters (i.e.
41 * ASCII). Other encodings are possible to take advantage of character sets
42 * with more than 256 characters.
43 *
44 * This affects only future calls to set_text(); it does not change text that
45 * was set previously.
46 */
47INLINE void TextEncoder::
48set_encoding(TextEncoder::Encoding encoding) {
49 // Force the previously-set strings to be encoded or decoded now.
50 get_text();
51 get_wtext();
52 _encoding = encoding;
53}
54
55/**
56 * Returns the encoding by which the string set via set_text() is to be
57 * interpreted. See set_encoding().
58 */
59INLINE TextEncoder::Encoding TextEncoder::
60get_encoding() const {
61 return _encoding;
62}
63
64/**
65 * Specifies the default encoding to be used for all subsequently created
66 * TextEncoder objects. See set_encoding().
67 */
68INLINE void TextEncoder::
69set_default_encoding(TextEncoder::Encoding encoding) {
70 _default_encoding = encoding;
71}
72
73/**
74 * Specifies the default encoding to be used for all subsequently created
75 * TextEncoder objects. See set_encoding().
76 */
77INLINE TextEncoder::Encoding TextEncoder::
79 return _default_encoding;
80}
81
82/**
83 * Changes the text that is stored in the encoder. The text should be encoded
84 * according to the method indicated by set_encoding(). Subsequent calls to
85 * get_text() will return this same string, while get_wtext() will return the
86 * decoded version of the string.
87 */
88INLINE void TextEncoder::
89set_text(const std::string &text) {
90 if (!has_text() || _text != text) {
91 _text = text;
92 _flags = (_flags | F_got_text) & ~F_got_wtext;
93 text_changed();
94 }
95}
96
97/**
98 * The two-parameter version of set_text() accepts an explicit encoding; the
99 * text is immediately decoded and stored as a wide-character string.
100 * Subsequent calls to get_text() will return the same text re-encoded using
101 * whichever encoding is specified by set_encoding().
102 */
103INLINE void TextEncoder::
104set_text(const std::string &text, TextEncoder::Encoding encoding) {
105 if (encoding == _encoding) {
106 set_text(text);
107 } else {
108 set_wtext(decode_text(text, encoding));
109 }
110}
111
112/**
113 * Removes the text from the TextEncoder.
114 */
115INLINE void TextEncoder::
116clear_text() {
117 _text = std::string();
118 _wtext = std::wstring();
119 _flags |= (F_got_text | F_got_wtext);
120 text_changed();
121}
122
123/**
124 *
125 */
126INLINE bool TextEncoder::
127has_text() const {
128 if (_flags & F_got_wtext) {
129 return !_wtext.empty();
130 } else {
131 return !_text.empty();
132 }
133}
134
135/**
136 * Returns the current text, as encoded via the current encoding system.
137 */
138INLINE std::string TextEncoder::
139get_text() const {
140 if ((_flags & F_got_text) == 0) {
141 ((TextEncoder *)this)->_text = encode_wtext(_wtext);
142 ((TextEncoder *)this)->_flags |= F_got_text;
143 }
144 return _text;
145}
146
147/**
148 * Returns the current text, as encoded via the indicated encoding system.
149 */
150INLINE std::string TextEncoder::
151get_text(TextEncoder::Encoding encoding) const {
152 return encode_wtext(get_wtext(), encoding);
153}
154
155/**
156 * Appends the indicates string to the end of the stored text.
157 */
158INLINE void TextEncoder::
159append_text(const std::string &text) {
160 if (!text.empty()) {
161 _text = get_text() + text;
162 _flags = (_flags | F_got_text) & ~F_got_wtext;
163 text_changed();
164 }
165}
166
167/**
168 * Appends a single character to the end of the stored text. This may be a
169 * wide character, up to 16 bits in Unicode.
170 */
171INLINE void TextEncoder::
172append_unicode_char(char32_t character) {
173#if WCHAR_MAX >= 0x10FFFF
174 // wchar_t might be UTF-32.
175 _wtext = get_wtext() + std::wstring(1, (wchar_t)character);
176#else
177 if ((character & ~0xffff) == 0) {
178 _wtext = get_wtext() + std::wstring(1, (wchar_t)character);
179 } else {
180 // Encode as a surrogate pair.
181 uint32_t v = (uint32_t)character - 0x10000u;
182 wchar_t wstr[2] = {
183 (wchar_t)((v >> 10u) | 0xd800u),
184 (wchar_t)((v & 0x3ffu) | 0xdc00u),
185 };
186 _wtext = get_wtext() + std::wstring(wstr, 2);
187 }
188#endif
189 _flags = (_flags | F_got_wtext) & ~F_got_text;
190 text_changed();
191}
192
193/**
194 * Returns the number of characters in the stored text. This is a count of
195 * wide characters, after the string has been decoded according to
196 * set_encoding().
197 */
198INLINE size_t TextEncoder::
199get_num_chars() const {
200 return get_wtext().length();
201}
202
203/**
204 * Returns the Unicode value of the nth character in the stored text. This
205 * may be a wide character (greater than 255), after the string has been
206 * decoded according to set_encoding().
207 */
209get_unicode_char(size_t index) const {
210 get_wtext();
211 if (index < _wtext.length()) {
212 return _wtext[index];
213 }
214 return 0;
215}
216
217/**
218 * Sets the Unicode value of the nth character in the stored text. This may
219 * be a wide character (greater than 255), after the string has been decoded
220 * according to set_encoding().
221 */
222INLINE void TextEncoder::
223set_unicode_char(size_t index, char32_t character) {
224 get_wtext();
225 if (index < _wtext.length()) {
226 _wtext[index] = character;
227 _flags &= ~F_got_text;
228 text_changed();
229 }
230}
231
232/**
233 * Returns the nth char of the stored text, as a one-, two-, or three-byte
234 * encoded string.
235 */
236INLINE std::string TextEncoder::
237get_encoded_char(size_t index) const {
238 return get_encoded_char(index, get_encoding());
239}
240
241/**
242 * Returns the nth char of the stored text, as a one-, two-, or three-byte
243 * encoded string.
244 */
245INLINE std::string TextEncoder::
246get_encoded_char(size_t index, TextEncoder::Encoding encoding) const {
247 std::wstring wch(1, (wchar_t)get_unicode_char(index));
248 return encode_wtext(wch, encoding);
249}
250
251/**
252 * Returns the text associated with the node, converted as nearly as possible
253 * to a fully-ASCII representation. This means replacing accented letters
254 * with their unaccented ASCII equivalents.
255 *
256 * It is possible that some characters in the string cannot be converted to
257 * ASCII. (The string may involve symbols like the copyright symbol, for
258 * instance, or it might involve letters in some other alphabet such as Greek
259 * or Cyrillic, or even Latin letters like thorn or eth that are not part of
260 * the ASCII character set.) In this case, as much of the string as possible
261 * will be converted to ASCII, and the nonconvertible characters will remain
262 * encoded in the encoding specified by set_encoding().
263 */
264INLINE std::string TextEncoder::
265get_text_as_ascii() const {
267}
268
269/**
270 * Given the indicated text string, which is assumed to be encoded via the
271 * encoding "from", decodes it and then reencodes it into the encoding "to",
272 * and returns the newly encoded string. This does not change or affect any
273 * properties on the TextEncoder itself.
274 */
275INLINE std::string TextEncoder::
276reencode_text(const std::string &text, TextEncoder::Encoding from,
277 TextEncoder::Encoding to) {
278 return encode_wtext(decode_text(text, from), to);
279}
280
281/**
282 * Returns true if the indicated character is an alphabetic letter, false
283 * otherwise. This is akin to ctype's isalpha(), extended to Unicode.
284 */
285INLINE bool TextEncoder::
286unicode_isalpha(char32_t character) {
287 const UnicodeLatinMap::Entry *entry = UnicodeLatinMap::look_up(character);
288 if (entry == nullptr) {
289 return false;
290 }
291 return entry->_char_type == UnicodeLatinMap::CT_upper ||
292 entry->_char_type == UnicodeLatinMap::CT_lower;
293}
294
295/**
296 * Returns true if the indicated character is a numeric digit, false
297 * otherwise. This is akin to ctype's isdigit(), extended to Unicode.
298 */
299INLINE bool TextEncoder::
300unicode_isdigit(char32_t character) {
301 const UnicodeLatinMap::Entry *entry = UnicodeLatinMap::look_up(character);
302 if (entry == nullptr) {
303 // The digits aren't actually listed in the map.
304 return (character >= '0' && character <= '9');
305 }
306 // This silly test (!= 0) is necessary to prevent a VC++ warning.
307 return (isdigit(entry->_ascii_equiv) != 0);
308}
309
310/**
311 * Returns true if the indicated character is a punctuation mark, false
312 * otherwise. This is akin to ctype's ispunct(), extended to Unicode.
313 */
314INLINE bool TextEncoder::
315unicode_ispunct(char32_t character) {
316 const UnicodeLatinMap::Entry *entry = UnicodeLatinMap::look_up(character);
317 if (entry == nullptr) {
318 // Some punctuation marks aren't listed in the map.
319 return (character < 128 && ispunct(character));
320 }
321 return entry->_char_type == UnicodeLatinMap::CT_punct;
322}
323
324/**
325 * Returns true if the indicated character is an uppercase letter, false
326 * otherwise. This is akin to ctype's isupper(), extended to Unicode.
327 */
328INLINE bool TextEncoder::
329unicode_isupper(char32_t character) {
330 const UnicodeLatinMap::Entry *entry = UnicodeLatinMap::look_up(character);
331 if (entry == nullptr) {
332 return false;
333 }
334 return entry->_char_type == UnicodeLatinMap::CT_upper;
335}
336
337/**
338 * Returns true if the indicated character is a whitespace letter, false
339 * otherwise. This is akin to ctype's isspace(), extended to Unicode.
340 */
341INLINE bool TextEncoder::
342unicode_isspace(char32_t character) {
343 switch (character) {
344 case ' ':
345 case '\t':
346 case '\n':
347 return true;
348
349 default:
350 return false;
351 }
352}
353
354/**
355 * Returns true if the indicated character is a lowercase letter, false
356 * otherwise. This is akin to ctype's islower(), extended to Unicode.
357 */
358INLINE bool TextEncoder::
359unicode_islower(char32_t character) {
360 const UnicodeLatinMap::Entry *entry = UnicodeLatinMap::look_up(character);
361 if (entry == nullptr) {
362 return false;
363 }
364 return entry->_char_type == UnicodeLatinMap::CT_lower;
365}
366
367/**
368 * Returns the uppercase equivalent of the given Unicode character. This is
369 * akin to ctype's toupper(), extended to Unicode.
370 */
372unicode_toupper(char32_t character) {
373 const UnicodeLatinMap::Entry *entry = UnicodeLatinMap::look_up(character);
374 if (entry == nullptr) {
375 return character;
376 }
377 return entry->_toupper_character;
378}
379
380/**
381 * Returns the uppercase equivalent of the given Unicode character. This is
382 * akin to ctype's tolower(), extended to Unicode.
383 */
385unicode_tolower(char32_t character) {
386 const UnicodeLatinMap::Entry *entry = UnicodeLatinMap::look_up(character);
387 if (entry == nullptr) {
388 return character;
389 }
390 return entry->_tolower_character;
391}
392
393/**
394 * Converts the string to uppercase, assuming the string is encoded in the
395 * default encoding.
396 */
397INLINE std::string TextEncoder::
398upper(const std::string &source) {
399 return upper(source, get_default_encoding());
400}
401
402/**
403 * Converts the string to uppercase, assuming the string is encoded in the
404 * indicated encoding.
405 */
406INLINE std::string TextEncoder::
407upper(const std::string &source, TextEncoder::Encoding encoding) {
408 TextEncoder encoder;
409 encoder.set_encoding(encoding);
410 encoder.set_text(source);
411 encoder.make_upper();
412 return encoder.get_text();
413}
414
415/**
416 * Converts the string to lowercase, assuming the string is encoded in the
417 * default encoding.
418 */
419INLINE std::string TextEncoder::
420lower(const std::string &source) {
421 return lower(source, get_default_encoding());
422}
423
424/**
425 * Converts the string to lowercase, assuming the string is encoded in the
426 * indicated encoding.
427 */
428INLINE std::string TextEncoder::
429lower(const std::string &source, TextEncoder::Encoding encoding) {
430 TextEncoder encoder;
431 encoder.set_encoding(encoding);
432 encoder.set_text(source);
433 encoder.make_lower();
434 return encoder.get_text();
435}
436
437/**
438 * Changes the text that is stored in the encoder. Subsequent calls to
439 * get_wtext() will return this same string, while get_text() will return the
440 * encoded version of the string.
441 */
442INLINE void TextEncoder::
443set_wtext(const std::wstring &wtext) {
444 if (!has_text() || _wtext != wtext) {
445 _wtext = wtext;
446 _flags = (_flags | F_got_wtext) & ~F_got_text;
447 text_changed();
448 }
449}
450
451/**
452 * Returns the text associated with the TextEncoder, as a wide-character
453 * string.
454 */
455INLINE const std::wstring &TextEncoder::
456get_wtext() const {
457 if ((_flags & F_got_wtext) == 0) {
458 ((TextEncoder *)this)->_wtext = decode_text(_text);
459 ((TextEncoder *)this)->_flags |= F_got_wtext;
460 }
461 return _wtext;
462}
463
464/**
465 * Appends the indicates string to the end of the stored wide-character text.
466 */
467INLINE void TextEncoder::
468append_wtext(const std::wstring &wtext) {
469 if (!wtext.empty()) {
470 _wtext = get_wtext() + wtext;
471 _flags = (_flags | F_got_wtext) & ~F_got_text;
472 text_changed();
473 }
474}
475
476/**
477 * Encodes a wide-text string into a single-char string, according to the
478 * current encoding.
479 */
480INLINE std::string TextEncoder::
481encode_wtext(const std::wstring &wtext) const {
482 return encode_wtext(wtext, _encoding);
483}
484
485/**
486 * Returns the given wstring decoded to a single-byte string, via the current
487 * encoding system.
488 */
489INLINE std::wstring TextEncoder::
490decode_text(const std::string &text) const {
491 return decode_text(text, _encoding);
492}
493
494/**
495 * Uses the current default encoding to output the wstring.
496 */
497INLINE std::ostream &
498operator << (std::ostream &out, const std::wstring &str) {
499 TextEncoder encoder;
500 encoder.set_wtext(str);
501 out << encoder.get_text();
502 return out;
503}
This class can be used to convert text between multiple representations, e.g.
Definition textEncoder.h:33
std::wstring decode_text(const std::string &text) const
Returns the given wstring decoded to a single-byte string, via the current encoding system.
void append_text(const std::string &text)
Appends the indicates string to the end of the stored text.
set_text
Changes the text that is stored in the encoder.
static std::string upper(const std::string &source)
Converts the string to uppercase, assuming the string is encoded in the default encoding.
void append_wtext(const std::wstring &text)
Appends the indicates string to the end of the stored wide-character text.
void set_unicode_char(size_t index, char32_t character)
Sets the Unicode value of the nth character in the stored text.
static std::string lower(const std::string &source)
Converts the string to lowercase, assuming the string is encoded in the default encoding.
std::string get_text_as_ascii() const
Returns the text associated with the node, converted as nearly as possible to a fully-ASCII represent...
static std::string reencode_text(const std::string &text, Encoding from, Encoding to)
Given the indicated text string, which is assumed to be encoded via the encoding "from",...
static bool unicode_ispunct(char32_t character)
Returns true if the indicated character is a punctuation mark, false otherwise.
static int unicode_tolower(char32_t character)
Returns the uppercase equivalent of the given Unicode character.
static bool unicode_isupper(char32_t character)
Returns true if the indicated character is an uppercase letter, false otherwise.
get_default_encoding
Specifies the default encoding to be used for all subsequently created TextEncoder objects.
Definition textEncoder.h:54
static int unicode_toupper(char32_t character)
Returns the uppercase equivalent of the given Unicode character.
std::wstring get_wtext_as_ascii() const
Returns the text associated with the node, converted as nearly as possible to a fully-ASCII represent...
Encoding get_encoding() const
Returns the encoding by which the string set via set_text() is to be interpreted.
Definition textEncoder.I:60
static bool unicode_isdigit(char32_t character)
Returns true if the indicated character is a numeric digit, false otherwise.
void set_encoding(Encoding encoding)
Specifies how the string set via set_text() is to be interpreted.
Definition textEncoder.I:48
get_text
Returns the current text, as encoded via the current encoding system.
const std::wstring & get_wtext() const
Returns the text associated with the TextEncoder, as a wide-character string.
size_t get_num_chars() const
Returns the number of characters in the stored text.
void make_lower()
Adjusts the text stored within the encoder to all lowercase letters (preserving accent marks correctl...
void clear_text()
Removes the text from the TextEncoder.
void make_upper()
Adjusts the text stored within the encoder to all uppercase letters (preserving accent marks correctl...
std::string get_encoded_char(size_t index) const
Returns the nth char of the stored text, as a one-, two-, or three-byte encoded string.
static bool unicode_isspace(char32_t character)
Returns true if the indicated character is a whitespace letter, false otherwise.
int get_unicode_char(size_t index) const
Returns the Unicode value of the nth character in the stored text.
static bool unicode_islower(char32_t character)
Returns true if the indicated character is a lowercase letter, false otherwise.
set_default_encoding
Specifies the default encoding to be used for all subsequently created TextEncoder objects.
Definition textEncoder.h:54
std::string encode_wtext(const std::wstring &wtext) const
Encodes a wide-text string into a single-char string, according to the current encoding.
static bool unicode_isalpha(char32_t character)
Returns true if the indicated character is an alphabetic letter, false otherwise.
void append_unicode_char(char32_t character)
Appends a single character to the end of the stored text.
void set_wtext(const std::wstring &wtext)
Changes the text that is stored in the encoder.
static const Entry * look_up(char32_t character)
Returns the Entry associated with the indicated character, if there is one.
std::ostream & operator<<(std::ostream &out, const std::wstring &str)
Uses the current default encoding to output the wstring.