Panda3D
Loading...
Searching...
No Matches
textEncoder.cxx
Go to the documentation of this file.
1/**
2 * PANDA 3D SOFTWARE
3 * Copyright (c) Carnegie Mellon University. All rights reserved.
4 *
5 * All use of this software is subject to the terms of the revised BSD
6 * license. You should have received a copy of this license along
7 * with this source code in a file named "LICENSE."
8 *
9 * @file textEncoder.cxx
10 * @author drose
11 * @date 2003-03-26
12 */
13
14#include "textEncoder.h"
15#include "stringDecoder.h"
16#include "unicodeLatinMap.h"
17#include "config_dtoolutil.h"
18
19using std::istream;
20using std::ostream;
21using std::string;
22using std::wstring;
23
24TextEncoder::Encoding TextEncoder::_default_encoding = TextEncoder::E_utf8;
25
26/**
27 * Adjusts the text stored within the encoder to all uppercase letters
28 * (preserving accent marks correctly).
29 */
31make_upper() {
32 get_wtext();
33 wstring::iterator si;
34 for (si = _wtext.begin(); si != _wtext.end(); ++si) {
35 (*si) = unicode_toupper(*si);
36 }
37 _flags &= ~F_got_text;
38 text_changed();
39}
40
41/**
42 * Adjusts the text stored within the encoder to all lowercase letters
43 * (preserving accent marks correctly).
44 */
46make_lower() {
47 get_wtext();
48 wstring::iterator si;
49 for (si = _wtext.begin(); si != _wtext.end(); ++si) {
50 (*si) = unicode_tolower(*si);
51 }
52 _flags &= ~F_got_text;
53 text_changed();
54}
55
56/**
57 * Returns the text associated with the node, converted as nearly as possible
58 * to a fully-ASCII representation. This means replacing accented letters
59 * with their unaccented ASCII equivalents.
60 *
61 * It is possible that some characters in the string cannot be converted to
62 * ASCII. (The string may involve symbols like the copyright symbol, for
63 * instance, or it might involve letters in some other alphabet such as Greek
64 * or Cyrillic, or even Latin letters like thorn or eth that are not part of
65 * the ASCII character set.) In this case, as much of the string as possible
66 * will be converted to ASCII, and the nonconvertible characters will remain
67 * in their original form.
68 */
70get_wtext_as_ascii() const {
71 get_wtext();
72 wstring result;
73 wstring::const_iterator si;
74 for (si = _wtext.begin(); si != _wtext.end(); ++si) {
75 wchar_t character = (*si);
76
77 const UnicodeLatinMap::Entry *map_entry =
78 UnicodeLatinMap::look_up(character);
79 if (map_entry != nullptr && map_entry->_ascii_equiv != 0) {
80 result += (wchar_t)map_entry->_ascii_equiv;
81 if (map_entry->_ascii_additional != 0) {
82 result += (wchar_t)map_entry->_ascii_additional;
83 }
84
85 } else {
86 result += character;
87 }
88 }
89
90 return result;
91}
92
93/**
94 * Returns true if any of the characters in the string returned by get_wtext()
95 * are out of the range of an ASCII character (and, therefore, get_wtext()
96 * should be called in preference to get_text()).
97 */
99is_wtext() const {
100 get_wtext();
101 wstring::const_iterator ti;
102 for (ti = _wtext.begin(); ti != _wtext.end(); ++ti) {
103 if (((*ti) & ~0x7f) != 0) {
104 return true;
105 }
106 }
107
108 return false;
109}
110
111/**
112 * Encodes a single Unicode character into a one-, two-, three-, or four-byte
113 * string, according to the given encoding system.
114 */
116encode_wchar(char32_t ch, TextEncoder::Encoding encoding) {
117 switch (encoding) {
118 case E_iso8859:
119 if ((ch & ~0xff) == 0) {
120 return string(1, (char)ch);
121 } else {
122 // The character won't fit in the 8-bit ISO 8859. See if we can make it
123 // fit by reducing it to its ascii equivalent (essentially stripping off
124 // an unusual accent mark).
125 const UnicodeLatinMap::Entry *map_entry =
127 if (map_entry != nullptr && map_entry->_ascii_equiv != 0) {
128 // Yes, it has an ascii equivalent.
129 if (map_entry->_ascii_additional != 0) {
130 // In fact, it has two of them.
131 return
132 string(1, map_entry->_ascii_equiv) +
133 string(1, map_entry->_ascii_additional);
134 }
135 return string(1, map_entry->_ascii_equiv);
136 }
137 // Nope; return "." for lack of anything better.
138 return ".";
139 }
140
141 case E_utf8:
142 if ((ch & ~0x7f) == 0) {
143 return string(1, (char)ch);
144 } else if ((ch & ~0x7ff) == 0) {
145 return
146 string(1, (char)((ch >> 6) | 0xc0)) +
147 string(1, (char)((ch & 0x3f) | 0x80));
148 } else if ((ch & ~0xffff) == 0) {
149 return
150 string(1, (char)((ch >> 12) | 0xe0)) +
151 string(1, (char)(((ch >> 6) & 0x3f) | 0x80)) +
152 string(1, (char)((ch & 0x3f) | 0x80));
153 } else {
154 return
155 string(1, (char)((ch >> 18) | 0xf0)) +
156 string(1, (char)(((ch >> 12) & 0x3f) | 0x80)) +
157 string(1, (char)(((ch >> 6) & 0x3f) | 0x80)) +
158 string(1, (char)((ch & 0x3f) | 0x80));
159 }
160
161 case E_utf16be:
162 if ((ch & ~0xffff) == 0) {
163 // Note that this passes through surrogates and BOMs unharmed.
164 return
165 string(1, (char)(ch >> 8)) +
166 string(1, (char)(ch & 0xff));
167 } else {
168 // Use a surrogate pair.
169 uint32_t v = (uint32_t)ch - 0x10000u;
170 uint16_t hi = (v >> 10u) | 0xd800u;
171 uint16_t lo = (v & 0x3ffu) | 0xdc00u;
172 char encoded[4] = {
173 (char)(hi >> 8),
174 (char)(hi & 0xff),
175 (char)(lo >> 8),
176 (char)(lo & 0xff),
177 };
178 return string(encoded, 4);
179 }
180 }
181
182 return "";
183}
184
185/**
186 * Encodes a wide-text string into a single-char string, according to the
187 * given encoding.
188 */
190encode_wtext(const wstring &wtext, TextEncoder::Encoding encoding) {
191 string result;
192
193 for (size_t i = 0; i < wtext.size(); ++i) {
194 wchar_t ch = wtext[i];
195
196 // On some systems, wstring may be UTF-16, and contain surrogate pairs.
197#if WCHAR_MAX < 0x10FFFF
198 if (ch >= 0xd800 && ch < 0xdc00 && (i + 1) < wtext.size()) {
199 // This is a high surrogate. Look for a subsequent low surrogate.
200 wchar_t ch2 = wtext[i + 1];
201 if (ch2 >= 0xdc00 && ch2 < 0xe000) {
202 // Yes, this is a low surrogate.
203 char32_t code_point = 0x10000 + ((ch - 0xd800) << 10) + (ch2 - 0xdc00);
204 result += encode_wchar(code_point, encoding);
205 i++;
206 continue;
207 }
208 }
209#endif
210
211 result += encode_wchar(ch, encoding);
212 }
213
214 return result;
215}
216
217/**
218 * Returns the given wstring decoded to a single-byte string, via the given
219 * encoding system.
220 */
222decode_text(const string &text, TextEncoder::Encoding encoding) {
223 switch (encoding) {
224 case E_utf8:
225 {
226 StringUtf8Decoder decoder(text);
227 return decode_text_impl(decoder);
228 }
229
230 case E_utf16be:
231 {
232 StringUtf16Decoder decoder(text);
233 return decode_text_impl(decoder);
234 }
235
236 case E_iso8859:
237 default:
238 {
239 StringDecoder decoder(text);
240 return decode_text_impl(decoder);
241 }
242 };
243}
244
245/**
246 * Decodes the eight-bit stream from the indicated decoder, returning the
247 * decoded wide-char string.
248 */
249wstring TextEncoder::
250decode_text_impl(StringDecoder &decoder) {
251 wstring result;
252 // bool expand_amp = get_expand_amp();
253
254 char32_t character = decoder.get_next_character();
255 while (!decoder.is_eof()) {
256 /*
257 if (character == '&' && expand_amp) {
258 // An ampersand in expand_amp mode is treated as an escape character.
259 character = expand_amp_sequence(decoder);
260 }
261 */
262 if (character <= WCHAR_MAX) {
263 result += character;
264 } else {
265 // We need to encode this as a surrogate pair.
266 uint32_t v = (uint32_t)character - 0x10000u;
267 result += (wchar_t)((v >> 10u) | 0xd800u);
268 result += (wchar_t)((v & 0x3ffu) | 0xdc00u);
269 }
270 character = decoder.get_next_character();
271 }
272
273 return result;
274}
275
276/**
277 * Given that we have just read an ampersand from the StringDecoder, and that
278 * we have expand_amp in effect and are therefore expected to expand the
279 * sequence that this ampersand begins into a single unicode character, do the
280 * expansion and return the character.
281 */
282/*
283int TextEncoder::
284expand_amp_sequence(StringDecoder &decoder) const {
285 int result = 0;
286
287 int character = decoder.get_next_character();
288 if (!decoder.is_eof() && character == '#') {
289 // An explicit numeric sequence: &#nnn;
290 result = 0;
291 character = decoder.get_next_character();
292 while (!decoder.is_eof() && character < 128 && isdigit((unsigned int)character)) {
293 result = (result * 10) + (character - '0');
294 character = decoder.get_next_character();
295 }
296 if (character != ';') {
297 // Invalid sequence.
298 return 0;
299 }
300
301 return result;
302 }
303
304 string sequence;
305
306 // Some non-numeric sequence.
307 while (!decoder.is_eof() && character < 128 && isalpha((unsigned int)character)) {
308 sequence += character;
309 character = decoder.get_next_character();
310 }
311 if (character != ';') {
312 // Invalid sequence.
313 return 0;
314 }
315
316 static const struct {
317 const char *name;
318 int code;
319 } tokens[] = {
320 { "amp", '&' }, { "lt", '<' }, { "gt", '>' }, { "quot", '"' },
321 { "nbsp", ' ' },
322
323 { "iexcl", 161 }, { "cent", 162 }, { "pound", 163 }, { "curren", 164 },
324 { "yen", 165 }, { "brvbar", 166 }, { "brkbar", 166 }, { "sect", 167 },
325 { "uml", 168 }, { "die", 168 }, { "copy", 169 }, { "ordf", 170 },
326 { "laquo", 171 }, { "not", 172 }, { "shy", 173 }, { "reg", 174 },
327 { "macr", 175 }, { "hibar", 175 }, { "deg", 176 }, { "plusmn", 177 },
328 { "sup2", 178 }, { "sup3", 179 }, { "acute", 180 }, { "micro", 181 },
329 { "para", 182 }, { "middot", 183 }, { "cedil", 184 }, { "sup1", 185 },
330 { "ordm", 186 }, { "raquo", 187 }, { "frac14", 188 }, { "frac12", 189 },
331 { "frac34", 190 }, { "iquest", 191 }, { "Agrave", 192 }, { "Aacute", 193 },
332 { "Acirc", 194 }, { "Atilde", 195 }, { "Auml", 196 }, { "Aring", 197 },
333 { "AElig", 198 }, { "Ccedil", 199 }, { "Egrave", 200 }, { "Eacute", 201 },
334 { "Ecirc", 202 }, { "Euml", 203 }, { "Igrave", 204 }, { "Iacute", 205 },
335 { "Icirc", 206 }, { "Iuml", 207 }, { "ETH", 208 }, { "Dstrok", 208 },
336 { "Ntilde", 209 }, { "Ograve", 210 }, { "Oacute", 211 }, { "Ocirc", 212 },
337 { "Otilde", 213 }, { "Ouml", 214 }, { "times", 215 }, { "Oslash", 216 },
338 { "Ugrave", 217 }, { "Uacute", 218 }, { "Ucirc", 219 }, { "Uuml", 220 },
339 { "Yacute", 221 }, { "THORN", 222 }, { "szlig", 223 }, { "agrave", 224 },
340 { "aacute", 225 }, { "acirc", 226 }, { "atilde", 227 }, { "auml", 228 },
341 { "aring", 229 }, { "aelig", 230 }, { "ccedil", 231 }, { "egrave", 232 },
342 { "eacute", 233 }, { "ecirc", 234 }, { "euml", 235 }, { "igrave", 236 },
343 { "iacute", 237 }, { "icirc", 238 }, { "iuml", 239 }, { "eth", 240 },
344 { "ntilde", 241 }, { "ograve", 242 }, { "oacute", 243 }, { "ocirc", 244 },
345 { "otilde", 245 }, { "ouml", 246 }, { "divide", 247 }, { "oslash", 248 },
346 { "ugrave", 249 }, { "uacute", 250 }, { "ucirc", 251 }, { "uuml", 252 },
347 { "yacute", 253 }, { "thorn", 254 }, { "yuml", 255 },
348
349 { NULL, 0 },
350 };
351
352 for (int i = 0; tokens[i].name != NULL; i++) {
353 if (sequence == tokens[i].name) {
354 // Here's a match.
355 return tokens[i].code;
356 }
357 }
358
359 // Some unrecognized sequence.
360 return 0;
361}
362*/
363
364/**
365 * Called whenever the text has been changed.
366 */
367void TextEncoder::
368text_changed() {
369}
370
371/**
372 *
373 */
374ostream &
375operator << (ostream &out, TextEncoder::Encoding encoding) {
376 switch (encoding) {
377 case TextEncoder::E_iso8859:
378 return out << "iso8859";
379
380 case TextEncoder::E_utf8:
381 return out << "utf8";
382
383 case TextEncoder::E_utf16be:
384 return out << "utf16be";
385 };
386
387 return out << "**invalid TextEncoder::Encoding(" << (int)encoding << ")**";
388}
389
390/**
391 *
392 */
393istream &
394operator >> (istream &in, TextEncoder::Encoding &encoding) {
395 string word;
396 in >> word;
397
398 if (word == "iso8859") {
399 encoding = TextEncoder::E_iso8859;
400 } else if (word == "utf8" || word == "utf-8") {
401 encoding = TextEncoder::E_utf8;
402 } else if (word == "unicode" || word == "utf16be" || word == "utf-16be" ||
403 word == "utf16-be" || word == "utf-16-be") {
404 encoding = TextEncoder::E_utf16be;
405 } else {
406 ostream *notify_ptr = StringDecoder::get_notify_ptr();
407 if (notify_ptr != nullptr) {
408 (*notify_ptr)
409 << "Invalid TextEncoder::Encoding: " << word << "\n";
410 }
411 encoding = TextEncoder::E_iso8859;
412 }
413
414 return in;
415}
The base class to a family of classes that decode various kinds of encoded byte streams.
bool is_eof()
Returns true if the decoder has returned the last character in the string, false if there are more to...
virtual char32_t get_next_character()
Returns the next character in sequence.
static std::ostream * get_notify_ptr()
Returns the ostream that is used to write error messages to.
This decoder extracts characters two at a time to get a plain wide character sequence.
This decoder extracts utf-8 sequences.
std::wstring decode_text(const std::string &text) const
Returns the given wstring decoded to a single-byte string, via the current encoding system.
static std::string encode_wchar(char32_t ch, Encoding encoding)
Encodes a single Unicode character into a one-, two-, three-, or four-byte string,...
static int unicode_tolower(char32_t character)
Returns the uppercase equivalent of the given Unicode character.
bool is_wtext() const
Returns true if any of the characters in the string returned by get_wtext() are out of the range of a...
static int unicode_toupper(char32_t character)
Returns the uppercase equivalent of the given Unicode character.
std::wstring get_wtext_as_ascii() const
Returns the text associated with the node, converted as nearly as possible to a fully-ASCII represent...
const std::wstring & get_wtext() const
Returns the text associated with the TextEncoder, as a wide-character string.
void make_lower()
Adjusts the text stored within the encoder to all lowercase letters (preserving accent marks correctl...
void make_upper()
Adjusts the text stored within the encoder to all uppercase letters (preserving accent marks correctl...
std::string encode_wtext(const std::wstring &wtext) const
Encodes a wide-text string into a single-char string, according to the current encoding.
static const Entry * look_up(char32_t character)
Returns the Entry associated with the indicated character, if there is one.
PANDA 3D SOFTWARE Copyright (c) Carnegie Mellon University.
PANDA 3D SOFTWARE Copyright (c) Carnegie Mellon University.
PANDA 3D SOFTWARE Copyright (c) Carnegie Mellon University.
PANDA 3D SOFTWARE Copyright (c) Carnegie Mellon University.