Panda3D
Loading...
Searching...
No Matches
stringDecoder.cxx
Go to the documentation of this file.
1/**
2 * PANDA 3D SOFTWARE
3 * Copyright (c) Carnegie Mellon University. All rights reserved.
4 *
5 * All use of this software is subject to the terms of the revised BSD
6 * license. You should have received a copy of this license along
7 * with this source code in a file named "LICENSE."
8 *
9 * @file stringDecoder.cxx
10 * @author drose
11 * @date 2002-02-11
12 */
13
14#include "stringDecoder.h"
15#include "config_dtoolutil.h"
16
17std::ostream *StringDecoder::_notify_ptr = &std::cerr;
18
19/**
20 *
21 */
22StringDecoder::
23~StringDecoder() {
24}
25
26/**
27 * Returns the next character in sequence.
28 */
31 if (test_eof()) {
32 return -1;
33 }
34 return (unsigned char)_input[_p++];
35}
36
37/**
38 * Sets the ostream that is used to write error messages to. This is
39 * necessary because of the low-level placement of this class, before the
40 * definition of the NotifyCategory class, so it cannot specify its own
41 * notify.
42 */
44set_notify_ptr(std::ostream *notify_ptr) {
45 _notify_ptr = notify_ptr;
46}
47
48/**
49 * Returns the ostream that is used to write error messages to. See
50 * set_notify_ptr().
51 */
52std::ostream *StringDecoder::
54 return _notify_ptr;
55}
56
57
58/*
59In UTF-8, each 16-bit Unicode character is encoded as a sequence of
60one, two, three or four 8-bit bytes, depending on the value of the
61character. The following table shows the format of such UTF-8 byte
62sequences (where the "free bits" shown by x's in the table are
63combined in the order shown, and interpreted from most significant to
64least significant):
65
66 Binary format of bytes in sequence:
67 Number of Maximum expressible
68 1st byte 2nd byte 3rd byte 4th byte free bits: Unicode value:
69
70 0xxxxxxx 7 007F hex (127)
71 110xxxxx 10xxxxxx (5+6)=11 07FF hex (2047)
72 1110xxxx 10xxxxxx 10xxxxxx (4+6+6)=16 FFFF hex (65535)
73 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx (4+6*3)=21 10FFFF hex (1114111)
74
75The value of each individual byte indicates its UTF-8 function, as follows:
76
77 00 to 7F hex (0 to 127): first and only byte of a sequence.
78 80 to BF hex (128 to 191): continuing byte in a multi-byte sequence.
79 C2 to DF hex (194 to 223): first byte of a two-byte sequence.
80 E0 to EF hex (224 to 239): first byte of a three-byte sequence.
81 F0 to F7 hex (240 to 247): first byte of a four-byte sequence.
82*/
83
84/**
85 * Returns the next character in sequence.
86 */
89 unsigned int result;
90 while (!test_eof()) {
91 result = (unsigned char)_input[_p++];
92 if ((result & 0x80) == 0) {
93 // A 7-bit ascii value in one byte.
94 return result;
95
96 } if ((result & 0xe0) == 0xc0) {
97 // First byte of two.
98 unsigned int two = 0;
99 if (test_eof()) {
100 if (_notify_ptr != nullptr) {
101 (*_notify_ptr)
102 << "utf-8 encoded string '" << _input << "' ends abruptly.\n";
103 }
104 return -1;
105 }
106 two = (unsigned char)_input[_p++];
107 result = ((result & 0x1f) << 6) | (two & 0x3f);
108 return result;
109
110 } else if ((result & 0xf0) == 0xe0) {
111 // First byte of three.
112 if (test_eof()) {
113 if (_notify_ptr != nullptr) {
114 (*_notify_ptr)
115 << "utf-8 encoded string '" << _input << "' ends abruptly.\n";
116 }
117 return -1;
118 }
119 unsigned int two = (unsigned char)_input[_p++];
120 if (test_eof()) {
121 if (_notify_ptr != nullptr) {
122 (*_notify_ptr)
123 << "utf-8 encoded string '" << _input << "' ends abruptly.\n";
124 }
125 return -1;
126 }
127 unsigned int three = (unsigned char)_input[_p++];
128 result = ((result & 0x0f) << 12) | ((two & 0x3f) << 6) | (three & 0x3f);
129 return result;
130
131 } else if ((result & 0xf8) == 0xf0) {
132 // First byte of four.
133 if (test_eof()) {
134 if (_notify_ptr != nullptr) {
135 (*_notify_ptr)
136 << "utf-8 encoded string '" << _input << "' ends abruptly.\n";
137 }
138 return -1;
139 }
140 unsigned int two = (unsigned char)_input[_p++];
141 if (test_eof()) {
142 if (_notify_ptr != nullptr) {
143 (*_notify_ptr)
144 << "utf-8 encoded string '" << _input << "' ends abruptly.\n";
145 }
146 return -1;
147 }
148 unsigned int three = (unsigned char)_input[_p++];
149 if (test_eof()) {
150 if (_notify_ptr != nullptr) {
151 (*_notify_ptr)
152 << "utf-8 encoded string '" << _input << "' ends abruptly.\n";
153 }
154 return -1;
155 }
156 unsigned int four = (unsigned char)_input[_p++];
157 result = ((result & 0x07) << 18) | ((two & 0x3f) << 12) | ((three & 0x3f) << 6) | (four & 0x3f);
158 return result;
159 }
160
161 // Otherwise--the high bit is set but it is not one of the introductory
162 // utf-8 bytes--we have an error.
163 if (_notify_ptr != nullptr) {
164 (*_notify_ptr)
165 << "Non utf-8 byte in string: 0x" << std::hex << result << std::dec
166 << ", string is '" << _input << "'\n";
167 }
168 return -1;
169 }
170
171 // End of string reached.
172 return -1;
173}
174
175/**
176 * Returns the next character in sequence.
177 */
180 if (test_eof()) {
181 return -1;
182 }
183
184 unsigned int high = (unsigned char)_input[_p++];
185 if (test_eof()) {
186 if (_notify_ptr != nullptr) {
187 (*_notify_ptr)
188 << "Unicode-encoded string has odd number of bytes.\n";
189 }
190 return -1;
191 }
192 unsigned int low = (unsigned char)_input[_p++];
193 int ch = ((high << 8) | low);
194
195 /*
196 using std::swap;
197
198 if (ch == 0xfffe) {
199 // This is a byte-swapped byte-order-marker. That means we need to swap
200 // the endianness of the rest of the stream.
201 char *data = (char *)_input.data();
202 for (size_t p = _p; p < _input.size() - 1; p += 2) {
203 std::swap(data[p], data[p + 1]);
204 }
205 ch = 0xfeff;
206 }
207 */
208
209 if (ch >= 0xd800 && ch < 0xdc00 && (_p + 1) < _input.size()) {
210 // This is a high surrogate. Look for a subsequent low surrogate.
211 unsigned int high = (unsigned char)_input[_p];
212 unsigned int low = (unsigned char)_input[_p + 1];
213 int ch2 = ((high << 8) | low);
214 if (ch2 >= 0xdc00 && ch2 < 0xe000) {
215 // Yes, this is a low surrogate.
216 _p += 2;
217 return 0x10000 + ((ch - 0xd800) << 10) + (ch2 - 0xdc00);
218 }
219 }
220 // No, this is just a regular character, or an unpaired surrogate.
221 return ch;
222}
static void set_notify_ptr(std::ostream *ptr)
Sets the ostream that is used to write error messages to.
virtual char32_t get_next_character()
Returns the next character in sequence.
static std::ostream * get_notify_ptr()
Returns the ostream that is used to write error messages to.
virtual char32_t get_next_character()
Returns the next character in sequence.
virtual char32_t get_next_character()
Returns the next character in sequence.
PANDA 3D SOFTWARE Copyright (c) Carnegie Mellon University.
PANDA 3D SOFTWARE Copyright (c) Carnegie Mellon University.