Panda3D
stringDecoder.cxx
Go to the documentation of this file.
1 /**
2  * PANDA 3D SOFTWARE
3  * Copyright (c) Carnegie Mellon University. All rights reserved.
4  *
5  * All use of this software is subject to the terms of the revised BSD
6  * license. You should have received a copy of this license along
7  * with this source code in a file named "LICENSE."
8  *
9  * @file stringDecoder.cxx
10  * @author drose
11  * @date 2002-02-11
12  */
13 
14 #include "stringDecoder.h"
15 #include "config_dtoolutil.h"
16 
17 std::ostream *StringDecoder::_notify_ptr = &std::cerr;
18 
19 /**
20  *
21  */
22 StringDecoder::
23 ~StringDecoder() {
24 }
25 
26 /**
27  * Returns the next character in sequence.
28  */
29 char32_t StringDecoder::
31  if (test_eof()) {
32  return -1;
33  }
34  return (unsigned char)_input[_p++];
35 }
36 
37 /**
38  * Sets the ostream that is used to write error messages to. This is
39  * necessary because of the low-level placement of this class, before the
40  * definition of the NotifyCategory class, so it cannot specify its own
41  * notify.
42  */
43 void StringDecoder::
44 set_notify_ptr(std::ostream *notify_ptr) {
45  _notify_ptr = notify_ptr;
46 }
47 
48 /**
49  * Returns the ostream that is used to write error messages to. See
50  * set_notify_ptr().
51  */
52 std::ostream *StringDecoder::
54  return _notify_ptr;
55 }
56 
57 
58 /*
59 In UTF-8, each 16-bit Unicode character is encoded as a sequence of
60 one, two, three or four 8-bit bytes, depending on the value of the
61 character. The following table shows the format of such UTF-8 byte
62 sequences (where the "free bits" shown by x's in the table are
63 combined in the order shown, and interpreted from most significant to
64 least significant):
65 
66  Binary format of bytes in sequence:
67  Number of Maximum expressible
68  1st byte 2nd byte 3rd byte 4th byte free bits: Unicode value:
69 
70  0xxxxxxx 7 007F hex (127)
71  110xxxxx 10xxxxxx (5+6)=11 07FF hex (2047)
72  1110xxxx 10xxxxxx 10xxxxxx (4+6+6)=16 FFFF hex (65535)
73  11110xxx 10xxxxxx 10xxxxxx 10xxxxxx (4+6*3)=21 10FFFF hex (1114111)
74 
75 The value of each individual byte indicates its UTF-8 function, as follows:
76 
77  00 to 7F hex (0 to 127): first and only byte of a sequence.
78  80 to BF hex (128 to 191): continuing byte in a multi-byte sequence.
79  C2 to DF hex (194 to 223): first byte of a two-byte sequence.
80  E0 to EF hex (224 to 239): first byte of a three-byte sequence.
81  F0 to F7 hex (240 to 247): first byte of a four-byte sequence.
82 */
83 
84 /**
85  * Returns the next character in sequence.
86  */
87 char32_t StringUtf8Decoder::
89  unsigned int result;
90  while (!test_eof()) {
91  result = (unsigned char)_input[_p++];
92  if ((result & 0x80) == 0) {
93  // A 7-bit ascii value in one byte.
94  return result;
95 
96  } if ((result & 0xe0) == 0xc0) {
97  // First byte of two.
98  unsigned int two = 0;
99  if (test_eof()) {
100  if (_notify_ptr != nullptr) {
101  (*_notify_ptr)
102  << "utf-8 encoded string '" << _input << "' ends abruptly.\n";
103  }
104  return -1;
105  }
106  two = (unsigned char)_input[_p++];
107  result = ((result & 0x1f) << 6) | (two & 0x3f);
108  return result;
109 
110  } else if ((result & 0xf0) == 0xe0) {
111  // First byte of three.
112  if (test_eof()) {
113  if (_notify_ptr != nullptr) {
114  (*_notify_ptr)
115  << "utf-8 encoded string '" << _input << "' ends abruptly.\n";
116  }
117  return -1;
118  }
119  unsigned int two = (unsigned char)_input[_p++];
120  if (test_eof()) {
121  if (_notify_ptr != nullptr) {
122  (*_notify_ptr)
123  << "utf-8 encoded string '" << _input << "' ends abruptly.\n";
124  }
125  return -1;
126  }
127  unsigned int three = (unsigned char)_input[_p++];
128  result = ((result & 0x0f) << 12) | ((two & 0x3f) << 6) | (three & 0x3f);
129  return result;
130 
131  } else if ((result & 0xf8) == 0xf0) {
132  // First byte of four.
133  if (test_eof()) {
134  if (_notify_ptr != nullptr) {
135  (*_notify_ptr)
136  << "utf-8 encoded string '" << _input << "' ends abruptly.\n";
137  }
138  return -1;
139  }
140  unsigned int two = (unsigned char)_input[_p++];
141  if (test_eof()) {
142  if (_notify_ptr != nullptr) {
143  (*_notify_ptr)
144  << "utf-8 encoded string '" << _input << "' ends abruptly.\n";
145  }
146  return -1;
147  }
148  unsigned int three = (unsigned char)_input[_p++];
149  if (test_eof()) {
150  if (_notify_ptr != nullptr) {
151  (*_notify_ptr)
152  << "utf-8 encoded string '" << _input << "' ends abruptly.\n";
153  }
154  return -1;
155  }
156  unsigned int four = (unsigned char)_input[_p++];
157  result = ((result & 0x07) << 18) | ((two & 0x3f) << 12) | ((three & 0x3f) << 6) | (four & 0x3f);
158  return result;
159  }
160 
161  // Otherwise--the high bit is set but it is not one of the introductory
162  // utf-8 bytes--we have an error.
163  if (_notify_ptr != nullptr) {
164  (*_notify_ptr)
165  << "Non utf-8 byte in string: 0x" << std::hex << result << std::dec
166  << ", string is '" << _input << "'\n";
167  }
168  return -1;
169  }
170 
171  // End of string reached.
172  return -1;
173 }
174 
175 /**
176  * Returns the next character in sequence.
177  */
178 char32_t StringUtf16Decoder::
180  if (test_eof()) {
181  return -1;
182  }
183 
184  unsigned int high = (unsigned char)_input[_p++];
185  if (test_eof()) {
186  if (_notify_ptr != nullptr) {
187  (*_notify_ptr)
188  << "Unicode-encoded string has odd number of bytes.\n";
189  }
190  return -1;
191  }
192  unsigned int low = (unsigned char)_input[_p++];
193  int ch = ((high << 8) | low);
194 
195  /*
196  using std::swap;
197 
198  if (ch == 0xfffe) {
199  // This is a byte-swapped byte-order-marker. That means we need to swap
200  // the endianness of the rest of the stream.
201  char *data = (char *)_input.data();
202  for (size_t p = _p; p < _input.size() - 1; p += 2) {
203  std::swap(data[p], data[p + 1]);
204  }
205  ch = 0xfeff;
206  }
207  */
208 
209  if (ch >= 0xd800 && ch < 0xdc00 && (_p + 1) < _input.size()) {
210  // This is a high surrogate. Look for a subsequent low surrogate.
211  unsigned int high = (unsigned char)_input[_p];
212  unsigned int low = (unsigned char)_input[_p + 1];
213  int ch2 = ((high << 8) | low);
214  if (ch2 >= 0xdc00 && ch2 < 0xe000) {
215  // Yes, this is a low surrogate.
216  _p += 2;
217  return 0x10000 + ((ch - 0xd800) << 10) + (ch2 - 0xdc00);
218  }
219  }
220  // No, this is just a regular character, or an unpaired surrogate.
221  return ch;
222 }
virtual char32_t get_next_character()
Returns the next character in sequence.
virtual char32_t get_next_character()
Returns the next character in sequence.
PANDA 3D SOFTWARE Copyright (c) Carnegie Mellon University.
virtual char32_t get_next_character()
Returns the next character in sequence.
static void set_notify_ptr(std::ostream *ptr)
Sets the ostream that is used to write error messages to.
static std::ostream * get_notify_ptr()
Returns the ostream that is used to write error messages to.
PANDA 3D SOFTWARE Copyright (c) Carnegie Mellon University.