Panda3D

stringDecoder.cxx

00001 // Filename: stringDecoder.cxx
00002 // Created by:  drose (11Feb02)
00003 //
00004 ////////////////////////////////////////////////////////////////////
00005 //
00006 // PANDA 3D SOFTWARE
00007 // Copyright (c) Carnegie Mellon University.  All rights reserved.
00008 //
00009 // All use of this software is subject to the terms of the revised BSD
00010 // license.  You should have received a copy of this license along
00011 // with this source code in a file named "LICENSE."
00012 //
00013 ////////////////////////////////////////////////////////////////////
00014 
00015 #include "stringDecoder.h"
00016 #include "config_express.h"
00017 
00018 ////////////////////////////////////////////////////////////////////
00019 //     Function: StringDecoder::Destructor
00020 //       Access: Public, Virtual
00021 //  Description: 
00022 ////////////////////////////////////////////////////////////////////
00023 StringDecoder::
00024 ~StringDecoder() {
00025 }
00026 
00027 ////////////////////////////////////////////////////////////////////
00028 //     Function: StringDecoder::get_next_character
00029 //       Access: Public, Virtual
00030 //  Description: Returns the next character in sequence.
00031 ////////////////////////////////////////////////////////////////////
00032 int StringDecoder::
00033 get_next_character() {
00034   if (test_eof()) {
00035     return -1;
00036   }
00037   return (unsigned char)_input[_p++];
00038 }
00039 
00040 /*
00041 In UTF-8, each 16-bit Unicode character is encoded as a sequence of
00042 one, two, or three 8-bit bytes, depending on the value of the
00043 character. The following table shows the format of such UTF-8 byte
00044 sequences (where the "free bits" shown by x's in the table are
00045 combined in the order shown, and interpreted from most significant to
00046 least significant):
00047 
00048  Binary format of bytes in sequence:
00049                                         Number of    Maximum expressible
00050  1st byte     2nd byte    3rd byte      free bits:      Unicode value:
00051 
00052  0xxxxxxx                                  7           007F hex   (127)
00053  110xxxxx     10xxxxxx                  (5+6)=11       07FF hex  (2047)
00054  1110xxxx     10xxxxxx    10xxxxxx     (4+6+6)=16      FFFF hex (65535)
00055 
00056 The value of each individual byte indicates its UTF-8 function, as follows:
00057 
00058  00 to 7F hex   (0 to 127):  first and only byte of a sequence.
00059  80 to BF hex (128 to 191):  continuing byte in a multi-byte sequence.
00060  C2 to DF hex (194 to 223):  first byte of a two-byte sequence.
00061  E0 to EF hex (224 to 239):  first byte of a three-byte sequence.
00062 */
00063 
00064 ////////////////////////////////////////////////////////////////////
00065 //     Function: StringUtf8Decoder::get_next_character
00066 //       Access: Public, Virtual
00067 //  Description: Returns the next character in sequence.
00068 ////////////////////////////////////////////////////////////////////
00069 int StringUtf8Decoder::
00070 get_next_character() {
00071   unsigned int result;
00072   while (!test_eof()) {
00073     result = (unsigned char)_input[_p++];
00074     if ((result & 0x80) == 0) {
00075       // A 7-bit ascii value in one byte.
00076       return result;
00077 
00078     } if ((result & 0xe0) == 0xc0) {
00079       // First byte of two.
00080       unsigned int two = 0;
00081       if (test_eof()) {
00082         express_cat.warning()
00083           << "utf-8 encoded string '" << _input << "' ends abruptly.\n";
00084         return -1;
00085       }
00086       two = (unsigned char)_input[_p++];
00087       result = ((result & 0x1f) << 6) | (two & 0x3f);
00088       return result;
00089       
00090     } else if ((result & 0xf0) == 0xe0) {
00091       // First byte of three.
00092       if (test_eof()) {
00093         express_cat.warning()
00094           << "utf-8 encoded string '" << _input << "' ends abruptly.\n";
00095         return -1;
00096       }
00097       unsigned int two = (unsigned char)_input[_p++];
00098       if (test_eof()) {
00099         express_cat.warning()
00100           << "utf-8 encoded string '" << _input << "' ends abruptly.\n";
00101         return -1;
00102       }
00103       unsigned int three = (unsigned char)_input[_p++];
00104       result = ((result & 0x0f) << 12) | ((two & 0x3f) << 6) | (three & 0x3f);
00105       return result;
00106     }
00107 
00108     // Otherwise--the high bit is set but it is not one of the
00109     // introductory utf-8 bytes--we have an error.
00110     express_cat.warning()
00111       << "Non utf-8 byte in string: 0x" << hex << result << dec
00112       << ", string is '" << _input << "'\n";
00113     nassertr(false, -1);
00114   }
00115 
00116   // End of string reached.
00117   return -1;
00118 }
00119 
00120 ////////////////////////////////////////////////////////////////////
00121 //     Function: StringUnicodeDecoder::get_next_character
00122 //       Access: Public, Virtual
00123 //  Description: Returns the next character in sequence.
00124 ////////////////////////////////////////////////////////////////////
00125 int StringUnicodeDecoder::
00126 get_next_character() {
00127   if (test_eof()) {
00128     return -1;
00129   }
00130 
00131   unsigned int high = (unsigned char)_input[_p++];
00132   if (test_eof()) {
00133     express_cat.warning()
00134       << "Unicode-encoded string has odd number of bytes.\n";
00135     return -1;
00136   }
00137   unsigned int low = (unsigned char)_input[_p++];
00138   return ((high << 8) | low);
00139 }
 All Classes Functions Variables Enumerations