Panda3D
 All Classes Functions Variables Enumerations
stringDecoder.cxx
00001 // Filename: stringDecoder.cxx
00002 // Created by:  drose (11Feb02)
00003 //
00004 ////////////////////////////////////////////////////////////////////
00005 //
00006 // PANDA 3D SOFTWARE
00007 // Copyright (c) Carnegie Mellon University.  All rights reserved.
00008 //
00009 // All use of this software is subject to the terms of the revised BSD
00010 // license.  You should have received a copy of this license along
00011 // with this source code in a file named "LICENSE."
00012 //
00013 ////////////////////////////////////////////////////////////////////
00014 
00015 #include "stringDecoder.h"
00016 #include "config_dtoolutil.h"
00017 
00018 ostream *StringDecoder::_notify_ptr = &cerr;
00019 
00020 ////////////////////////////////////////////////////////////////////
00021 //     Function: StringDecoder::Destructor
00022 //       Access: Public, Virtual
00023 //  Description: 
00024 ////////////////////////////////////////////////////////////////////
00025 StringDecoder::
00026 ~StringDecoder() {
00027 }
00028 
00029 ////////////////////////////////////////////////////////////////////
00030 //     Function: StringDecoder::get_next_character
00031 //       Access: Public, Virtual
00032 //  Description: Returns the next character in sequence.
00033 ////////////////////////////////////////////////////////////////////
00034 int StringDecoder::
00035 get_next_character() {
00036   if (test_eof()) {
00037     return -1;
00038   }
00039   return (unsigned char)_input[_p++];
00040 }
00041 
00042 ////////////////////////////////////////////////////////////////////
00043 //     Function: StringDecoder::set_notify_ptr
00044 //       Access: Public, Static
00045 //  Description: Sets the ostream that is used to write error messages
00046 //               to.  This is necessary because of the low-level
00047 //               placement of this class, before the definition of the
00048 //               NotifyCategory class, so it cannot specify its own
00049 //               notify.
00050 ////////////////////////////////////////////////////////////////////
00051 void StringDecoder::
00052 set_notify_ptr(ostream *notify_ptr) {
00053   _notify_ptr = notify_ptr;
00054 }
00055 
00056 ////////////////////////////////////////////////////////////////////
00057 //     Function: StringDecoder::get_notify_ptr
00058 //       Access: Public, Static
00059 //  Description: Returns the ostream that is used to write error messages
00060 //               to.  See set_notify_ptr().
00061 ////////////////////////////////////////////////////////////////////
00062 ostream *StringDecoder::
00063 get_notify_ptr() {
00064   return _notify_ptr;
00065 }
00066 
00067 
00068 /*
00069 In UTF-8, each 16-bit Unicode character is encoded as a sequence of
00070 one, two, or three 8-bit bytes, depending on the value of the
00071 character. The following table shows the format of such UTF-8 byte
00072 sequences (where the "free bits" shown by x's in the table are
00073 combined in the order shown, and interpreted from most significant to
00074 least significant):
00075 
00076  Binary format of bytes in sequence:
00077                                         Number of    Maximum expressible
00078  1st byte     2nd byte    3rd byte      free bits:      Unicode value:
00079 
00080  0xxxxxxx                                  7           007F hex   (127)
00081  110xxxxx     10xxxxxx                  (5+6)=11       07FF hex  (2047)
00082  1110xxxx     10xxxxxx    10xxxxxx     (4+6+6)=16      FFFF hex (65535)
00083 
00084 The value of each individual byte indicates its UTF-8 function, as follows:
00085 
00086  00 to 7F hex   (0 to 127):  first and only byte of a sequence.
00087  80 to BF hex (128 to 191):  continuing byte in a multi-byte sequence.
00088  C2 to DF hex (194 to 223):  first byte of a two-byte sequence.
00089  E0 to EF hex (224 to 239):  first byte of a three-byte sequence.
00090 */
00091 
00092 ////////////////////////////////////////////////////////////////////
00093 //     Function: StringUtf8Decoder::get_next_character
00094 //       Access: Public, Virtual
00095 //  Description: Returns the next character in sequence.
00096 ////////////////////////////////////////////////////////////////////
00097 int StringUtf8Decoder::
00098 get_next_character() {
00099   unsigned int result;
00100   while (!test_eof()) {
00101     result = (unsigned char)_input[_p++];
00102     if ((result & 0x80) == 0) {
00103       // A 7-bit ascii value in one byte.
00104       return result;
00105 
00106     } if ((result & 0xe0) == 0xc0) {
00107       // First byte of two.
00108       unsigned int two = 0;
00109       if (test_eof()) {
00110         if (_notify_ptr != NULL) {
00111           (*_notify_ptr)
00112             << "utf-8 encoded string '" << _input << "' ends abruptly.\n";
00113         }
00114         return -1;
00115       }
00116       two = (unsigned char)_input[_p++];
00117       result = ((result & 0x1f) << 6) | (two & 0x3f);
00118       return result;
00119       
00120     } else if ((result & 0xf0) == 0xe0) {
00121       // First byte of three.
00122       if (test_eof()) {
00123         if (_notify_ptr != NULL) {
00124           (*_notify_ptr)
00125             << "utf-8 encoded string '" << _input << "' ends abruptly.\n";
00126         }
00127         return -1;
00128       }
00129       unsigned int two = (unsigned char)_input[_p++];
00130       if (test_eof()) {
00131         if (_notify_ptr != NULL) {
00132           (*_notify_ptr)
00133             << "utf-8 encoded string '" << _input << "' ends abruptly.\n";
00134         }
00135         return -1;
00136       }
00137       unsigned int three = (unsigned char)_input[_p++];
00138       result = ((result & 0x0f) << 12) | ((two & 0x3f) << 6) | (three & 0x3f);
00139       return result;
00140     }
00141 
00142     // Otherwise--the high bit is set but it is not one of the
00143     // introductory utf-8 bytes--we have an error.
00144     if (_notify_ptr != NULL) {
00145       (*_notify_ptr)
00146         << "Non utf-8 byte in string: 0x" << hex << result << dec
00147         << ", string is '" << _input << "'\n";
00148     }
00149     return -1;
00150   }
00151 
00152   // End of string reached.
00153   return -1;
00154 }
00155 
00156 ////////////////////////////////////////////////////////////////////
00157 //     Function: StringUnicodeDecoder::get_next_character
00158 //       Access: Public, Virtual
00159 //  Description: Returns the next character in sequence.
00160 ////////////////////////////////////////////////////////////////////
00161 int StringUnicodeDecoder::
00162 get_next_character() {
00163   if (test_eof()) {
00164     return -1;
00165   }
00166 
00167   unsigned int high = (unsigned char)_input[_p++];
00168   if (test_eof()) {
00169     if (_notify_ptr != NULL) {
00170       (*_notify_ptr)
00171         << "Unicode-encoded string has odd number of bytes.\n";
00172     }
00173     return -1;
00174   }
00175   unsigned int low = (unsigned char)_input[_p++];
00176   return ((high << 8) | low);
00177 }
 All Classes Functions Variables Enumerations