mirror of https://github.com/python/cpython.git
Backport r57105 and r57145 from the py3k branch: UTF-32 codecs.
This commit is contained in:
parent
437e6a3b15
commit
6e39080649
|
@ -1301,6 +1301,79 @@ These are the UTF-8 codec APIs:
|
|||
object. Error handling is "strict". Return *NULL* if an exception was raised
|
||||
by the codec.
|
||||
|
||||
These are the UTF-32 codec APIs:
|
||||
|
||||
.. % --- UTF-32 Codecs ------------------------------------------------------ */
|
||||
|
||||
|
||||
.. cfunction:: PyObject* PyUnicode_DecodeUTF32(const char *s, Py_ssize_t size, const char *errors, int *byteorder)
|
||||
|
||||
Decode *length* bytes from a UTF-32 encoded buffer string and return the
|
||||
corresponding Unicode object. *errors* (if non-*NULL*) defines the error
|
||||
handling. It defaults to "strict".
|
||||
|
||||
If *byteorder* is non-*NULL*, the decoder starts decoding using the given byte
|
||||
order::
|
||||
|
||||
*byteorder == -1: little endian
|
||||
*byteorder == 0: native order
|
||||
*byteorder == 1: big endian
|
||||
|
||||
and then switches if the first four bytes of the input data are a byte order mark
|
||||
(BOM) and the specified byte order is native order. This BOM is not copied into
|
||||
the resulting Unicode string. After completion, *\*byteorder* is set to the
|
||||
current byte order at the end of input data.
|
||||
|
||||
In a narrow build codepoints outside the BMP will be decoded as surrogate pairs.
|
||||
|
||||
If *byteorder* is *NULL*, the codec starts in native order mode.
|
||||
|
||||
Return *NULL* if an exception was raised by the codec.
|
||||
|
||||
.. versionadded:: 2.6
|
||||
|
||||
|
||||
.. cfunction:: PyObject* PyUnicode_DecodeUTF32Stateful(const char *s, Py_ssize_t size, const char *errors, int *byteorder, Py_ssize_t *consumed)
|
||||
|
||||
If *consumed* is *NULL*, behave like :cfunc:`PyUnicode_DecodeUTF32`. If
|
||||
*consumed* is not *NULL*, :cfunc:`PyUnicode_DecodeUTF32Stateful` will not treat
|
||||
trailing incomplete UTF-32 byte sequences (such as a number of bytes not divisible
|
||||
by four) as an error. Those bytes will not be decoded and the number of bytes
|
||||
that have been decoded will be stored in *consumed*.
|
||||
|
||||
.. versionadded:: 2.6
|
||||
|
||||
|
||||
.. cfunction:: PyObject* PyUnicode_EncodeUTF32(const Py_UNICODE *s, Py_ssize_t size, const char *errors, int byteorder)
|
||||
|
||||
Return a Python bytes object holding the UTF-32 encoded value of the Unicode
|
||||
data in *s*. If *byteorder* is not ``0``, output is written according to the
|
||||
following byte order::
|
||||
|
||||
byteorder == -1: little endian
|
||||
byteorder == 0: native byte order (writes a BOM mark)
|
||||
byteorder == 1: big endian
|
||||
|
||||
If byteorder is ``0``, the output string will always start with the Unicode BOM
|
||||
mark (U+FEFF). In the other two modes, no BOM mark is prepended.
|
||||
|
||||
If *Py_UNICODE_WIDE* is not defined, surrogate pairs will be output
|
||||
as a single codepoint.
|
||||
|
||||
Return *NULL* if an exception was raised by the codec.
|
||||
|
||||
.. versionadded:: 2.6
|
||||
|
||||
|
||||
.. cfunction:: PyObject* PyUnicode_AsUTF32String(PyObject *unicode)
|
||||
|
||||
Return a Python string using the UTF-32 encoding in native byte order. The
|
||||
string always starts with a BOM mark. Error handling is "strict". Return
|
||||
*NULL* if an exception was raised by the codec.
|
||||
|
||||
.. versionadded:: 2.6
|
||||
|
||||
|
||||
These are the UTF-16 codec APIs:
|
||||
|
||||
.. % --- UTF-16 Codecs ------------------------------------------------------ */
|
||||
|
|
|
@ -1045,6 +1045,12 @@ particular, the following variants typically exist:
|
|||
| shift_jisx0213 | shiftjisx0213, sjisx0213, | Japanese |
|
||||
| | s_jisx0213 | |
|
||||
+-----------------+--------------------------------+--------------------------------+
|
||||
| utf_32 | U32, utf32 | all languages |
|
||||
+-----------------+--------------------------------+--------------------------------+
|
||||
| utf_32_be | UTF-32BE | all languages |
|
||||
+-----------------+--------------------------------+--------------------------------+
|
||||
| utf_32_le | UTF-32LE | all languages |
|
||||
+-----------------+--------------------------------+--------------------------------+
|
||||
| utf_16 | U16, utf16 | all languages |
|
||||
+-----------------+--------------------------------+--------------------------------+
|
||||
| utf_16_be | UTF-16BE | all languages (BMP only) |
|
||||
|
|
|
@ -145,6 +145,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
|
|||
# define PyUnicode_AsEncodedString PyUnicodeUCS2_AsEncodedString
|
||||
# define PyUnicode_AsLatin1String PyUnicodeUCS2_AsLatin1String
|
||||
# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS2_AsRawUnicodeEscapeString
|
||||
# define PyUnicode_AsUTF32String PyUnicodeUCS2_AsUTF32String
|
||||
# define PyUnicode_AsUTF16String PyUnicodeUCS2_AsUTF16String
|
||||
# define PyUnicode_AsUTF8String PyUnicodeUCS2_AsUTF8String
|
||||
# define PyUnicode_AsUnicode PyUnicodeUCS2_AsUnicode
|
||||
|
@ -159,6 +160,8 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
|
|||
# define PyUnicode_DecodeCharmap PyUnicodeUCS2_DecodeCharmap
|
||||
# define PyUnicode_DecodeLatin1 PyUnicodeUCS2_DecodeLatin1
|
||||
# define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS2_DecodeRawUnicodeEscape
|
||||
# define PyUnicode_DecodeUTF32 PyUnicodeUCS2_DecodeUTF32
|
||||
# define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS2_DecodeUTF32Stateful
|
||||
# define PyUnicode_DecodeUTF16 PyUnicodeUCS2_DecodeUTF16
|
||||
# define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS2_DecodeUTF16Stateful
|
||||
# define PyUnicode_DecodeUTF8 PyUnicodeUCS2_DecodeUTF8
|
||||
|
@ -170,6 +173,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
|
|||
# define PyUnicode_EncodeDecimal PyUnicodeUCS2_EncodeDecimal
|
||||
# define PyUnicode_EncodeLatin1 PyUnicodeUCS2_EncodeLatin1
|
||||
# define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS2_EncodeRawUnicodeEscape
|
||||
# define PyUnicode_EncodeUTF32 PyUnicodeUCS2_EncodeUTF32
|
||||
# define PyUnicode_EncodeUTF16 PyUnicodeUCS2_EncodeUTF16
|
||||
# define PyUnicode_EncodeUTF8 PyUnicodeUCS2_EncodeUTF8
|
||||
# define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS2_EncodeUnicodeEscape
|
||||
|
@ -223,6 +227,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
|
|||
# define PyUnicode_AsEncodedString PyUnicodeUCS4_AsEncodedString
|
||||
# define PyUnicode_AsLatin1String PyUnicodeUCS4_AsLatin1String
|
||||
# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS4_AsRawUnicodeEscapeString
|
||||
# define PyUnicode_AsUTF32String PyUnicodeUCS4_AsUTF32String
|
||||
# define PyUnicode_AsUTF16String PyUnicodeUCS4_AsUTF16String
|
||||
# define PyUnicode_AsUTF8String PyUnicodeUCS4_AsUTF8String
|
||||
# define PyUnicode_AsUnicode PyUnicodeUCS4_AsUnicode
|
||||
|
@ -237,6 +242,8 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
|
|||
# define PyUnicode_DecodeCharmap PyUnicodeUCS4_DecodeCharmap
|
||||
# define PyUnicode_DecodeLatin1 PyUnicodeUCS4_DecodeLatin1
|
||||
# define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS4_DecodeRawUnicodeEscape
|
||||
# define PyUnicode_DecodeUTF32 PyUnicodeUCS4_DecodeUTF32
|
||||
# define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS4_DecodeUTF32Stateful
|
||||
# define PyUnicode_DecodeUTF16 PyUnicodeUCS4_DecodeUTF16
|
||||
# define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS4_DecodeUTF16Stateful
|
||||
# define PyUnicode_DecodeUTF8 PyUnicodeUCS4_DecodeUTF8
|
||||
|
@ -248,6 +255,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
|
|||
# define PyUnicode_EncodeDecimal PyUnicodeUCS4_EncodeDecimal
|
||||
# define PyUnicode_EncodeLatin1 PyUnicodeUCS4_EncodeLatin1
|
||||
# define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS4_EncodeRawUnicodeEscape
|
||||
# define PyUnicode_EncodeUTF32 PyUnicodeUCS4_EncodeUTF32
|
||||
# define PyUnicode_EncodeUTF16 PyUnicodeUCS4_EncodeUTF16
|
||||
# define PyUnicode_EncodeUTF8 PyUnicodeUCS4_EncodeUTF8
|
||||
# define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS4_EncodeUnicodeEscape
|
||||
|
@ -701,6 +709,80 @@ PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8(
|
|||
const char *errors /* error handling */
|
||||
);
|
||||
|
||||
/* --- UTF-32 Codecs ------------------------------------------------------ */
|
||||
|
||||
/* Decodes length bytes from a UTF-32 encoded buffer string and returns
|
||||
the corresponding Unicode object.
|
||||
|
||||
errors (if non-NULL) defines the error handling. It defaults
|
||||
to "strict".
|
||||
|
||||
If byteorder is non-NULL, the decoder starts decoding using the
|
||||
given byte order:
|
||||
|
||||
*byteorder == -1: little endian
|
||||
*byteorder == 0: native order
|
||||
*byteorder == 1: big endian
|
||||
|
||||
In native mode, the first four bytes of the stream are checked for a
|
||||
BOM mark. If found, the BOM mark is analysed, the byte order
|
||||
adjusted and the BOM skipped. In the other modes, no BOM mark
|
||||
interpretation is done. After completion, *byteorder is set to the
|
||||
current byte order at the end of input data.
|
||||
|
||||
If byteorder is NULL, the codec starts in native order mode.
|
||||
|
||||
*/
|
||||
|
||||
PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
|
||||
const char *string, /* UTF-32 encoded string */
|
||||
Py_ssize_t length, /* size of string */
|
||||
const char *errors, /* error handling */
|
||||
int *byteorder /* pointer to byteorder to use
|
||||
0=native;-1=LE,1=BE; updated on
|
||||
exit */
|
||||
);
|
||||
|
||||
PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
|
||||
const char *string, /* UTF-32 encoded string */
|
||||
Py_ssize_t length, /* size of string */
|
||||
const char *errors, /* error handling */
|
||||
int *byteorder, /* pointer to byteorder to use
|
||||
0=native;-1=LE,1=BE; updated on
|
||||
exit */
|
||||
Py_ssize_t *consumed /* bytes consumed */
|
||||
);
|
||||
|
||||
/* Returns a Python string using the UTF-32 encoding in native byte
|
||||
order. The string always starts with a BOM mark. */
|
||||
|
||||
PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
|
||||
PyObject *unicode /* Unicode object */
|
||||
);
|
||||
|
||||
/* Returns a Python string object holding the UTF-32 encoded value of
|
||||
the Unicode data.
|
||||
|
||||
If byteorder is not 0, output is written according to the following
|
||||
byte order:
|
||||
|
||||
byteorder == -1: little endian
|
||||
byteorder == 0: native byte order (writes a BOM mark)
|
||||
byteorder == 1: big endian
|
||||
|
||||
If byteorder is 0, the output string will always start with the
|
||||
Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
|
||||
prepended.
|
||||
|
||||
*/
|
||||
|
||||
PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32(
|
||||
const Py_UNICODE *data, /* Unicode char buffer */
|
||||
Py_ssize_t length, /* number of Py_UNICODE chars to encode */
|
||||
const char *errors, /* error handling */
|
||||
int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
|
||||
);
|
||||
|
||||
/* --- UTF-16 Codecs ------------------------------------------------------ */
|
||||
|
||||
/* Decodes length bytes from a UTF-16 encoded buffer string and returns
|
||||
|
|
|
@ -490,6 +490,16 @@
|
|||
'unicodelittleunmarked' : 'utf_16_le',
|
||||
'utf_16le' : 'utf_16_le',
|
||||
|
||||
# utf_32 codec
|
||||
'u32' : 'utf_32',
|
||||
'utf32' : 'utf_32',
|
||||
|
||||
# utf_32_be codec
|
||||
'utf_32be' : 'utf_32_be',
|
||||
|
||||
# utf_32_le codec
|
||||
'utf_32le' : 'utf_32_le',
|
||||
|
||||
# utf_7 codec
|
||||
'u7' : 'utf_7',
|
||||
'utf7' : 'utf_7',
|
||||
|
|
|
@ -0,0 +1,144 @@
|
|||
"""
|
||||
Python 'utf-32' Codec
|
||||
"""
|
||||
import codecs, sys
|
||||
|
||||
### Codec APIs
|
||||
|
||||
encode = codecs.utf_32_encode
|
||||
|
||||
def decode(input, errors='strict'):
|
||||
return codecs.utf_32_decode(input, errors, True)
|
||||
|
||||
class IncrementalEncoder(codecs.IncrementalEncoder):
|
||||
def __init__(self, errors='strict'):
|
||||
codecs.IncrementalEncoder.__init__(self, errors)
|
||||
self.encoder = None
|
||||
|
||||
def encode(self, input, final=False):
|
||||
if self.encoder is None:
|
||||
result = codecs.utf_32_encode(input, self.errors)[0]
|
||||
if sys.byteorder == 'little':
|
||||
self.encoder = codecs.utf_32_le_encode
|
||||
else:
|
||||
self.encoder = codecs.utf_32_be_encode
|
||||
return result
|
||||
return self.encoder(input, self.errors)[0]
|
||||
|
||||
def reset(self):
|
||||
codecs.IncrementalEncoder.reset(self)
|
||||
self.encoder = None
|
||||
|
||||
def getstate(self):
|
||||
# state info we return to the caller:
|
||||
# 0: stream is in natural order for this platform
|
||||
# 2: endianness hasn't been determined yet
|
||||
# (we're never writing in unnatural order)
|
||||
return (2 if self.encoder is None else 0)
|
||||
|
||||
def setstate(self, state):
|
||||
if state:
|
||||
self.encoder = None
|
||||
else:
|
||||
if sys.byteorder == 'little':
|
||||
self.encoder = codecs.utf_32_le_encode
|
||||
else:
|
||||
self.encoder = codecs.utf_32_be_encode
|
||||
|
||||
class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
|
||||
def __init__(self, errors='strict'):
|
||||
codecs.BufferedIncrementalDecoder.__init__(self, errors)
|
||||
self.decoder = None
|
||||
|
||||
def _buffer_decode(self, input, errors, final):
|
||||
if self.decoder is None:
|
||||
(output, consumed, byteorder) = \
|
||||
codecs.utf_32_ex_decode(input, errors, 0, final)
|
||||
if byteorder == -1:
|
||||
self.decoder = codecs.utf_32_le_decode
|
||||
elif byteorder == 1:
|
||||
self.decoder = codecs.utf_32_be_decode
|
||||
elif consumed >= 4:
|
||||
raise UnicodeError("UTF-32 stream does not start with BOM")
|
||||
return (output, consumed)
|
||||
return self.decoder(input, self.errors, final)
|
||||
|
||||
def reset(self):
|
||||
codecs.BufferedIncrementalDecoder.reset(self)
|
||||
self.decoder = None
|
||||
|
||||
def getstate(self):
|
||||
# additonal state info from the base class must be None here,
|
||||
# as it isn't passed along to the caller
|
||||
state = codecs.BufferedIncrementalDecoder.getstate(self)[0]
|
||||
# additional state info we pass to the caller:
|
||||
# 0: stream is in natural order for this platform
|
||||
# 1: stream is in unnatural order
|
||||
# 2: endianness hasn't been determined yet
|
||||
if self.decoder is None:
|
||||
return (state, 2)
|
||||
addstate = int((sys.byteorder == "big") !=
|
||||
(self.decoder is codecs.utf_32_be_decode))
|
||||
return (state, addstate)
|
||||
|
||||
def setstate(self, state):
|
||||
# state[1] will be ignored by BufferedIncrementalDecoder.setstate()
|
||||
codecs.BufferedIncrementalDecoder.setstate(self, state)
|
||||
state = state[1]
|
||||
if state == 0:
|
||||
self.decoder = (codecs.utf_32_be_decode
|
||||
if sys.byteorder == "big"
|
||||
else codecs.utf_32_le_decode)
|
||||
elif state == 1:
|
||||
self.decoder = (codecs.utf_32_le_decode
|
||||
if sys.byteorder == "big"
|
||||
else codecs.utf_32_be_decode)
|
||||
else:
|
||||
self.decoder = None
|
||||
|
||||
class StreamWriter(codecs.StreamWriter):
|
||||
def __init__(self, stream, errors='strict'):
|
||||
self.bom_written = False
|
||||
codecs.StreamWriter.__init__(self, stream, errors)
|
||||
|
||||
def encode(self, input, errors='strict'):
|
||||
self.bom_written = True
|
||||
result = codecs.utf_32_encode(input, errors)
|
||||
if sys.byteorder == 'little':
|
||||
self.encode = codecs.utf_32_le_encode
|
||||
else:
|
||||
self.encode = codecs.utf_32_be_encode
|
||||
return result
|
||||
|
||||
class StreamReader(codecs.StreamReader):
|
||||
|
||||
def reset(self):
|
||||
codecs.StreamReader.reset(self)
|
||||
try:
|
||||
del self.decode
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
def decode(self, input, errors='strict'):
|
||||
(object, consumed, byteorder) = \
|
||||
codecs.utf_32_ex_decode(input, errors, 0, False)
|
||||
if byteorder == -1:
|
||||
self.decode = codecs.utf_32_le_decode
|
||||
elif byteorder == 1:
|
||||
self.decode = codecs.utf_32_be_decode
|
||||
elif consumed>=4:
|
||||
raise UnicodeError,"UTF-32 stream does not start with BOM"
|
||||
return (object, consumed)
|
||||
|
||||
### encodings module API
|
||||
|
||||
def getregentry():
|
||||
return codecs.CodecInfo(
|
||||
name='utf-32',
|
||||
encode=encode,
|
||||
decode=decode,
|
||||
incrementalencoder=IncrementalEncoder,
|
||||
incrementaldecoder=IncrementalDecoder,
|
||||
streamreader=StreamReader,
|
||||
streamwriter=StreamWriter,
|
||||
)
|
|
@ -0,0 +1,37 @@
|
|||
"""
|
||||
Python 'utf-32-be' Codec
|
||||
"""
|
||||
import codecs
|
||||
|
||||
### Codec APIs
|
||||
|
||||
encode = codecs.utf_32_be_encode
|
||||
|
||||
def decode(input, errors='strict'):
|
||||
return codecs.utf_32_be_decode(input, errors, True)
|
||||
|
||||
class IncrementalEncoder(codecs.IncrementalEncoder):
|
||||
def encode(self, input, final=False):
|
||||
return codecs.utf_32_be_encode(input, self.errors)[0]
|
||||
|
||||
class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
|
||||
_buffer_decode = codecs.utf_32_be_decode
|
||||
|
||||
class StreamWriter(codecs.StreamWriter):
|
||||
encode = codecs.utf_32_be_encode
|
||||
|
||||
class StreamReader(codecs.StreamReader):
|
||||
decode = codecs.utf_32_be_decode
|
||||
|
||||
### encodings module API
|
||||
|
||||
def getregentry():
|
||||
return codecs.CodecInfo(
|
||||
name='utf-32-be',
|
||||
encode=encode,
|
||||
decode=decode,
|
||||
incrementalencoder=IncrementalEncoder,
|
||||
incrementaldecoder=IncrementalDecoder,
|
||||
streamreader=StreamReader,
|
||||
streamwriter=StreamWriter,
|
||||
)
|
|
@ -0,0 +1,37 @@
|
|||
"""
|
||||
Python 'utf-32-le' Codec
|
||||
"""
|
||||
import codecs
|
||||
|
||||
### Codec APIs
|
||||
|
||||
encode = codecs.utf_32_le_encode
|
||||
|
||||
def decode(input, errors='strict'):
|
||||
return codecs.utf_32_le_decode(input, errors, True)
|
||||
|
||||
class IncrementalEncoder(codecs.IncrementalEncoder):
|
||||
def encode(self, input, final=False):
|
||||
return codecs.utf_32_le_encode(input, self.errors)[0]
|
||||
|
||||
class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
|
||||
_buffer_decode = codecs.utf_32_le_decode
|
||||
|
||||
class StreamWriter(codecs.StreamWriter):
|
||||
encode = codecs.utf_32_le_encode
|
||||
|
||||
class StreamReader(codecs.StreamReader):
|
||||
decode = codecs.utf_32_le_decode
|
||||
|
||||
### encodings module API
|
||||
|
||||
def getregentry():
|
||||
return codecs.CodecInfo(
|
||||
name='utf-32-le',
|
||||
encode=encode,
|
||||
decode=decode,
|
||||
incrementalencoder=IncrementalEncoder,
|
||||
incrementaldecoder=IncrementalDecoder,
|
||||
streamreader=StreamReader,
|
||||
streamwriter=StreamWriter,
|
||||
)
|
|
@ -285,7 +285,8 @@ def handler2(exc):
|
|||
|
||||
def test_longstrings(self):
|
||||
# test long strings to check for memory overflow problems
|
||||
errors = [ "strict", "ignore", "replace", "xmlcharrefreplace", "backslashreplace"]
|
||||
errors = [ "strict", "ignore", "replace", "xmlcharrefreplace",
|
||||
"backslashreplace"]
|
||||
# register the handlers under different names,
|
||||
# to prevent the codec from recognizing the name
|
||||
for err in errors:
|
||||
|
@ -293,7 +294,8 @@ def test_longstrings(self):
|
|||
l = 1000
|
||||
errors += [ "test." + err for err in errors ]
|
||||
for uni in [ s*l for s in (u"x", u"\u3042", u"a\xe4") ]:
|
||||
for enc in ("ascii", "latin-1", "iso-8859-1", "iso-8859-15", "utf-8", "utf-7", "utf-16"):
|
||||
for enc in ("ascii", "latin-1", "iso-8859-1", "iso-8859-15",
|
||||
"utf-8", "utf-7", "utf-16", "utf-32"):
|
||||
for err in errors:
|
||||
try:
|
||||
uni.encode(enc, err)
|
||||
|
|
|
@ -244,6 +244,137 @@ def test_bug1098990_b(self):
|
|||
self.assertEqual(reader.readline(), s5)
|
||||
self.assertEqual(reader.readline(), u"")
|
||||
|
||||
class UTF32Test(ReadTest):
|
||||
encoding = "utf-32"
|
||||
|
||||
spamle = ('\xff\xfe\x00\x00'
|
||||
's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
|
||||
's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
|
||||
spambe = ('\x00\x00\xfe\xff'
|
||||
'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
|
||||
'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
|
||||
|
||||
def test_only_one_bom(self):
|
||||
_,_,reader,writer = codecs.lookup(self.encoding)
|
||||
# encode some stream
|
||||
s = StringIO.StringIO()
|
||||
f = writer(s)
|
||||
f.write(u"spam")
|
||||
f.write(u"spam")
|
||||
d = s.getvalue()
|
||||
# check whether there is exactly one BOM in it
|
||||
self.assert_(d == self.spamle or d == self.spambe)
|
||||
# try to read it back
|
||||
s = StringIO.StringIO(d)
|
||||
f = reader(s)
|
||||
self.assertEquals(f.read(), u"spamspam")
|
||||
|
||||
def test_badbom(self):
|
||||
s = StringIO.StringIO(4*"\xff")
|
||||
f = codecs.getreader(self.encoding)(s)
|
||||
self.assertRaises(UnicodeError, f.read)
|
||||
|
||||
s = StringIO.StringIO(8*"\xff")
|
||||
f = codecs.getreader(self.encoding)(s)
|
||||
self.assertRaises(UnicodeError, f.read)
|
||||
|
||||
def test_partial(self):
|
||||
self.check_partial(
|
||||
u"\x00\xff\u0100\uffff",
|
||||
[
|
||||
u"", # first byte of BOM read
|
||||
u"", # second byte of BOM read
|
||||
u"", # third byte of BOM read
|
||||
u"", # fourth byte of BOM read => byteorder known
|
||||
u"",
|
||||
u"",
|
||||
u"",
|
||||
u"\x00",
|
||||
u"\x00",
|
||||
u"\x00",
|
||||
u"\x00",
|
||||
u"\x00\xff",
|
||||
u"\x00\xff",
|
||||
u"\x00\xff",
|
||||
u"\x00\xff",
|
||||
u"\x00\xff\u0100",
|
||||
u"\x00\xff\u0100",
|
||||
u"\x00\xff\u0100",
|
||||
u"\x00\xff\u0100",
|
||||
u"\x00\xff\u0100\uffff",
|
||||
]
|
||||
)
|
||||
|
||||
def test_errors(self):
|
||||
self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
|
||||
"\xff", "strict", True)
|
||||
|
||||
class UTF32LETest(ReadTest):
|
||||
encoding = "utf-32-le"
|
||||
|
||||
def test_partial(self):
|
||||
self.check_partial(
|
||||
u"\x00\xff\u0100\uffff",
|
||||
[
|
||||
u"",
|
||||
u"",
|
||||
u"",
|
||||
u"\x00",
|
||||
u"\x00",
|
||||
u"\x00",
|
||||
u"\x00",
|
||||
u"\x00\xff",
|
||||
u"\x00\xff",
|
||||
u"\x00\xff",
|
||||
u"\x00\xff",
|
||||
u"\x00\xff\u0100",
|
||||
u"\x00\xff\u0100",
|
||||
u"\x00\xff\u0100",
|
||||
u"\x00\xff\u0100",
|
||||
u"\x00\xff\u0100\uffff",
|
||||
]
|
||||
)
|
||||
|
||||
def test_simple(self):
|
||||
self.assertEqual(u"\U00010203".encode(self.encoding), "\x03\x02\x01\x00")
|
||||
|
||||
def test_errors(self):
|
||||
self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
|
||||
"\xff", "strict", True)
|
||||
|
||||
class UTF32BETest(ReadTest):
|
||||
encoding = "utf-32-be"
|
||||
|
||||
def test_partial(self):
|
||||
self.check_partial(
|
||||
u"\x00\xff\u0100\uffff",
|
||||
[
|
||||
u"",
|
||||
u"",
|
||||
u"",
|
||||
u"\x00",
|
||||
u"\x00",
|
||||
u"\x00",
|
||||
u"\x00",
|
||||
u"\x00\xff",
|
||||
u"\x00\xff",
|
||||
u"\x00\xff",
|
||||
u"\x00\xff",
|
||||
u"\x00\xff\u0100",
|
||||
u"\x00\xff\u0100",
|
||||
u"\x00\xff\u0100",
|
||||
u"\x00\xff\u0100",
|
||||
u"\x00\xff\u0100\uffff",
|
||||
]
|
||||
)
|
||||
|
||||
def test_simple(self):
|
||||
self.assertEqual(u"\U00010203".encode(self.encoding), "\x00\x01\x02\x03")
|
||||
|
||||
def test_errors(self):
|
||||
self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
|
||||
"\xff", "strict", True)
|
||||
|
||||
class UTF16Test(ReadTest):
|
||||
encoding = "utf-16"
|
||||
|
||||
|
@ -1278,6 +1409,9 @@ def test_streamreaderwriter(self):
|
|||
|
||||
def test_main():
|
||||
test_support.run_unittest(
|
||||
UTF32Test,
|
||||
UTF32LETest,
|
||||
UTF32BETest,
|
||||
UTF16Test,
|
||||
UTF16LETest,
|
||||
UTF16BETest,
|
||||
|
|
|
@ -243,6 +243,8 @@ Library
|
|||
- GB18030 codec now can encode additional two-byte characters that
|
||||
are missing in GBK.
|
||||
|
||||
- Add new codecs for UTF-32, UTF-32-LE and UTF-32-BE.
|
||||
|
||||
- Bug #1704793: Return UTF-16 pair if unicodedata.lookup cannot
|
||||
represent the result in a single character.
|
||||
|
||||
|
|
|
@ -391,6 +391,126 @@ utf_16_ex_decode(PyObject *self,
|
|||
return tuple;
|
||||
}
|
||||
|
||||
static PyObject *
|
||||
utf_32_decode(PyObject *self,
|
||||
PyObject *args)
|
||||
{
|
||||
const char *data;
|
||||
Py_ssize_t size;
|
||||
const char *errors = NULL;
|
||||
int byteorder = 0;
|
||||
int final = 0;
|
||||
Py_ssize_t consumed;
|
||||
PyObject *decoded;
|
||||
|
||||
if (!PyArg_ParseTuple(args, "t#|zi:utf_32_decode",
|
||||
&data, &size, &errors, &final))
|
||||
return NULL;
|
||||
if (size < 0) {
|
||||
PyErr_SetString(PyExc_ValueError, "negative argument");
|
||||
return 0;
|
||||
}
|
||||
consumed = size; /* This is overwritten unless final is true. */
|
||||
decoded = PyUnicode_DecodeUTF32Stateful(data, size, errors, &byteorder,
|
||||
final ? NULL : &consumed);
|
||||
if (decoded == NULL)
|
||||
return NULL;
|
||||
return codec_tuple(decoded, consumed);
|
||||
}
|
||||
|
||||
static PyObject *
|
||||
utf_32_le_decode(PyObject *self,
|
||||
PyObject *args)
|
||||
{
|
||||
const char *data;
|
||||
Py_ssize_t size;
|
||||
const char *errors = NULL;
|
||||
int byteorder = -1;
|
||||
int final = 0;
|
||||
Py_ssize_t consumed;
|
||||
PyObject *decoded = NULL;
|
||||
|
||||
if (!PyArg_ParseTuple(args, "t#|zi:utf_32_le_decode",
|
||||
&data, &size, &errors, &final))
|
||||
return NULL;
|
||||
|
||||
if (size < 0) {
|
||||
PyErr_SetString(PyExc_ValueError, "negative argument");
|
||||
return 0;
|
||||
}
|
||||
consumed = size; /* This is overwritten unless final is true. */
|
||||
decoded = PyUnicode_DecodeUTF32Stateful(data, size, errors,
|
||||
&byteorder, final ? NULL : &consumed);
|
||||
if (decoded == NULL)
|
||||
return NULL;
|
||||
return codec_tuple(decoded, consumed);
|
||||
|
||||
}
|
||||
|
||||
static PyObject *
|
||||
utf_32_be_decode(PyObject *self,
|
||||
PyObject *args)
|
||||
{
|
||||
const char *data;
|
||||
Py_ssize_t size;
|
||||
const char *errors = NULL;
|
||||
int byteorder = 1;
|
||||
int final = 0;
|
||||
Py_ssize_t consumed;
|
||||
PyObject *decoded = NULL;
|
||||
|
||||
if (!PyArg_ParseTuple(args, "t#|zi:utf_32_be_decode",
|
||||
&data, &size, &errors, &final))
|
||||
return NULL;
|
||||
if (size < 0) {
|
||||
PyErr_SetString(PyExc_ValueError, "negative argument");
|
||||
return 0;
|
||||
}
|
||||
consumed = size; /* This is overwritten unless final is true. */
|
||||
decoded = PyUnicode_DecodeUTF32Stateful(data, size, errors,
|
||||
&byteorder, final ? NULL : &consumed);
|
||||
if (decoded == NULL)
|
||||
return NULL;
|
||||
return codec_tuple(decoded, consumed);
|
||||
}
|
||||
|
||||
/* This non-standard version also provides access to the byteorder
|
||||
parameter of the builtin UTF-32 codec.
|
||||
|
||||
It returns a tuple (unicode, bytesread, byteorder) with byteorder
|
||||
being the value in effect at the end of data.
|
||||
|
||||
*/
|
||||
|
||||
static PyObject *
|
||||
utf_32_ex_decode(PyObject *self,
|
||||
PyObject *args)
|
||||
{
|
||||
const char *data;
|
||||
Py_ssize_t size;
|
||||
const char *errors = NULL;
|
||||
int byteorder = 0;
|
||||
PyObject *unicode, *tuple;
|
||||
int final = 0;
|
||||
Py_ssize_t consumed;
|
||||
|
||||
if (!PyArg_ParseTuple(args, "t#|zii:utf_32_ex_decode",
|
||||
&data, &size, &errors, &byteorder, &final))
|
||||
return NULL;
|
||||
if (size < 0) {
|
||||
PyErr_SetString(PyExc_ValueError, "negative argument");
|
||||
return 0;
|
||||
}
|
||||
consumed = size; /* This is overwritten unless final is true. */
|
||||
unicode = PyUnicode_DecodeUTF32Stateful(data, size, errors, &byteorder,
|
||||
final ? NULL : &consumed);
|
||||
if (unicode == NULL)
|
||||
return NULL;
|
||||
tuple = Py_BuildValue("Oni", unicode, consumed, byteorder);
|
||||
Py_DECREF(unicode);
|
||||
return tuple;
|
||||
}
|
||||
|
||||
static PyObject *
|
||||
unicode_escape_decode(PyObject *self,
|
||||
PyObject *args)
|
||||
|
@ -683,6 +803,83 @@ utf_16_be_encode(PyObject *self,
|
|||
return v;
|
||||
}
|
||||
|
||||
/* This version provides access to the byteorder parameter of the
|
||||
builtin UTF-32 codecs as optional third argument. It defaults to 0
|
||||
which means: use the native byte order and prepend the data with a
|
||||
BOM mark.
|
||||
|
||||
*/
|
||||
|
||||
static PyObject *
|
||||
utf_32_encode(PyObject *self,
|
||||
PyObject *args)
|
||||
{
|
||||
PyObject *str, *v;
|
||||
const char *errors = NULL;
|
||||
int byteorder = 0;
|
||||
|
||||
if (!PyArg_ParseTuple(args, "O|zi:utf_32_encode",
|
||||
&str, &errors, &byteorder))
|
||||
return NULL;
|
||||
|
||||
str = PyUnicode_FromObject(str);
|
||||
if (str == NULL)
|
||||
return NULL;
|
||||
v = codec_tuple(PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(str),
|
||||
PyUnicode_GET_SIZE(str),
|
||||
errors,
|
||||
byteorder),
|
||||
PyUnicode_GET_SIZE(str));
|
||||
Py_DECREF(str);
|
||||
return v;
|
||||
}
|
||||
|
||||
static PyObject *
|
||||
utf_32_le_encode(PyObject *self,
|
||||
PyObject *args)
|
||||
{
|
||||
PyObject *str, *v;
|
||||
const char *errors = NULL;
|
||||
|
||||
if (!PyArg_ParseTuple(args, "O|z:utf_32_le_encode",
|
||||
&str, &errors))
|
||||
return NULL;
|
||||
|
||||
str = PyUnicode_FromObject(str);
|
||||
if (str == NULL)
|
||||
return NULL;
|
||||
v = codec_tuple(PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(str),
|
||||
PyUnicode_GET_SIZE(str),
|
||||
errors,
|
||||
-1),
|
||||
PyUnicode_GET_SIZE(str));
|
||||
Py_DECREF(str);
|
||||
return v;
|
||||
}
|
||||
|
||||
static PyObject *
|
||||
utf_32_be_encode(PyObject *self,
|
||||
PyObject *args)
|
||||
{
|
||||
PyObject *str, *v;
|
||||
const char *errors = NULL;
|
||||
|
||||
if (!PyArg_ParseTuple(args, "O|z:utf_32_be_encode",
|
||||
&str, &errors))
|
||||
return NULL;
|
||||
|
||||
str = PyUnicode_FromObject(str);
|
||||
if (str == NULL)
|
||||
return NULL;
|
||||
v = codec_tuple(PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(str),
|
||||
PyUnicode_GET_SIZE(str),
|
||||
errors,
|
||||
+1),
|
||||
PyUnicode_GET_SIZE(str));
|
||||
Py_DECREF(str);
|
||||
return v;
|
||||
}
|
||||
|
||||
static PyObject *
|
||||
unicode_escape_encode(PyObject *self,
|
||||
PyObject *args)
|
||||
|
@ -901,6 +1098,13 @@ static PyMethodDef _codecs_functions[] = {
|
|||
{"utf_16_le_decode", utf_16_le_decode, METH_VARARGS},
|
||||
{"utf_16_be_decode", utf_16_be_decode, METH_VARARGS},
|
||||
{"utf_16_ex_decode", utf_16_ex_decode, METH_VARARGS},
|
||||
{"utf_32_encode", utf_32_encode, METH_VARARGS},
|
||||
{"utf_32_le_encode", utf_32_le_encode, METH_VARARGS},
|
||||
{"utf_32_be_encode", utf_32_be_encode, METH_VARARGS},
|
||||
{"utf_32_decode", utf_32_decode, METH_VARARGS},
|
||||
{"utf_32_le_decode", utf_32_le_decode, METH_VARARGS},
|
||||
{"utf_32_be_decode", utf_32_be_decode, METH_VARARGS},
|
||||
{"utf_32_ex_decode", utf_32_ex_decode, METH_VARARGS},
|
||||
{"unicode_escape_encode", unicode_escape_encode, METH_VARARGS},
|
||||
{"unicode_escape_decode", unicode_escape_decode, METH_VARARGS},
|
||||
{"unicode_internal_encode", unicode_internal_encode, METH_VARARGS},
|
||||
|
|
|
@ -1504,6 +1504,272 @@ PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
|
|||
NULL);
|
||||
}
|
||||
|
||||
/* --- UTF-32 Codec ------------------------------------------------------- */
|
||||
|
||||
PyObject *
|
||||
PyUnicode_DecodeUTF32(const char *s,
|
||||
Py_ssize_t size,
|
||||
const char *errors,
|
||||
int *byteorder)
|
||||
{
|
||||
return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
|
||||
}
|
||||
|
||||
PyObject *
|
||||
PyUnicode_DecodeUTF32Stateful(const char *s,
|
||||
Py_ssize_t size,
|
||||
const char *errors,
|
||||
int *byteorder,
|
||||
Py_ssize_t *consumed)
|
||||
{
|
||||
const char *starts = s;
|
||||
Py_ssize_t startinpos;
|
||||
Py_ssize_t endinpos;
|
||||
Py_ssize_t outpos;
|
||||
PyUnicodeObject *unicode;
|
||||
Py_UNICODE *p;
|
||||
#ifndef Py_UNICODE_WIDE
|
||||
int i, pairs;
|
||||
#else
|
||||
const int pairs = 0;
|
||||
#endif
|
||||
const unsigned char *q, *e;
|
||||
int bo = 0; /* assume native ordering by default */
|
||||
const char *errmsg = "";
|
||||
/* On narrow builds we split characters outside the BMP into two
|
||||
codepoints => count how much extra space we need. */
|
||||
#ifndef Py_UNICODE_WIDE
|
||||
for (i = pairs = 0; i < size/4; i++)
|
||||
if (((Py_UCS4 *)s)[i] >= 0x10000)
|
||||
pairs++;
|
||||
#endif
|
||||
/* Offsets from q for retrieving bytes in the right order. */
|
||||
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
|
||||
int iorder[] = {0, 1, 2, 3};
|
||||
#else
|
||||
int iorder[] = {3, 2, 1, 0};
|
||||
#endif
|
||||
PyObject *errorHandler = NULL;
|
||||
PyObject *exc = NULL;
|
||||
|
||||
/* This might be one to much, because of a BOM */
|
||||
unicode = _PyUnicode_New((size+3)/4+pairs);
|
||||
if (!unicode)
|
||||
return NULL;
|
||||
if (size == 0)
|
||||
return (PyObject *)unicode;
|
||||
|
||||
/* Unpack UTF-32 encoded data */
|
||||
p = unicode->str;
|
||||
q = (unsigned char *)s;
|
||||
e = q + size;
|
||||
|
||||
if (byteorder)
|
||||
bo = *byteorder;
|
||||
|
||||
/* Check for BOM marks (U+FEFF) in the input and adjust current
|
||||
byte order setting accordingly. In native mode, the leading BOM
|
||||
mark is skipped, in all other modes, it is copied to the output
|
||||
stream as-is (giving a ZWNBSP character). */
|
||||
if (bo == 0) {
|
||||
if (size >= 4) {
|
||||
const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
|
||||
(q[iorder[1]] << 8) | q[iorder[0]];
|
||||
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
|
||||
if (bom == 0x0000FEFF) {
|
||||
q += 4;
|
||||
bo = -1;
|
||||
}
|
||||
else if (bom == 0xFFFE0000) {
|
||||
q += 4;
|
||||
bo = 1;
|
||||
}
|
||||
#else
|
||||
if (bom == 0x0000FEFF) {
|
||||
q += 4;
|
||||
bo = 1;
|
||||
}
|
||||
else if (bom == 0xFFFE0000) {
|
||||
q += 4;
|
||||
bo = -1;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
if (bo == -1) {
|
||||
/* force LE */
|
||||
iorder[0] = 0;
|
||||
iorder[1] = 1;
|
||||
iorder[2] = 2;
|
||||
iorder[3] = 3;
|
||||
}
|
||||
else if (bo == 1) {
|
||||
/* force BE */
|
||||
iorder[0] = 3;
|
||||
iorder[1] = 2;
|
||||
iorder[2] = 1;
|
||||
iorder[3] = 0;
|
||||
}
|
||||
|
||||
while (q < e) {
|
||||
Py_UCS4 ch;
|
||||
/* remaining bytes at the end? (size should be divisible by 4) */
|
||||
if (e-q<4) {
|
||||
if (consumed)
|
||||
break;
|
||||
errmsg = "truncated data";
|
||||
startinpos = ((const char *)q)-starts;
|
||||
endinpos = ((const char *)e)-starts;
|
||||
goto utf32Error;
|
||||
/* The remaining input chars are ignored if the callback
|
||||
chooses to skip the input */
|
||||
}
|
||||
ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
|
||||
(q[iorder[1]] << 8) | q[iorder[0]];
|
||||
|
||||
if (ch >= 0x110000)
|
||||
{
|
||||
errmsg = "codepoint not in range(0x110000)";
|
||||
startinpos = ((const char *)q)-starts;
|
||||
endinpos = startinpos+4;
|
||||
goto utf32Error;
|
||||
}
|
||||
#ifndef Py_UNICODE_WIDE
|
||||
if (ch >= 0x10000)
|
||||
{
|
||||
*p++ = 0xD800 | ((ch-0x10000) >> 10);
|
||||
*p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
*p++ = ch;
|
||||
q += 4;
|
||||
continue;
|
||||
utf32Error:
|
||||
outpos = p-PyUnicode_AS_UNICODE(unicode);
|
||||
if (unicode_decode_call_errorhandler(
|
||||
errors, &errorHandler,
|
||||
"utf32", errmsg,
|
||||
starts, size, &startinpos, &endinpos, &exc, &s,
|
||||
(PyObject **)&unicode, &outpos, &p))
|
||||
goto onError;
|
||||
}
|
||||
|
||||
if (byteorder)
|
||||
*byteorder = bo;
|
||||
|
||||
if (consumed)
|
||||
*consumed = (const char *)q-starts;
|
||||
|
||||
/* Adjust length */
|
||||
if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
|
||||
goto onError;
|
||||
|
||||
Py_XDECREF(errorHandler);
|
||||
Py_XDECREF(exc);
|
||||
return (PyObject *)unicode;
|
||||
|
||||
onError:
|
||||
Py_DECREF(unicode);
|
||||
Py_XDECREF(errorHandler);
|
||||
Py_XDECREF(exc);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
PyObject *
|
||||
PyUnicode_EncodeUTF32(const Py_UNICODE *s,
|
||||
Py_ssize_t size,
|
||||
const char *errors,
|
||||
int byteorder)
|
||||
{
|
||||
PyObject *v;
|
||||
unsigned char *p;
|
||||
#ifndef Py_UNICODE_WIDE
|
||||
int i, pairs;
|
||||
#else
|
||||
const int pairs = 0;
|
||||
#endif
|
||||
/* Offsets from p for storing byte pairs in the right order. */
|
||||
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
|
||||
int iorder[] = {0, 1, 2, 3};
|
||||
#else
|
||||
int iorder[] = {3, 2, 1, 0};
|
||||
#endif
|
||||
|
||||
#define STORECHAR(CH) \
|
||||
do { \
|
||||
p[iorder[3]] = ((CH) >> 24) & 0xff; \
|
||||
p[iorder[2]] = ((CH) >> 16) & 0xff; \
|
||||
p[iorder[1]] = ((CH) >> 8) & 0xff; \
|
||||
p[iorder[0]] = (CH) & 0xff; \
|
||||
p += 4; \
|
||||
} while(0)
|
||||
|
||||
/* In narrow builds we can output surrogate pairs as one codepoint,
|
||||
so we need less space. */
|
||||
#ifndef Py_UNICODE_WIDE
|
||||
for (i = pairs = 0; i < size-1; i++)
|
||||
if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
|
||||
0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
|
||||
pairs++;
|
||||
#endif
|
||||
v = PyString_FromStringAndSize(NULL,
|
||||
4 * (size - pairs + (byteorder == 0)));
|
||||
if (v == NULL)
|
||||
return NULL;
|
||||
|
||||
p = (unsigned char *)PyString_AS_STRING(v);
|
||||
if (byteorder == 0)
|
||||
STORECHAR(0xFEFF);
|
||||
if (size == 0)
|
||||
return v;
|
||||
|
||||
if (byteorder == -1) {
|
||||
/* force LE */
|
||||
iorder[0] = 0;
|
||||
iorder[1] = 1;
|
||||
iorder[2] = 2;
|
||||
iorder[3] = 3;
|
||||
}
|
||||
else if (byteorder == 1) {
|
||||
/* force BE */
|
||||
iorder[0] = 3;
|
||||
iorder[1] = 2;
|
||||
iorder[2] = 1;
|
||||
iorder[3] = 0;
|
||||
}
|
||||
|
||||
while (size-- > 0) {
|
||||
Py_UCS4 ch = *s++;
|
||||
#ifndef Py_UNICODE_WIDE
|
||||
if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
|
||||
Py_UCS4 ch2 = *s;
|
||||
if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
|
||||
ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
|
||||
s++;
|
||||
size--;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
STORECHAR(ch);
|
||||
}
|
||||
return v;
|
||||
#undef STORECHAR
|
||||
}
|
||||
|
||||
PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
|
||||
{
|
||||
if (!PyUnicode_Check(unicode)) {
|
||||
PyErr_BadArgument();
|
||||
return NULL;
|
||||
}
|
||||
return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
|
||||
PyUnicode_GET_SIZE(unicode),
|
||||
NULL,
|
||||
0);
|
||||
}
|
||||
|
||||
/* --- UTF-16 Codec ------------------------------------------------------- */
|
||||
|
||||
PyObject *
|
||||
|
|
Loading…
Reference in New Issue