mirror of https://github.com/python/cpython.git
Backport r57105 and r57145 from the py3k branch: UTF-32 codecs.
This commit is contained in:
parent
437e6a3b15
commit
6e39080649
|
@ -1301,6 +1301,79 @@ These are the UTF-8 codec APIs:
|
||||||
object. Error handling is "strict". Return *NULL* if an exception was raised
|
object. Error handling is "strict". Return *NULL* if an exception was raised
|
||||||
by the codec.
|
by the codec.
|
||||||
|
|
||||||
|
These are the UTF-32 codec APIs:
|
||||||
|
|
||||||
|
.. % --- UTF-32 Codecs ------------------------------------------------------ */
|
||||||
|
|
||||||
|
|
||||||
|
.. cfunction:: PyObject* PyUnicode_DecodeUTF32(const char *s, Py_ssize_t size, const char *errors, int *byteorder)
|
||||||
|
|
||||||
|
Decode *length* bytes from a UTF-32 encoded buffer string and return the
|
||||||
|
corresponding Unicode object. *errors* (if non-*NULL*) defines the error
|
||||||
|
handling. It defaults to "strict".
|
||||||
|
|
||||||
|
If *byteorder* is non-*NULL*, the decoder starts decoding using the given byte
|
||||||
|
order::
|
||||||
|
|
||||||
|
*byteorder == -1: little endian
|
||||||
|
*byteorder == 0: native order
|
||||||
|
*byteorder == 1: big endian
|
||||||
|
|
||||||
|
and then switches if the first four bytes of the input data are a byte order mark
|
||||||
|
(BOM) and the specified byte order is native order. This BOM is not copied into
|
||||||
|
the resulting Unicode string. After completion, *\*byteorder* is set to the
|
||||||
|
current byte order at the end of input data.
|
||||||
|
|
||||||
|
In a narrow build codepoints outside the BMP will be decoded as surrogate pairs.
|
||||||
|
|
||||||
|
If *byteorder* is *NULL*, the codec starts in native order mode.
|
||||||
|
|
||||||
|
Return *NULL* if an exception was raised by the codec.
|
||||||
|
|
||||||
|
.. versionadded:: 2.6
|
||||||
|
|
||||||
|
|
||||||
|
.. cfunction:: PyObject* PyUnicode_DecodeUTF32Stateful(const char *s, Py_ssize_t size, const char *errors, int *byteorder, Py_ssize_t *consumed)
|
||||||
|
|
||||||
|
If *consumed* is *NULL*, behave like :cfunc:`PyUnicode_DecodeUTF32`. If
|
||||||
|
*consumed* is not *NULL*, :cfunc:`PyUnicode_DecodeUTF32Stateful` will not treat
|
||||||
|
trailing incomplete UTF-32 byte sequences (such as a number of bytes not divisible
|
||||||
|
by four) as an error. Those bytes will not be decoded and the number of bytes
|
||||||
|
that have been decoded will be stored in *consumed*.
|
||||||
|
|
||||||
|
.. versionadded:: 2.6
|
||||||
|
|
||||||
|
|
||||||
|
.. cfunction:: PyObject* PyUnicode_EncodeUTF32(const Py_UNICODE *s, Py_ssize_t size, const char *errors, int byteorder)
|
||||||
|
|
||||||
|
Return a Python bytes object holding the UTF-32 encoded value of the Unicode
|
||||||
|
data in *s*. If *byteorder* is not ``0``, output is written according to the
|
||||||
|
following byte order::
|
||||||
|
|
||||||
|
byteorder == -1: little endian
|
||||||
|
byteorder == 0: native byte order (writes a BOM mark)
|
||||||
|
byteorder == 1: big endian
|
||||||
|
|
||||||
|
If byteorder is ``0``, the output string will always start with the Unicode BOM
|
||||||
|
mark (U+FEFF). In the other two modes, no BOM mark is prepended.
|
||||||
|
|
||||||
|
If *Py_UNICODE_WIDE* is not defined, surrogate pairs will be output
|
||||||
|
as a single codepoint.
|
||||||
|
|
||||||
|
Return *NULL* if an exception was raised by the codec.
|
||||||
|
|
||||||
|
.. versionadded:: 2.6
|
||||||
|
|
||||||
|
|
||||||
|
.. cfunction:: PyObject* PyUnicode_AsUTF32String(PyObject *unicode)
|
||||||
|
|
||||||
|
Return a Python string using the UTF-32 encoding in native byte order. The
|
||||||
|
string always starts with a BOM mark. Error handling is "strict". Return
|
||||||
|
*NULL* if an exception was raised by the codec.
|
||||||
|
|
||||||
|
.. versionadded:: 2.6
|
||||||
|
|
||||||
|
|
||||||
These are the UTF-16 codec APIs:
|
These are the UTF-16 codec APIs:
|
||||||
|
|
||||||
.. % --- UTF-16 Codecs ------------------------------------------------------ */
|
.. % --- UTF-16 Codecs ------------------------------------------------------ */
|
||||||
|
|
|
@ -1045,6 +1045,12 @@ particular, the following variants typically exist:
|
||||||
| shift_jisx0213 | shiftjisx0213, sjisx0213, | Japanese |
|
| shift_jisx0213 | shiftjisx0213, sjisx0213, | Japanese |
|
||||||
| | s_jisx0213 | |
|
| | s_jisx0213 | |
|
||||||
+-----------------+--------------------------------+--------------------------------+
|
+-----------------+--------------------------------+--------------------------------+
|
||||||
|
| utf_32 | U32, utf32 | all languages |
|
||||||
|
+-----------------+--------------------------------+--------------------------------+
|
||||||
|
| utf_32_be | UTF-32BE | all languages |
|
||||||
|
+-----------------+--------------------------------+--------------------------------+
|
||||||
|
| utf_32_le | UTF-32LE | all languages |
|
||||||
|
+-----------------+--------------------------------+--------------------------------+
|
||||||
| utf_16 | U16, utf16 | all languages |
|
| utf_16 | U16, utf16 | all languages |
|
||||||
+-----------------+--------------------------------+--------------------------------+
|
+-----------------+--------------------------------+--------------------------------+
|
||||||
| utf_16_be | UTF-16BE | all languages (BMP only) |
|
| utf_16_be | UTF-16BE | all languages (BMP only) |
|
||||||
|
|
|
@ -145,6 +145,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
|
||||||
# define PyUnicode_AsEncodedString PyUnicodeUCS2_AsEncodedString
|
# define PyUnicode_AsEncodedString PyUnicodeUCS2_AsEncodedString
|
||||||
# define PyUnicode_AsLatin1String PyUnicodeUCS2_AsLatin1String
|
# define PyUnicode_AsLatin1String PyUnicodeUCS2_AsLatin1String
|
||||||
# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS2_AsRawUnicodeEscapeString
|
# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS2_AsRawUnicodeEscapeString
|
||||||
|
# define PyUnicode_AsUTF32String PyUnicodeUCS2_AsUTF32String
|
||||||
# define PyUnicode_AsUTF16String PyUnicodeUCS2_AsUTF16String
|
# define PyUnicode_AsUTF16String PyUnicodeUCS2_AsUTF16String
|
||||||
# define PyUnicode_AsUTF8String PyUnicodeUCS2_AsUTF8String
|
# define PyUnicode_AsUTF8String PyUnicodeUCS2_AsUTF8String
|
||||||
# define PyUnicode_AsUnicode PyUnicodeUCS2_AsUnicode
|
# define PyUnicode_AsUnicode PyUnicodeUCS2_AsUnicode
|
||||||
|
@ -159,6 +160,8 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
|
||||||
# define PyUnicode_DecodeCharmap PyUnicodeUCS2_DecodeCharmap
|
# define PyUnicode_DecodeCharmap PyUnicodeUCS2_DecodeCharmap
|
||||||
# define PyUnicode_DecodeLatin1 PyUnicodeUCS2_DecodeLatin1
|
# define PyUnicode_DecodeLatin1 PyUnicodeUCS2_DecodeLatin1
|
||||||
# define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS2_DecodeRawUnicodeEscape
|
# define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS2_DecodeRawUnicodeEscape
|
||||||
|
# define PyUnicode_DecodeUTF32 PyUnicodeUCS2_DecodeUTF32
|
||||||
|
# define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS2_DecodeUTF32Stateful
|
||||||
# define PyUnicode_DecodeUTF16 PyUnicodeUCS2_DecodeUTF16
|
# define PyUnicode_DecodeUTF16 PyUnicodeUCS2_DecodeUTF16
|
||||||
# define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS2_DecodeUTF16Stateful
|
# define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS2_DecodeUTF16Stateful
|
||||||
# define PyUnicode_DecodeUTF8 PyUnicodeUCS2_DecodeUTF8
|
# define PyUnicode_DecodeUTF8 PyUnicodeUCS2_DecodeUTF8
|
||||||
|
@ -170,6 +173,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
|
||||||
# define PyUnicode_EncodeDecimal PyUnicodeUCS2_EncodeDecimal
|
# define PyUnicode_EncodeDecimal PyUnicodeUCS2_EncodeDecimal
|
||||||
# define PyUnicode_EncodeLatin1 PyUnicodeUCS2_EncodeLatin1
|
# define PyUnicode_EncodeLatin1 PyUnicodeUCS2_EncodeLatin1
|
||||||
# define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS2_EncodeRawUnicodeEscape
|
# define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS2_EncodeRawUnicodeEscape
|
||||||
|
# define PyUnicode_EncodeUTF32 PyUnicodeUCS2_EncodeUTF32
|
||||||
# define PyUnicode_EncodeUTF16 PyUnicodeUCS2_EncodeUTF16
|
# define PyUnicode_EncodeUTF16 PyUnicodeUCS2_EncodeUTF16
|
||||||
# define PyUnicode_EncodeUTF8 PyUnicodeUCS2_EncodeUTF8
|
# define PyUnicode_EncodeUTF8 PyUnicodeUCS2_EncodeUTF8
|
||||||
# define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS2_EncodeUnicodeEscape
|
# define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS2_EncodeUnicodeEscape
|
||||||
|
@ -223,6 +227,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
|
||||||
# define PyUnicode_AsEncodedString PyUnicodeUCS4_AsEncodedString
|
# define PyUnicode_AsEncodedString PyUnicodeUCS4_AsEncodedString
|
||||||
# define PyUnicode_AsLatin1String PyUnicodeUCS4_AsLatin1String
|
# define PyUnicode_AsLatin1String PyUnicodeUCS4_AsLatin1String
|
||||||
# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS4_AsRawUnicodeEscapeString
|
# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS4_AsRawUnicodeEscapeString
|
||||||
|
# define PyUnicode_AsUTF32String PyUnicodeUCS4_AsUTF32String
|
||||||
# define PyUnicode_AsUTF16String PyUnicodeUCS4_AsUTF16String
|
# define PyUnicode_AsUTF16String PyUnicodeUCS4_AsUTF16String
|
||||||
# define PyUnicode_AsUTF8String PyUnicodeUCS4_AsUTF8String
|
# define PyUnicode_AsUTF8String PyUnicodeUCS4_AsUTF8String
|
||||||
# define PyUnicode_AsUnicode PyUnicodeUCS4_AsUnicode
|
# define PyUnicode_AsUnicode PyUnicodeUCS4_AsUnicode
|
||||||
|
@ -237,6 +242,8 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
|
||||||
# define PyUnicode_DecodeCharmap PyUnicodeUCS4_DecodeCharmap
|
# define PyUnicode_DecodeCharmap PyUnicodeUCS4_DecodeCharmap
|
||||||
# define PyUnicode_DecodeLatin1 PyUnicodeUCS4_DecodeLatin1
|
# define PyUnicode_DecodeLatin1 PyUnicodeUCS4_DecodeLatin1
|
||||||
# define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS4_DecodeRawUnicodeEscape
|
# define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS4_DecodeRawUnicodeEscape
|
||||||
|
# define PyUnicode_DecodeUTF32 PyUnicodeUCS4_DecodeUTF32
|
||||||
|
# define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS4_DecodeUTF32Stateful
|
||||||
# define PyUnicode_DecodeUTF16 PyUnicodeUCS4_DecodeUTF16
|
# define PyUnicode_DecodeUTF16 PyUnicodeUCS4_DecodeUTF16
|
||||||
# define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS4_DecodeUTF16Stateful
|
# define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS4_DecodeUTF16Stateful
|
||||||
# define PyUnicode_DecodeUTF8 PyUnicodeUCS4_DecodeUTF8
|
# define PyUnicode_DecodeUTF8 PyUnicodeUCS4_DecodeUTF8
|
||||||
|
@ -248,6 +255,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
|
||||||
# define PyUnicode_EncodeDecimal PyUnicodeUCS4_EncodeDecimal
|
# define PyUnicode_EncodeDecimal PyUnicodeUCS4_EncodeDecimal
|
||||||
# define PyUnicode_EncodeLatin1 PyUnicodeUCS4_EncodeLatin1
|
# define PyUnicode_EncodeLatin1 PyUnicodeUCS4_EncodeLatin1
|
||||||
# define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS4_EncodeRawUnicodeEscape
|
# define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS4_EncodeRawUnicodeEscape
|
||||||
|
# define PyUnicode_EncodeUTF32 PyUnicodeUCS4_EncodeUTF32
|
||||||
# define PyUnicode_EncodeUTF16 PyUnicodeUCS4_EncodeUTF16
|
# define PyUnicode_EncodeUTF16 PyUnicodeUCS4_EncodeUTF16
|
||||||
# define PyUnicode_EncodeUTF8 PyUnicodeUCS4_EncodeUTF8
|
# define PyUnicode_EncodeUTF8 PyUnicodeUCS4_EncodeUTF8
|
||||||
# define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS4_EncodeUnicodeEscape
|
# define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS4_EncodeUnicodeEscape
|
||||||
|
@ -701,6 +709,80 @@ PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8(
|
||||||
const char *errors /* error handling */
|
const char *errors /* error handling */
|
||||||
);
|
);
|
||||||
|
|
||||||
|
/* --- UTF-32 Codecs ------------------------------------------------------ */
|
||||||
|
|
||||||
|
/* Decodes length bytes from a UTF-32 encoded buffer string and returns
|
||||||
|
the corresponding Unicode object.
|
||||||
|
|
||||||
|
errors (if non-NULL) defines the error handling. It defaults
|
||||||
|
to "strict".
|
||||||
|
|
||||||
|
If byteorder is non-NULL, the decoder starts decoding using the
|
||||||
|
given byte order:
|
||||||
|
|
||||||
|
*byteorder == -1: little endian
|
||||||
|
*byteorder == 0: native order
|
||||||
|
*byteorder == 1: big endian
|
||||||
|
|
||||||
|
In native mode, the first four bytes of the stream are checked for a
|
||||||
|
BOM mark. If found, the BOM mark is analysed, the byte order
|
||||||
|
adjusted and the BOM skipped. In the other modes, no BOM mark
|
||||||
|
interpretation is done. After completion, *byteorder is set to the
|
||||||
|
current byte order at the end of input data.
|
||||||
|
|
||||||
|
If byteorder is NULL, the codec starts in native order mode.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
|
||||||
|
const char *string, /* UTF-32 encoded string */
|
||||||
|
Py_ssize_t length, /* size of string */
|
||||||
|
const char *errors, /* error handling */
|
||||||
|
int *byteorder /* pointer to byteorder to use
|
||||||
|
0=native;-1=LE,1=BE; updated on
|
||||||
|
exit */
|
||||||
|
);
|
||||||
|
|
||||||
|
PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
|
||||||
|
const char *string, /* UTF-32 encoded string */
|
||||||
|
Py_ssize_t length, /* size of string */
|
||||||
|
const char *errors, /* error handling */
|
||||||
|
int *byteorder, /* pointer to byteorder to use
|
||||||
|
0=native;-1=LE,1=BE; updated on
|
||||||
|
exit */
|
||||||
|
Py_ssize_t *consumed /* bytes consumed */
|
||||||
|
);
|
||||||
|
|
||||||
|
/* Returns a Python string using the UTF-32 encoding in native byte
|
||||||
|
order. The string always starts with a BOM mark. */
|
||||||
|
|
||||||
|
PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
|
||||||
|
PyObject *unicode /* Unicode object */
|
||||||
|
);
|
||||||
|
|
||||||
|
/* Returns a Python string object holding the UTF-32 encoded value of
|
||||||
|
the Unicode data.
|
||||||
|
|
||||||
|
If byteorder is not 0, output is written according to the following
|
||||||
|
byte order:
|
||||||
|
|
||||||
|
byteorder == -1: little endian
|
||||||
|
byteorder == 0: native byte order (writes a BOM mark)
|
||||||
|
byteorder == 1: big endian
|
||||||
|
|
||||||
|
If byteorder is 0, the output string will always start with the
|
||||||
|
Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
|
||||||
|
prepended.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32(
|
||||||
|
const Py_UNICODE *data, /* Unicode char buffer */
|
||||||
|
Py_ssize_t length, /* number of Py_UNICODE chars to encode */
|
||||||
|
const char *errors, /* error handling */
|
||||||
|
int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
|
||||||
|
);
|
||||||
|
|
||||||
/* --- UTF-16 Codecs ------------------------------------------------------ */
|
/* --- UTF-16 Codecs ------------------------------------------------------ */
|
||||||
|
|
||||||
/* Decodes length bytes from a UTF-16 encoded buffer string and returns
|
/* Decodes length bytes from a UTF-16 encoded buffer string and returns
|
||||||
|
|
|
@ -490,6 +490,16 @@
|
||||||
'unicodelittleunmarked' : 'utf_16_le',
|
'unicodelittleunmarked' : 'utf_16_le',
|
||||||
'utf_16le' : 'utf_16_le',
|
'utf_16le' : 'utf_16_le',
|
||||||
|
|
||||||
|
# utf_32 codec
|
||||||
|
'u32' : 'utf_32',
|
||||||
|
'utf32' : 'utf_32',
|
||||||
|
|
||||||
|
# utf_32_be codec
|
||||||
|
'utf_32be' : 'utf_32_be',
|
||||||
|
|
||||||
|
# utf_32_le codec
|
||||||
|
'utf_32le' : 'utf_32_le',
|
||||||
|
|
||||||
# utf_7 codec
|
# utf_7 codec
|
||||||
'u7' : 'utf_7',
|
'u7' : 'utf_7',
|
||||||
'utf7' : 'utf_7',
|
'utf7' : 'utf_7',
|
||||||
|
|
|
@ -0,0 +1,144 @@
|
||||||
|
"""
|
||||||
|
Python 'utf-32' Codec
|
||||||
|
"""
|
||||||
|
import codecs, sys
|
||||||
|
|
||||||
|
### Codec APIs
|
||||||
|
|
||||||
|
encode = codecs.utf_32_encode
|
||||||
|
|
||||||
|
def decode(input, errors='strict'):
|
||||||
|
return codecs.utf_32_decode(input, errors, True)
|
||||||
|
|
||||||
|
class IncrementalEncoder(codecs.IncrementalEncoder):
|
||||||
|
def __init__(self, errors='strict'):
|
||||||
|
codecs.IncrementalEncoder.__init__(self, errors)
|
||||||
|
self.encoder = None
|
||||||
|
|
||||||
|
def encode(self, input, final=False):
|
||||||
|
if self.encoder is None:
|
||||||
|
result = codecs.utf_32_encode(input, self.errors)[0]
|
||||||
|
if sys.byteorder == 'little':
|
||||||
|
self.encoder = codecs.utf_32_le_encode
|
||||||
|
else:
|
||||||
|
self.encoder = codecs.utf_32_be_encode
|
||||||
|
return result
|
||||||
|
return self.encoder(input, self.errors)[0]
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
codecs.IncrementalEncoder.reset(self)
|
||||||
|
self.encoder = None
|
||||||
|
|
||||||
|
def getstate(self):
|
||||||
|
# state info we return to the caller:
|
||||||
|
# 0: stream is in natural order for this platform
|
||||||
|
# 2: endianness hasn't been determined yet
|
||||||
|
# (we're never writing in unnatural order)
|
||||||
|
return (2 if self.encoder is None else 0)
|
||||||
|
|
||||||
|
def setstate(self, state):
|
||||||
|
if state:
|
||||||
|
self.encoder = None
|
||||||
|
else:
|
||||||
|
if sys.byteorder == 'little':
|
||||||
|
self.encoder = codecs.utf_32_le_encode
|
||||||
|
else:
|
||||||
|
self.encoder = codecs.utf_32_be_encode
|
||||||
|
|
||||||
|
class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
|
||||||
|
def __init__(self, errors='strict'):
|
||||||
|
codecs.BufferedIncrementalDecoder.__init__(self, errors)
|
||||||
|
self.decoder = None
|
||||||
|
|
||||||
|
def _buffer_decode(self, input, errors, final):
|
||||||
|
if self.decoder is None:
|
||||||
|
(output, consumed, byteorder) = \
|
||||||
|
codecs.utf_32_ex_decode(input, errors, 0, final)
|
||||||
|
if byteorder == -1:
|
||||||
|
self.decoder = codecs.utf_32_le_decode
|
||||||
|
elif byteorder == 1:
|
||||||
|
self.decoder = codecs.utf_32_be_decode
|
||||||
|
elif consumed >= 4:
|
||||||
|
raise UnicodeError("UTF-32 stream does not start with BOM")
|
||||||
|
return (output, consumed)
|
||||||
|
return self.decoder(input, self.errors, final)
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
codecs.BufferedIncrementalDecoder.reset(self)
|
||||||
|
self.decoder = None
|
||||||
|
|
||||||
|
def getstate(self):
|
||||||
|
# additonal state info from the base class must be None here,
|
||||||
|
# as it isn't passed along to the caller
|
||||||
|
state = codecs.BufferedIncrementalDecoder.getstate(self)[0]
|
||||||
|
# additional state info we pass to the caller:
|
||||||
|
# 0: stream is in natural order for this platform
|
||||||
|
# 1: stream is in unnatural order
|
||||||
|
# 2: endianness hasn't been determined yet
|
||||||
|
if self.decoder is None:
|
||||||
|
return (state, 2)
|
||||||
|
addstate = int((sys.byteorder == "big") !=
|
||||||
|
(self.decoder is codecs.utf_32_be_decode))
|
||||||
|
return (state, addstate)
|
||||||
|
|
||||||
|
def setstate(self, state):
|
||||||
|
# state[1] will be ignored by BufferedIncrementalDecoder.setstate()
|
||||||
|
codecs.BufferedIncrementalDecoder.setstate(self, state)
|
||||||
|
state = state[1]
|
||||||
|
if state == 0:
|
||||||
|
self.decoder = (codecs.utf_32_be_decode
|
||||||
|
if sys.byteorder == "big"
|
||||||
|
else codecs.utf_32_le_decode)
|
||||||
|
elif state == 1:
|
||||||
|
self.decoder = (codecs.utf_32_le_decode
|
||||||
|
if sys.byteorder == "big"
|
||||||
|
else codecs.utf_32_be_decode)
|
||||||
|
else:
|
||||||
|
self.decoder = None
|
||||||
|
|
||||||
|
class StreamWriter(codecs.StreamWriter):
|
||||||
|
def __init__(self, stream, errors='strict'):
|
||||||
|
self.bom_written = False
|
||||||
|
codecs.StreamWriter.__init__(self, stream, errors)
|
||||||
|
|
||||||
|
def encode(self, input, errors='strict'):
|
||||||
|
self.bom_written = True
|
||||||
|
result = codecs.utf_32_encode(input, errors)
|
||||||
|
if sys.byteorder == 'little':
|
||||||
|
self.encode = codecs.utf_32_le_encode
|
||||||
|
else:
|
||||||
|
self.encode = codecs.utf_32_be_encode
|
||||||
|
return result
|
||||||
|
|
||||||
|
class StreamReader(codecs.StreamReader):
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
codecs.StreamReader.reset(self)
|
||||||
|
try:
|
||||||
|
del self.decode
|
||||||
|
except AttributeError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def decode(self, input, errors='strict'):
|
||||||
|
(object, consumed, byteorder) = \
|
||||||
|
codecs.utf_32_ex_decode(input, errors, 0, False)
|
||||||
|
if byteorder == -1:
|
||||||
|
self.decode = codecs.utf_32_le_decode
|
||||||
|
elif byteorder == 1:
|
||||||
|
self.decode = codecs.utf_32_be_decode
|
||||||
|
elif consumed>=4:
|
||||||
|
raise UnicodeError,"UTF-32 stream does not start with BOM"
|
||||||
|
return (object, consumed)
|
||||||
|
|
||||||
|
### encodings module API
|
||||||
|
|
||||||
|
def getregentry():
|
||||||
|
return codecs.CodecInfo(
|
||||||
|
name='utf-32',
|
||||||
|
encode=encode,
|
||||||
|
decode=decode,
|
||||||
|
incrementalencoder=IncrementalEncoder,
|
||||||
|
incrementaldecoder=IncrementalDecoder,
|
||||||
|
streamreader=StreamReader,
|
||||||
|
streamwriter=StreamWriter,
|
||||||
|
)
|
|
@ -0,0 +1,37 @@
|
||||||
|
"""
|
||||||
|
Python 'utf-32-be' Codec
|
||||||
|
"""
|
||||||
|
import codecs
|
||||||
|
|
||||||
|
### Codec APIs
|
||||||
|
|
||||||
|
encode = codecs.utf_32_be_encode
|
||||||
|
|
||||||
|
def decode(input, errors='strict'):
|
||||||
|
return codecs.utf_32_be_decode(input, errors, True)
|
||||||
|
|
||||||
|
class IncrementalEncoder(codecs.IncrementalEncoder):
|
||||||
|
def encode(self, input, final=False):
|
||||||
|
return codecs.utf_32_be_encode(input, self.errors)[0]
|
||||||
|
|
||||||
|
class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
|
||||||
|
_buffer_decode = codecs.utf_32_be_decode
|
||||||
|
|
||||||
|
class StreamWriter(codecs.StreamWriter):
|
||||||
|
encode = codecs.utf_32_be_encode
|
||||||
|
|
||||||
|
class StreamReader(codecs.StreamReader):
|
||||||
|
decode = codecs.utf_32_be_decode
|
||||||
|
|
||||||
|
### encodings module API
|
||||||
|
|
||||||
|
def getregentry():
|
||||||
|
return codecs.CodecInfo(
|
||||||
|
name='utf-32-be',
|
||||||
|
encode=encode,
|
||||||
|
decode=decode,
|
||||||
|
incrementalencoder=IncrementalEncoder,
|
||||||
|
incrementaldecoder=IncrementalDecoder,
|
||||||
|
streamreader=StreamReader,
|
||||||
|
streamwriter=StreamWriter,
|
||||||
|
)
|
|
@ -0,0 +1,37 @@
|
||||||
|
"""
|
||||||
|
Python 'utf-32-le' Codec
|
||||||
|
"""
|
||||||
|
import codecs
|
||||||
|
|
||||||
|
### Codec APIs
|
||||||
|
|
||||||
|
encode = codecs.utf_32_le_encode
|
||||||
|
|
||||||
|
def decode(input, errors='strict'):
|
||||||
|
return codecs.utf_32_le_decode(input, errors, True)
|
||||||
|
|
||||||
|
class IncrementalEncoder(codecs.IncrementalEncoder):
|
||||||
|
def encode(self, input, final=False):
|
||||||
|
return codecs.utf_32_le_encode(input, self.errors)[0]
|
||||||
|
|
||||||
|
class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
|
||||||
|
_buffer_decode = codecs.utf_32_le_decode
|
||||||
|
|
||||||
|
class StreamWriter(codecs.StreamWriter):
|
||||||
|
encode = codecs.utf_32_le_encode
|
||||||
|
|
||||||
|
class StreamReader(codecs.StreamReader):
|
||||||
|
decode = codecs.utf_32_le_decode
|
||||||
|
|
||||||
|
### encodings module API
|
||||||
|
|
||||||
|
def getregentry():
|
||||||
|
return codecs.CodecInfo(
|
||||||
|
name='utf-32-le',
|
||||||
|
encode=encode,
|
||||||
|
decode=decode,
|
||||||
|
incrementalencoder=IncrementalEncoder,
|
||||||
|
incrementaldecoder=IncrementalDecoder,
|
||||||
|
streamreader=StreamReader,
|
||||||
|
streamwriter=StreamWriter,
|
||||||
|
)
|
|
@ -285,7 +285,8 @@ def handler2(exc):
|
||||||
|
|
||||||
def test_longstrings(self):
|
def test_longstrings(self):
|
||||||
# test long strings to check for memory overflow problems
|
# test long strings to check for memory overflow problems
|
||||||
errors = [ "strict", "ignore", "replace", "xmlcharrefreplace", "backslashreplace"]
|
errors = [ "strict", "ignore", "replace", "xmlcharrefreplace",
|
||||||
|
"backslashreplace"]
|
||||||
# register the handlers under different names,
|
# register the handlers under different names,
|
||||||
# to prevent the codec from recognizing the name
|
# to prevent the codec from recognizing the name
|
||||||
for err in errors:
|
for err in errors:
|
||||||
|
@ -293,7 +294,8 @@ def test_longstrings(self):
|
||||||
l = 1000
|
l = 1000
|
||||||
errors += [ "test." + err for err in errors ]
|
errors += [ "test." + err for err in errors ]
|
||||||
for uni in [ s*l for s in (u"x", u"\u3042", u"a\xe4") ]:
|
for uni in [ s*l for s in (u"x", u"\u3042", u"a\xe4") ]:
|
||||||
for enc in ("ascii", "latin-1", "iso-8859-1", "iso-8859-15", "utf-8", "utf-7", "utf-16"):
|
for enc in ("ascii", "latin-1", "iso-8859-1", "iso-8859-15",
|
||||||
|
"utf-8", "utf-7", "utf-16", "utf-32"):
|
||||||
for err in errors:
|
for err in errors:
|
||||||
try:
|
try:
|
||||||
uni.encode(enc, err)
|
uni.encode(enc, err)
|
||||||
|
|
|
@ -244,6 +244,137 @@ def test_bug1098990_b(self):
|
||||||
self.assertEqual(reader.readline(), s5)
|
self.assertEqual(reader.readline(), s5)
|
||||||
self.assertEqual(reader.readline(), u"")
|
self.assertEqual(reader.readline(), u"")
|
||||||
|
|
||||||
|
class UTF32Test(ReadTest):
|
||||||
|
encoding = "utf-32"
|
||||||
|
|
||||||
|
spamle = ('\xff\xfe\x00\x00'
|
||||||
|
's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
|
||||||
|
's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
|
||||||
|
spambe = ('\x00\x00\xfe\xff'
|
||||||
|
'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
|
||||||
|
'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
|
||||||
|
|
||||||
|
def test_only_one_bom(self):
|
||||||
|
_,_,reader,writer = codecs.lookup(self.encoding)
|
||||||
|
# encode some stream
|
||||||
|
s = StringIO.StringIO()
|
||||||
|
f = writer(s)
|
||||||
|
f.write(u"spam")
|
||||||
|
f.write(u"spam")
|
||||||
|
d = s.getvalue()
|
||||||
|
# check whether there is exactly one BOM in it
|
||||||
|
self.assert_(d == self.spamle or d == self.spambe)
|
||||||
|
# try to read it back
|
||||||
|
s = StringIO.StringIO(d)
|
||||||
|
f = reader(s)
|
||||||
|
self.assertEquals(f.read(), u"spamspam")
|
||||||
|
|
||||||
|
def test_badbom(self):
|
||||||
|
s = StringIO.StringIO(4*"\xff")
|
||||||
|
f = codecs.getreader(self.encoding)(s)
|
||||||
|
self.assertRaises(UnicodeError, f.read)
|
||||||
|
|
||||||
|
s = StringIO.StringIO(8*"\xff")
|
||||||
|
f = codecs.getreader(self.encoding)(s)
|
||||||
|
self.assertRaises(UnicodeError, f.read)
|
||||||
|
|
||||||
|
def test_partial(self):
|
||||||
|
self.check_partial(
|
||||||
|
u"\x00\xff\u0100\uffff",
|
||||||
|
[
|
||||||
|
u"", # first byte of BOM read
|
||||||
|
u"", # second byte of BOM read
|
||||||
|
u"", # third byte of BOM read
|
||||||
|
u"", # fourth byte of BOM read => byteorder known
|
||||||
|
u"",
|
||||||
|
u"",
|
||||||
|
u"",
|
||||||
|
u"\x00",
|
||||||
|
u"\x00",
|
||||||
|
u"\x00",
|
||||||
|
u"\x00",
|
||||||
|
u"\x00\xff",
|
||||||
|
u"\x00\xff",
|
||||||
|
u"\x00\xff",
|
||||||
|
u"\x00\xff",
|
||||||
|
u"\x00\xff\u0100",
|
||||||
|
u"\x00\xff\u0100",
|
||||||
|
u"\x00\xff\u0100",
|
||||||
|
u"\x00\xff\u0100",
|
||||||
|
u"\x00\xff\u0100\uffff",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_errors(self):
|
||||||
|
self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
|
||||||
|
"\xff", "strict", True)
|
||||||
|
|
||||||
|
class UTF32LETest(ReadTest):
|
||||||
|
encoding = "utf-32-le"
|
||||||
|
|
||||||
|
def test_partial(self):
|
||||||
|
self.check_partial(
|
||||||
|
u"\x00\xff\u0100\uffff",
|
||||||
|
[
|
||||||
|
u"",
|
||||||
|
u"",
|
||||||
|
u"",
|
||||||
|
u"\x00",
|
||||||
|
u"\x00",
|
||||||
|
u"\x00",
|
||||||
|
u"\x00",
|
||||||
|
u"\x00\xff",
|
||||||
|
u"\x00\xff",
|
||||||
|
u"\x00\xff",
|
||||||
|
u"\x00\xff",
|
||||||
|
u"\x00\xff\u0100",
|
||||||
|
u"\x00\xff\u0100",
|
||||||
|
u"\x00\xff\u0100",
|
||||||
|
u"\x00\xff\u0100",
|
||||||
|
u"\x00\xff\u0100\uffff",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_simple(self):
|
||||||
|
self.assertEqual(u"\U00010203".encode(self.encoding), "\x03\x02\x01\x00")
|
||||||
|
|
||||||
|
def test_errors(self):
|
||||||
|
self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
|
||||||
|
"\xff", "strict", True)
|
||||||
|
|
||||||
|
class UTF32BETest(ReadTest):
|
||||||
|
encoding = "utf-32-be"
|
||||||
|
|
||||||
|
def test_partial(self):
|
||||||
|
self.check_partial(
|
||||||
|
u"\x00\xff\u0100\uffff",
|
||||||
|
[
|
||||||
|
u"",
|
||||||
|
u"",
|
||||||
|
u"",
|
||||||
|
u"\x00",
|
||||||
|
u"\x00",
|
||||||
|
u"\x00",
|
||||||
|
u"\x00",
|
||||||
|
u"\x00\xff",
|
||||||
|
u"\x00\xff",
|
||||||
|
u"\x00\xff",
|
||||||
|
u"\x00\xff",
|
||||||
|
u"\x00\xff\u0100",
|
||||||
|
u"\x00\xff\u0100",
|
||||||
|
u"\x00\xff\u0100",
|
||||||
|
u"\x00\xff\u0100",
|
||||||
|
u"\x00\xff\u0100\uffff",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_simple(self):
|
||||||
|
self.assertEqual(u"\U00010203".encode(self.encoding), "\x00\x01\x02\x03")
|
||||||
|
|
||||||
|
def test_errors(self):
|
||||||
|
self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
|
||||||
|
"\xff", "strict", True)
|
||||||
|
|
||||||
class UTF16Test(ReadTest):
|
class UTF16Test(ReadTest):
|
||||||
encoding = "utf-16"
|
encoding = "utf-16"
|
||||||
|
|
||||||
|
@ -1278,6 +1409,9 @@ def test_streamreaderwriter(self):
|
||||||
|
|
||||||
def test_main():
|
def test_main():
|
||||||
test_support.run_unittest(
|
test_support.run_unittest(
|
||||||
|
UTF32Test,
|
||||||
|
UTF32LETest,
|
||||||
|
UTF32BETest,
|
||||||
UTF16Test,
|
UTF16Test,
|
||||||
UTF16LETest,
|
UTF16LETest,
|
||||||
UTF16BETest,
|
UTF16BETest,
|
||||||
|
|
|
@ -243,6 +243,8 @@ Library
|
||||||
- GB18030 codec now can encode additional two-byte characters that
|
- GB18030 codec now can encode additional two-byte characters that
|
||||||
are missing in GBK.
|
are missing in GBK.
|
||||||
|
|
||||||
|
- Add new codecs for UTF-32, UTF-32-LE and UTF-32-BE.
|
||||||
|
|
||||||
- Bug #1704793: Return UTF-16 pair if unicodedata.lookup cannot
|
- Bug #1704793: Return UTF-16 pair if unicodedata.lookup cannot
|
||||||
represent the result in a single character.
|
represent the result in a single character.
|
||||||
|
|
||||||
|
|
|
@ -391,6 +391,126 @@ utf_16_ex_decode(PyObject *self,
|
||||||
return tuple;
|
return tuple;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static PyObject *
|
||||||
|
utf_32_decode(PyObject *self,
|
||||||
|
PyObject *args)
|
||||||
|
{
|
||||||
|
const char *data;
|
||||||
|
Py_ssize_t size;
|
||||||
|
const char *errors = NULL;
|
||||||
|
int byteorder = 0;
|
||||||
|
int final = 0;
|
||||||
|
Py_ssize_t consumed;
|
||||||
|
PyObject *decoded;
|
||||||
|
|
||||||
|
if (!PyArg_ParseTuple(args, "t#|zi:utf_32_decode",
|
||||||
|
&data, &size, &errors, &final))
|
||||||
|
return NULL;
|
||||||
|
if (size < 0) {
|
||||||
|
PyErr_SetString(PyExc_ValueError, "negative argument");
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
consumed = size; /* This is overwritten unless final is true. */
|
||||||
|
decoded = PyUnicode_DecodeUTF32Stateful(data, size, errors, &byteorder,
|
||||||
|
final ? NULL : &consumed);
|
||||||
|
if (decoded == NULL)
|
||||||
|
return NULL;
|
||||||
|
return codec_tuple(decoded, consumed);
|
||||||
|
}
|
||||||
|
|
||||||
|
static PyObject *
|
||||||
|
utf_32_le_decode(PyObject *self,
|
||||||
|
PyObject *args)
|
||||||
|
{
|
||||||
|
const char *data;
|
||||||
|
Py_ssize_t size;
|
||||||
|
const char *errors = NULL;
|
||||||
|
int byteorder = -1;
|
||||||
|
int final = 0;
|
||||||
|
Py_ssize_t consumed;
|
||||||
|
PyObject *decoded = NULL;
|
||||||
|
|
||||||
|
if (!PyArg_ParseTuple(args, "t#|zi:utf_32_le_decode",
|
||||||
|
&data, &size, &errors, &final))
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
if (size < 0) {
|
||||||
|
PyErr_SetString(PyExc_ValueError, "negative argument");
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
consumed = size; /* This is overwritten unless final is true. */
|
||||||
|
decoded = PyUnicode_DecodeUTF32Stateful(data, size, errors,
|
||||||
|
&byteorder, final ? NULL : &consumed);
|
||||||
|
if (decoded == NULL)
|
||||||
|
return NULL;
|
||||||
|
return codec_tuple(decoded, consumed);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
static PyObject *
|
||||||
|
utf_32_be_decode(PyObject *self,
|
||||||
|
PyObject *args)
|
||||||
|
{
|
||||||
|
const char *data;
|
||||||
|
Py_ssize_t size;
|
||||||
|
const char *errors = NULL;
|
||||||
|
int byteorder = 1;
|
||||||
|
int final = 0;
|
||||||
|
Py_ssize_t consumed;
|
||||||
|
PyObject *decoded = NULL;
|
||||||
|
|
||||||
|
if (!PyArg_ParseTuple(args, "t#|zi:utf_32_be_decode",
|
||||||
|
&data, &size, &errors, &final))
|
||||||
|
return NULL;
|
||||||
|
if (size < 0) {
|
||||||
|
PyErr_SetString(PyExc_ValueError, "negative argument");
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
consumed = size; /* This is overwritten unless final is true. */
|
||||||
|
decoded = PyUnicode_DecodeUTF32Stateful(data, size, errors,
|
||||||
|
&byteorder, final ? NULL : &consumed);
|
||||||
|
if (decoded == NULL)
|
||||||
|
return NULL;
|
||||||
|
return codec_tuple(decoded, consumed);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* This non-standard version also provides access to the byteorder
|
||||||
|
parameter of the builtin UTF-32 codec.
|
||||||
|
|
||||||
|
It returns a tuple (unicode, bytesread, byteorder) with byteorder
|
||||||
|
being the value in effect at the end of data.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
static PyObject *
|
||||||
|
utf_32_ex_decode(PyObject *self,
|
||||||
|
PyObject *args)
|
||||||
|
{
|
||||||
|
const char *data;
|
||||||
|
Py_ssize_t size;
|
||||||
|
const char *errors = NULL;
|
||||||
|
int byteorder = 0;
|
||||||
|
PyObject *unicode, *tuple;
|
||||||
|
int final = 0;
|
||||||
|
Py_ssize_t consumed;
|
||||||
|
|
||||||
|
if (!PyArg_ParseTuple(args, "t#|zii:utf_32_ex_decode",
|
||||||
|
&data, &size, &errors, &byteorder, &final))
|
||||||
|
return NULL;
|
||||||
|
if (size < 0) {
|
||||||
|
PyErr_SetString(PyExc_ValueError, "negative argument");
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
consumed = size; /* This is overwritten unless final is true. */
|
||||||
|
unicode = PyUnicode_DecodeUTF32Stateful(data, size, errors, &byteorder,
|
||||||
|
final ? NULL : &consumed);
|
||||||
|
if (unicode == NULL)
|
||||||
|
return NULL;
|
||||||
|
tuple = Py_BuildValue("Oni", unicode, consumed, byteorder);
|
||||||
|
Py_DECREF(unicode);
|
||||||
|
return tuple;
|
||||||
|
}
|
||||||
|
|
||||||
static PyObject *
|
static PyObject *
|
||||||
unicode_escape_decode(PyObject *self,
|
unicode_escape_decode(PyObject *self,
|
||||||
PyObject *args)
|
PyObject *args)
|
||||||
|
@ -683,6 +803,83 @@ utf_16_be_encode(PyObject *self,
|
||||||
return v;
|
return v;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* This version provides access to the byteorder parameter of the
|
||||||
|
builtin UTF-32 codecs as optional third argument. It defaults to 0
|
||||||
|
which means: use the native byte order and prepend the data with a
|
||||||
|
BOM mark.
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
static PyObject *
|
||||||
|
utf_32_encode(PyObject *self,
|
||||||
|
PyObject *args)
|
||||||
|
{
|
||||||
|
PyObject *str, *v;
|
||||||
|
const char *errors = NULL;
|
||||||
|
int byteorder = 0;
|
||||||
|
|
||||||
|
if (!PyArg_ParseTuple(args, "O|zi:utf_32_encode",
|
||||||
|
&str, &errors, &byteorder))
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
str = PyUnicode_FromObject(str);
|
||||||
|
if (str == NULL)
|
||||||
|
return NULL;
|
||||||
|
v = codec_tuple(PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(str),
|
||||||
|
PyUnicode_GET_SIZE(str),
|
||||||
|
errors,
|
||||||
|
byteorder),
|
||||||
|
PyUnicode_GET_SIZE(str));
|
||||||
|
Py_DECREF(str);
|
||||||
|
return v;
|
||||||
|
}
|
||||||
|
|
||||||
|
static PyObject *
|
||||||
|
utf_32_le_encode(PyObject *self,
|
||||||
|
PyObject *args)
|
||||||
|
{
|
||||||
|
PyObject *str, *v;
|
||||||
|
const char *errors = NULL;
|
||||||
|
|
||||||
|
if (!PyArg_ParseTuple(args, "O|z:utf_32_le_encode",
|
||||||
|
&str, &errors))
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
str = PyUnicode_FromObject(str);
|
||||||
|
if (str == NULL)
|
||||||
|
return NULL;
|
||||||
|
v = codec_tuple(PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(str),
|
||||||
|
PyUnicode_GET_SIZE(str),
|
||||||
|
errors,
|
||||||
|
-1),
|
||||||
|
PyUnicode_GET_SIZE(str));
|
||||||
|
Py_DECREF(str);
|
||||||
|
return v;
|
||||||
|
}
|
||||||
|
|
||||||
|
static PyObject *
|
||||||
|
utf_32_be_encode(PyObject *self,
|
||||||
|
PyObject *args)
|
||||||
|
{
|
||||||
|
PyObject *str, *v;
|
||||||
|
const char *errors = NULL;
|
||||||
|
|
||||||
|
if (!PyArg_ParseTuple(args, "O|z:utf_32_be_encode",
|
||||||
|
&str, &errors))
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
str = PyUnicode_FromObject(str);
|
||||||
|
if (str == NULL)
|
||||||
|
return NULL;
|
||||||
|
v = codec_tuple(PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(str),
|
||||||
|
PyUnicode_GET_SIZE(str),
|
||||||
|
errors,
|
||||||
|
+1),
|
||||||
|
PyUnicode_GET_SIZE(str));
|
||||||
|
Py_DECREF(str);
|
||||||
|
return v;
|
||||||
|
}
|
||||||
|
|
||||||
static PyObject *
|
static PyObject *
|
||||||
unicode_escape_encode(PyObject *self,
|
unicode_escape_encode(PyObject *self,
|
||||||
PyObject *args)
|
PyObject *args)
|
||||||
|
@ -901,6 +1098,13 @@ static PyMethodDef _codecs_functions[] = {
|
||||||
{"utf_16_le_decode", utf_16_le_decode, METH_VARARGS},
|
{"utf_16_le_decode", utf_16_le_decode, METH_VARARGS},
|
||||||
{"utf_16_be_decode", utf_16_be_decode, METH_VARARGS},
|
{"utf_16_be_decode", utf_16_be_decode, METH_VARARGS},
|
||||||
{"utf_16_ex_decode", utf_16_ex_decode, METH_VARARGS},
|
{"utf_16_ex_decode", utf_16_ex_decode, METH_VARARGS},
|
||||||
|
{"utf_32_encode", utf_32_encode, METH_VARARGS},
|
||||||
|
{"utf_32_le_encode", utf_32_le_encode, METH_VARARGS},
|
||||||
|
{"utf_32_be_encode", utf_32_be_encode, METH_VARARGS},
|
||||||
|
{"utf_32_decode", utf_32_decode, METH_VARARGS},
|
||||||
|
{"utf_32_le_decode", utf_32_le_decode, METH_VARARGS},
|
||||||
|
{"utf_32_be_decode", utf_32_be_decode, METH_VARARGS},
|
||||||
|
{"utf_32_ex_decode", utf_32_ex_decode, METH_VARARGS},
|
||||||
{"unicode_escape_encode", unicode_escape_encode, METH_VARARGS},
|
{"unicode_escape_encode", unicode_escape_encode, METH_VARARGS},
|
||||||
{"unicode_escape_decode", unicode_escape_decode, METH_VARARGS},
|
{"unicode_escape_decode", unicode_escape_decode, METH_VARARGS},
|
||||||
{"unicode_internal_encode", unicode_internal_encode, METH_VARARGS},
|
{"unicode_internal_encode", unicode_internal_encode, METH_VARARGS},
|
||||||
|
|
|
@ -1504,6 +1504,272 @@ PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
|
||||||
NULL);
|
NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* --- UTF-32 Codec ------------------------------------------------------- */
|
||||||
|
|
||||||
|
PyObject *
|
||||||
|
PyUnicode_DecodeUTF32(const char *s,
|
||||||
|
Py_ssize_t size,
|
||||||
|
const char *errors,
|
||||||
|
int *byteorder)
|
||||||
|
{
|
||||||
|
return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
|
||||||
|
}
|
||||||
|
|
||||||
|
PyObject *
|
||||||
|
PyUnicode_DecodeUTF32Stateful(const char *s,
|
||||||
|
Py_ssize_t size,
|
||||||
|
const char *errors,
|
||||||
|
int *byteorder,
|
||||||
|
Py_ssize_t *consumed)
|
||||||
|
{
|
||||||
|
const char *starts = s;
|
||||||
|
Py_ssize_t startinpos;
|
||||||
|
Py_ssize_t endinpos;
|
||||||
|
Py_ssize_t outpos;
|
||||||
|
PyUnicodeObject *unicode;
|
||||||
|
Py_UNICODE *p;
|
||||||
|
#ifndef Py_UNICODE_WIDE
|
||||||
|
int i, pairs;
|
||||||
|
#else
|
||||||
|
const int pairs = 0;
|
||||||
|
#endif
|
||||||
|
const unsigned char *q, *e;
|
||||||
|
int bo = 0; /* assume native ordering by default */
|
||||||
|
const char *errmsg = "";
|
||||||
|
/* On narrow builds we split characters outside the BMP into two
|
||||||
|
codepoints => count how much extra space we need. */
|
||||||
|
#ifndef Py_UNICODE_WIDE
|
||||||
|
for (i = pairs = 0; i < size/4; i++)
|
||||||
|
if (((Py_UCS4 *)s)[i] >= 0x10000)
|
||||||
|
pairs++;
|
||||||
|
#endif
|
||||||
|
/* Offsets from q for retrieving bytes in the right order. */
|
||||||
|
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
|
||||||
|
int iorder[] = {0, 1, 2, 3};
|
||||||
|
#else
|
||||||
|
int iorder[] = {3, 2, 1, 0};
|
||||||
|
#endif
|
||||||
|
PyObject *errorHandler = NULL;
|
||||||
|
PyObject *exc = NULL;
|
||||||
|
|
||||||
|
/* This might be one to much, because of a BOM */
|
||||||
|
unicode = _PyUnicode_New((size+3)/4+pairs);
|
||||||
|
if (!unicode)
|
||||||
|
return NULL;
|
||||||
|
if (size == 0)
|
||||||
|
return (PyObject *)unicode;
|
||||||
|
|
||||||
|
/* Unpack UTF-32 encoded data */
|
||||||
|
p = unicode->str;
|
||||||
|
q = (unsigned char *)s;
|
||||||
|
e = q + size;
|
||||||
|
|
||||||
|
if (byteorder)
|
||||||
|
bo = *byteorder;
|
||||||
|
|
||||||
|
/* Check for BOM marks (U+FEFF) in the input and adjust current
|
||||||
|
byte order setting accordingly. In native mode, the leading BOM
|
||||||
|
mark is skipped, in all other modes, it is copied to the output
|
||||||
|
stream as-is (giving a ZWNBSP character). */
|
||||||
|
if (bo == 0) {
|
||||||
|
if (size >= 4) {
|
||||||
|
const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
|
||||||
|
(q[iorder[1]] << 8) | q[iorder[0]];
|
||||||
|
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
|
||||||
|
if (bom == 0x0000FEFF) {
|
||||||
|
q += 4;
|
||||||
|
bo = -1;
|
||||||
|
}
|
||||||
|
else if (bom == 0xFFFE0000) {
|
||||||
|
q += 4;
|
||||||
|
bo = 1;
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
if (bom == 0x0000FEFF) {
|
||||||
|
q += 4;
|
||||||
|
bo = 1;
|
||||||
|
}
|
||||||
|
else if (bom == 0xFFFE0000) {
|
||||||
|
q += 4;
|
||||||
|
bo = -1;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (bo == -1) {
|
||||||
|
/* force LE */
|
||||||
|
iorder[0] = 0;
|
||||||
|
iorder[1] = 1;
|
||||||
|
iorder[2] = 2;
|
||||||
|
iorder[3] = 3;
|
||||||
|
}
|
||||||
|
else if (bo == 1) {
|
||||||
|
/* force BE */
|
||||||
|
iorder[0] = 3;
|
||||||
|
iorder[1] = 2;
|
||||||
|
iorder[2] = 1;
|
||||||
|
iorder[3] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
while (q < e) {
|
||||||
|
Py_UCS4 ch;
|
||||||
|
/* remaining bytes at the end? (size should be divisible by 4) */
|
||||||
|
if (e-q<4) {
|
||||||
|
if (consumed)
|
||||||
|
break;
|
||||||
|
errmsg = "truncated data";
|
||||||
|
startinpos = ((const char *)q)-starts;
|
||||||
|
endinpos = ((const char *)e)-starts;
|
||||||
|
goto utf32Error;
|
||||||
|
/* The remaining input chars are ignored if the callback
|
||||||
|
chooses to skip the input */
|
||||||
|
}
|
||||||
|
ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
|
||||||
|
(q[iorder[1]] << 8) | q[iorder[0]];
|
||||||
|
|
||||||
|
if (ch >= 0x110000)
|
||||||
|
{
|
||||||
|
errmsg = "codepoint not in range(0x110000)";
|
||||||
|
startinpos = ((const char *)q)-starts;
|
||||||
|
endinpos = startinpos+4;
|
||||||
|
goto utf32Error;
|
||||||
|
}
|
||||||
|
#ifndef Py_UNICODE_WIDE
|
||||||
|
if (ch >= 0x10000)
|
||||||
|
{
|
||||||
|
*p++ = 0xD800 | ((ch-0x10000) >> 10);
|
||||||
|
*p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
#endif
|
||||||
|
*p++ = ch;
|
||||||
|
q += 4;
|
||||||
|
continue;
|
||||||
|
utf32Error:
|
||||||
|
outpos = p-PyUnicode_AS_UNICODE(unicode);
|
||||||
|
if (unicode_decode_call_errorhandler(
|
||||||
|
errors, &errorHandler,
|
||||||
|
"utf32", errmsg,
|
||||||
|
starts, size, &startinpos, &endinpos, &exc, &s,
|
||||||
|
(PyObject **)&unicode, &outpos, &p))
|
||||||
|
goto onError;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (byteorder)
|
||||||
|
*byteorder = bo;
|
||||||
|
|
||||||
|
if (consumed)
|
||||||
|
*consumed = (const char *)q-starts;
|
||||||
|
|
||||||
|
/* Adjust length */
|
||||||
|
if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
|
||||||
|
goto onError;
|
||||||
|
|
||||||
|
Py_XDECREF(errorHandler);
|
||||||
|
Py_XDECREF(exc);
|
||||||
|
return (PyObject *)unicode;
|
||||||
|
|
||||||
|
onError:
|
||||||
|
Py_DECREF(unicode);
|
||||||
|
Py_XDECREF(errorHandler);
|
||||||
|
Py_XDECREF(exc);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
PyObject *
|
||||||
|
PyUnicode_EncodeUTF32(const Py_UNICODE *s,
|
||||||
|
Py_ssize_t size,
|
||||||
|
const char *errors,
|
||||||
|
int byteorder)
|
||||||
|
{
|
||||||
|
PyObject *v;
|
||||||
|
unsigned char *p;
|
||||||
|
#ifndef Py_UNICODE_WIDE
|
||||||
|
int i, pairs;
|
||||||
|
#else
|
||||||
|
const int pairs = 0;
|
||||||
|
#endif
|
||||||
|
/* Offsets from p for storing byte pairs in the right order. */
|
||||||
|
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
|
||||||
|
int iorder[] = {0, 1, 2, 3};
|
||||||
|
#else
|
||||||
|
int iorder[] = {3, 2, 1, 0};
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define STORECHAR(CH) \
|
||||||
|
do { \
|
||||||
|
p[iorder[3]] = ((CH) >> 24) & 0xff; \
|
||||||
|
p[iorder[2]] = ((CH) >> 16) & 0xff; \
|
||||||
|
p[iorder[1]] = ((CH) >> 8) & 0xff; \
|
||||||
|
p[iorder[0]] = (CH) & 0xff; \
|
||||||
|
p += 4; \
|
||||||
|
} while(0)
|
||||||
|
|
||||||
|
/* In narrow builds we can output surrogate pairs as one codepoint,
|
||||||
|
so we need less space. */
|
||||||
|
#ifndef Py_UNICODE_WIDE
|
||||||
|
for (i = pairs = 0; i < size-1; i++)
|
||||||
|
if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
|
||||||
|
0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
|
||||||
|
pairs++;
|
||||||
|
#endif
|
||||||
|
v = PyString_FromStringAndSize(NULL,
|
||||||
|
4 * (size - pairs + (byteorder == 0)));
|
||||||
|
if (v == NULL)
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
p = (unsigned char *)PyString_AS_STRING(v);
|
||||||
|
if (byteorder == 0)
|
||||||
|
STORECHAR(0xFEFF);
|
||||||
|
if (size == 0)
|
||||||
|
return v;
|
||||||
|
|
||||||
|
if (byteorder == -1) {
|
||||||
|
/* force LE */
|
||||||
|
iorder[0] = 0;
|
||||||
|
iorder[1] = 1;
|
||||||
|
iorder[2] = 2;
|
||||||
|
iorder[3] = 3;
|
||||||
|
}
|
||||||
|
else if (byteorder == 1) {
|
||||||
|
/* force BE */
|
||||||
|
iorder[0] = 3;
|
||||||
|
iorder[1] = 2;
|
||||||
|
iorder[2] = 1;
|
||||||
|
iorder[3] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
while (size-- > 0) {
|
||||||
|
Py_UCS4 ch = *s++;
|
||||||
|
#ifndef Py_UNICODE_WIDE
|
||||||
|
if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
|
||||||
|
Py_UCS4 ch2 = *s;
|
||||||
|
if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
|
||||||
|
ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
|
||||||
|
s++;
|
||||||
|
size--;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
STORECHAR(ch);
|
||||||
|
}
|
||||||
|
return v;
|
||||||
|
#undef STORECHAR
|
||||||
|
}
|
||||||
|
|
||||||
|
PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
|
||||||
|
{
|
||||||
|
if (!PyUnicode_Check(unicode)) {
|
||||||
|
PyErr_BadArgument();
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
|
||||||
|
PyUnicode_GET_SIZE(unicode),
|
||||||
|
NULL,
|
||||||
|
0);
|
||||||
|
}
|
||||||
|
|
||||||
/* --- UTF-16 Codec ------------------------------------------------------- */
|
/* --- UTF-16 Codec ------------------------------------------------------- */
|
||||||
|
|
||||||
PyObject *
|
PyObject *
|
||||||
|
|
Loading…
Reference in New Issue