mirror of https://github.com/python/cpython.git
This patch changes the behaviour of the UTF-16 codec family. Only the
UTF-16 codec will now interpret and remove a *leading* BOM mark. Sub- sequent BOM characters are no longer interpreted and removed. UTF-16-LE and -BE pass through all BOM mark characters. These changes should get the UTF-16 codec more in line with what the Unicode FAQ recommends w/r to BOM marks.
This commit is contained in:
parent
f52d27e52d
commit
489b56e044
|
@ -459,10 +459,11 @@ extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF8(
|
||||||
*byteorder == 0: native order
|
*byteorder == 0: native order
|
||||||
*byteorder == 1: big endian
|
*byteorder == 1: big endian
|
||||||
|
|
||||||
and then switches according to all BOM marks it finds in the input
|
In native mode, the first two bytes of the stream are checked for a
|
||||||
data. BOM marks are not copied into the resulting Unicode string.
|
BOM mark. If found, the BOM mark is analysed, the byte order
|
||||||
After completion, *byteorder is set to the current byte order at
|
adjusted and the BOM skipped. In the other modes, no BOM mark
|
||||||
the end of input data.
|
interpretation is done. After completion, *byteorder is set to the
|
||||||
|
current byte order at the end of input data.
|
||||||
|
|
||||||
If byteorder is NULL, the codec starts in native order mode.
|
If byteorder is NULL, the codec starts in native order mode.
|
||||||
|
|
||||||
|
|
|
@ -1001,31 +1001,39 @@ PyObject *PyUnicode_DecodeUTF16(const char *s,
|
||||||
if (byteorder)
|
if (byteorder)
|
||||||
bo = *byteorder;
|
bo = *byteorder;
|
||||||
|
|
||||||
|
/* Check for BOM marks (U+FEFF) in the input and adjust current
|
||||||
|
byte order setting accordingly. In native mode, the leading BOM
|
||||||
|
mark is skipped, in all other modes, it is copied to the output
|
||||||
|
stream as-is (giving a ZWNBSP character). */
|
||||||
|
if (bo == 0) {
|
||||||
|
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
|
||||||
|
if (*q == 0xFEFF) {
|
||||||
|
q++;
|
||||||
|
bo = -1;
|
||||||
|
} else if (*q == 0xFFFE) {
|
||||||
|
q++;
|
||||||
|
bo = 1;
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
if (*q == 0xFEFF) {
|
||||||
|
q++;
|
||||||
|
bo = 1;
|
||||||
|
} else if (*q == 0xFFFE) {
|
||||||
|
q++;
|
||||||
|
bo = -1;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
while (q < e) {
|
while (q < e) {
|
||||||
register Py_UNICODE ch = *q++;
|
register Py_UNICODE ch = *q++;
|
||||||
|
|
||||||
/* Check for BOM marks (U+FEFF) in the input and adjust
|
/* Swap input bytes if needed. (This assumes
|
||||||
current byte order setting accordingly. Swap input
|
sizeof(Py_UNICODE) == 2 !) */
|
||||||
bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
|
|
||||||
!) */
|
|
||||||
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
|
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
|
||||||
if (ch == 0xFEFF) {
|
|
||||||
bo = -1;
|
|
||||||
continue;
|
|
||||||
} else if (ch == 0xFFFE) {
|
|
||||||
bo = 1;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (bo == 1)
|
if (bo == 1)
|
||||||
ch = (ch >> 8) | (ch << 8);
|
ch = (ch >> 8) | (ch << 8);
|
||||||
#else
|
#else
|
||||||
if (ch == 0xFEFF) {
|
|
||||||
bo = 1;
|
|
||||||
continue;
|
|
||||||
} else if (ch == 0xFFFE) {
|
|
||||||
bo = -1;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (bo == -1)
|
if (bo == -1)
|
||||||
ch = (ch >> 8) | (ch << 8);
|
ch = (ch >> 8) | (ch << 8);
|
||||||
#endif
|
#endif
|
||||||
|
|
Loading…
Reference in New Issue