diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index 988ea1b39ec8..f91a5a0c8c7f 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -459,10 +459,11 @@ extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF8( *byteorder == 0: native order *byteorder == 1: big endian - and then switches according to all BOM marks it finds in the input - data. BOM marks are not copied into the resulting Unicode string. - After completion, *byteorder is set to the current byte order at - the end of input data. + In native mode, the first two bytes of the stream are checked for a + BOM mark. If found, the BOM mark is analysed, the byte order + adjusted and the BOM skipped. In the other modes, no BOM mark + interpretation is done. After completion, *byteorder is set to the + current byte order at the end of input data. If byteorder is NULL, the codec starts in native order mode. diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 475215c25f20..d55e2a72e206 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -1001,31 +1001,39 @@ PyObject *PyUnicode_DecodeUTF16(const char *s, if (byteorder) bo = *byteorder; + /* Check for BOM marks (U+FEFF) in the input and adjust current + byte order setting accordingly. In native mode, the leading BOM + mark is skipped, in all other modes, it is copied to the output + stream as-is (giving a ZWNBSP character). */ + if (bo == 0) { +#ifdef BYTEORDER_IS_LITTLE_ENDIAN + if (*q == 0xFEFF) { + q++; + bo = -1; + } else if (*q == 0xFFFE) { + q++; + bo = 1; + } +#else + if (*q == 0xFEFF) { + q++; + bo = 1; + } else if (*q == 0xFFFE) { + q++; + bo = -1; + } +#endif + } + while (q < e) { register Py_UNICODE ch = *q++; - /* Check for BOM marks (U+FEFF) in the input and adjust - current byte order setting accordingly. Swap input - bytes if needed. (This assumes sizeof(Py_UNICODE) == 2 - !) */ + /* Swap input bytes if needed. (This assumes + sizeof(Py_UNICODE) == 2 !) */ #ifdef BYTEORDER_IS_LITTLE_ENDIAN - if (ch == 0xFEFF) { - bo = -1; - continue; - } else if (ch == 0xFFFE) { - bo = 1; - continue; - } if (bo == 1) ch = (ch >> 8) | (ch << 8); #else - if (ch == 0xFEFF) { - bo = 1; - continue; - } else if (ch == 0xFFFE) { - bo = -1; - continue; - } if (bo == -1) ch = (ch >> 8) | (ch << 8); #endif