From 489b56e04480b8ca3f2d1676265e67c65bae788d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc-Andr=C3=A9=20Lemburg?= Date: Mon, 21 May 2001 20:30:15 +0000 Subject: [PATCH] This patch changes the behaviour of the UTF-16 codec family. Only the UTF-16 codec will now interpret and remove a *leading* BOM mark. Sub- sequent BOM characters are no longer interpreted and removed. UTF-16-LE and -BE pass through all BOM mark characters. These changes should get the UTF-16 codec more in line with what the Unicode FAQ recommends w/r to BOM marks. --- Include/unicodeobject.h | 9 +++++---- Objects/unicodeobject.c | 44 ++++++++++++++++++++++++----------------- 2 files changed, 31 insertions(+), 22 deletions(-) diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index 988ea1b39ec8..f91a5a0c8c7f 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -459,10 +459,11 @@ extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF8( *byteorder == 0: native order *byteorder == 1: big endian - and then switches according to all BOM marks it finds in the input - data. BOM marks are not copied into the resulting Unicode string. - After completion, *byteorder is set to the current byte order at - the end of input data. + In native mode, the first two bytes of the stream are checked for a + BOM mark. If found, the BOM mark is analysed, the byte order + adjusted and the BOM skipped. In the other modes, no BOM mark + interpretation is done. After completion, *byteorder is set to the + current byte order at the end of input data. If byteorder is NULL, the codec starts in native order mode. diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 475215c25f20..d55e2a72e206 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -1001,31 +1001,39 @@ PyObject *PyUnicode_DecodeUTF16(const char *s, if (byteorder) bo = *byteorder; + /* Check for BOM marks (U+FEFF) in the input and adjust current + byte order setting accordingly. In native mode, the leading BOM + mark is skipped, in all other modes, it is copied to the output + stream as-is (giving a ZWNBSP character). */ + if (bo == 0) { +#ifdef BYTEORDER_IS_LITTLE_ENDIAN + if (*q == 0xFEFF) { + q++; + bo = -1; + } else if (*q == 0xFFFE) { + q++; + bo = 1; + } +#else + if (*q == 0xFEFF) { + q++; + bo = 1; + } else if (*q == 0xFFFE) { + q++; + bo = -1; + } +#endif + } + while (q < e) { register Py_UNICODE ch = *q++; - /* Check for BOM marks (U+FEFF) in the input and adjust - current byte order setting accordingly. Swap input - bytes if needed. (This assumes sizeof(Py_UNICODE) == 2 - !) */ + /* Swap input bytes if needed. (This assumes + sizeof(Py_UNICODE) == 2 !) */ #ifdef BYTEORDER_IS_LITTLE_ENDIAN - if (ch == 0xFEFF) { - bo = -1; - continue; - } else if (ch == 0xFFFE) { - bo = 1; - continue; - } if (bo == 1) ch = (ch >> 8) | (ch << 8); #else - if (ch == 0xFEFF) { - bo = 1; - continue; - } else if (ch == 0xFFFE) { - bo = -1; - continue; - } if (bo == -1) ch = (ch >> 8) | (ch << 8); #endif