mirror of https://github.com/python/cpython.git
gh-110913: Fix WindowsConsoleIO chunking of UTF-8 text (GH-111007)
This commit is contained in:
parent
b60f058708
commit
11312eae6e
|
@ -0,0 +1 @@
|
|||
WindowsConsoleIO now correctly chunks large buffers without splitting up UTF-8 sequences.
|
|
@ -134,6 +134,23 @@ char _PyIO_get_console_type(PyObject *path_or_fd) {
|
|||
return m;
|
||||
}
|
||||
|
||||
static DWORD
|
||||
_find_last_utf8_boundary(const char *buf, DWORD len)
|
||||
{
|
||||
/* This function never returns 0, returns the original len instead */
|
||||
DWORD count = 1;
|
||||
if (len == 0 || (buf[len - 1] & 0x80) == 0) {
|
||||
return len;
|
||||
}
|
||||
for (;; count++) {
|
||||
if (count > 3 || count >= len) {
|
||||
return len;
|
||||
}
|
||||
if ((buf[len - count] & 0xc0) != 0x80) {
|
||||
return len - count;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*[clinic input]
|
||||
module _io
|
||||
|
@ -975,7 +992,7 @@ _io__WindowsConsoleIO_write_impl(winconsoleio *self, PyTypeObject *cls,
|
|||
{
|
||||
BOOL res = TRUE;
|
||||
wchar_t *wbuf;
|
||||
DWORD len, wlen, orig_len, n = 0;
|
||||
DWORD len, wlen, n = 0;
|
||||
HANDLE handle;
|
||||
|
||||
if (self->fd == -1)
|
||||
|
@ -1007,21 +1024,8 @@ _io__WindowsConsoleIO_write_impl(winconsoleio *self, PyTypeObject *cls,
|
|||
have to reduce and recalculate. */
|
||||
while (wlen > 32766 / sizeof(wchar_t)) {
|
||||
len /= 2;
|
||||
orig_len = len;
|
||||
/* Reduce the length until we hit the final byte of a UTF-8 sequence
|
||||
* (top bit is unset). Fix for github issue 82052.
|
||||
*/
|
||||
while (len > 0 && (((char *)b->buf)[len-1] & 0x80) != 0)
|
||||
--len;
|
||||
/* If we hit a length of 0, something has gone wrong. This shouldn't
|
||||
* be possible, as valid UTF-8 can have at most 3 non-final bytes
|
||||
* before a final one, and our buffer is way longer than that.
|
||||
* But to be on the safe side, if we hit this issue we just restore
|
||||
* the original length and let the console API sort it out.
|
||||
*/
|
||||
if (len == 0) {
|
||||
len = orig_len;
|
||||
}
|
||||
/* Fix for github issues gh-110913 and gh-82052. */
|
||||
len = _find_last_utf8_boundary(b->buf, len);
|
||||
wlen = MultiByteToWideChar(CP_UTF8, 0, b->buf, len, NULL, 0);
|
||||
}
|
||||
Py_END_ALLOW_THREADS
|
||||
|
|
Loading…
Reference in New Issue