382 lines
9.8 KiB
Plaintext
382 lines
9.8 KiB
Plaintext
|
#include "EXTERN.h"
|
||
|
#include "perl.h"
|
||
|
#include "XSUB.h"
|
||
|
#include "perliol.h"
|
||
|
#include "ppport.h"
|
||
|
|
||
|
#define UTF8_MAX_BYTES 4
|
||
|
|
||
|
static const U8 xs_utf8_sequence_len[0x100] = {
|
||
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x00-0x0F */
|
||
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x10-0x1F */
|
||
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x20-0x2F */
|
||
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x30-0x3F */
|
||
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x40-0x4F */
|
||
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x50-0x5F */
|
||
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x60-0x6F */
|
||
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x70-0x7F */
|
||
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x80-0x8F */
|
||
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x90-0x9F */
|
||
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xA0-0xAF */
|
||
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xB0-0xBF */
|
||
|
0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* 0xC0-0xCF */
|
||
|
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* 0xD0-0xDF */
|
||
|
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, /* 0xE0-0xEF */
|
||
|
4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, /* 0xF0-0xFF */
|
||
|
};
|
||
|
|
||
|
|
||
|
typedef enum { STRICT_UTF8=0, ALLOW_SURROGATES=1, ALLOW_NONCHARACTERS=2, ALLOW_NONSHORTEST=4 } utf8_flags;
|
||
|
|
||
|
|
||
|
static STRLEN skip_sequence(const U8 *cur, const STRLEN len) {
|
||
|
STRLEN i, n = xs_utf8_sequence_len[*cur];
|
||
|
|
||
|
if (n < 1 || len < 2)
|
||
|
return 1;
|
||
|
|
||
|
switch (cur[0]) {
|
||
|
case 0xE0: if ((cur[1] & 0xE0) != 0xA0) return 1; break;
|
||
|
case 0xED: if ((cur[1] & 0xE0) != 0x80) return 1; break;
|
||
|
case 0xF4: if ((cur[1] & 0xF0) != 0x80) return 1; break;
|
||
|
case 0xF0: if ((cur[1] & 0xF0) == 0x80) return 1; /* FALLTROUGH */
|
||
|
default: if ((cur[1] & 0xC0) != 0x80) return 1; break;
|
||
|
}
|
||
|
|
||
|
if (n > len)
|
||
|
n = len;
|
||
|
for (i = 2; i < n; i++)
|
||
|
if ((cur[i] & 0xC0) != 0x80)
|
||
|
break;
|
||
|
return i;
|
||
|
}
|
||
|
|
||
|
#if defined(PERL_STATIC_NO_RET) && defined(__attribute__noreturn__)
|
||
|
PERL_STATIC_NO_RET void report_illformed(pTHX_ const U8 *cur, STRLEN len, bool eof) __attribute__noreturn__;
|
||
|
#elif defined(__attribute__noreturn__)
|
||
|
static void report_illformed(pTHX_ const U8 *cur, STRLEN len, bool eof) __attribute__noreturn__;
|
||
|
#endif
|
||
|
|
||
|
static void report_illformed(pTHX_ const U8 *cur, STRLEN len, bool eof) {
|
||
|
static const char *hex = "0123456789ABCDEF";
|
||
|
const char *fmt;
|
||
|
char seq[UTF8_MAX_BYTES * 3];
|
||
|
char *d = seq;
|
||
|
|
||
|
if (eof)
|
||
|
fmt = "Can't decode ill-formed UTF-8 octet sequence <%s> at end of file";
|
||
|
else
|
||
|
fmt = "Can't decode ill-formed UTF-8 octet sequence <%s>";
|
||
|
|
||
|
while (len-- > 0) {
|
||
|
const U8 c = *cur++;
|
||
|
*d++ = hex[c >> 4];
|
||
|
*d++ = hex[c & 15];
|
||
|
if (len)
|
||
|
*d++ = ' ';
|
||
|
}
|
||
|
*d = 0;
|
||
|
Perl_croak(aTHX_ fmt, seq);
|
||
|
}
|
||
|
|
||
|
#if defined(PERL_STATIC_NO_RET) && defined(__attribute__noreturn__)
|
||
|
PERL_STATIC_NO_RET void report_noncharacter(pTHX_ UV usv) __attribute__noreturn__;
|
||
|
#elif defined(__attribute__noreturn__)
|
||
|
static void report_noncharacter(pTHX_ UV usv) __attribute__noreturn__;
|
||
|
#endif
|
||
|
|
||
|
static void report_noncharacter(pTHX_ UV usv) {
|
||
|
static const char *fmt = "Can't interchange noncharacter code point U+%"UVXf;
|
||
|
Perl_croak(aTHX_ fmt, usv);
|
||
|
}
|
||
|
|
||
|
static STRLEN validate(pTHX_ const U8 *buf, const U8 *end, const int flags, PerlIO* handle) {
|
||
|
const bool eof = PerlIO_eof(handle);
|
||
|
const U8 *cur = buf;
|
||
|
const U8 *end4 = end - UTF8_MAX_BYTES;
|
||
|
STRLEN skip = 0;
|
||
|
U32 v;
|
||
|
|
||
|
while (cur < end4) {
|
||
|
while (cur < end4 && *cur < 0x80)
|
||
|
cur++;
|
||
|
|
||
|
check:
|
||
|
switch (xs_utf8_sequence_len[*cur]) {
|
||
|
case 0:
|
||
|
goto illformed;
|
||
|
case 1:
|
||
|
cur += 1;
|
||
|
break;
|
||
|
case 2:
|
||
|
/* 110xxxxx 10xxxxxx */
|
||
|
if ((cur[1] & 0xC0) != 0x80)
|
||
|
goto illformed;
|
||
|
cur += 2;
|
||
|
break;
|
||
|
case 3:
|
||
|
v = ((U32)cur[0] << 16)
|
||
|
| ((U32)cur[1] << 8)
|
||
|
| ((U32)cur[2]);
|
||
|
/* 1110xxxx 10xxxxxx 10xxxxxx */
|
||
|
if ((v & 0x00F0C0C0) != 0x00E08080 ||
|
||
|
/* Non-shortest form */
|
||
|
v < 0x00E0A080)
|
||
|
goto illformed;
|
||
|
/* Surrogates U+D800..U+DFFF */
|
||
|
if (!(flags & ALLOW_SURROGATES) && (v & 0x00EFA080) == 0x00EDA080)
|
||
|
goto illformed;
|
||
|
/* Non-characters U+FDD0..U+FDEF, U+FFFE..U+FFFF */
|
||
|
if (!(flags & ALLOW_NONCHARACTERS) && v >= 0x00EFB790 && (v <= 0x00EFB7AF || v >= 0x00EFBFBE))
|
||
|
goto noncharacter;
|
||
|
cur += 3;
|
||
|
break;
|
||
|
case 4:
|
||
|
v = ((U32)cur[0] << 24)
|
||
|
| ((U32)cur[1] << 16)
|
||
|
| ((U32)cur[2] << 8)
|
||
|
| ((U32)cur[3]);
|
||
|
/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
|
||
|
if ((v & 0xF8C0C0C0) != 0xF0808080 ||
|
||
|
/* Non-shortest form */
|
||
|
v < 0xF0908080 ||
|
||
|
/* Greater than U+10FFFF */
|
||
|
v > 0xF48FBFBF)
|
||
|
goto illformed;
|
||
|
/* Non-characters U+nFFFE..U+nFFFF on plane 1-16 */
|
||
|
if (!(flags & ALLOW_NONCHARACTERS) && (v & 0x000FBFBE) == 0x000FBFBE)
|
||
|
goto noncharacter;
|
||
|
cur += 4;
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if (cur < end) {
|
||
|
if (cur + xs_utf8_sequence_len[*cur] <= end)
|
||
|
goto check;
|
||
|
skip = skip_sequence(cur, end - cur);
|
||
|
if (eof || cur + skip < end)
|
||
|
goto illformed;
|
||
|
}
|
||
|
return cur - buf;
|
||
|
|
||
|
illformed:
|
||
|
if (!skip)
|
||
|
skip = skip_sequence(cur, end - cur);
|
||
|
PerlIOBase(handle)->flags |= PERLIO_F_ERROR;
|
||
|
report_illformed(aTHX_ cur, skip, eof);
|
||
|
|
||
|
noncharacter:
|
||
|
if (v < 0xF0808080)
|
||
|
v = (v & 0x3F) | (v & 0x3F00) >> 2 | (v & 0x0F0000) >> 4;
|
||
|
else
|
||
|
v = (v & 0x3F) | (v & 0x3F00) >> 2 | (v & 0x3F0000) >> 4 | (v & 0x07000000) >> 6;
|
||
|
PerlIOBase(handle)->flags |= PERLIO_F_ERROR;
|
||
|
report_noncharacter(aTHX_ v);
|
||
|
}
|
||
|
|
||
|
typedef struct {
|
||
|
PerlIOBuf buf;
|
||
|
STDCHAR leftovers[UTF8_MAX_BYTES];
|
||
|
size_t leftover_length;
|
||
|
utf8_flags flags;
|
||
|
} PerlIOUnicode;
|
||
|
|
||
|
static struct {
|
||
|
const char* name;
|
||
|
size_t length;
|
||
|
utf8_flags value;
|
||
|
} map[] = {
|
||
|
{ STR_WITH_LEN("allow_surrogates"), ALLOW_SURROGATES },
|
||
|
{ STR_WITH_LEN("allow_noncharacters"), ALLOW_NONCHARACTERS },
|
||
|
{ STR_WITH_LEN("allow_nonshortest"), ALLOW_NONSHORTEST },
|
||
|
{ STR_WITH_LEN("strict"), 0 },
|
||
|
{ STR_WITH_LEN("loose"), ALLOW_SURROGATES | ALLOW_NONCHARACTERS | ALLOW_NONSHORTEST },
|
||
|
};
|
||
|
|
||
|
static utf8_flags lookup_parameter(pTHX_ const char* ptr, size_t len) {
|
||
|
unsigned i;
|
||
|
for (i = 0; i < sizeof map / sizeof *map; ++i) {
|
||
|
if (map[i].length == len && memcmp(ptr, map[i].name, len) == 0)
|
||
|
return map[i].value;
|
||
|
}
|
||
|
Perl_croak(aTHX_ "Unknown argument to :utf8_strict: %*s", (int)len, ptr);
|
||
|
}
|
||
|
static utf8_flags parse_parameters(pTHX_ SV* param) {
|
||
|
STRLEN len;
|
||
|
const char *begin, *delim;
|
||
|
if (!param || !SvOK(param))
|
||
|
return 0;
|
||
|
|
||
|
begin = SvPV(param, len);
|
||
|
delim = strchr(begin, ',');
|
||
|
if(delim) {
|
||
|
utf8_flags ret = 0;
|
||
|
const char* end = begin + len;
|
||
|
do {
|
||
|
ret |= lookup_parameter(aTHX_ begin, delim - begin);
|
||
|
begin = delim + 1;
|
||
|
delim = strchr(begin, ',');
|
||
|
} while (delim);
|
||
|
if (begin < end)
|
||
|
ret |= lookup_parameter(aTHX_ begin, end - begin);
|
||
|
return ret;
|
||
|
}
|
||
|
else {
|
||
|
return lookup_parameter(aTHX_ begin, len);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
#define line_buffered(flags) ((flags & (PERLIO_F_LINEBUF | PERLIO_F_CANWRITE)) == (PERLIO_F_LINEBUF | PERLIO_F_CANWRITE))
|
||
|
|
||
|
void PerlIOBase_flush_linebuf(pTHX) {
|
||
|
#ifdef dVAR
|
||
|
dVAR;
|
||
|
#endif
|
||
|
PerlIOl **table = &PL_perlio;
|
||
|
PerlIOl *f;
|
||
|
while ((f = *table)) {
|
||
|
int i;
|
||
|
table = (PerlIOl **) (f++);
|
||
|
for (i = 1; i < 64; i++) {
|
||
|
if (f->next && line_buffered(PerlIOBase(&(f->next))->flags))
|
||
|
PerlIO_flush(&(f->next));
|
||
|
f++;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
static IV PerlIOUnicode_pushed(pTHX_ PerlIO* f, const char* mode, SV* arg, PerlIO_funcs* tab) {
|
||
|
utf8_flags flags = parse_parameters(aTHX_ arg);
|
||
|
if (PerlIOBuf_pushed(aTHX_ f, mode, arg, tab) == 0) {
|
||
|
PerlIOBase(f)->flags |= PERLIO_F_UTF8;
|
||
|
PerlIOSelf(f, PerlIOUnicode)->flags = flags;
|
||
|
return 0;
|
||
|
}
|
||
|
return -1;
|
||
|
}
|
||
|
|
||
|
static IV PerlIOUnicode_fill(pTHX_ PerlIO* f) {
|
||
|
PerlIOUnicode * const u = PerlIOSelf(f, PerlIOUnicode);
|
||
|
PerlIOBuf * const b = &u->buf;
|
||
|
PerlIO *n = PerlIONext(f);
|
||
|
SSize_t avail;
|
||
|
Size_t read_bytes = 0;
|
||
|
STDCHAR *end;
|
||
|
SSize_t fit;
|
||
|
|
||
|
if (PerlIO_flush(f) != 0)
|
||
|
return -1;
|
||
|
if (PerlIOBase(f)->flags & PERLIO_F_TTY)
|
||
|
PerlIOBase_flush_linebuf(aTHX);
|
||
|
|
||
|
if (!b->buf)
|
||
|
PerlIO_get_base(f);
|
||
|
|
||
|
assert(b->buf);
|
||
|
|
||
|
if (u->leftover_length) {
|
||
|
Copy(u->leftovers, b->buf, u->leftover_length, STDCHAR);
|
||
|
b->end = b->buf + u->leftover_length;
|
||
|
read_bytes = u->leftover_length;
|
||
|
u->leftover_length = 0;
|
||
|
}
|
||
|
else {
|
||
|
b->ptr = b->end = b->buf;
|
||
|
}
|
||
|
fit = (SSize_t)b->bufsiz - (b->end - b->buf);
|
||
|
|
||
|
if (!PerlIOValid(n)) {
|
||
|
PerlIOBase(f)->flags |= PERLIO_F_EOF;
|
||
|
return -1;
|
||
|
}
|
||
|
|
||
|
if (PerlIO_fast_gets(n)) {
|
||
|
/*
|
||
|
* Layer below is also buffered. We do _NOT_ want to call its
|
||
|
* ->Read() because that will loop till it gets what we asked for
|
||
|
* which may hang on a pipe etc. Instead take anything it has to
|
||
|
* hand, or ask it to fill _once_.
|
||
|
*/
|
||
|
avail = PerlIO_get_cnt(n);
|
||
|
if (avail <= 0) {
|
||
|
avail = PerlIO_fill(n);
|
||
|
if (avail == 0)
|
||
|
avail = PerlIO_get_cnt(n);
|
||
|
else {
|
||
|
if (!PerlIO_error(n) && PerlIO_eof(n))
|
||
|
avail = 0;
|
||
|
}
|
||
|
}
|
||
|
if (avail > 0) {
|
||
|
STDCHAR *ptr = PerlIO_get_ptr(n);
|
||
|
const SSize_t cnt = avail;
|
||
|
if (avail > fit)
|
||
|
avail = fit;
|
||
|
Copy(ptr, b->end, avail, STDCHAR);
|
||
|
PerlIO_set_ptrcnt(n, ptr + avail, cnt - avail);
|
||
|
read_bytes += avail;
|
||
|
}
|
||
|
}
|
||
|
else {
|
||
|
avail = PerlIO_read(n, b->end, fit);
|
||
|
if (avail > 0)
|
||
|
read_bytes += avail;
|
||
|
}
|
||
|
if (avail <= 0) {
|
||
|
if (avail < 0 || (read_bytes == 0 && PerlIO_eof(n))) {
|
||
|
PerlIOBase(f)->flags |= (avail == 0) ? PERLIO_F_EOF : PERLIO_F_ERROR;
|
||
|
return -1;
|
||
|
}
|
||
|
}
|
||
|
end = b->buf + read_bytes;
|
||
|
b->end = b->buf + validate(aTHX_ (const U8 *)b->buf, (const U8 *)end, u->flags, n);
|
||
|
if (b->end < end) {
|
||
|
size_t len = b->buf + read_bytes - b->end;
|
||
|
Copy(b->end, u->leftovers, len, char);
|
||
|
u->leftover_length = len;
|
||
|
}
|
||
|
PerlIOBase(f)->flags |= PERLIO_F_RDBUF;
|
||
|
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
PERLIO_FUNCS_DECL(PerlIO_utf8_strict) = {
|
||
|
sizeof(PerlIO_funcs),
|
||
|
"utf8_strict",
|
||
|
sizeof(PerlIOUnicode),
|
||
|
PERLIO_K_BUFFERED|PERLIO_K_UTF8,
|
||
|
PerlIOUnicode_pushed,
|
||
|
PerlIOBuf_popped,
|
||
|
PerlIOBuf_open,
|
||
|
PerlIOBase_binmode,
|
||
|
NULL,
|
||
|
PerlIOBase_fileno,
|
||
|
PerlIOBuf_dup,
|
||
|
PerlIOBuf_read,
|
||
|
PerlIOBase_unread,
|
||
|
PerlIOBuf_write,
|
||
|
PerlIOBuf_seek,
|
||
|
PerlIOBuf_tell,
|
||
|
PerlIOBuf_close,
|
||
|
PerlIOBuf_flush,
|
||
|
PerlIOUnicode_fill,
|
||
|
PerlIOBase_eof,
|
||
|
PerlIOBase_error,
|
||
|
PerlIOBase_clearerr,
|
||
|
PerlIOBase_setlinebuf,
|
||
|
PerlIOBuf_get_base,
|
||
|
PerlIOBuf_bufsiz,
|
||
|
PerlIOBuf_get_ptr,
|
||
|
PerlIOBuf_get_cnt,
|
||
|
PerlIOBuf_set_ptrcnt,
|
||
|
};
|
||
|
|
||
|
MODULE = PerlIO::utf8_strict
|
||
|
|
||
|
PROTOTYPES: DISABLE
|
||
|
|
||
|
BOOT:
|
||
|
PerlIO_define_layer(aTHX_ (PerlIO_funcs*)&PerlIO_utf8_strict);
|
||
|
|