add utf-8 identifier check in nasal_lexer & fix printf format at windows platform & add test file utf8chk.nas

This commit is contained in:
ValKmjolnir 2022-05-07 16:50:13 +08:00
parent de262980cc
commit 51a1279110
6 changed files with 97 additions and 24 deletions

29
nasal.h
View File

@ -32,6 +32,16 @@
#include <sys/wait.h>
#endif
#ifndef _WIN32
#define PRTHEX64 "0x%lx"
#define PRTHEX64_8 "0x%.8lx"
#define PRTINT64 "%ld"
#else
#define PRTHEX64 "0x%llx"
#define PRTHEX64_8 "0x%.8llx"
#define PRTINT64 "%lld"
#endif
inline double hex_to_double(const char* str)
{
double ret=0;
@ -112,6 +122,25 @@ double str2num(const char* str)
return is_negative?-ret_num:ret_num;
}
int utf8_hdchk(char head)
{
uint8_t c=(uint8_t)head;
uint32_t nbytes=0;
if((c>>5)==0x06) // 110x xxxx (10xx xxxx)^1
nbytes=1;
if((c>>4)==0x0e) // 1110 xxxx (10xx xxxx)^2
nbytes=2;
if((c>>3)==0x1e) // 1111 0xxx (10xx xxxx)^3
nbytes=3;
if((c>>2)==0x3e) // 1111 10xx (10xx xxxx)^4
nbytes=4;
if((c>>1)==0x7e) // 1111 110x (10xx xxxx)^5
nbytes=5;
if(c==0xfe) // 1111 1110 (10xx xxxx)^6
nbytes=6;
return nbytes;
}
std::string rawstr(const std::string& str)
{
std::string ret("");

View File

@ -1287,7 +1287,7 @@ void nasal_codegen::print_op(uint32_t index)
case op_calll: case op_mcalll: case op_loadl:
printf("0x%x\n",c.num);break;
case op_callb:
printf("0x%x <%s@0x%lx>\n",c.num,builtin[c.num].name,(uint64_t)builtin[c.num].func);break;
printf("0x%x <%s@" PRTHEX64 ">\n",c.num,builtin[c.num].name,(uint64_t)builtin[c.num].func);break;
case op_upval:case op_mupval: case op_loadu:
printf("0x%x[0x%x]\n",(c.num>>16)&0xffff,c.num&0xffff);break;
case op_happ: case op_pstr:

View File

@ -1,7 +1,7 @@
#ifndef __NASAL_LEXER_H__
#define __NASAL_LEXER_H__
#define ID(c) ((c=='_')||('a'<=c && c<='z')||('A'<=c&&c<='Z'))
#define ID(c) ((c=='_')||('a'<=c && c<='z')||('A'<=c&&c<='Z')||(c<0))
#define HEX(c) (('0'<=c&&c<='9')||('a'<=c&&c<='f')||('A'<=c && c<='F'))
#define OCT(c) ('0'<=c&&c<='7')
#define DIGIT(c) ('0'<=c&&c<='9')
@ -114,6 +114,7 @@ private:
uint32_t get_type(const std::string&);
void die(std::string info){nerr.err("lexer",line,column,info);};
void open(const std::string&);
std::string utf8_gen();
std::string id_gen();
std::string num_gen();
std::string str_gen();
@ -151,12 +152,43 @@ uint32_t nasal_lexer::get_type(const std::string& tk_str)
return tok_null;
}
std::string nasal_lexer::utf8_gen()
{
std::string str="";
while(ptr<res.size() && res[ptr]<0)
{
std::string tmp="";
uint32_t nbytes=utf8_hdchk(res[ptr]);
if(nbytes)
{
tmp+=res[ptr++];
for(uint32_t i=0;i<nbytes;++i,++ptr)
if(ptr<res.size() && (res[ptr]&0xc0)==0x80)
tmp+=res[ptr];
if(tmp.length()!=1+nbytes)
die("invalid utf-8 character here");
str+=tmp;
++column;
}
else
++ptr;
}
return str;
}
std::string nasal_lexer::id_gen()
{
std::string str="";
while(ptr<res.size() && (ID(res[ptr])||DIGIT(res[ptr])))
str+=res[ptr++];
column+=str.length();
{
if(res[ptr]<0) // utf-8
str+=utf8_gen();
else // ascii
{
str+=res[ptr++];
++column;
}
}
return str;
}
@ -283,7 +315,7 @@ void nasal_lexer::scan(const std::string& file)
std::string str;
while(ptr<res.size())
{
while(ptr<res.size() && (res[ptr]==' ' || res[ptr]=='\n' || res[ptr]=='\t' || res[ptr]=='\r' || res[ptr]<0))
while(ptr<res.size() && (res[ptr]==' ' || res[ptr]=='\n' || res[ptr]=='\t' || res[ptr]=='\r'))// || res[ptr]<0))
{
// these characters will be ignored, and '\n' will cause ++line
++column;

View File

@ -148,20 +148,20 @@ void nasal_vm::valinfo(nasal_ref& val)
{
case vm_none: printf("| null |\n");break;
case vm_ret: printf("| pc | 0x%x\n",val.ret());break;
case vm_addr: printf("| addr | 0x%lx\n",(uint64_t)val.addr());break;
case vm_cnt: printf("| cnt | %ld\n",val.cnt());break;
case vm_addr: printf("| addr | " PRTHEX64 "\n",(uint64_t)val.addr());break;
case vm_cnt: printf("| cnt | " PRTINT64 "\n",val.cnt());break;
case vm_nil: printf("| nil |\n");break;
case vm_num: printf("| num | ");std::cout<<val.num()<<'\n';break;
case vm_str:
{
std::string tmp=rawstr(val.str());
printf("| str | <0x%lx> %.16s%s\n",(uint64_t)p,tmp.c_str(),tmp.length()>16?"...":"");
printf("| str | <" PRTHEX64 "> %.16s%s\n",(uint64_t)p,tmp.c_str(),tmp.length()>16?"...":"");
}break;
case vm_func: printf("| func | <0x%lx> entry:0x%x\n",(uint64_t)p,val.func().entry);break;
case vm_vec: printf("| vec | <0x%lx> [%zu val]\n",(uint64_t)p,val.vec().size());break;
case vm_hash: printf("| hash | <0x%lx> {%zu val}\n",(uint64_t)p,val.hash().size());break;
case vm_obj: printf("| obj | <0x%lx> obj:0x%lx\n",(uint64_t)p,(uint64_t)val.obj().ptr);break;
default: printf("| err | <0x%lx> unknown object\n",(uint64_t)p);break;
case vm_func: printf("| func | <" PRTHEX64 "> entry:0x%x\n",(uint64_t)p,val.func().entry);break;
case vm_vec: printf("| vec | <" PRTHEX64 "> [%zu val]\n",(uint64_t)p,val.vec().size());break;
case vm_hash: printf("| hash | <" PRTHEX64 "> {%zu val}\n",(uint64_t)p,val.hash().size());break;
case vm_obj: printf("| obj | <" PRTHEX64 "> obj:" PRTHEX64 "\n",(uint64_t)p,(uint64_t)val.obj().ptr);break;
default: printf("| err | <" PRTHEX64 "> unknown object\n",(uint64_t)p);break;
}
}
void nasal_vm::bytecodeinfo(const char* header,const uint32_t p)
@ -202,7 +202,7 @@ void nasal_vm::bytecodeinfo(const char* header,const uint32_t p)
case op_calll: case op_mcalll: case op_loadl:
printf("0x%x",c.num);break;
case op_callb:
printf("0x%x <%s@0x%lx>",c.num,builtin[c.num].name,(uint64_t)builtin[c.num].func);break;
printf("0x%x <%s@" PRTHEX64 ">",c.num,builtin[c.num].name,(uint64_t)builtin[c.num].func);break;
case op_upval: case op_mupval: case op_loadu:
printf(" (0x%x[0x%x])",(c.num>>16)&0xffff,c.num&0xffff);break;
case op_happ: case op_pstr:
@ -251,16 +251,16 @@ void nasal_vm::stackinfo(const uint32_t limit=10)
uint32_t gsize=bytecode[0].num;
nasal_ref* top=gc.top;
nasal_ref* bottom=gc.stack+gsize;
printf("vm stack(0x%lx<sp+%u>, limit %d, total ",(uint64_t)bottom,gsize,limit);
printf("vm stack(" PRTHEX64 "<sp+%u>, limit %d, total ",(uint64_t)bottom,gsize,limit);
if(top<bottom)
{
printf("0)\n");
return;
}
printf("%ld):\n",top-bottom+1);
printf("" PRTINT64 "):\n",top-bottom+1);
for(uint32_t i=0;i<limit && top>=bottom;++i,--top)
{
printf(" 0x%.8lx",top-gc.stack);
printf(" " PRTHEX64_8 "",top-gc.stack);
valinfo(top[0]);
}
}
@ -268,7 +268,7 @@ void nasal_vm::global_state()
{
if(!bytecode[0].num || gc.stack[0].type==vm_none) // bytecode[0].op is op_intg
return;
printf("global(0x%lx<sp+0>):\n",(uint64_t)gc.stack);
printf("global(" PRTHEX64 "<sp+0>):\n",(uint64_t)gc.stack);
for(uint32_t i=0;i<bytecode[0].num;++i)
{
printf(" 0x%.8x",i);
@ -280,7 +280,7 @@ void nasal_vm::local_state()
if(!localr || !gc.funcr.func().lsize)
return;
uint32_t lsize=gc.funcr.func().lsize;
printf("local(0x%lx<sp+%ld>):\n",(uint64_t)localr,localr-gc.stack);
printf("local(" PRTHEX64 "<sp+" PRTINT64 ">):\n",(uint64_t)localr,localr-gc.stack);
for(uint32_t i=0;i<lsize;++i)
{
printf(" 0x%.8x",i);
@ -306,12 +306,12 @@ void nasal_vm::upval_state()
}
void nasal_vm::detail()
{
printf("maddr:\n (0x%lx)\n",(uint64_t)mem_addr);
printf("localr:\n (0x%lx)\n",(uint64_t)localr);
printf("maddr:\n (" PRTHEX64 ")\n",(uint64_t)mem_addr);
printf("localr:\n (" PRTHEX64 ")\n",(uint64_t)localr);
if(gc.funcr.type==vm_nil)
printf("funcr:\n (nil)\n");
else
printf("funcr:\n (<0x%lx> entry:0x%x)\n",
printf("funcr:\n (<" PRTHEX64 "> entry:0x%x)\n",
(uint64_t)gc.funcr.value.gcobj,
gc.funcr.func().entry);
global_state();

View File

@ -65,8 +65,9 @@ var testfile=[
"test/tetris.nas ",
"test/trait.nas ",
"test/turingmachine.nas",
"test/ycombinator.nas ",
"test/wavecollapse.nas "
"test/utf8chk.nas ",
"test/wavecollapse.nas ",
"test/ycombinator.nas "
];
var module=[

11
test/utf8chk.nas Normal file
View File

@ -0,0 +1,11 @@
var 输出=print;
var 这是unicode测试=func(){
var 测试成功=[
"unicode: utf-8支持测试成功",
"目前仅支持utf-8以及ascii格式文件",
"注意: windows系统请开启chcp 65001代码页"
];
foreach(var 内容;测试成功)
输出(内容~"\n");
}
这是unicode测试();