Nasal-Interpreter/nasal_lexer.h

464 lines
12 KiB
C
Raw Normal View History

2020-10-23 14:53:04 +08:00
#ifndef __NASAL_LEXER_H__
#define __NASAL_LEXER_H__
2020-11-02 12:24:12 +08:00
#define IS_IDENTIFIER(c) ((c=='_')||('a'<=c && c<='z')||('A'<=c&&c<='Z'))
2020-10-23 14:53:04 +08:00
#define IS_HEX_NUMBER(c) (('0'<=c&&c<='9')||('a'<=c&&c<='f')||('A'<=c && c<='F'))
#define IS_OCT_NUMEBR(c) ('0'<=c&&c<='7')
#define IS_DIGIT(c) ('0'<=c&&c<='9')
2020-11-02 12:24:12 +08:00
#define IS_STRING(c) (c=='\''||c=='\"'||c=='`')
2020-10-23 14:53:04 +08:00
// single operators have only one character
#define IS_SINGLE_OPRATOR(c) (c=='('||c==')'||c=='['||c==']'||c=='{'||c=='}'||c==','||c==';'||c=='|'||c==':'||\
c=='?'||c=='`'||c=='&'||c=='@'||c=='%'||c=='$'||c=='^'||c=='\\')
// calculation operators may have two chars, for example: += -= *= /= ~= != == >= <=
#define IS_CALC_OPERATOR(c) (c=='='||c=='+'||c=='-'||c=='*'||c=='!'||c=='/'||c=='<'||c=='>'||c=='~')
2020-11-02 12:24:12 +08:00
#define IS_NOTE(c) (c=='#')
2020-10-23 14:53:04 +08:00
2020-11-30 23:54:32 +08:00
enum token_type
{
tok_null=0,
tok_number,tok_string,tok_identifier,
tok_for,tok_forindex,tok_foreach,tok_while,
tok_var,tok_func,tok_break,tok_continue,
tok_return,tok_if,tok_elsif,tok_else,tok_nil,
tok_left_curve,tok_right_curve,
tok_left_bracket,tok_right_bracket,
tok_left_brace,tok_right_brace,
tok_semi,tok_and,tok_or,tok_comma,tok_dot,tok_ellipsis,tok_quesmark,
tok_colon,tok_add,tok_sub,tok_mult,tok_div,tok_link,tok_not,
tok_equal,
tok_add_equal,tok_sub_equal,tok_mult_equal,tok_div_equal,tok_link_equal,
2020-12-14 00:10:31 +08:00
tok_cmp_equal,tok_cmp_not_equal,tok_less_than,tok_less_equal,tok_greater_than,tok_greater_equal
2020-11-30 23:54:32 +08:00
};
2020-11-02 12:24:12 +08:00
struct
2020-10-23 14:53:04 +08:00
{
2020-11-02 12:24:12 +08:00
const char* str;
2020-10-23 14:53:04 +08:00
int tok_type;
2020-11-02 12:24:12 +08:00
}token_table[]=
2020-10-23 14:53:04 +08:00
{
{"for" ,tok_for },
{"forindex",tok_forindex },
{"foreach" ,tok_foreach },
{"while" ,tok_while },
{"var" ,tok_var },
{"func" ,tok_func },
{"break" ,tok_break },
{"continue",tok_continue },
{"return" ,tok_return },
{"if" ,tok_if },
{"elsif" ,tok_elsif },
{"else" ,tok_else },
{"nil" ,tok_nil },
{"(" ,tok_left_curve },
{")" ,tok_right_curve },
{"[" ,tok_left_bracket },
{"]" ,tok_right_bracket},
{"{" ,tok_left_brace },
{"}" ,tok_right_brace },
{";" ,tok_semi },
{"and" ,tok_and },
{"or" ,tok_or },
{"," ,tok_comma },
{"." ,tok_dot },
{"..." ,tok_ellipsis },
{"?" ,tok_quesmark },
{":" ,tok_colon },
{"+" ,tok_add },
{"-" ,tok_sub },
{"*" ,tok_mult },
{"/" ,tok_div },
{"~" ,tok_link },
{"!" ,tok_not },
{"=" ,tok_equal },
{"+=" ,tok_add_equal },
{"-=" ,tok_sub_equal },
{"*=" ,tok_mult_equal },
{"/=" ,tok_div_equal },
{"~=" ,tok_link_equal },
{"==" ,tok_cmp_equal },
{"!=" ,tok_cmp_not_equal},
{"<" ,tok_less_than },
2020-12-14 00:10:31 +08:00
{"<=" ,tok_less_equal },
2020-10-23 14:53:04 +08:00
{">" ,tok_greater_than },
{">=" ,tok_greater_equal},
2020-11-02 12:24:12 +08:00
{NULL ,-1 }
2020-10-23 14:53:04 +08:00
};
struct token
{
int line;
int type;
std::string str;
};
class nasal_lexer
{
private:
int error;
2020-10-23 17:10:02 +08:00
int res_size;
int line;
int ptr;
2020-11-02 12:24:12 +08:00
std::string line_code;
2020-10-23 17:10:02 +08:00
std::vector<char> res;
2020-10-23 14:53:04 +08:00
std::vector<token> token_list;
2020-10-23 17:10:02 +08:00
std::string identifier_gen();
std::string number_gen();
std::string string_gen();
2020-10-23 14:53:04 +08:00
public:
void clear();
2020-10-23 17:10:02 +08:00
void openfile(std::string);
2020-11-02 12:24:12 +08:00
void die(std::string,int,int);
2020-10-23 17:10:02 +08:00
void scanner();
2020-10-23 14:53:04 +08:00
void print_token();
int get_error();
std::vector<token>& get_token_list();
};
void nasal_lexer::clear()
{
2020-10-23 17:10:02 +08:00
error=0;
res_size=0;
line=0;
ptr=0;
2020-11-02 12:24:12 +08:00
line_code="";
2020-10-23 17:10:02 +08:00
res.clear();
2020-10-23 14:53:04 +08:00
token_list.clear();
return;
}
2020-10-23 17:10:02 +08:00
void nasal_lexer::openfile(std::string filename)
{
error=0;
res.clear();
std::ifstream fin(filename,std::ios::binary);
if(fin.fail())
{
++error;
std::cout<<">> [lexer] cannot open file \""<<filename<<"\".\n";
fin.close();
return;
}
while(!fin.eof())
{
char c=fin.get();
if(fin.eof())
break;
res.push_back(c);
}
fin.close();
res_size=res.size();
return;
}
2020-11-02 12:24:12 +08:00
void nasal_lexer::die(std::string error_info,int line=-1,int column=-1)
{
++error;
std::cout<<">> [lexer] line "<<line<<" column "<<column<<": "<<error_info<<"\n";
return;
}
2020-10-23 17:10:02 +08:00
std::string nasal_lexer::identifier_gen()
2020-10-23 14:53:04 +08:00
{
std::string token_str="";
2020-11-02 12:24:12 +08:00
while(ptr<res_size && (IS_IDENTIFIER(res[ptr])||IS_DIGIT(res[ptr])))
2020-10-23 14:53:04 +08:00
token_str+=res[ptr++];
2020-11-02 12:24:12 +08:00
line_code+=token_str;
2020-10-23 14:53:04 +08:00
return token_str;
// after running this process, ptr will point to the next token's beginning character
}
2020-10-23 17:10:02 +08:00
std::string nasal_lexer::number_gen()
2020-10-23 14:53:04 +08:00
{
bool scientific_notation=false;// numbers like 1e8 are scientific_notation
std::string token_str="";
// generate hex number
if(res[ptr]=='0' && ptr+1<res_size && res[ptr+1]=='x')
{
token_str="0x";
ptr+=2;
while(ptr<res_size && IS_HEX_NUMBER(res[ptr]))
token_str+=res[ptr++];
2020-11-02 12:24:12 +08:00
line_code+=token_str;
2020-10-23 14:53:04 +08:00
if(token_str=="0x")
{
2020-11-02 12:24:12 +08:00
die("["+line_code+"_] incorrect number.",line,line_code.length());
2020-10-23 14:53:04 +08:00
return "0";
}
return token_str;
}
// generate oct number
else if(res[ptr]=='0' && ptr+1<res_size && res[ptr+1]=='o')
{
token_str="0o";
ptr+=2;
while(ptr<res_size && IS_OCT_NUMEBR(res[ptr]))
token_str+=res[ptr++];
2020-11-02 12:24:12 +08:00
line_code+=token_str;
2020-10-23 14:53:04 +08:00
if(token_str=="0o")
{
2020-11-02 12:24:12 +08:00
die("["+line_code+"_] incorrect number.",line,line_code.length());
2020-10-23 14:53:04 +08:00
return "0";
}
return token_str;
}
// generate dec number
2020-11-02 12:24:12 +08:00
// dec number -> [0~9][0~9]*(.[0~9]*)(e|E(+|-)0|[1~9][0~9]*)
2020-10-23 14:53:04 +08:00
while(ptr<res_size && IS_DIGIT(res[ptr]))
token_str+=res[ptr++];
if(ptr<res_size && res[ptr]=='.')
{
token_str+=res[ptr++];
// "xxxx." is not a correct number
if(ptr>=res_size)
{
2020-11-02 12:24:12 +08:00
line_code+=token_str;
die("["+line_code+"_] incorrect number.",line,line_code.length());
2020-10-23 14:53:04 +08:00
return "0";
}
while(ptr<res_size && IS_DIGIT(res[ptr]))
token_str+=res[ptr++];
// "xxxx." is not a correct number
if(token_str.back()=='.')
{
2020-11-02 12:24:12 +08:00
line_code+=token_str;
die("["+line_code+"_] incorrect number.",line,line_code.length());
2020-10-23 14:53:04 +08:00
return "0";
}
}
if(ptr<res_size && (res[ptr]=='e' || res[ptr]=='E'))
{
token_str+=res[ptr++];
// "xxxe" is not a correct number
if(ptr>=res_size)
{
2020-11-02 12:24:12 +08:00
line_code+=token_str;
die("["+line_code+"_] incorrect number.",line,line_code.length());
2020-10-23 14:53:04 +08:00
return "0";
}
if(ptr<res_size && (res[ptr]=='-' || res[ptr]=='+'))
token_str+=res[ptr++];
if(ptr>=res_size)
{
2020-11-02 12:24:12 +08:00
line_code+=token_str;
die("["+line_code+"_] incorrect number.",line,line_code.length());
2020-10-23 14:53:04 +08:00
return "0";
}
if(ptr<res_size && res[ptr]=='0')
token_str+=res[ptr++];
while(ptr<res_size && IS_DIGIT(res[ptr]))
token_str+=res[ptr++];
// "xxxe(-|+)" is not a correct number
if(token_str.back()=='e' || token_str.back()=='E' || token_str.back()=='-' || token_str.back()=='+')
{
2020-11-02 12:24:12 +08:00
line_code+=token_str;
die("["+line_code+"_] incorrect number.",line,line_code.length());
2020-10-23 14:53:04 +08:00
return "0";
}
}
2020-11-02 12:24:12 +08:00
line_code+=token_str;
2020-10-23 14:53:04 +08:00
return token_str;
}
2020-10-23 17:10:02 +08:00
std::string nasal_lexer::string_gen()
2020-10-23 14:53:04 +08:00
{
std::string token_str="";
2020-11-02 12:24:12 +08:00
line_code+=res[ptr];
2020-10-23 14:53:04 +08:00
char str_begin=res[ptr++];
while(ptr<res_size && res[ptr]!=str_begin)
{
2020-11-02 12:24:12 +08:00
line_code+=res[ptr];
if(res[ptr]=='\n')
{
line_code="";
++line;
}
if(res[ptr]=='\\' && ptr+1<res_size)
2020-10-23 14:53:04 +08:00
{
++ptr;
2020-11-02 12:24:12 +08:00
line_code+=res[ptr];
2020-10-23 14:53:04 +08:00
switch(res[ptr])
{
case 'a':token_str.push_back('\a');break;
case 'b':token_str.push_back('\b');break;
case 'f':token_str.push_back('\f');break;
case 'n':token_str.push_back('\n');break;
case 'r':token_str.push_back('\r');break;
case 't':token_str.push_back('\t');break;
case 'v':token_str.push_back('\v');break;
case '?':token_str.push_back('\?');break;
case '0':token_str.push_back('\0');break;
case '\\':token_str.push_back('\\');break;
case '\'':token_str.push_back('\'');break;
case '\"':token_str.push_back('\"');break;
default: token_str.push_back(res[ptr]);break;
}
}
else
token_str+=res[ptr];
++ptr;
}
// check if this string ends with a " or '
if(ptr>=res_size)
2020-11-02 12:24:12 +08:00
die("["+line_code+"_] get EOF when generating string.",line,line_code.length());
2020-10-23 14:53:04 +08:00
++ptr;
return token_str;
}
2020-10-23 17:10:02 +08:00
void nasal_lexer::scanner()
2020-10-23 14:53:04 +08:00
{
token_list.clear();
2020-10-23 17:10:02 +08:00
line=1;
ptr=0;
2020-11-02 12:24:12 +08:00
line_code="";
2020-10-23 17:10:02 +08:00
2020-10-23 14:53:04 +08:00
std::string token_str;
while(ptr<res_size)
{
while(ptr<res_size && (res[ptr]==' ' || res[ptr]=='\n' || res[ptr]=='\t' || res[ptr]=='\r' || res[ptr]<0))
{
// these characters will be ignored, and '\n' will cause ++line
2020-11-02 12:24:12 +08:00
line_code+=res[ptr];
if(res[ptr]=='\n')
{
++line;
line_code="";
}
2020-10-23 14:53:04 +08:00
++ptr;
}
if(ptr>=res_size) break;
2020-11-02 12:24:12 +08:00
if(IS_IDENTIFIER(res[ptr]))
2020-10-23 14:53:04 +08:00
{
2020-10-23 17:10:02 +08:00
token_str=identifier_gen();
2020-10-23 14:53:04 +08:00
token new_token;
new_token.line=line;
new_token.str=token_str;
new_token.type=0;
2020-11-02 12:24:12 +08:00
for(int i=0;token_table[i].str;++i)
if(token_str==token_table[i].str)
2020-10-23 14:53:04 +08:00
{
2020-11-02 12:24:12 +08:00
new_token.type=token_table[i].tok_type;
2020-10-23 14:53:04 +08:00
break;
}
if(!new_token.type)
new_token.type=tok_identifier;
token_list.push_back(new_token);
}
else if(IS_DIGIT(res[ptr]))
{
2020-10-23 17:10:02 +08:00
token_str=number_gen();
2020-10-23 14:53:04 +08:00
token new_token;
new_token.line=line;
new_token.str=token_str;
new_token.type=tok_number;
token_list.push_back(new_token);
}
2020-11-02 12:24:12 +08:00
else if(IS_STRING(res[ptr]))
2020-10-23 14:53:04 +08:00
{
2020-10-23 17:10:02 +08:00
token_str=string_gen();
2020-10-23 14:53:04 +08:00
token new_token;
new_token.line=line;
new_token.type=tok_string;
new_token.str=token_str;
token_list.push_back(new_token);
}
else if(IS_SINGLE_OPRATOR(res[ptr]))
{
token_str="";
token_str+=res[ptr];
2020-11-02 12:24:12 +08:00
line_code+=res[ptr];
2020-10-23 14:53:04 +08:00
token new_token;
new_token.line=line;
new_token.str=token_str;
2020-11-02 12:24:12 +08:00
new_token.type=-1;
for(int i=0;token_table[i].str;++i)
if(token_str==token_table[i].str)
2020-10-23 14:53:04 +08:00
{
2020-11-02 12:24:12 +08:00
new_token.type=token_table[i].tok_type;
2020-10-23 14:53:04 +08:00
break;
}
2020-11-02 12:24:12 +08:00
if(new_token.type<0)
die("["+line_code+"_] incorrect operator.",line,line_code.length());
2020-10-23 14:53:04 +08:00
token_list.push_back(new_token);
++ptr;
}
else if(res[ptr]=='.')
{
if(ptr+2<res_size && res[ptr+1]=='.' && res[ptr+2]=='.')
{
token_str="...";
ptr+=3;
}
else
{
token_str=".";
++ptr;
}
2020-11-02 12:24:12 +08:00
line_code+=token_str;
2020-10-23 14:53:04 +08:00
token new_token;
new_token.line=line;
new_token.str=token_str;
2020-11-02 12:24:12 +08:00
for(int i=0;token_table[i].str;++i)
if(token_str==token_table[i].str)
2020-10-23 14:53:04 +08:00
{
2020-11-02 12:24:12 +08:00
new_token.type=token_table[i].tok_type;
2020-10-23 14:53:04 +08:00
break;
}
token_list.push_back(new_token);
}
else if(IS_CALC_OPERATOR(res[ptr]))
{
// get calculation operator
2020-11-02 12:24:12 +08:00
token_str=res[ptr];
2020-10-23 14:53:04 +08:00
++ptr;
if(ptr<res.size() && res[ptr]=='=')
{
token_str+=res[ptr];
++ptr;
}
2020-11-02 12:24:12 +08:00
line_code+=token_str;
2020-10-23 14:53:04 +08:00
token new_token;
new_token.line=line;
new_token.str=token_str;
2020-11-02 12:24:12 +08:00
for(int i=0;token_table[i].str;++i)
if(token_str==token_table[i].str)
2020-10-23 14:53:04 +08:00
{
2020-11-02 12:24:12 +08:00
new_token.type=token_table[i].tok_type;
2020-10-23 14:53:04 +08:00
break;
}
token_list.push_back(new_token);
}
2020-11-02 12:24:12 +08:00
else if(IS_NOTE(res[ptr]))
2020-10-23 14:53:04 +08:00
{
// avoid note
while(ptr<res_size && res[ptr]!='\n') ++ptr;
// after this process ptr will point to a '\n'
// don't ++ptr then the counter for line can work correctly
}
else
{
2020-11-02 12:24:12 +08:00
line_code+=res[ptr];
die("["+line_code+"_] unknown character.",line,line_code.length());
2020-10-23 14:53:04 +08:00
++ptr;
}
}
return;
}
void nasal_lexer::print_token()
{
int size=token_list.size();
for(int i=0;i<size;++i)
std::cout<<"("<<token_list[i].line<<" | "<<token_list[i].str<<")\n";
return;
}
int nasal_lexer::get_error()
{
return error;
}
std::vector<token>& nasal_lexer::get_token_list()
{
return token_list;
}
2020-06-01 01:19:26 +08:00
#endif