doxygen/libxml/xml.l

498 lines
14 KiB
Plaintext

/******************************************************************************
*
* Copyright (C) 1997-2020 by Dimitri van Heesch.
*
* Permission to use, copy, modify, and distribute this software and its
* documentation under the terms of the GNU General Public License is hereby
* granted. No representations are made about the suitability of this software
* for any purpose. It is provided "as is" without express or implied warranty.
* See the GNU General Public License for more details.
*
* Documents produced by Doxygen are derivative works derived from the
* input used in their production; they are not affected by this license.
*
*/
/******************************************************************************
* Minimal flex based parser for XML
******************************************************************************/
%option never-interactive
%option prefix="xmlYY"
%option reentrant
%option extra-type="struct xmlYY_state *"
%option 8bit noyywrap
%top{
#include <stdint.h>
}
%{
#include <ctype.h>
#include <vector>
#include <stdio.h>
#include "xml.h"
//#include "message.h"
#define YY_NEVER_INTERACTIVE 1
#define YY_NO_INPUT 1
#define YY_NO_UNISTD_H 1
struct xmlYY_state
{
std::string fileName;
int lineNr = 1;
const char * inputString = 0; //!< the code fragment as text
yy_size_t inputPosition = 0; //!< read offset during parsing
std::string name;
bool isEnd = false;
bool selfClose = false;
std::string data;
std::string attrValue;
std::string attrName;
XMLHandlers::Attributes attrs;
XMLHandlers handlers;
int cdataContext;
int commentContext;
char stringChar;
std::vector<std::string> xpath;
};
#if USE_STATE2STRING
static const char *stateToString(int state);
#endif
static yy_size_t yyread(yyscan_t yyscanner,char *buf,yy_size_t max_size);
static void initElement(yyscan_t yyscanner);
static void addCharacters(yyscan_t yyscanner);
static void addElement(yyscan_t yyscanner);
static void addAttribute(yyscan_t yyscanner);
static void countLines(yyscan_t yyscanner, const char *txt,yy_size_t len);
static void reportError(yyscan_t yyscanner, const std::string &msg);
static std::string processData(yyscan_t yyscanner,const char *txt,yy_size_t len);
#undef YY_INPUT
#define YY_INPUT(buf,result,max_size) result=yyread(yyscanner,buf,max_size);
%}
NL (\r\n|\r|\n)
SP [ \t\r\n]+
OPEN {SP}?"<"
OPENSPECIAL {SP}?"<?"
CLOSE ">"{NL}?
CLOSESPECIAL "?>"{NL}?
NAMESTART [:A-Za-z\200-\377_]
NAMECHAR [:A-Za-z\200-\377_0-9.-]
NAME {NAMESTART}{NAMECHAR}*
ESC "&#"[0-9]+";"|"&#x"[0-9a-fA-F]+";"
COLON ":"
PCDATA [^<]+
COMMENT {OPEN}"!--"
COMMENTEND "--"{CLOSE}
STRING \"([^"&]|{ESC})*\"|\'([^'&]|{ESC})*\'
DOCTYPE {SP}?"<!DOCTYPE"{SP}
CDATA {SP}?"<![CDATA["
ENDCDATA "]]>"
%option noyywrap
%s Initial
%s Content
%s CDataSection
%s Element
%s Attributes
%s AttributeValue
%s AttrValueStr
%s Prolog
%s Comment
%%
<Initial>{
{SP} { countLines(yyscanner,yytext,yyleng); }
{DOCTYPE} { countLines(yyscanner,yytext,yyleng); }
{OPENSPECIAL} { countLines(yyscanner,yytext,yyleng); BEGIN(Prolog); }
{OPEN} { countLines(yyscanner,yytext,yyleng);
initElement(yyscanner);
BEGIN(Element); }
{COMMENT} { yyextra->commentContext = YY_START;
BEGIN(Comment);
}
}
<Content>{
{CDATA} { countLines(yyscanner,yytext,yyleng);
yyextra->cdataContext = YY_START;
BEGIN(CDataSection);
}
{PCDATA} { yyextra->data += processData(yyscanner,yytext,yyleng); }
{OPEN} { countLines(yyscanner,yytext,yyleng);
addCharacters(yyscanner);
initElement(yyscanner);
BEGIN(Element);
}
{COMMENT} { yyextra->commentContext = YY_START;
countLines(yyscanner,yytext,yyleng);
BEGIN(Comment);
}
}
<Element>{
"/" { yyextra->isEnd = true; }
{NAME} { yyextra->name = yytext;
BEGIN(Attributes); }
{CLOSE} { addElement(yyscanner);
countLines(yyscanner,yytext,yyleng);
yyextra->data = "";
BEGIN(Content);
}
{SP} { countLines(yyscanner,yytext,yyleng); }
}
<Attributes>{
"/" { yyextra->selfClose = true; }
{NAME} { yyextra->attrName = yytext; }
"=" { BEGIN(AttributeValue); }
{CLOSE} { addElement(yyscanner);
countLines(yyscanner,yytext,yyleng);
yyextra->data = "";
BEGIN(Content);
}
{SP} { countLines(yyscanner,yytext,yyleng); }
}
<AttributeValue>{
{SP} { countLines(yyscanner,yytext,yyleng); }
['"] { yyextra->stringChar = *yytext;
yyextra->attrValue = "";
BEGIN(AttrValueStr);
}
. { std::string msg = std::string("Missing attribute value. Unexpected character `")+yytext+"` found";
reportError(yyscanner,msg);
unput(*yytext);
BEGIN(Attributes);
}
}
<AttrValueStr>{
[^'"\n]+ { yyextra->attrValue += processData(yyscanner,yytext,yyleng); }
['"] { if (*yytext==yyextra->stringChar)
{
addAttribute(yyscanner);
BEGIN(Attributes);
}
else
{
yyextra->attrValue += processData(yyscanner,yytext,yyleng);
}
}
\n { yyextra->lineNr++; yyextra->attrValue+=' '; }
}
<CDataSection>{
{ENDCDATA} { BEGIN(yyextra->cdataContext); }
[^]\n]+ { yyextra->data += yytext; }
\n { yyextra->data += yytext;
yyextra->lineNr++;
}
. { yyextra->data += yytext; }
}
<Prolog>{
{CLOSESPECIAL} { countLines(yyscanner,yytext,yyleng);
BEGIN(Initial);
}
[^?\n]+ { }
\n { yyextra->lineNr++; }
. { }
}
<Comment>{
{COMMENTEND} { countLines(yyscanner,yytext,yyleng);
BEGIN(yyextra->commentContext);
}
[^\n-]+ { }
\n { yyextra->lineNr++; }
. { }
}
\n { yyextra->lineNr++; }
. { std::string msg = "Unexpected character `";
msg+=yytext;
msg+="` found";
reportError(yyscanner,msg);
}
%%
//----------------------------------------------------------------------------------------
static yy_size_t yyread(yyscan_t yyscanner,char *buf,size_t max_size)
{
struct yyguts_t *yyg = (struct yyguts_t*)yyscanner;
yy_size_t inputPosition = yyextra->inputPosition;
const char *s = yyextra->inputString + inputPosition;
yy_size_t c=0;
while( c < max_size && *s)
{
*buf++ = *s++;
c++;
}
yyextra->inputPosition += c;
return c;
}
static void countLines(yyscan_t yyscanner, const char *txt,yy_size_t len)
{
struct yyguts_t *yyg = (struct yyguts_t*)yyscanner;
for (yy_size_t i=0;i<len;i++)
{
if (txt[i]=='\n') yyextra->lineNr++;
}
}
static void initElement(yyscan_t yyscanner)
{
struct yyguts_t *yyg = (struct yyguts_t*)yyscanner;
yyextra->isEnd = false; // true => </tag>
yyextra->selfClose = false; // true => <tag/>
yyextra->name = "";
yyextra->attrs.clear();
}
static void checkAndUpdatePath(yyscan_t yyscanner)
{
struct yyguts_t *yyg = (struct yyguts_t*)yyscanner;
if (yyextra->xpath.empty())
{
std::string msg = "found closing tag '"+yyextra->name+"' without matching opening tag";
reportError(yyscanner,msg);
}
else
{
std::string expectedTagName = yyextra->xpath.back();
if (expectedTagName!=yyextra->name)
{
std::string msg = "Found closing tag '"+yyextra->name+"' that does not match the opening tag '"+expectedTagName+"' at the same level";
reportError(yyscanner,msg);
}
else // matching end tag
{
yyextra->xpath.pop_back();
}
}
}
static void addElement(yyscan_t yyscanner)
{
struct yyguts_t *yyg = (struct yyguts_t*)yyscanner;
if (!yyextra->isEnd)
{
yyextra->xpath.push_back(yyextra->name);
if (yyextra->handlers.startElement)
{
yyextra->handlers.startElement(yyextra->name,yyextra->attrs);
}
if (yy_flex_debug)
{
fprintf(stderr,"%d: startElement(%s,attr=[",yyextra->lineNr,yyextra->name.data());
for (auto attr : yyextra->attrs)
{
fprintf(stderr,"%s='%s' ",attr.first.c_str(),attr.second.c_str());
}
fprintf(stderr,"])\n");
}
}
if (yyextra->isEnd || yyextra->selfClose)
{
if (yy_flex_debug)
{
fprintf(stderr,"%d: endElement(%s)\n",yyextra->lineNr,yyextra->name.data());
}
checkAndUpdatePath(yyscanner);
if (yyextra->handlers.endElement)
{
yyextra->handlers.endElement(yyextra->name);
}
}
}
static std::string trimSpaces(const std::string &str)
{
const int l = static_cast<int>(str.length());
int s=0, e=l-1;
while (s<l && isspace(str.at(s))) s++;
while (e>s && isspace(str.at(e))) e--;
return str.substr(s,1+e-s);
}
static void addCharacters(yyscan_t yyscanner)
{
struct yyguts_t *yyg = (struct yyguts_t*)yyscanner;
std::string data = trimSpaces(yyextra->data);
if (yyextra->handlers.characters)
{
yyextra->handlers.characters(data);
}
if (!data.empty())
{
if (yy_flex_debug)
{
fprintf(stderr,"characters(%s)\n",data.c_str());
}
}
}
static void addAttribute(yyscan_t yyscanner)
{
struct yyguts_t *yyg = (struct yyguts_t*)yyscanner;
yyextra->attrs.insert(std::make_pair(yyextra->attrName,yyextra->attrValue));
}
static void reportError(yyscan_t yyscanner,const std::string &msg)
{
struct yyguts_t *yyg = (struct yyguts_t*)yyscanner;
if (yy_flex_debug)
{
fprintf(stderr,"%s:%d: Error '%s'\n",yyextra->fileName.c_str(),yyextra->lineNr,msg.c_str());
}
if (yyextra->handlers.error)
{
yyextra->handlers.error(yyextra->fileName,yyextra->lineNr,msg);
}
}
static const char *entities_enc[] = { "amp", "quot", "gt", "lt", "apos" };
static const char entities_dec[] = { '&', '"', '>', '<', '\'' };
static const int num_entities = 5;
// replace character entities such as &amp; in txt and return the string where entities
// are replaced
static std::string processData(yyscan_t yyscanner,const char *txt,yy_size_t len)
{
std::string result;
result.reserve(len);
for (yy_size_t i=0; i<len; i++)
{
char c = txt[i];
if (c=='&')
{
const int maxEntityLen = 10;
char entity[maxEntityLen+1];
entity[maxEntityLen]='\0';
for (yy_size_t j=0; j<maxEntityLen && i+j+1<len; j++)
{
if (txt[i+j+1]!=';')
{
entity[j]=txt[i+j+1];
}
else
{
entity[j]=0;
break;
}
}
bool found=false;
for (int e=0; !found && e<num_entities; e++)
{
if (strcmp(entity,entities_enc[e])==0)
{
result+=entities_dec[e];
i+=strlen(entities_enc[e])+1;
found=true;
}
}
if (!found)
{
std::string msg = std::string("Invalid character entity '&") + entity + ";' found\n";
reportError(yyscanner,msg);
}
}
else
{
result+=c;
}
}
return result;
}
//--------------------------------------------------------------
struct XMLParser::Private
{
yyscan_t yyscanner;
struct xmlYY_state xmlYY_extra;
};
XMLParser::XMLParser(const XMLHandlers &handlers) : p(new Private)
{
xmlYYlex_init_extra(&p->xmlYY_extra,&p->yyscanner);
p->xmlYY_extra.handlers = handlers;
}
XMLParser::~XMLParser()
{
xmlYYlex_destroy(p->yyscanner);
}
void XMLParser::parse(const char *fileName,const char *inputStr, bool debugEnabled)
{
yyscan_t yyscanner = p->yyscanner;
struct yyguts_t *yyg = (struct yyguts_t*)yyscanner;
#ifdef FLEX_DEBUG
xmlYYset_debug(1,p->yyscanner);
#endif
if (inputStr==nullptr || inputStr[0]=='\0') return; // empty input
FILE *output = 0;
const char *enter_txt = 0;
const char *finished_txt = 0;
const char *pre_txt = 0;
if (yy_flex_debug) { output=stderr; pre_txt="--"; enter_txt="entering"; finished_txt="finished"; }
else if (debugEnabled) { output=stdout; pre_txt=""; enter_txt="Entering"; finished_txt="Finished"; }
if (output)
{
fprintf(output,"%s%s lexical analyzer: %s (for: %s)\n",pre_txt,enter_txt, __FILE__, fileName);
}
BEGIN(Initial);
yyextra->fileName = fileName;
yyextra->lineNr = 1;
yyextra->inputString = inputStr;
yyextra->inputPosition = 0;
xmlYYrestart( 0, yyscanner );
if (yyextra->handlers.startDocument)
{
yyextra->handlers.startDocument();
}
xmlYYlex(yyscanner);
if (yyextra->handlers.endDocument)
{
yyextra->handlers.endDocument();
}
if (!yyextra->xpath.empty())
{
std::string tagName = yyextra->xpath.back();
std::string msg = "End of file reached while expecting closing tag '"+tagName+"'";
reportError(yyscanner,msg);
}
if (output)
{
fprintf(output,"%s%s lexical analyzer: %s (for: %s)\n",pre_txt,finished_txt, __FILE__, fileName);
}
}
int XMLParser::lineNr() const
{
struct yyguts_t *yyg = (struct yyguts_t*)p->yyscanner;
return yyextra->lineNr;
}
std::string XMLParser::fileName() const
{
struct yyguts_t *yyg = (struct yyguts_t*)p->yyscanner;
return yyextra->fileName;
}
#if USE_STATE2STRING
#include "xml.l.h"
#endif