535 lines
15 KiB
C++
535 lines
15 KiB
C++
/****************************************************************************
|
|
**
|
|
** Copyright (C) 2016 The Qt Company Ltd.
|
|
** Contact: https://www.qt.io/licensing/
|
|
**
|
|
** This file is part of the utils of the Qt Toolkit.
|
|
**
|
|
** $QT_BEGIN_LICENSE:GPL-EXCEPT$
|
|
** Commercial License Usage
|
|
** Licensees holding valid commercial Qt licenses may use this file in
|
|
** accordance with the commercial license agreement provided with the
|
|
** Software or, alternatively, in accordance with the terms contained in
|
|
** a written agreement between you and The Qt Company. For licensing terms
|
|
** and conditions see https://www.qt.io/terms-conditions. For further
|
|
** information use the contact form at https://www.qt.io/contact-us.
|
|
**
|
|
** GNU General Public License Usage
|
|
** Alternatively, this file may be used under the terms of the GNU
|
|
** General Public License version 3 as published by the Free Software
|
|
** Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT
|
|
** included in the packaging of this file. Please review the following
|
|
** information to ensure the GNU General Public License requirements will
|
|
** be met: https://www.gnu.org/licenses/gpl-3.0.html.
|
|
**
|
|
** $QT_END_LICENSE$
|
|
**
|
|
****************************************************************************/
|
|
|
|
#include "re2nfa.h"
|
|
#include "tokenizer.cpp"
|
|
|
|
RE2NFA::RE2NFA(const QMap<QString, NFA> ¯os, const QSet<InputType> &maxInputSet, Qt::CaseSensitivity cs)
|
|
: macros(macros), index(0), errorColumn(-1), maxInputSet(maxInputSet), caseSensitivity(cs)
|
|
{
|
|
}
|
|
|
|
NFA RE2NFA::parse(const QString &expression, int *errCol)
|
|
{
|
|
tokenize(expression);
|
|
|
|
if (symbols.isEmpty())
|
|
return NFA();
|
|
|
|
index = 0;
|
|
|
|
NFA result = parseExpr();
|
|
if (result.isEmpty()) {
|
|
if (errCol)
|
|
*errCol = errorColumn;
|
|
}
|
|
return result;
|
|
}
|
|
|
|
NFA RE2NFA::parseExpr()
|
|
{
|
|
NFA value = parseBranch();
|
|
while (test(TOK_OR)) {
|
|
NFA rhs = parseBranch();
|
|
value = NFA::createAlternatingNFA(value, rhs);
|
|
}
|
|
return value;
|
|
}
|
|
|
|
NFA RE2NFA::parseBranch()
|
|
{
|
|
NFA value = parsePiece();
|
|
if (!hasNext())
|
|
return value;
|
|
NFA next;
|
|
do {
|
|
next = parsePiece();
|
|
if (!next.isEmpty())
|
|
value = NFA::createConcatenatingNFA(value, next);
|
|
} while (!next.isEmpty() && hasNext());
|
|
return value;
|
|
}
|
|
|
|
NFA RE2NFA::parsePiece()
|
|
{
|
|
NFA atom = parseAtom();
|
|
if (atom.isEmpty() || !hasNext())
|
|
return atom;
|
|
return parseMaybeQuantifier(atom);
|
|
}
|
|
|
|
NFA RE2NFA::parseAtom()
|
|
{
|
|
// ####
|
|
switch (next()) {
|
|
case TOK_STRING:
|
|
return createCharNFA();
|
|
case TOK_LPAREN: {
|
|
NFA subExpr = parseExpr();
|
|
next(TOK_RPAREN);
|
|
return subExpr;
|
|
}
|
|
case TOK_LBRACE: {
|
|
QString macroName = lexemUntil(TOK_RBRACE);
|
|
QMap<QString, NFA>::ConstIterator macro = macros.find(macroName);
|
|
if (macro == macros.end()) {
|
|
qWarning("Unknown macro '%s' - probably used before defined", qPrintable(macroName));
|
|
return NFA();
|
|
}
|
|
return *macro;
|
|
}
|
|
case TOK_LBRACKET: {
|
|
NFA set = parseSet();
|
|
next(TOK_RBRACKET);
|
|
return set;
|
|
}
|
|
case TOK_SEQUENCE:
|
|
return parseSet2();
|
|
case TOK_DOT:
|
|
return NFA::createSetNFA(maxInputSet);
|
|
default:
|
|
prev();
|
|
return NFA();
|
|
}
|
|
}
|
|
|
|
NFA RE2NFA::parseMaybeQuantifier(const NFA &nfa)
|
|
{
|
|
// ####
|
|
switch (next()) {
|
|
case TOK_STAR:
|
|
return NFA::createOptionalNFA(nfa);
|
|
case TOK_QUESTION:
|
|
return NFA::createZeroOrOneNFA(nfa);
|
|
case TOK_PLUS:
|
|
return NFA::createConcatenatingNFA(nfa, NFA::createOptionalNFA(nfa));
|
|
case TOK_LBRACE: {
|
|
const int rewind = index - 1;
|
|
|
|
QString lexemBeforeComma;
|
|
QString lexemAfterComma;
|
|
bool seenComma = false;
|
|
forever {
|
|
if (test(TOK_COMMA)) {
|
|
if (seenComma) {
|
|
errorColumn = symbol().column;
|
|
return NFA();
|
|
}
|
|
seenComma = true;
|
|
} else if (test(TOK_RBRACE)) {
|
|
break;
|
|
} else {
|
|
next(TOK_STRING);
|
|
if (seenComma)
|
|
lexemAfterComma += symbol().lexem;
|
|
else
|
|
lexemBeforeComma += symbol().lexem;
|
|
}
|
|
}
|
|
bool isNumber = false;
|
|
int min = lexemBeforeComma.toInt(&isNumber);
|
|
if (!isNumber) {
|
|
index = rewind;
|
|
return nfa;
|
|
}
|
|
int max = min;
|
|
if (seenComma) {
|
|
max = lexemAfterComma.toInt(&isNumber);
|
|
if (!isNumber) {
|
|
errorColumn = symbol().column;
|
|
return NFA();
|
|
}
|
|
}
|
|
return NFA::applyQuantity(nfa, min, max);
|
|
}
|
|
default:
|
|
prev();
|
|
return nfa;
|
|
}
|
|
}
|
|
|
|
NFA RE2NFA::parseSet()
|
|
{
|
|
QSet<InputType> set;
|
|
bool negate = false;
|
|
|
|
next(TOK_STRING);
|
|
|
|
do {
|
|
Q_ASSERT(symbol().lexem.length() == 1);
|
|
// ###
|
|
QChar ch = symbol().lexem.at(0);
|
|
if (set.isEmpty() && ch == QLatin1Char('^')) {
|
|
negate = true;
|
|
continue;
|
|
}
|
|
|
|
// look ahead for ranges like a-z
|
|
bool rangeFound = false;
|
|
if (test(TOK_STRING)) {
|
|
if (symbol().lexem.length() == 1
|
|
&& symbol().lexem.at(0) == QLatin1Char('-')) {
|
|
next(TOK_STRING);
|
|
Q_ASSERT(symbol().lexem.length() == 1);
|
|
QChar last = symbol().lexem.at(0);
|
|
|
|
if (ch.unicode() > last.unicode())
|
|
qSwap(ch, last);
|
|
|
|
for (ushort i = ch.unicode(); i <= last.unicode(); ++i) {
|
|
if (caseSensitivity == Qt::CaseInsensitive) {
|
|
set.insert(QChar(i).toLower().unicode());
|
|
} else {
|
|
set.insert(i);
|
|
}
|
|
}
|
|
|
|
rangeFound = true;
|
|
} else {
|
|
prev();
|
|
}
|
|
}
|
|
|
|
if (!rangeFound) {
|
|
if (caseSensitivity == Qt::CaseInsensitive) {
|
|
set.insert(ch.toLower().unicode());
|
|
} else {
|
|
set.insert(ch.unicode());
|
|
}
|
|
}
|
|
} while (test(TOK_STRING));
|
|
|
|
if (negate) {
|
|
QSet<InputType> negatedSet = maxInputSet;
|
|
negatedSet.subtract(set);
|
|
set = negatedSet;
|
|
}
|
|
|
|
return NFA::createSetNFA(set);
|
|
}
|
|
|
|
NFA RE2NFA::parseSet2()
|
|
{
|
|
QSet<InputType> set;
|
|
bool negate = false;
|
|
|
|
QString str = symbol().lexem;
|
|
// strip off brackets
|
|
str.chop(1);
|
|
str.remove(0, 1);
|
|
|
|
int i = 0;
|
|
while (i < str.length()) {
|
|
// ###
|
|
QChar ch = str.at(i++);
|
|
if (set.isEmpty() && ch == QLatin1Char('^')) {
|
|
negate = true;
|
|
continue;
|
|
}
|
|
|
|
// look ahead for ranges like a-z
|
|
bool rangeFound = false;
|
|
if (i < str.length() - 1 && str.at(i) == QLatin1Char('-')) {
|
|
++i;
|
|
QChar last = str.at(i++);
|
|
|
|
if (ch.unicode() > last.unicode())
|
|
qSwap(ch, last);
|
|
|
|
for (ushort i = ch.unicode(); i <= last.unicode(); ++i) {
|
|
if (caseSensitivity == Qt::CaseInsensitive) {
|
|
set.insert(QChar(i).toLower().unicode());
|
|
} else {
|
|
set.insert(i);
|
|
}
|
|
}
|
|
|
|
rangeFound = true;
|
|
}
|
|
|
|
if (!rangeFound) {
|
|
if (caseSensitivity == Qt::CaseInsensitive) {
|
|
set.insert(ch.toLower().unicode());
|
|
} else {
|
|
set.insert(ch.unicode());
|
|
}
|
|
}
|
|
}
|
|
|
|
if (negate) {
|
|
QSet<InputType> negatedSet = maxInputSet;
|
|
negatedSet.subtract(set);
|
|
set = negatedSet;
|
|
}
|
|
|
|
return NFA::createSetNFA(set);
|
|
}
|
|
NFA RE2NFA::createCharNFA()
|
|
{
|
|
NFA nfa;
|
|
// ####
|
|
if (caseSensitivity == Qt::CaseInsensitive) {
|
|
nfa = NFA::createStringNFA(symbol().lexem.toLower().toLatin1());
|
|
} else {
|
|
nfa = NFA::createStringNFA(symbol().lexem.toLatin1());
|
|
}
|
|
return nfa;
|
|
}
|
|
|
|
static inline int skipQuote(const QString &str, int pos)
|
|
{
|
|
while (pos < str.length()
|
|
&& str.at(pos) != QLatin1Char('"')) {
|
|
if (str.at(pos) == QLatin1Char('\\')) {
|
|
++pos;
|
|
if (pos >= str.length())
|
|
break;
|
|
}
|
|
++pos;
|
|
}
|
|
if (pos < str.length())
|
|
++pos;
|
|
return pos;
|
|
}
|
|
|
|
#if 0
|
|
static const char*tokStr(Token t)
|
|
{
|
|
switch (t) {
|
|
case TOK_INVALID: return "TOK_INVALID";
|
|
case TOK_STRING: return "TOK_STRING";
|
|
case TOK_LBRACE: return "TOK_LBRACE";
|
|
case TOK_RBRACE: return "TOK_RBRACE";
|
|
case TOK_LBRACKET: return "TOK_LBRACKET";
|
|
case TOK_RBRACKET: return "TOK_RBRACKET";
|
|
case TOK_LPAREN: return "TOK_LPAREN";
|
|
case TOK_RPAREN: return "TOK_RPAREN";
|
|
case TOK_COMMA: return "TOK_COMMA";
|
|
case TOK_STAR: return "TOK_STAR";
|
|
case TOK_OR: return "TOK_OR";
|
|
case TOK_QUESTION: return "TOK_QUESTION";
|
|
case TOK_DOT: return "TOK_DOT";
|
|
case TOK_PLUS: return "TOK_PLUS";
|
|
case TOK_SEQUENCE: return "TOK_SEQUENCE";
|
|
case TOK_QUOTED_STRING: return "TOK_QUOTED_STRING";
|
|
}
|
|
return "";
|
|
}
|
|
#endif
|
|
|
|
void RE2NFA::tokenize(const QString &input)
|
|
{
|
|
symbols.clear();
|
|
#if 1
|
|
RegExpTokenizer tokenizer(input);
|
|
Symbol sym;
|
|
int tok = tokenizer.lex();
|
|
while (tok != -1) {
|
|
Symbol sym;
|
|
sym.token = static_cast<Token>(tok);
|
|
sym.lexem = input.mid(tokenizer.lexemStart, tokenizer.lexemLength);
|
|
|
|
if (sym.token == TOK_QUOTED_STRING) {
|
|
sym.lexem.chop(1);
|
|
sym.lexem.remove(0, 1);
|
|
sym.token = TOK_STRING;
|
|
}
|
|
|
|
if (sym.token == TOK_STRING || sym.token == TOK_SEQUENCE) {
|
|
for (int i = 0; i < sym.lexem.length(); ++i) {
|
|
if (sym.lexem.at(i) == '\\') {
|
|
if (i >= sym.lexem.length() - 1)
|
|
break;
|
|
QChar ch = sym.lexem.at(i + 1);
|
|
if (ch == QLatin1Char('n')) {
|
|
ch = '\n';
|
|
} else if (ch == QLatin1Char('r')) {
|
|
ch = '\r';
|
|
} else if (ch == QLatin1Char('t')) {
|
|
ch = '\t';
|
|
} else if (ch == QLatin1Char('f')) {
|
|
ch = '\f';
|
|
}
|
|
sym.lexem.replace(i, 2, ch);
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
if (sym.token == TOK_SEQUENCE) {
|
|
Symbol s;
|
|
s.token = TOK_LBRACKET;
|
|
s.lexem = "[";
|
|
symbols.append(s);
|
|
|
|
for (int i = 1; i < sym.lexem.length() - 1; ++i) {
|
|
s.token = TOK_STRING;
|
|
s.lexem = sym.lexem.at(i);
|
|
symbols.append(s);
|
|
}
|
|
|
|
s.token = TOK_RBRACKET;
|
|
s.lexem = "]";
|
|
symbols.append(s);
|
|
|
|
tok = tokenizer.lex();
|
|
continue;
|
|
}
|
|
*/
|
|
|
|
symbols.append(sym);
|
|
tok = tokenizer.lex();
|
|
}
|
|
#else
|
|
int pos = 0;
|
|
bool insideSet = false;
|
|
while (pos < input.length()) {
|
|
QChar ch = input.at(pos);
|
|
|
|
Symbol sym;
|
|
sym.column = pos;
|
|
sym.token = TOK_INVALID;
|
|
sym.lexem = QString(ch);
|
|
switch (ch.toLatin1()) {
|
|
case '"': {
|
|
if (insideSet) {
|
|
sym.token = TOK_STRING;
|
|
sym.lexem = QString(ch);
|
|
symbols += sym;
|
|
++pos;
|
|
continue;
|
|
}
|
|
if (pos + 1 >= input.length())
|
|
return;
|
|
int quoteEnd = skipQuote(input, pos + 1);
|
|
sym.token = TOK_STRING;
|
|
sym.lexem = input.mid(pos + 1, quoteEnd - pos - 2);
|
|
symbols += sym;
|
|
pos = quoteEnd;
|
|
continue;
|
|
}
|
|
case '{':
|
|
sym.token = (insideSet ? TOK_STRING : TOK_LBRACE);
|
|
break;
|
|
case '}':
|
|
sym.token = (insideSet ? TOK_STRING : TOK_RBRACE);
|
|
break;
|
|
case '[':
|
|
insideSet = true;
|
|
sym.token = TOK_LBRACKET;
|
|
break;
|
|
case ']':
|
|
insideSet = false;
|
|
sym.token = TOK_RBRACKET;
|
|
break;
|
|
case '(':
|
|
sym.token = (insideSet ? TOK_STRING : TOK_LPAREN);
|
|
break;
|
|
case ')':
|
|
sym.token = (insideSet ? TOK_STRING : TOK_RPAREN);
|
|
break;
|
|
case ',':
|
|
sym.token = (insideSet ? TOK_STRING : TOK_COMMA);
|
|
break;
|
|
case '*':
|
|
sym.token = (insideSet ? TOK_STRING : TOK_STAR);
|
|
break;
|
|
case '|':
|
|
sym.token = (insideSet ? TOK_STRING : TOK_OR);
|
|
break;
|
|
case '?':
|
|
sym.token = (insideSet ? TOK_STRING : TOK_QUESTION);
|
|
break;
|
|
case '.':
|
|
sym.token = (insideSet ? TOK_STRING : TOK_DOT);
|
|
break;
|
|
case '+':
|
|
sym.token = (insideSet ? TOK_STRING : TOK_PLUS);
|
|
break;
|
|
case '\\':
|
|
++pos;
|
|
if (pos >= input.length())
|
|
return;
|
|
ch = input.at(pos);
|
|
if (ch == QLatin1Char('n')) {
|
|
ch = '\n';
|
|
} else if (ch == QLatin1Char('r')) {
|
|
ch = '\r';
|
|
} else if (ch == QLatin1Char('t')) {
|
|
ch = '\t';
|
|
} else if (ch == QLatin1Char('f')) {
|
|
ch = '\f';
|
|
}
|
|
// fall through
|
|
default:
|
|
sym.token = TOK_STRING;
|
|
sym.lexem = QString(ch);
|
|
symbols += sym;
|
|
++pos;
|
|
continue;
|
|
}
|
|
symbols += sym;
|
|
++pos;
|
|
}
|
|
#endif
|
|
#if 0
|
|
foreach (Symbol s, symbols) {
|
|
qDebug() << "Tok" << tokStr(s.token) << "lexem" << s.lexem;
|
|
}
|
|
#endif
|
|
}
|
|
|
|
bool RE2NFA::next(Token t)
|
|
{
|
|
if (hasNext() && next() == t)
|
|
return true;
|
|
errorColumn = symbol().column;
|
|
Q_ASSERT(false);
|
|
return false;
|
|
}
|
|
|
|
bool RE2NFA::test(Token t)
|
|
{
|
|
if (index >= symbols.count())
|
|
return false;
|
|
if (symbols.at(index).token == t) {
|
|
++index;
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
QString RE2NFA::lexemUntil(Token t)
|
|
{
|
|
QString lexem;
|
|
while (hasNext() && next() != t)
|
|
lexem += symbol().lexem;
|
|
return lexem;
|
|
}
|
|
|