311 lines
10 KiB
C++
311 lines
10 KiB
C++
/****************************************************************************
|
|
**
|
|
** Copyright (C) 2016 The Qt Company Ltd.
|
|
** Contact: https://www.qt.io/licensing/
|
|
**
|
|
** This file is part of the utils of the Qt Toolkit.
|
|
**
|
|
** $QT_BEGIN_LICENSE:GPL-EXCEPT$
|
|
** Commercial License Usage
|
|
** Licensees holding valid commercial Qt licenses may use this file in
|
|
** accordance with the commercial license agreement provided with the
|
|
** Software or, alternatively, in accordance with the terms contained in
|
|
** a written agreement between you and The Qt Company. For licensing terms
|
|
** and conditions see https://www.qt.io/terms-conditions. For further
|
|
** information use the contact form at https://www.qt.io/contact-us.
|
|
**
|
|
** GNU General Public License Usage
|
|
** Alternatively, this file may be used under the terms of the GNU
|
|
** General Public License version 3 as published by the Free Software
|
|
** Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT
|
|
** included in the packaging of this file. Please review the following
|
|
** information to ensure the GNU General Public License requirements will
|
|
** be met: https://www.gnu.org/licenses/gpl-3.0.html.
|
|
**
|
|
** $QT_END_LICENSE$
|
|
**
|
|
****************************************************************************/
|
|
|
|
#include "nfa.h"
|
|
#include "re2nfa.h"
|
|
#include "configfile.h"
|
|
#include "generator.h"
|
|
|
|
#include <QFile>
|
|
#include <QCoreApplication>
|
|
#include <QFileInfo>
|
|
#include <QDateTime>
|
|
|
|
struct Symbol
|
|
{
|
|
QString token;
|
|
QString lexem;
|
|
};
|
|
|
|
static QList<Symbol> tokenize(const DFA &dfa, const QString &input, Config *cfg, bool *ok = 0)
|
|
{
|
|
QList<Symbol> symbols;
|
|
Symbol lastSymbol;
|
|
int state = 0;
|
|
int lastAcceptingState = -1;
|
|
QString lastAcceptingLexem;
|
|
int lastAcceptingPos = -1;
|
|
for (int i = 0; i < input.length(); ++i) {
|
|
QChar ch = input.at(i);
|
|
QChar chForInput = ch;
|
|
if (cfg->caseSensitivity == Qt::CaseInsensitive)
|
|
chForInput = chForInput.toLower();
|
|
int next = dfa.at(state).transitions.value(chForInput.unicode());
|
|
if (cfg->debug)
|
|
qDebug() << "input" << input.at(i) << "leads to state" << next;
|
|
if (next) {
|
|
lastSymbol.lexem.append(input.at(i));
|
|
lastSymbol.token = dfa.at(next).symbol;
|
|
if (!lastSymbol.token.isEmpty()) {
|
|
lastAcceptingState = next;
|
|
lastAcceptingLexem = lastSymbol.lexem;
|
|
lastAcceptingPos = i;
|
|
}
|
|
state = next;
|
|
} else {
|
|
if (lastAcceptingState != -1) {
|
|
if (cfg->debug)
|
|
qDebug() << "adding" << dfa.at(lastAcceptingState).symbol << "and backtracking to" << lastAcceptingPos;
|
|
Symbol s;
|
|
s.token = dfa.at(lastAcceptingState).symbol;
|
|
s.lexem = lastAcceptingLexem;
|
|
symbols << s;
|
|
lastSymbol = Symbol();
|
|
state = 0;
|
|
i = lastAcceptingPos;
|
|
lastAcceptingPos = -1;
|
|
lastAcceptingState = -1;
|
|
continue;
|
|
}
|
|
if (state == 0 || lastSymbol.token.isEmpty()) {
|
|
if (cfg->debug)
|
|
qDebug() << "invalid input";
|
|
if (ok)
|
|
*ok = false;
|
|
return symbols;
|
|
}
|
|
if (cfg->debug)
|
|
qDebug() << "appending symbol with token" << lastSymbol.token;
|
|
symbols << lastSymbol;
|
|
lastSymbol = Symbol();
|
|
state = 0;
|
|
lastAcceptingState = -1;
|
|
--i;
|
|
}
|
|
}
|
|
if (!lastSymbol.token.isEmpty()) {
|
|
if (cfg->debug)
|
|
qDebug() << "appending (last) symbol with token" << lastSymbol.token;
|
|
symbols << lastSymbol;
|
|
} else if (lastAcceptingState != -1) {
|
|
if (cfg->debug)
|
|
qDebug() << "appending last accepting state with token" << dfa.at(lastAcceptingState).symbol;
|
|
Symbol s;
|
|
s.lexem = lastAcceptingLexem;
|
|
s.token = dfa.at(lastAcceptingState).symbol;
|
|
symbols << s;
|
|
}
|
|
if (ok)
|
|
*ok = true;
|
|
return symbols;
|
|
}
|
|
|
|
static QSet<InputType> determineMaxInputSet(const ConfigFile::Section §ion)
|
|
{
|
|
QSet<InputType> set;
|
|
|
|
QString inputTypeName;
|
|
|
|
foreach (const ConfigFile::Entry &entry, section)
|
|
if (entry.key == QLatin1String("InputType")) {
|
|
if (!inputTypeName.isEmpty()) {
|
|
qWarning("Error: InputType field specified multiple times in config file");
|
|
return QSet<InputType>();
|
|
}
|
|
inputTypeName = entry.value;
|
|
}
|
|
|
|
if (inputTypeName.isEmpty())
|
|
inputTypeName = "quint8";
|
|
|
|
if (inputTypeName == "quint8") {
|
|
for (int i = 1; i < 256; ++i)
|
|
set.insert(i);
|
|
} /* else if ### */
|
|
else {
|
|
qWarning("Error: Unknown input type '%s'", qPrintable(inputTypeName));
|
|
return QSet<InputType>();
|
|
}
|
|
|
|
return set;
|
|
}
|
|
|
|
static bool loadConfig(const QString &ruleFile, Config *cfg)
|
|
{
|
|
ConfigFile::SectionMap sections = ConfigFile::parse(ruleFile);
|
|
if (sections.isEmpty()) {
|
|
qWarning("Error parsing %s", qPrintable(ruleFile));
|
|
return false;
|
|
}
|
|
|
|
QSet<InputType> maxInputSet = determineMaxInputSet(sections.value("Options"));
|
|
if (maxInputSet.isEmpty())
|
|
return false;
|
|
|
|
Qt::CaseSensitivity cs = Qt::CaseInsensitive;
|
|
if (sections.value("Options").contains("case-sensitive"))
|
|
cs = Qt::CaseSensitive;
|
|
|
|
cfg->configSections = sections;
|
|
cfg->caseSensitivity = cs;
|
|
cfg->className = sections.value("Options").value("classname", "Scanner");
|
|
cfg->maxInputSet = maxInputSet;
|
|
cfg->ruleFile = ruleFile;
|
|
return true;
|
|
}
|
|
|
|
static DFA generateMachine(const Config &cfg)
|
|
{
|
|
if (cfg.cache) {
|
|
QFileInfo ruleInfo(cfg.ruleFile);
|
|
QFileInfo cacheInfo(ruleInfo.baseName() + ".dfa");
|
|
if (cacheInfo.exists()
|
|
&& cacheInfo.lastModified() > ruleInfo.lastModified()) {
|
|
QFile f(cacheInfo.absoluteFilePath());
|
|
f.open(QIODevice::ReadOnly);
|
|
QDataStream stream(&f);
|
|
DFA machine;
|
|
stream >> machine;
|
|
return machine;
|
|
}
|
|
}
|
|
|
|
QMap<QString, NFA> macros;
|
|
foreach (ConfigFile::Entry e, cfg.configSections.value("Macros")) {
|
|
int errCol = 0;
|
|
if (cfg.debug)
|
|
qDebug() << "parsing" << e.value;
|
|
NFA nfa = RE2NFA(macros, cfg.maxInputSet, cfg.caseSensitivity).parse(e.value, &errCol);
|
|
if (nfa.isEmpty()) {
|
|
qWarning("Parse error in line %d column %d", e.lineNumber, errCol);
|
|
return DFA();
|
|
}
|
|
macros.insert(e.key, nfa);
|
|
}
|
|
|
|
if (!cfg.configSections.contains("Tokens")) {
|
|
qWarning("Rule file does not contain a [Tokens] section!");
|
|
return DFA();
|
|
}
|
|
|
|
QVector<NFA> tokens;
|
|
|
|
foreach (ConfigFile::Entry e, cfg.configSections.value("Tokens")) {
|
|
int errCol = 0;
|
|
if (cfg.debug)
|
|
qDebug() << "parsing" << e.value;
|
|
NFA tok = RE2NFA(macros, cfg.maxInputSet, cfg.caseSensitivity).parse(e.value, &errCol);
|
|
if (tok.isEmpty()) {
|
|
qWarning("Parse error in line %d column %d while parsing token %s", e.lineNumber, errCol, e.key.toLocal8Bit().constData());
|
|
return DFA();
|
|
}
|
|
tok.setTerminationSymbol(e.key);
|
|
tokens.append(tok);
|
|
}
|
|
|
|
NFA giganticStateMachine;
|
|
foreach (NFA nfa, tokens)
|
|
if (giganticStateMachine.isEmpty())
|
|
giganticStateMachine = nfa;
|
|
else
|
|
giganticStateMachine = NFA::createAlternatingNFA(giganticStateMachine, nfa);
|
|
|
|
DFA result = giganticStateMachine.toDFA().minimize();
|
|
if (cfg.cache) {
|
|
QFileInfo ruleInfo(cfg.ruleFile);
|
|
QFileInfo cacheInfo(ruleInfo.baseName() + ".dfa");
|
|
QFile f(cacheInfo.absoluteFilePath());
|
|
f.open(QIODevice::WriteOnly | QIODevice::Truncate);
|
|
QDataStream stream(&f);
|
|
stream << result;
|
|
}
|
|
return result;
|
|
}
|
|
|
|
#if !defined(AUTOTEST)
|
|
int main(int argc, char **argv)
|
|
{
|
|
QCoreApplication app(argc, argv);
|
|
QString ruleFile;
|
|
Config cfg;
|
|
|
|
const QStringList arguments = app.arguments().mid(1);
|
|
cfg.debug = arguments.contains("-debug");
|
|
const bool testRules = arguments.contains("-test");
|
|
cfg.cache = arguments.contains("-cache");
|
|
|
|
foreach (const QString &arg, arguments)
|
|
if (!arg.startsWith(QLatin1Char('-'))) {
|
|
ruleFile = arg;
|
|
break;
|
|
}
|
|
|
|
if (ruleFile.isEmpty()) {
|
|
qWarning("usage: lexgen [-debug] [-cache] [-test] rulefile");
|
|
qWarning(" ");
|
|
qWarning(" the -test option will cause lexgen to interpret standard input");
|
|
qWarning(" according to the specified rules and print out pairs of token and");
|
|
qWarning(" lexical element");
|
|
return 1;
|
|
}
|
|
|
|
if (!loadConfig(ruleFile, &cfg))
|
|
return 1;
|
|
|
|
DFA machine = generateMachine(cfg);
|
|
if (machine.isEmpty())
|
|
return 1;
|
|
|
|
if (testRules) {
|
|
qWarning("Testing:");
|
|
QString input = QTextStream(stdin).readAll();
|
|
/*
|
|
qDebug() << "NFA has" << machine.stateCount() << "states";
|
|
qDebug() << "Converting to DFA... (this may take a while)";
|
|
DFA dfa = machine.toDFA();
|
|
qDebug() << "DFA has" << dfa.count() << "states";
|
|
qDebug() << "Minimizing...";
|
|
dfa = dfa.minimize();
|
|
qDebug() << "Minimized DFA has" << dfa.count() << "states";
|
|
*/
|
|
DFA dfa = machine;
|
|
if (cfg.debug)
|
|
qDebug() << "tokenizing" << input;
|
|
bool ok = false;
|
|
QList<Symbol> symbols = tokenize(dfa, input, &cfg, &ok);
|
|
if (symbols.isEmpty()) {
|
|
qWarning("No tokens produced!");
|
|
} else {
|
|
foreach (Symbol s, symbols)
|
|
qDebug() << s.token << ":" << s.lexem;
|
|
}
|
|
if (ok)
|
|
qDebug() << symbols.count() << "tokens produced.";
|
|
else
|
|
qDebug() << "Error while tokenizing!";
|
|
} else {
|
|
Generator gen(machine, cfg);
|
|
QTextStream(stdout)
|
|
<< gen.generate();
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
#endif
|
|
|