WingHexExplorer2/3rdparty/qcodeedit2/lib/qnfa/qnfa.cpp

942 lines
26 KiB
C++

/****************************************************************************
**
** Copyright (C) 2006-2009 fullmetalcoder <fullmetalcoder@hotmail.fr>
**
** This file is part of the Edyuk project <http://edyuk.org>
**
** This file may be used under the terms of the GNU General Public License
** version 3 as published by the Free Software Foundation and appearing in the
** file GPL.txt included in the packaging of this file.
**
** This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE
** WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
**
****************************************************************************/
#include "qnfa.h"
/*!
\file qnfa.cpp
\brief Implementation of the core QNFA syntax engine
*/
#include <QHash>
#include <QList>
quint32 QNFA::_count = 0;
static QList<QNFA *> _deleted;
QNFA::QNFA() : type(Char), assertion(0), actionid(0) {
out.next = 0;
++_count;
// qDebug("alloc(0x%x) => QNFA syntax[%i];", this, _count);
}
QNFA::~QNFA() {
--_count;
// some nfa nodes are shared... gotta make sure they are free'd once only
_deleted << this;
// qDebug("free(0x%x) => QNFA syntax[%i];", this, _count);
tree.clear();
if ((type & CxtBeg) && out.branch) {
delete out.branch;
out.branch = 0;
}
if (out.next && !_deleted.contains(out.next)) {
delete out.next;
out.next = 0;
}
}
QNFABranch::~QNFABranch() {
// qDebug("branch to %i nodes", count());
for (int i = 0; i < count(); ++i) {
if (at(i) && !_deleted.contains(at(i))) {
delete (*this)[i];
(*this)[i] = 0;
}
}
}
inline bool isWord(QChar c) {
return c.isLetterOrNumber() || (c.unicode() == '_');
}
static bool match(QChar cc, QNFA *chain) {
bool found = true;
quint16 cu = cc.unicode();
bool notEmpty = chain->c.count();
if (notEmpty && (chain->c.at(0) == '\0'))
found = false;
if (notEmpty)
if (chain->c.contains(cu))
return found;
int ass = chain->assertion;
if (ass) {
if (cc.isDigit()) {
if (ass & Digit)
return found;
} else {
if (ass & NonDigit)
return found;
if (cc.isSpace()) {
if (ass & Space)
return found;
} else {
if (ass & NonSpace)
return found;
if (cc.isLetterOrNumber() || (cu == '_')) {
if (ass & Word)
return found;
} else {
if (ass & NonWord)
return found;
}
}
}
}
return !found;
}
void match(QNFAMatchContext *lexer, const QChar *d, int length,
QNFAMatchNotifier notify) {
if (!lexer || !lexer->context) {
// qWarning("get off you scum!");
return;
}
// restore message buffering
notify.clear();
int olvls = lexer->parents.count(), nlvls = 0, lvls = olvls;
if (lvls)
notify.startBuffering();
//
quint16 c = 0;
const QChar *di = d;
QNFA *chain = 0, *start = 0;
int index = 0, lastCxt = 0, len, idx;
bool bFound, bEscape = false, bEscaped = false;
bool wPrev = false, wCur = false;
while (index < length) {
bFound = false;
bEscaped = false;
// bEscape &= !lexer->meaningless.contains(d[index].unicode());
// while ( lexer->meaningless.contains(d[index].unicode()) && ((index +
// 1) < length) )
// ++index;
if (index >= length)
break;
c = di->unicode();
wCur = isWord(*di);
int plainIndex = -1, plainMatch, plainLength;
// try fast plain matching
if (!(wPrev && wCur)) {
// qDebug("trying plain...");
// len = 0;
idx = index;
QCharTree::const_iterator it, match, end;
it = lexer->context->tree.constFind(c);
if (it != lexer->context->tree.constEnd()) {
// qDebug("plain on %c", c);
do {
++di;
++idx;
end = it->next.constEnd();
match = it->next.constFind(0);
if (idx < length) {
c = di->unicode();
it = it->next.constFind(c);
} else {
it = end;
}
if (it == end) {
if ((match != end) && !isWord(*di)) {
// word boundary found
// corresponding token end found
wPrev = isWord(*(di - 1));
bFound = true;
if (match->value.action & 0x40000000) {
// try regexps before notifying
plainIndex = index;
plainLength = idx - index;
plainMatch = match->value.action;
// qDebug("ambiguity.");
} else {
notify(index, idx - index, match->value.action);
index = idx;
}
// qDebug("next step : %c", d[index].toLatin1());
// bMonitor = true;
}
break;
}
} while (idx < length);
if (bFound) {
bEscape = false;
if (plainIndex == -1)
continue;
bFound = false;
}
di -= idx - index;
}
}
// fallback on regexp-like NFA-based matching
QNFABranch *children = lexer->context->out.branch;
if (children) {
// qDebug("trying %i sub nfas on %c", children->count(),
// d[index].toLatin1());
auto max = children->count();
for (decltype(max) i = 0; i < max; ++i) {
len = 0;
idx = index;
start = chain = children->at(i);
// qDebug("%ith attempt on %c", i, d[index + len].toLatin1());
while ((idx < length) || (chain->type & Match)) {
bEscaped = false;
if (chain->type & Match) {
if ((chain->assertion & WordEnd) && (idx < length) &&
isWord(*di) && isWord(*(di - 1))) {
// qDebug("end assertion failed...");
break;
}
// qDebug("matched to end");
if (chain->type & CxtBeg) {
// qDebug("entering context : 0x%x", chain);
++nlvls;
bool notifySub = notify.bufferLevel();
if (notifySub) {
// pop one message buffer
notify.stopBuffering();
}
// notify content of previous context until nest
notify(lastCxt, index - lastCxt,
lexer->context->actionid | 0x80000000);
if (notifySub) {
// notify sub matches so far to avoid tricky
// handling later on
notify.flush();
// notify.startBuffering();
}
// notify begin marker
notify(index, len,
start->actionid ? start->actionid
: chain->actionid);
// update context stack
lexer->parents.push(lexer->context);
lexer->context = chain;
// update nest index
lastCxt = idx;
// push a message buffer
notify.startBuffering();
} else if (chain->type & CxtEnd) {
// qDebug("leaving context :");
if (lexer->parents.isEmpty())
qFatal("context nesting problem");
if (bEscape) {
// not really end : escape found...
bEscape = false;
bEscaped = true;
} else {
if (nlvls)
--nlvls;
else
--lvls;
// pop one message buffer
notify.stopBuffering();
// notify context content from last nest
notify(lastCxt, index - lastCxt,
lexer->context->actionid | 0x80000000);
// flush sub matches
notify.flush();
// update context stack
lexer->context = lexer->parents.pop();
if (lexer->parents.count())
notify.startBuffering();
// update nest index
lastCxt = idx;
// notify end marker
notify(index, len, chain->actionid);
// qDebug("cxt notif...");
if (chain->type & Exclusive)
index = idx;
--index;
--di;
bFound = true;
break;
}
} else if (chain->type & CxtEsc) {
// qDebug("matched %s", qPrintable(QString(index,
// len)));
// notify(index, len, chain->actionid);
bEscape = !bEscape;
} else {
// qDebug("matched %s", qPrintable(QString(d +
// index, len)));
if (plainIndex != -1 && plainLength >= len) {
break;
}
notify(index, len, chain->actionid);
bEscape = false;
}
bFound = true;
index = idx;
--index;
--di;
// qDebug("next step : %c", d[index + 1].toLatin1());
// bMonitor = true;
break;
} else {
// "regular" nfa match (no match yet...)
if ((chain->assertion & WordStart) && (idx >= 1) &&
(isWord(*(di - 1)) && isWord(*di))) {
// qDebug("beg assertion failed...");
break;
}
QChar cc = *di;
bool found = match(cc, chain);
if (!(chain->assertion & ZeroOrOne) &&
!(chain->assertion & ZeroOrMore) && !found) {
// if ( cc.toLatin1() == ')' )
// qDebug("mismatch : %c != %c", cc.toLatin1(),
// chain->c.at(0));
break;
}
if (found) {
// qDebug("%c", d[index + len].toLatin1());
if ((chain->assertion & OneOrMore) ||
(chain->assertion & ZeroOrMore)) {
do {
++di;
++len;
++idx;
} while ((idx < length) && match(*di, chain));
} else {
++len;
++idx;
++di;
}
} else {
// qDebug("! %c", d[index + len].toLatin1());
}
chain = chain->out.next;
}
}
if (bFound)
break;
di -= len;
}
}
if (!bFound) {
if (plainIndex != -1) {
notify(plainIndex, plainLength, plainMatch);
index = plainIndex + plainLength;
di += plainLength;
continue;
}
bEscape = false;
//++index;
wPrev = wCur;
} else {
wPrev = isWord(*di);
}
++index;
++di;
}
// flush messages
if (!notify.bufferLevel())
return;
// qDebug("%i context nests", notify.bufferLevel());
// qDebug("[%i;+00[ : 0x%x", lastCxt, lexer->context->actionid |
// 0x80000000);
// pop down one buffer
notify.stopBuffering();
// notify overlapping context so far
notify(lastCxt, length - lastCxt, lexer->context->actionid | 0x80000000);
// notify sub matches
notify.flush();
// make sure we leave a blank notifier...
notify.clear();
// preserve escape power...
if (bEscaped)
return;
// some existing left AND new one(s)
if ((olvls == lvls) && nlvls)
++lvls;
// close stay-on-line contexts, if any
QStack<QNFA *>::iterator it = lexer->parents.begin() + lvls;
while (it != lexer->parents.end()) {
if ((*it)->type & StayOnLine) {
// qDebug("staid...");
it = lexer->parents.erase(it);
} else {
++it;
}
}
if ((lexer->context->type & StayOnLine) && nlvls && lexer->parents.count())
lexer->context = lexer->parents.pop();
}
QNFA *lexer() {
QNFA *lex = new QNFA;
lex->type = ContextBegin;
lex->out.branch = new QNFABranch;
return lex;
}
QNFA *sharedContext(const QString &start, QNFA *other, bool cs) {
QNFA *nfa, *end,
*beg = sequence(start.constData(), start.length(), &end, cs);
nfa = new QNFA;
nfa->type = ContextBegin;
nfa->out.branch = other->out.branch;
end->out.next = nfa;
return beg;
}
QNFA *context(const QString &start, const QString &stop, const QString &,
int action, QNFA **handler, bool cs) {
QNFA *nfa, *end,
*beg = sequence(start.constData(), start.length(), &end, cs);
nfa = new QNFA;
nfa->type = ContextBegin;
nfa->actionid = action;
nfa->out.branch = new QNFABranch;
if (handler)
*handler = nfa;
// else
// qDebug("no handler set [0x%x]", nfa);
end->out.next = nfa;
end = nfa;
QNFA *endmark,
*begendmark = sequence(stop.constData(), stop.length(), &endmark, cs);
nfa = new QNFA;
nfa->type = ContextEnd;
nfa->actionid = action;
endmark->out.next = nfa;
// end->out->branch->append(endmark);
addNFA(end, begendmark);
return beg;
}
void addWord(QNFA *lexer, const QString &w, int action, bool cs) {
if (!lexer || !(lexer->type & CxtBeg) || !lexer->out.branch)
return;
// try using the fastest way if possible
QString pt;
if (plain(w, &pt) && cs) {
addWord(lexer->tree, pt, action, cs);
return;
}
// fallback on (fast) regexp-like NFA-based semi-compiled parsing
QNFA *nfa, *word, *end;
word = sequence(w.constData(), w.length(), &end, cs);
word->assertion |= WordStart;
nfa = new QNFA;
nfa->type = Match;
nfa->assertion = WordEnd;
nfa->actionid = action;
end->out.next = nfa;
// lexer->out.branch->append(word);
addNFA(lexer, word);
}
void addSequence(QNFA *lexer, const QString &w, int action, bool cs) {
if (!lexer || !(lexer->type & CxtBeg) || !lexer->out.branch) {
return;
}
QNFA *seq, *end, *nfa;
seq = sequence(w.constData(), w.length(), &end, cs);
nfa = new QNFA;
nfa->type = Match;
nfa->actionid = action;
end->out.next = nfa;
// lexer->out.branch->append(seq);
addNFA(lexer, seq);
}
QNFA *sequence(const QChar *d, int length, QNFA **end, bool cs) {
QNFA *nfa, *set = 0, *prev = 0, *first = 0;
for (int i = 0; i < length; ++i) {
QChar c = d[i];
if (c == QLatin1Char('\\')) {
c = d[++i];
if (c == QLatin1Char('n')) {
c = '\n';
} else if (c == QLatin1Char('t')) {
c = '\t';
} else if (c == QLatin1Char('r')) {
c = '\r';
}
if (set) {
set->c << c.unicode();
} else {
nfa = new QNFA;
nfa->c << c.unicode();
if (prev)
prev->out.next = nfa;
prev = nfa;
}
} else if (c == QLatin1Char('$')) {
// char classes
c = d[++i];
if (set) {
if (c == QLatin1Char('s'))
set->assertion |= Space;
else if (c == QLatin1Char('S'))
set->assertion |= NonSpace;
else if (c == QLatin1Char('d'))
set->assertion |= Digit;
else if (c == QLatin1Char('D'))
set->assertion |= NonDigit;
else if (c == QLatin1Char('w'))
set->assertion |= Word;
else if (c == QLatin1Char('W'))
set->assertion |= NonWord;
else
set->c << QLatin1Char('$').unicode() << c.unicode();
} else {
nfa = new QNFA;
if (c == QLatin1Char('s'))
nfa->assertion |= Space;
else if (c == QLatin1Char('S'))
nfa->assertion |= NonSpace;
else if (c == QLatin1Char('d'))
nfa->assertion |= Digit;
else if (c == QLatin1Char('D'))
nfa->assertion |= NonDigit;
else if (c == QLatin1Char('w'))
nfa->assertion |= Word;
else if (c == QLatin1Char('W'))
nfa->assertion |= NonWord;
else {
nfa->c << QLatin1Char('$').unicode();
--i;
}
if (prev)
prev->out.next = nfa;
prev = nfa;
}
} else if (c == QLatin1Char('[')) {
if (set) {
set->c << c.unicode();
// qWarning("Nested sets are not supported (and useless
// BTW)...");
continue;
}
// enter set...
set = new QNFA;
// qDebug("set start");
} else if (c == QLatin1Char(']')) {
if (!set) {
qWarning("Unmatched set closing marker");
continue;
}
// leave set...
if (prev)
prev->out.next = set;
prev = set;
set = 0;
// qDebug("set end");
/*
} else if ( c == QLatin1Char('(') ) {
// allow trivial groups
QList<int> cuts;
int idx = i, nest = 1;
while ( nest && (++idx < length) )
{
if ( d[idx] == '\\' )
{
++idx;
continue;
} else if ( d[idx] == '(' ) {
++nest;
} else if ( d[idx] == ')' ) {
--nest;
} else if ( (nest == 1) && (d[idx] == '|') ) {
cuts << idx;
} else if ( d[idx] == '[' ) {
while ( ++idx < length )
{
if ( d[idx] == '\\' )
{
++idx;
continue;
} else if ( d[idx] == ']' ) {
break;
}
}
}
}
*/
} else if (set) {
if ((c == QLatin1Char('^')) && !set->c.count()) {
set->c << '\0';
continue;
}
quint16 prev =
set->c.count() ? set->c.at(set->c.length() - 1) : '\0';
if ((c == '-') && (prev != '\0') && ((i + 1) < length)) {
quint16 cse = d[++i].unicode();
for (quint16 csi = prev + 1; csi <= cse; ++csi) {
QChar csc(csi);
if (c.isLetter() && !cs)
set->c << c.toLower().unicode()
<< c.toUpper().unicode();
else
set->c << csi;
}
} else {
if (c.isLetter() && !cs)
set->c << c.toLower().unicode() << c.toUpper().unicode();
else
set->c << c.unicode();
}
// qDebug("set << %c", c.toLatin1());
} else if (c == QLatin1Char('+')) {
if (prev)
prev->assertion |= OneOrMore;
} else if (c == QLatin1Char('*')) {
if (prev)
prev->assertion |= ZeroOrMore;
} else if (c == QLatin1Char('?')) {
if (prev)
prev->assertion |= ZeroOrOne;
} else {
nfa = new QNFA;
if (c.isLetter() && !cs) {
nfa->c << c.toLower().unicode() << c.toUpper().unicode();
} else {
nfa->c << c.unicode();
}
if (prev)
prev->out.next = nfa;
prev = nfa;
}
if (!first)
first = prev;
}
if (end) {
*end = prev;
}
return first;
}
bool plain(const QString &word, QString *dest) {
if (dest)
dest->clear();
for (int i = 0; i < word.length(); i++) {
QChar c = word.at(i);
if (c == QLatin1Char('\\')) {
if (dest && ((i + 1) < word.length())) {
c = word.at(++i);
if (c == QLatin1Char('n'))
dest->append('\n');
else if (c == QLatin1Char('t'))
dest->append('\t');
else if (c == QLatin1Char('r'))
dest->append('\r');
else
dest->append(c);
}
} else if (c == QLatin1Char('[') || c == QLatin1Char(']') ||
c == QLatin1Char('+') || c == QLatin1Char('*') ||
c == QLatin1Char('?') || c == QLatin1Char('$')) {
if (dest)
dest->clear();
return false;
} else {
if (dest)
dest->append(c);
}
}
return true;
}
void addWord(QCharTree &tree, const QString &w, int action, bool cs) {
// qDebug("Adding word to char tree : %s", qPrintable(w));
if (cs) {
quint16 u = w.at(0).unicode();
QCharTree::iterator it = tree.find(u), tmp;
if (it == tree.end())
it = tree.insert(u, QCharTreeNode(u));
for (int i = 1; i < w.length(); i++) {
u = w.at(i).unicode();
// qDebug("char %c", w.at(i).toLatin1());
tmp = it->next.find(u);
if (tmp == it->next.end())
tmp = it->next.insert(u, QCharTreeNode(u));
it = tmp;
}
// add action handler
QCharTreeNode node;
node.value.action = action;
it->next[0] = node;
} else if (0) {
QChar c = w.at(0);
quint16 u = c.unicode();
QCharTree::iterator it, tmp;
QList<QCharTree::iterator> l, ltmp;
if (c.isLetter()) {
u = c.toLower().unicode();
tmp = tree.find(u);
if (tmp == tree.end())
tmp = tree.insert(u, QCharTreeNode(u));
l << tmp;
u = c.toUpper().unicode();
tmp = tree.find(u);
if (tmp == tree.end())
tmp = tree.insert(u, QCharTreeNode(u));
l << tmp;
} else {
tmp = tree.find(u);
if (tmp == tree.end())
tmp = tree.insert(u, QCharTreeNode(u));
l << tmp;
}
for (int i = 1; i < w.length(); ++i) {
c = w.at(i);
QList<QChar> lc;
if (c.isLetter())
lc << c.toLower() << c.toUpper();
else
lc << c;
foreach (c, lc) {
u = c.unicode();
foreach (it, l) {
tmp = it->next.find(u);
if (tmp == it->next.end())
tmp = it->next.insert(u, QCharTreeNode(u));
ltmp << tmp;
}
}
l = ltmp;
}
// add action handler
QCharTreeNode node;
node.value.action = action;
foreach (it, l)
it->next[0] = node;
}
}
void squeeze(QNFA *nfa) {
squeeze(nfa->tree);
if (nfa->type & Match) {
if (nfa->out.branch)
for (int i = 0; i < nfa->out.branch->count(); ++i)
squeeze(nfa->out.branch->at(i));
} else if (nfa->out.next) {
squeeze(nfa->out.next);
}
}
void squeeze(QCharTreeLevel &lvl) {
lvl.squeeze();
QCharTreeLevel::iterator it = lvl.begin();
while (it != lvl.end())
squeeze((it++)->next);
}