#ifndef __CREGEXP__
#define __CREGEXP__
#include<unicode/String.h>
#include<unicode/CharacterClass.h>
/**
@addtogroup cregexp Regular Expressions
Colorer Regular Expressions (cregexp) class implementation.
*/
/// with this define class uses extended command set for
/// colorer compatibility mode
/// if you undef it, it will compile stantard set for
/// regexp compatibility mode
#define COLORERMODE
/// use hashes for saving named brackets
//#define NAMED_MATCHES_IN_HASH
/// check duplicate brackets
//#define CHECKNAMES
#if defined COLORERMODE && defined NAMED_MATCHES_IN_HASH
#error COLORERMODE && NAMED_MATCHES_IN_HASH not realyzed yet
#endif
/// numeric matches num
#define MATCHES_NUM 0x10
#if !defined NAMED_MATCHES_IN_HASH
// number of named brackets (access through SMatches.ns)
#define NAMED_MATCHES_NUM 0x10
#endif
#ifdef NAMED_MATCHES_IN_HASH
struct SMatch{
int s,e;
};
// you can redefine this class
typedef class SMatchHash{
public:
SMatch *setItem(const String *name, SMatch &smatch){return null;};
SMatch *getItem(const String *name){return null;};
}*PMatchHash;
#endif
enum EOps
{
ReBlockOps,
ReMul, // *
RePlus, // +
ReQuest, // ?
ReNGMul, // *?
ReNGPlus, // +?
ReNGQuest, // ??
ReRangeN, // {n,}
ReRangeNM, // {n,m}
ReNGRangeN, // {n,}?
ReNGRangeNM, // {n,m}?
ReOr, // |
ReBehind, // ?#n
ReNBehind, // ?~n
ReAhead, // ?=
ReNAhead, // ?!
ReSymbolOps,
ReEmpty,
ReMetaSymb, // \W \s \d ...
ReSymb, // a b c ...
ReWord, // word...
ReEnum, // []
ReNEnum, // [^]
ReBrackets, // (...)
ReNamedBrackets, // (?{name} ...)
#ifdef COLORERMODE
ReBkTrace, // \yN
ReBkTraceN, // \YN
ReBkTraceName, // \y{name}
ReBkTraceNName, // \Y{name}
#endif
ReBkBrack, // \N
ReBkBrackName // \p{name}
};
enum EMetaSymbols
{
ReBadMeta,
ReAnyChr, // .
ReSoL, // ^
#ifdef COLORERMODE
ReSoScheme, // ~
#endif
ReEoL, // $
ReDigit, // \d
ReNDigit, // \D
ReWordSymb, // \w
ReNWordSymb, // \W
ReWSpace, // \s isWhiteSpace()
ReNWSpace, // \S
ReUCase, // \u
ReNUCase, // \l
ReWBound, // \b
ReNWBound, // \B
RePreNW, // \c
#ifdef COLORERMODE
ReStart, // \m
ReEnd, // \M
#endif
ReChrLast,
};
enum EError
{
EOK = 0, EERROR, ESYNTAX, EBRACKETS, EENUM, EOP
};
/// @ingroup cregexp
struct SMatches
{
int s[MATCHES_NUM];
int e[MATCHES_NUM];
int cMatch;
#if !defined NAMED_MATCHES_IN_HASH
int ns[NAMED_MATCHES_NUM];
int ne[NAMED_MATCHES_NUM];
int cnMatch;
#endif
};
/** Regular expressions internal tree node.
@ingroup cregexp
*/
class SRegInfo
{
public:
SRegInfo();
~SRegInfo();
#include<common/MemoryOperator.h>
EOps op;
union{
EMetaSymbols metaSymbol;
wchar symbol;
String *word;
CharacterClass *charclass;
SRegInfo *param;
}un;
#if defined NAMED_MATCHES_IN_HASH
String *namedata;
#endif
int oldParse;
int param0, param1;
int s, e;
SRegInfo *parent;
SRegInfo *next;
SRegInfo *prev;
};
/** Regular Expression compiler and matcher.
Colorer regular expressions library cregexp.
\par 1. Features.
\par 1.1. Colorer Unicode classes.
- Unicode Consortium regexp level 1 support.
All characters are treated as independent 16-bit units.
The result of RE is independent of current locale.
- Unicode syntax extensions:
- Unicode general category char class:
- [{L}{Nd}] - all letters and decimal digits,
- [{ALL}] - as '.',
- [{ASSIGNED}] - all assigned unicode characters,
- [{UNASSIGNED}] - all unassigned unicode characters.
- Char classes substraction unicode extension:
- [{ASSIGNED}-[{Lu}]-[{Ll}]] - all assigned characters except,
- upper and lower case characters.
- Char classes connection syntax:
- [{Lu}[{Ll}]] - upper and lower case characters.
- Char classes intersection syntax:
- [{ALL}&&[{L}]] - only Letter characters.
- Character reference syntax: \\x{2028} \\x0A as in Perl.
- Unicode form \\u2028 is unused (\\u - upper case char).
\par 1.2. Extensions.
- Bracket extensions:
- (?{name} pattern ) - named bracket,
- \\p{name} - named bracket reference.
- (?{} pattern ) - no capturing bracket as (?: pattern ) in Perl.
- Look Ahead/Backward:
- pattern?= as Perl's (?=pattern)
- pattern?! as Perl's (?!pattern)
- pattern?#N - N symbols backward look for pattern
- pattern?~N - N symbols backward look for no pattern
- Colorer library extensions:
- \\m \\M - sets new start and end of zero(default) bracket.
- \\yN \\YN \\y{name} \\Y{name} - back reference into another RE's bracket.
\par 1.3. Perl compatibility.
- Modifiers //ismx
- \\ p{name} - back reference to named bracket (but not named property as in Perl!)
- No POSIX character classes support.
\par 2. Dislikes:
\par 2.1. According to Unicode RE level 1 support:
- No surrogate symbols support,
- No string length changes on case mappings (only 1 <-> 1 mappings),
\par 2.2. Algorithmic problems:
- Stack recursion implementation.
@ingroup cregexp
*/
class CRegExp
{
public:
/**
Empty constructor. No RE tree is builded with this constructor.
Use #setRE method to change pattern.
*/
CRegExp();
/**
Constructs regular expression and compile it with @c text pattern.
*/
CRegExp(const String *text);
~CRegExp();
/**
Is compilied RE well-formed.
*/
bool isOk();
/**
Returns information about RE compilation error.
*/
EError getError();
/**
Tells RE parser, that it must make moves on tested string while RE matching.
*/
bool setPositionMoves(bool moves);
/**
Returns count of named brackets.
*/
int getBracketNo(const String *brname);
/**
Returns named bracked name by it's index.
*/
String *getBracketName(int no);
#ifdef COLORERMODE
bool setBackRE(CRegExp *bkre);
/**
Changes RE object, used for backreferences with named \y{} \Y{} operators.
*/
bool setBackTrace(const String *str, SMatches *trace);
/**
Returns current RE object, used for backreferences with \y \Y operators.
*/
bool getBackTrace(const String **str, SMatches **trace);
#endif
/**
Compiles specified regular expression and drops all
previous structures.
*/
bool setRE(const String *re);
#ifdef NAMED_MATCHES_IN_HASH
/** Runs RE parser against input string @c str
*/
bool parse(const String *str, SMatches *mtch, SMatchHash *nmtch = null);
/** Runs RE parser against input string @c str
*/
bool parse(const String *str, int pos, int eol, SMatches *mtch, SMatchHash *nmtch = null, int soscheme = 0, int moves = -1);
#else
/** Runs RE parser against input string @c str
*/
bool parse(const String *str, SMatches *mtch);
/** Runs RE parser against input string @c str
*/
bool parse(const String *str, int pos, int eol, SMatches *mtch, int soscheme = 0, int moves = -1);
#endif
private:
bool ignoreCase, extend, positionMoves, singleLine, multiLine;
SRegInfo *tree_root;
EError error;
wchar firstChar;
EMetaSymbols firstMetaChar;
#ifdef COLORERMODE
CRegExp *backRE;
const String *backStr;
SMatches *backTrace;
int schemeStart;
#endif
bool startChange, endChange;
const String *global_pattern;
int end;
SMatches *matches;
int cMatch;
#if !defined NAMED_MATCHES_IN_HASH
String *(brnames[NAMED_MATCHES_NUM]);
int cnMatch;
#else
SMatchHash *namedMatches;
#endif
void init();
EError setRELow(const String &re);
EError setStructs(SRegInfo *&, const String &expr, int &endPos);
void optimize();
bool quickCheck(int toParse);
bool isWordBoundary(int &toParse);
bool isNWordBoundary(int &toParse);
bool checkMetaSymbol(EMetaSymbols metaSymbol, int &toParse);
bool lowParse(SRegInfo *re, SRegInfo *prev, int toParse);
bool parseRE(int toParse);
};
#endif
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is the Colorer Library.
*
* The Initial Developer of the Original Code is
* Cail Lomecb <cail@nm.ru>.
* Portions created by the Initial Developer are Copyright (C) 1999-2003
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */