haiku/src/kits/shared/RegExp.cpp
Adrien Destugues 831819980e RegExp: fix match count computation
* There actually is a way to count the matches, so use it instead of
attempting to guess
 * In some cases (when using optional groups (xxx)?, for example), there
may be a non-matching group (with offsets set to -1) and matching groups
after it, so the binary search wasn't quite working
 * Instead, we always return the number of capturing groups in the in
the given expression, which is the maximal number of matches. Some (or
all) of these may not have any content. We do return 0 matches on any
error, including when the regular expression didn't match anything.
2013-10-16 21:01:42 +02:00

372 lines
6.4 KiB
C++

/*
* Copyright 2013, Ingo Weinhold, ingo_weinhold@gmx.de.
* Copyright 2013, Rene Gollent, rene@gollent.com.
* Distributed under the terms of the MIT License.
*/
#include <RegExp.h>
#include <new>
#include <regex.h>
#include <String.h>
#include <Referenceable.h>
// #pragma mark - RegExp::Data
struct RegExp::Data : public BReferenceable {
Data(const char* pattern, PatternType patternType, bool caseSensitive)
:
BReferenceable()
{
// convert the shell pattern to a regular expression
BString patternString;
if (patternType == PATTERN_TYPE_WILDCARD) {
while (*pattern != '\0') {
char c = *pattern++;
switch (c) {
case '?':
patternString += '.';
continue;
case '*':
patternString += ".*";
continue;
case '[':
{
// find the matching ']' first
const char* end = pattern;
while (*end != ']') {
if (*end++ == '\0') {
fError = REG_EBRACK;
return;
}
}
if (pattern == end) {
// Empty bracket expression. It will never match
// anything. Strictly speaking this is not
// considered an error, but we handle it like one.
fError = REG_EBRACK;
return;
}
patternString += '[';
// We need to avoid "[." ... ".]", "[=" ... "=]", and
// "[:" ... ":]" sequences, since those have special
// meaning in regular expressions. If we encounter
// a '[' followed by either of '.', '=', or ':', we
// replace the '[' by "[.[.]".
while (pattern < end) {
c = *pattern++;
if (c == '[' && pattern < end) {
switch (*pattern) {
case '.':
case '=':
case ':':
patternString += "[.[.]";
continue;
}
}
patternString += c;
}
pattern++;
patternString += ']';
break;
}
case '\\':
{
// Quotes the next character. Works the same way for
// regular expressions.
if (*pattern == '\0') {
fError = REG_EESCAPE;
return;
}
patternString += '\\';
patternString += *pattern++;
break;
}
case '^':
case '.':
case '$':
case '(':
case ')':
case '|':
case '+':
case '{':
// need to be quoted
patternString += '\\';
// fall through
default:
patternString += c;
break;
}
}
pattern = patternString.String();
}
int flags = REG_EXTENDED;
if (!caseSensitive)
flags |= REG_ICASE;
fError = regcomp(&fCompiledExpression, pattern, flags);
}
~Data()
{
if (fError == 0)
regfree(&fCompiledExpression);
}
bool IsValid() const
{
return fError == 0;
}
const regex_t* CompiledExpression() const
{
return &fCompiledExpression;
}
private:
int fError;
regex_t fCompiledExpression;
};
// #pragma mark - RegExp::MatchResultData
struct RegExp::MatchResultData : public BReferenceable {
MatchResultData(const regex_t* compiledExpression, const char* string)
:
BReferenceable(),
fMatchCount(0),
fMatches(NULL)
{
// fMatchCount is always set to the number of matching groups in the
// expression (or 0 if an error occured). Some of the "matches" in
// the array may still point to the (-1,-1) range if they don't
// actually match anything.
fMatchCount = compiledExpression->re_nsub + 1;
fMatches = new regmatch_t[fMatchCount];
if (regexec(compiledExpression, string, fMatchCount, fMatches, 0)
!= 0) {
delete[] fMatches;
fMatches = NULL;
fMatchCount = 0;
}
}
~MatchResultData()
{
delete[] fMatches;
}
size_t MatchCount() const
{
return fMatchCount;
}
const regmatch_t* Matches() const
{
return fMatches;
}
private:
size_t fMatchCount;
regmatch_t* fMatches;
};
// #pragma mark - RegExp
RegExp::RegExp()
:
fData(NULL)
{
}
RegExp::RegExp(const char* pattern, PatternType patternType,
bool caseSensitive)
:
fData(NULL)
{
SetPattern(pattern, patternType, caseSensitive);
}
RegExp::RegExp(const RegExp& other)
:
fData(other.fData)
{
if (fData != NULL)
fData->AcquireReference();
}
RegExp::~RegExp()
{
if (fData != NULL)
fData->ReleaseReference();
}
bool
RegExp::SetPattern(const char* pattern, PatternType patternType,
bool caseSensitive)
{
if (fData != NULL) {
fData->ReleaseReference();
fData = NULL;
}
Data* newData = new(std::nothrow) Data(pattern, patternType, caseSensitive);
if (newData == NULL)
return false;
BReference<Data> dataReference(newData, true);
if (!newData->IsValid())
return false;
fData = dataReference.Detach();
return true;
}
RegExp::MatchResult
RegExp::Match(const char* string) const
{
if (!IsValid())
return MatchResult();
return MatchResult(
new(std::nothrow) MatchResultData(fData->CompiledExpression(),
string));
}
RegExp&
RegExp::operator=(const RegExp& other)
{
if (fData != NULL)
fData->ReleaseReference();
fData = other.fData;
if (fData != NULL)
fData->AcquireReference();
return *this;
}
// #pragma mark - RegExp::MatchResult
RegExp::MatchResult::MatchResult()
:
fData(NULL)
{
}
RegExp::MatchResult::MatchResult(MatchResultData* data)
:
fData(data)
{
}
RegExp::MatchResult::MatchResult(const MatchResult& other)
:
fData(other.fData)
{
if (fData != NULL)
fData->AcquireReference();
}
RegExp::MatchResult::~MatchResult()
{
if (fData != NULL)
fData->ReleaseReference();
}
bool
RegExp::MatchResult::HasMatched() const
{
return fData != NULL && fData->MatchCount() > 0;
}
size_t
RegExp::MatchResult::StartOffset() const
{
return fData != NULL && fData->MatchCount() > 0
? fData->Matches()[0].rm_so : 0;
}
size_t
RegExp::MatchResult::EndOffset() const
{
return fData != NULL && fData->MatchCount() > 0
? fData->Matches()[0].rm_eo : 0;
}
size_t
RegExp::MatchResult::GroupCount() const
{
if (fData == NULL)
return 0;
size_t matchCount = fData->MatchCount();
return matchCount > 0 ? matchCount - 1 : 0;
}
size_t
RegExp::MatchResult::GroupStartOffsetAt(size_t index) const
{
return fData != NULL && fData->MatchCount() > index + 1
? fData->Matches()[index + 1].rm_so : 0;
}
size_t
RegExp::MatchResult::GroupEndOffsetAt(size_t index) const
{
return fData != NULL && fData->MatchCount() > index + 1
? fData->Matches()[index + 1].rm_eo : 0;
}
RegExp::MatchResult&
RegExp::MatchResult::operator=(const MatchResult& other)
{
if (fData != NULL)
fData->ReleaseReference();
fData = other.fData;
if (fData != NULL)
fData->AcquireReference();
return *this;
}