+ Added support for "-i" flags and case insensitive matching

+ Renamed Expr class to DisjList, which is more descriptive
  as to its function
+ Added ' and " to the list of allowed characters in an
  unquoted string.
+ Added updated sniffer rule grammar to doxygen for the
  Parser class


git-svn-id: file:///srv/svn/repos/haiku/trunk/current@685 a95241bf-73f2-0310-859d-f6bbb57e9c96
This commit is contained in:
Tyler Dauwalder 2002-08-10 09:36:00 +00:00
parent 0322e71475
commit 93d145bb01
14 changed files with 257 additions and 61 deletions

View File

@ -0,0 +1,31 @@
//----------------------------------------------------------------------
// This software is part of the OpenBeOS distribution and is covered
// by the OpenBeOS license.
//---------------------------------------------------------------------
/*!
\file sniffer/DisjList.h
Mime Sniffer Disjunction List declarations
*/
#ifndef _sk_sniffer_disj_list_h_
#define _sk_sniffer_disj_list_h_
class BPositionIO;
namespace Sniffer {
//! Abstract class defining methods acting on a list of ORed patterns
class DisjList {
public:
DisjList();
virtual ~DisjList();
virtual bool Sniff(BPositionIO *data) const = 0;
void SetCaseInsensitive(bool how);
bool IsCaseInsensitive();
protected:
bool fCaseInsensitive;
};
}
#endif // _sk_sniffer_disj_list_h_

View File

@ -23,7 +23,7 @@ class BString;
namespace Sniffer {
class Rule;
class Expr;
class DisjList;
class RPattern;
class Pattern;
@ -77,6 +77,7 @@ typedef enum TokenType {
Colon,
Divider,
Ampersand,
CaseInsensitiveFlag,
CharacterString,
Integer,
FloatingPoint
@ -192,6 +193,106 @@ private:
};
//! Handles parsing a sniffer rule, yielding either a parsed rule or a descriptive error message.
/*! A MIME sniffer rule is valid if it is well-formed with respect to the
following grammar and fulfills some further conditions listed thereafter:
<code>
Rule ::= LWS Priority LWS ConjList LWS
ConjList ::= DisjList (LWS DisjList)*
DisjList ::= "(" LWS PatternList LWS ")"
| "(" LWS RPatternList LWS ")"
| Range LWS "(" LWS PatternList LWS ")"
RPatternList ::= [Flag LWS] RPattern (LWS "|" LWS [Flag LWS] RPattern)*
PatternList ::= [Flag LWS] Pattern (LWS "|" LWS [Flag LWS] Pattern)*
RPattern ::= LWS Range LWS Pattern
Pattern ::= PString [ LWS "&" LWS Mask ]
Range ::= "[" LWS SDecimal [LWS ":" LWS SDecimal] LWS "]"
Priority ::= Float
Mask ::= PString
PString ::= HexLiteral | QuotedString | UnquotedString
HexLiteral ::= "0x" HexPair HexPair*
HexPair ::= HexChar HexChar
QuotedString ::= SingleQuotedString | DoubleQuotedString
SQuotedString := "'" SQChar+ "'"
DQuotedString := '"' DQChar+ '"'
UnquotedString ::= EscapedChar UChar*
EscapedChar ::= OctalEscape | HexEscape | "\" Char
OctalEscape ::= "\" [[OctHiChar] OctChar] OctChar
HexEscape ::= "\x" HexPair
Flag ::= "-i"
SDecimal ::= [Sign] Decimal
Decimal ::= DecChar DecChar*
Float ::= Fixed [("E" | "e") SDecimal]
Fixed ::= SDecimal ["." [Decimal]] | [Sign] "." Decimal
Sign ::= "+" | "-"
PunctuationChar ::= "(" | ")" | "[" | "]" | "|" | "&" | ":"
OctHiChar ::= "0" | "1" | "2" | "3"
OctChar ::= OctHiChar | "4" | "5" | "6" | "7"
DecChar ::= OctChar | "8" | "9"
HexChar ::= DecChar | "a" | "b" | "c" | "d" | "e" | "f" | "A" | "B" | "C"
| "D" | "E" | "F"
Char :: <any character>
SQChar ::= <Char except "\", "'"> | EscapedChar
DQChar ::= <Char except "\", '"'> | EscapedChar
UChar ::= <Char except "\", LWSChar, and PunctuationChar> | EscapedChar
LWS ::= LWSChar*
LWSChar ::= " " | TAB | LF
</code>
Conditions:
- If a mask is specified for a pattern, this mask must have the same
length as the pattern string.
- 0.0 <= Priority <= 1.0
- 0 <= Range begin <= Range end
Notes:
- If a case-insensitive flag ("-i") appears in front of any Pattern or RPattern
in a DisjList, case-insensitivity is applied to the entire DisjList.
Examples:
- 1.0 ('ABCD')
The file must start with the string "ABCD". The priority of the rule
is 1.0 (maximal).
- 0.8 [0:3] ('ABCD' | 'abcd')
The file must contain the string "ABCD" or "abcd" starting somewhere in
the first four bytes. The rule priority is 0.8.
- 0.5 ([0:3] 'ABCD' | [0:3] 'abcd' | [13] 'EFGH')
The file must contain the string "ABCD" or "abcd" starting somewhere in
the first four bytes or the string "EFGH" at position 13. The rule
priority is 0.5.
- 0.8 [0:3] ('ABCD' & 0xff00ffff | 'abcd' & 0xffff00ff)
The file must contain the string "A.CD" or "ab.d" (whereas "." is an
arbitrary character) starting somewhere in the first four bytes. The
rule priority is 0.8.
- 0.3 [10] ('mnop') ('abc') [20] ('xyz')
The file must contain the string 'abc' at the beginning of the file,
the string 'mnop' starting at position 10, and the string 'xyz'
starting at position 20. The rule priority is 0.3.
- 200e-3 (-i 'ab')
The file must contain the string 'ab', 'aB', 'Ab', or 'AB' at the
beginning of the file. The rule priority is 0.2.
Real examples:
- 0.20 ([0]"//" | [0]"/\*" | [0:32]"#include" | [0:32]"#ifndef"
| [0:32]"#ifdef")
text/x-source-code
- 0.70 ("8BPS \000\000\000\000" & 0xffffffff0000ffffffff )
image/x-photoshop
- 0.40 [0:64]( -i "&lt;HTML" | "&lt;HEAD" | "&lt;TITLE" | "&lt;BODY"
| "&lt;TABLE" | "&lt;!--" | "&lt;META" | "&lt;CENTER")
text/html
*/
class Parser {
public:
Parser();
@ -209,11 +310,11 @@ private:
// Parsing functions
void ParseRule(Rule *result);
double ParsePriority();
std::vector<Expr*>* ParseExprList();
Expr* ParseExpr();
std::vector<DisjList*>* ParseConjList();
DisjList* ParseDisjList();
Range ParseRange();
Expr* ParsePatternList(Range range);
Expr* ParseRPatternList();
DisjList* ParsePatternList(Range range);
DisjList* ParseRPatternList();
RPattern* ParseRPattern();
Pattern* ParsePattern();

View File

@ -30,11 +30,11 @@ public:
status_t InitCheck() const;
Err* GetErr() const;
bool Sniff(Range range, BPositionIO *data) const;
bool Sniff(Range range, BPositionIO *data, bool caseInsensitive) const;
status_t SetTo(const std::string &string, const std::string &mask);
private:
bool Sniff(off_t start, off_t size, BPositionIO *data) const;
bool Sniff(off_t start, off_t size, BPositionIO *data, bool caseInsensitive) const;
void SetStatus(status_t status, const char *msg = NULL);
void SetErrorMessage(const char *msg);

View File

@ -9,7 +9,7 @@
#ifndef _sk_sniffer_pattern_list_h_
#define _sk_sniffer_pattern_list_h_
#include <sniffer/Expr.h>
#include <sniffer/DisjList.h>
#include <sniffer/Range.h>
#include <vector>
@ -20,8 +20,10 @@ namespace Sniffer {
class Err;
class Pattern;
//! A list of patterns, all of which are to be searched over the same range.
class PatternList : public Expr {
/*! \brief A list of patterns, one of which must match for the list to match, all
of which are to be searched over the same range.
*/
class PatternList : public DisjList {
public:
PatternList(Range range);
virtual ~PatternList();

View File

@ -27,7 +27,7 @@ public:
status_t InitCheck() const;
Err* GetErr() const;
bool Sniff(BPositionIO *data) const;
bool Sniff(BPositionIO *data, bool caseInsensitive) const;
private:
Range fRange;
Pattern *fPattern;

View File

@ -9,7 +9,7 @@
#ifndef _sk_sniffer_r_pattern_list_h_
#define _sk_sniffer_r_pattern_list_h_
#include <sniffer/Expr.h>
#include <sniffer/DisjList.h>
#include <vector>
class BPositionIO;
@ -20,7 +20,7 @@ class Err;
class RPattern;
//! A list of patterns, each of which is to be searched over its own specified range.
class RPatternList : public Expr {
class RPatternList : public DisjList {
public:
RPatternList();
virtual ~RPatternList();

View File

@ -16,7 +16,7 @@ class BPositionIO;
namespace Sniffer {
class Expr;
class DisjList;
/*! \brief A priority and a list of expressions to be used for sniffing out the
type of an untyped file.
@ -34,10 +34,10 @@ private:
friend class Parser;
void Unset();
void SetTo(double priority, std::vector<Expr*>* list);
void SetTo(double priority, std::vector<DisjList*>* list);
double fPriority;
std::vector<Expr*> *fExprList;
std::vector<DisjList*> *fConjList; // A list of DisjLists to be ANDed
};
}

View File

@ -0,0 +1,31 @@
//----------------------------------------------------------------------
// This software is part of the OpenBeOS distribution and is covered
// by the OpenBeOS license.
//---------------------------------------------------------------------
/*!
\file DisjList.cpp
MIME sniffer Disjunction List class implementation
*/
#include <sniffer/DisjList.h>
using namespace Sniffer;
DisjList::DisjList()
: fCaseInsensitive(false)
{
}
DisjList::~DisjList() {
}
void
DisjList::SetCaseInsensitive(bool how) {
fCaseInsensitive = how;
}
bool
DisjList::IsCaseInsensitive() {
return fCaseInsensitive;
}

View File

@ -7,7 +7,6 @@
MIME sniffer rule parser implementation
*/
//#include <sniffer/Expr.h>
#include <sniffer/Parser.h>
#include <sniffer/Pattern.h>
#include <sniffer/PatternList.h>
@ -374,6 +373,7 @@ TokenStream::SetTo(const std::string &string) {
case '+':
case '-':
charStr = ch;
lastChar = ch;
state = tsssLonelyMinusOrPlus;
break;
@ -570,6 +570,9 @@ TokenStream::SetTo(const std::string &string) {
} else if (ch == '.') {
charStr += ch;
state = tsssLonelyDecimalPoint;
} else if (ch == 'i' && lastChar == '-') {
AddToken(CaseInsensitiveFlag, startPos);
state = tsssStart;
} else
throw new Err(std::string("Sniffer pattern error: incomplete signed number"), pos);
break;
@ -615,8 +618,6 @@ TokenStream::SetTo(const std::string &string) {
AddString(charStr, startPos);
stream.Unget(); // In case it's punctuation, let tsssStart handle it
state = tsssStart;
} else if (ch == '\'' || ch == '"') {
throw new Err(std::string("Sniffer pattern error: illegal unquoted character '") + ch + "'", pos);
} else if (ch == 0x3 && stream.IsEmpty()) {
AddString(charStr, startPos);
keepLooping = false;
@ -976,6 +977,9 @@ Sniffer::tokenTypeToString(TokenType type) {
case Ampersand:
return "Ampersand";
break;
case CaseInsensitiveFlag:
return "CaseInsensitiveFlag";
break;
case CharacterString:
return "CharacterString";
break;
@ -1048,8 +1052,8 @@ Parser::ParseRule(Rule *result) {
// Priority
double priority = ParsePriority();
// Expression List
std::vector<Expr*>* list = ParseExprList();
// Conjunction List
std::vector<DisjList*>* list = ParseConjList();
result->SetTo(priority, list);
}
@ -1069,16 +1073,16 @@ Parser::ParsePriority() {
throw new Err("Sniffer pattern error: match level expected", t->Pos()); // Same as R5
}
std::vector<Expr*>*
Parser::ParseExprList() {
std::vector<Expr*> *list = new(nothrow) std::vector<Expr*>;
std::vector<DisjList*>*
Parser::ParseConjList() {
std::vector<DisjList*> *list = new(nothrow) std::vector<DisjList*>;
if (!list)
ThrowOutOfMemError(stream.Pos());
try {
// Expr+
// DisjList+
int count = 0;
while (true) {
Expr* expr = ParseExpr();
DisjList* expr = ParseDisjList();
if (!expr)
break;
else {
@ -1095,10 +1099,10 @@ Parser::ParseExprList() {
return list;
}
Expr*
Parser::ParseExpr() {
DisjList*
Parser::ParseDisjList() {
// If we've run out of tokens right now, it's okay, but
// we need to let ParseExprList() know what's up
// we need to let ParseConjList() know what's up
if (stream.IsEmpty())
return NULL;
@ -1109,10 +1113,14 @@ Parser::ParseExpr() {
// PatternList | RangeList
if (t1->Type() == LeftParen) {
const Token *t2 = stream.Get();
// Skip the case-insensitive flag, if there is one
const Token *tokenOfInterest = (t2->Type() == CaseInsensitiveFlag) ? stream.Get() : t2;
if (t2 != tokenOfInterest)
stream.Unget(); // We called Get() three times
stream.Unget();
stream.Unget();
// RangeList
if (t2->Type() == LeftBracket) {
if (tokenOfInterest->Type() == LeftBracket) {
return ParseRPatternList();
// PatternList
} else {
@ -1175,7 +1183,7 @@ Parser::ParseRange() {
throw range.GetErr();
}
Expr*
DisjList*
Parser::ParsePatternList(Range range) {
PatternList *list = new(nothrow) PatternList(range);
if (!list)
@ -1183,9 +1191,12 @@ Parser::ParsePatternList(Range range) {
try {
// LeftParen
stream.Read(LeftParen);
// Pattern, (Divider, Pattern)*
// [Flag] Pattern, (Divider, [Flag] Pattern)*
bool keepLooping = true;
while (true) {
// [Flag]
if (stream.CondRead(CaseInsensitiveFlag))
list->SetCaseInsensitive(true);
// Pattern
list->Add(ParsePattern());
// [Divider]
@ -1203,7 +1214,7 @@ Parser::ParsePatternList(Range range) {
return list;
}
Expr*
DisjList*
Parser::ParseRPatternList() {
RPatternList *list = new(nothrow) RPatternList();
if (!list)
@ -1211,9 +1222,12 @@ Parser::ParseRPatternList() {
try {
// LeftParen
stream.Read(LeftParen);
// RPattern, (Divider, RPattern)*
// [Flag] RPattern, (Divider, [Flag] RPattern)*
bool keepLooping = true;
while (true) {
// [Flag]
if (stream.CondRead(CaseInsensitiveFlag))
list->SetCaseInsensitive(true);
// RPattern
list->Add(ParseRPattern());
// [Divider]

View File

@ -80,17 +80,14 @@ Pattern::SetTo(const std::string &string, const std::string &mask) {
false if not.
*/
bool
Pattern::Sniff(Range range, BPositionIO *data) const {
// If our range contains negative values relative to the end of
// the file, convert them to positive values relative to the
// beginning of the file.
Pattern::Sniff(Range range, BPositionIO *data, bool caseInsensitive) const {
int32 start = range.Start();
int32 end = range.End();
off_t size = data->Seek(0, SEEK_END);
if (end >= size)
end = size-1;
end = size-1; // Don't bother searching beyond the end of the stream
for (int i = start; i <= end; i++) {
if (Sniff(i, size, data))
if (Sniff(i, size, data, caseInsensitive))
return true;
}
return false;
@ -99,12 +96,12 @@ Pattern::Sniff(Range range, BPositionIO *data) const {
// Assumes the BPositionIO object is in the correct
// position from which to sniff
bool
Pattern::Sniff(off_t start, off_t size, BPositionIO *data) const {
Pattern::Sniff(off_t start, off_t size, BPositionIO *data, bool caseInsensitive) const {
off_t len = fString.length();
char *buffer = new(nothrow) char[len+1];
if (buffer) {
ssize_t bytesRead = data->ReadAt(start, buffer, len);
// /todo If there are fewer bytes left in the data stream
// \todo If there are fewer bytes left in the data stream
// from the given position than the length of our data
// string, should we just return false (which is what we're
// doing now), or should we compare as many bytes as we
@ -113,10 +110,28 @@ Pattern::Sniff(off_t start, off_t size, BPositionIO *data) const {
return false;
else {
bool result = true;
for (int i = 0; i < len; i++) {
if ((fString[i] & fMask[i]) != (buffer[i] & fMask[i])) {
result = false;
break;
if (caseInsensitive) {
for (int i = 0; i < len; i++) {
char secondChar;
if ('A' <= fString[i] && fString[i] <= 'Z')
secondChar = 'a' + (fString[i] - 'A'); // Also check lowercase
else if ('a' <= fString[i] && fString[i] <= 'z')
secondChar = 'A' + (fString[i] - 'a'); // Also check uppercase
else
secondChar = fString[i]; // Check the same char twice as punishment for doing a case insensitive search ;-)
if (((fString[i] & fMask[i]) != (buffer[i] & fMask[i]))
&& ((secondChar & fMask[i]) != (buffer[i] & fMask[i])))
{
result = false;
break;
}
}
} else {
for (int i = 0; i < len; i++) {
if ((fString[i] & fMask[i]) != (buffer[i] & fMask[i])) {
result = false;
break;
}
}
}
return result;

View File

@ -16,7 +16,8 @@
using namespace Sniffer;
PatternList::PatternList(Range range)
: fRange(range)
: DisjList()
, fRange(range)
{
}
@ -49,7 +50,7 @@ PatternList::Sniff(BPositionIO *data) const {
std::vector<Pattern*>::const_iterator i;
for (i = fList.begin(); i != fList.end(); i++) {
if (*i)
result |= (*i)->Sniff(fRange, data);
result |= (*i)->Sniff(fRange, data, fCaseInsensitive);
}
return result;
}

View File

@ -53,9 +53,9 @@ RPattern::~RPattern() {
//! Sniffs the given data stream over the object's range for the object's pattern
bool
RPattern::Sniff(BPositionIO *data) const {
RPattern::Sniff(BPositionIO *data, bool caseInsensitive) const {
if (!data || InitCheck() != B_OK)
return false;
else
return fPattern->Sniff(fRange, data);
return fPattern->Sniff(fRange, data, caseInsensitive);
}

View File

@ -16,6 +16,7 @@
using namespace Sniffer;
RPatternList::RPatternList()
: DisjList()
{
}
@ -49,7 +50,7 @@ RPatternList::Sniff(BPositionIO *data) const {
std::vector<RPattern*>::const_iterator i;
for (i = fList.begin(); i != fList.end(); i++) {
if (*i)
result |= (*i)->Sniff(data);
result |= (*i)->Sniff(data, fCaseInsensitive);
}
return result;
}

View File

@ -8,7 +8,7 @@
*/
#include <sniffer/Err.h>
#include <sniffer/Expr.h>
#include <sniffer/DisjList.h>
#include <sniffer/Rule.h>
#include <DataIO.h>
@ -16,7 +16,7 @@ using namespace Sniffer;
Rule::Rule()
: fPriority(0.0)
, fExprList(NULL)
, fConjList(NULL)
{
}
@ -26,7 +26,7 @@ Rule::~Rule() {
status_t
Rule::InitCheck() const {
return fExprList ? B_OK : B_NO_INIT;
return fConjList ? B_OK : B_NO_INIT;
}
//! Returns the priority of the rule. 0.0 <= priority <= 1.0.
@ -42,8 +42,8 @@ Rule::Sniff(BPositionIO *data) const {
return false;
else {
bool result = true;
std::vector<Expr*>::const_iterator i;
for (i = fExprList->begin(); i != fExprList->end(); i++) {
std::vector<DisjList*>::const_iterator i;
for (i = fConjList->begin(); i != fConjList->end(); i++) {
if (*i)
result &= (*i)->Sniff(data);
}
@ -53,21 +53,21 @@ Rule::Sniff(BPositionIO *data) const {
void
Rule::Unset() {
if (fExprList){
delete fExprList;
fExprList = NULL;
if (fConjList){
delete fConjList;
fConjList = NULL;
}
}
//! Called by Parser::Parse() after successfully parsing a sniffer rule.
void
Rule::SetTo(double priority, std::vector<Expr*>* list) {
Rule::SetTo(double priority, std::vector<DisjList*>* list) {
Unset();
if (0.0 <= priority && priority <= 1.0)
fPriority = priority;
else
throw new Err("Sniffer pattern error: invalid priority", -1);
fExprList = list;
fConjList = list;
}