//---------------------------------------------------------------------- // This software is part of the OpenBeOS distribution and is covered // by the MIT License. //--------------------------------------------------------------------- /*! \file sniffer/Parser.h MIME sniffer rule parser declarations */ #ifndef _SNIFFER_PARSER_H #define _SNIFFER_PARSER_H #include #include #include #include #include #include #include #include class BString; //! MIME Sniffer related classes namespace BPrivate { namespace Storage { namespace Sniffer { class Rule; class DisjList; class RPattern; class Pattern; //------------------------------------------------------------------------------ // The mighty parsing function ;-) //------------------------------------------------------------------------------ status_t parse(const char *rule, Rule *result, BString *parseError = NULL); //------------------------------------------------------------------------------ // Classes used internally by the parser //------------------------------------------------------------------------------ //! Types of tokens typedef enum TokenType { EmptyToken, LeftParen, RightParen, LeftBracket, RightBracket, Colon, Divider, Ampersand, CaseInsensitiveFlag, CharacterString, Integer, FloatingPoint } TokenType; /*! \brief Returns a NULL-terminated string contating the name of the given token type */ const char* tokenTypeToString(TokenType type); //! Base token class returned by TokenStream /*! Each token represents a single chunk of relevant information in a given rule. For example, the floating point number "1.2e-35", originally represented as a 7-character string, is added to the token stream as a single FloatToken object. */ class Token { public: Token(TokenType type = EmptyToken, const ssize_t pos = -1); virtual ~Token(); TokenType Type() const; virtual const std::string& String() const; virtual int32 Int() const; virtual double Float() const; ssize_t Pos() const; bool operator==(Token &ref) const; protected: TokenType fType; ssize_t fPos; }; //! String token class /*! Single-quoted strings, double-quoted strings, unquoted strings, and hex literals are all converted to StringToken objects by the scanner and from then on treated uniformly. */ class StringToken : public Token { public: StringToken(const std::string &str, const ssize_t pos); virtual ~StringToken(); virtual const std::string& String() const; protected: std::string fString; }; //! Integer token class /*! Signed or unsigned integer literals are coverted to IntToken objects, which may then be treated as either ints or floats (since a priority of "1" would be valid, but scanned as an int instead of a float). */ class IntToken : public Token { public: IntToken(const int32 value, const ssize_t pos); virtual ~IntToken(); virtual int32 Int() const; virtual double Float() const; protected: int32 fValue; }; //! Floating point token class /*! Signed or unsigned, extended or non-extended notation floating point numbers are converted to FloatToken objects by the scanner. */ class FloatToken : public Token { public: FloatToken(const double value, const ssize_t pos); virtual ~FloatToken(); virtual double Float() const; protected: double fValue; }; //! Manages a stream of Token objects /*! Provides Get() and Unget() operations, some handy shortcut operations (Read() and CondRead()), and handles memory management with respect to all the Token objects in the stream (i.e. never delete a Token object returned by Get()). Also, the scanner portion of the parser is implemented in the TokenStream's SetTo() function. */ class TokenStream { public: TokenStream(const std::string &string); TokenStream(); ~TokenStream(); status_t SetTo(const std::string &string); void Unset(); status_t InitCheck() const; const Token* Get(); void Unget(); void Read(TokenType type); bool CondRead(TokenType type); ssize_t Pos() const; ssize_t EndPos() const; bool IsEmpty() const; private: void AddToken(TokenType type, ssize_t pos); void AddString(const std::string &str, ssize_t pos); void AddInt(const char *str, ssize_t pos); void AddFloat(const char *str, ssize_t pos); std::vector fTokenList; status_t fCStatus; int fPos; int fStrLen; TokenStream(const TokenStream &ref); TokenStream& operator=(const TokenStream &ref); }; //! Handles parsing a sniffer rule, yielding either a parsed rule or a descriptive error message. /*! A MIME sniffer rule is valid if it is well-formed with respect to the following grammar and fulfills some further conditions listed thereafter:


	Rule			::= LWS Priority LWS ConjList LWS	
	ConjList		::= DisjList (LWS DisjList)*	
	DisjList		::= "(" LWS PatternList LWS ")"										
						| "(" LWS RPatternList LWS ")"									
						| Range LWS "(" LWS PatternList LWS ")"						
	RPatternList	::= [Flag LWS] RPattern (LWS "|" LWS [Flag LWS] RPattern)*	
	PatternList		::= [Flag LWS] Pattern (LWS "|" LWS [Flag LWS] Pattern)*
	
	RPattern		::= LWS Range LWS Pattern
	Pattern			::= PString [ LWS "&" LWS Mask ]
	Range			::=	"[" LWS SDecimal [LWS ":" LWS SDecimal] LWS "]"

	Priority		::= Float
	Mask			::= PString
	PString			::= HexLiteral | QuotedString | UnquotedString

	HexLiteral		::= "0x" HexPair HexPair*
	HexPair			::= HexChar HexChar
	
	QuotedString	::= SingleQuotedString | DoubleQuotedString
	SQuotedString	:= "'" SQChar+ "'"
	DQuotedString	:= '"' DQChar+ '"'

	UnquotedString	::= EscapedChar UChar*
	EscapedChar		::= OctalEscape | HexEscape | "\" Char
	OctalEscape		::= "\" [[OctHiChar] OctChar] OctChar
	HexEscape		::= "\x" HexPair

	Flag			::= "-i"

	SDecimal		::= [Sign] Decimal
	Decimal			::= DecChar DecChar*
	Float			::= Fixed [("E" | "e") SDecimal]
	Fixed			::= SDecimal ["." [Decimal]] | [Sign] "." Decimal
	Sign			::= "+" | "-"

	PunctuationChar	::= "(" | ")" | "[" | "]" | "|" | "&" | ":"
	OctHiChar		::= "0" | "1" | "2" | "3" 
	OctChar			::= OctHiChar | "4" | "5" | "6" | "7"
	DecChar			::= OctChar | "8" | "9"
	HexChar			::= DecChar | "a" | "b" | "c" | "d" | "e" | "f" | "A" | "B" | "C"
						| "D" | "E" | "F"

	Char			:: 
	SQChar			::=  | EscapedChar
	DQChar			::=  | EscapedChar
	UChar			::=  | EscapedChar

	LWS				::= LWSChar*
	LWSChar			::= " " | TAB | LF

Conditions: - If a mask is specified for a pattern, this mask must have the same length as the pattern string. - 0.0 <= Priority <= 1.0 - 0 <= Range begin <= Range end Notes: - If a case-insensitive flag ("-i") appears in front of any Pattern or RPattern in a DisjList, case-insensitivity is applied to the entire DisjList. Examples: - 1.0 ('ABCD') The file must start with the string "ABCD". The priority of the rule is 1.0 (maximal). - 0.8 [0:3] ('ABCD' | 'abcd') The file must contain the string "ABCD" or "abcd" starting somewhere in the first four bytes. The rule priority is 0.8. - 0.5 ([0:3] 'ABCD' | [0:3] 'abcd' | [13] 'EFGH') The file must contain the string "ABCD" or "abcd" starting somewhere in the first four bytes or the string "EFGH" at position 13. The rule priority is 0.5. - 0.8 [0:3] ('ABCD' & 0xff00ffff | 'abcd' & 0xffff00ff) The file must contain the string "A.CD" or "ab.d" (whereas "." is an arbitrary character) starting somewhere in the first four bytes. The rule priority is 0.8. - 0.3 [10] ('mnop') ('abc') [20] ('xyz') The file must contain the string 'abc' at the beginning of the file, the string 'mnop' starting at position 10, and the string 'xyz' starting at position 20. The rule priority is 0.3. - 200e-3 (-i 'ab') The file must contain the string 'ab', 'aB', 'Ab', or 'AB' at the beginning of the file. The rule priority is 0.2. Real examples: - 0.20 ([0]"//" | [0]"/\*" | [0:32]"#include" | [0:32]"#ifndef" | [0:32]"#ifdef") text/x-source-code - 0.70 ("8BPS \000\000\000\000" & 0xffffffff0000ffffffff ) image/x-photoshop - 0.40 [0:64]( -i "<HTML" | "<HEAD" | "<TITLE" | "<BODY" | "<TABLE" | "<!--" | "<META" | "<CENTER") text/html */ class Parser { public: Parser(); ~Parser(); status_t Parse(const char *rule, Rule *result, BString *parseError = NULL); private: std::string ErrorMessage(Err *err, const char *rule); // Things that get done a lot :-) void ThrowEndOfStreamError(); inline void ThrowOutOfMemError(ssize_t pos); void ThrowUnexpectedTokenError(TokenType expected, const Token *found); void ThrowUnexpectedTokenError(TokenType expected1, TokenType expected2, const Token *found); // Parsing functions void ParseRule(Rule *result); double ParsePriority(); std::vector* ParseConjList(); DisjList* ParseDisjList(); Range ParseRange(); DisjList* ParsePatternList(Range range); DisjList* ParseRPatternList(); RPattern* ParseRPattern(); Pattern* ParsePattern(); TokenStream stream; Err *fOutOfMemErr; }; }; // namespace Sniffer }; // namespace Storage }; // namespace BPrivate #endif // _SNIFFER_PARSER_H