haiku/headers/private/storage/sniffer/Parser.h

//----------------------------------------------------------------------
//  This software is part of the OpenBeOS distribution and is covered
//  by the OpenBeOS license.
//---------------------------------------------------------------------
/*!
	\file sniffer/Parser.h
	MIME sniffer rule parser declarations
*/
#ifndef _SNIFFER_PARSER_H
#define _SNIFFER_PARSER_H

#include <SupportDefs.h>
#include <sniffer/Err.h>
#include <sniffer/Range.h>
#include <sniffer/Rule.h>
#include <List.h>
#include <string>
#include <vector>

class BString;

//! MIME Sniffer related classes
namespace BPrivate {
namespace Storage {
namespace Sniffer {

class Rule;
class DisjList;
class RPattern;
class Pattern;

//------------------------------------------------------------------------------
// The mighty parsing function ;-)
//------------------------------------------------------------------------------

status_t parse(const char *rule, Rule *result, BString *parseError = NULL);

//------------------------------------------------------------------------------
// Classes used internally by the parser
//------------------------------------------------------------------------------

//! Manages a stream of characters
/*! CharStream is used by the scanner portion of the parser, which is implemented
	in TokenStream::SetTo().
*/
class CharStream {
public:
	CharStream(const std::string &string);
	CharStream();
	~CharStream();

	status_t SetTo(const std::string &string);
	void Unset();
	status_t InitCheck() const;
	bool IsEmpty() const;
	ssize_t Pos() const;
	const std::string& String() const;

	char Get();
	void Unget();

private:
	std::string fString;
	ssize_t fPos;
//	ssize_t fLen;
	status_t fCStatus;

	CharStream(const CharStream &ref);
	CharStream& operator=(const CharStream &ref);
};

//! Types of tokens
typedef enum TokenType {
	EmptyToken,
	LeftParen,
	RightParen,
	LeftBracket,
	RightBracket,
	Colon,
	Divider,
	Ampersand,
	CaseInsensitiveFlag,
	CharacterString,
	Integer,
	FloatingPoint
};

/*! \brief Returns a NULL-terminated string contating the
		   name of the given token type
*/
const char* tokenTypeToString(TokenType type);

//! Base token class returned by TokenStream
/*! Each token represents a single chunk of relevant information
    in a given rule. For example, the floating point number "1.2e-35",
    originally represented as a 7-character string, is added to the
    token stream as a single FloatToken object.
*/
class Token {
public:
	Token(TokenType type = EmptyToken, const ssize_t pos = -1);
	TokenType Type() const;
	virtual const std::string& String() const;
	virtual int32 Int() const;
	virtual double Float() const;
	ssize_t Pos() const;
	bool operator==(Token &ref) const;
protected:
	TokenType fType;
	ssize_t fPos;
};

//! String token class
/*! Single-quoted strings, double-quoted strings, unquoted strings, and
	hex literals are all converted to StringToken objects by the scanner
	and from then on treated uniformly.
*/
class StringToken : public Token {
public:
	StringToken(const std::string &str, const ssize_t pos);
	virtual const std::string& String() const;
protected:
	std::string fString;
};

//! Integer token class
/*! Signed or unsigned integer literals are coverted to IntToken objects,
    which may then be treated as either ints or floats (since a priority
    of "1" would be valid, but scanned as an int instead of a float).
*/
class IntToken : public Token {
public:
	IntToken(const int32 value, const ssize_t pos);
	virtual int32 Int() const;
	virtual double Float() const;
protected:
	int32 fValue;
};

//! Floating point token class
/*! Signed or unsigned, extended or non-extended notation floating point
    numbers are converted to FloatToken objects by the scanner.
*/
class FloatToken : public Token {
public:
	FloatToken(const double value, const ssize_t pos);
	virtual double Float() const;
protected:
	double fValue;
};

//! Manages a stream of Token objects
/*! Provides Get() and Unget() operations, some handy shortcut operations (Read()
    and CondRead()), and handles memory management with respect to all the
    Token objects in the stream (i.e. never delete a Token object returned by Get()).

    Also, the scanner portion of the parser is implemented in the TokenStream's
    SetTo() function.
*/
class TokenStream {
public:
	TokenStream(const std::string &string);
	TokenStream();
	~TokenStream();

	status_t SetTo(const std::string &string);
	void Unset();
	status_t InitCheck() const;

	const Token* Get();
	void Unget();

	void Read(TokenType type);
	bool CondRead(TokenType type);

	ssize_t Pos() const;
	ssize_t EndPos() const;

	bool IsEmpty() const;

private:
	void AddToken(TokenType type, ssize_t pos);
	void AddString(const std::string &str, ssize_t pos);
	void AddInt(const char *str, ssize_t pos);
	void AddFloat(const char *str, ssize_t pos);

	std::vector<Token*> fTokenList;
	int fPos;
	int fStrLen;
	status_t fCStatus;


	TokenStream(const TokenStream &ref);
	TokenStream& operator=(const TokenStream &ref);
};

//! Handles parsing a sniffer rule, yielding either a parsed rule or a descriptive error message.
/*! A MIME sniffer rule is valid if it is well-formed with respect to the
	following grammar and fulfills some further conditions listed thereafter:

	<code>
	Rule			::= LWS Priority LWS ConjList LWS
	ConjList		::= DisjList (LWS DisjList)*
	DisjList		::= "(" LWS PatternList LWS ")"
						| "(" LWS RPatternList LWS ")"
						| Range LWS "(" LWS PatternList LWS ")"
	RPatternList	::= [Flag LWS] RPattern (LWS "|" LWS [Flag LWS] RPattern)*
	PatternList		::= [Flag LWS] Pattern (LWS "|" LWS [Flag LWS] Pattern)*

	RPattern		::= LWS Range LWS Pattern
	Pattern			::= PString [ LWS "&" LWS Mask ]
	Range			::=	"[" LWS SDecimal [LWS ":" LWS SDecimal] LWS "]"

	Priority		::= Float
	Mask			::= PString
	PString			::= HexLiteral | QuotedString | UnquotedString

	HexLiteral		::= "0x" HexPair HexPair*
	HexPair			::= HexChar HexChar

	QuotedString	::= SingleQuotedString | DoubleQuotedString
	SQuotedString	:= "'" SQChar+ "'"
	DQuotedString	:= '"' DQChar+ '"'

	UnquotedString	::= EscapedChar UChar*
	EscapedChar		::= OctalEscape | HexEscape | "\" Char
	OctalEscape		::= "\" [[OctHiChar] OctChar] OctChar
	HexEscape		::= "\x" HexPair

	Flag			::= "-i"

	SDecimal		::= [Sign] Decimal
	Decimal			::= DecChar DecChar*
	Float			::= Fixed [("E" | "e") SDecimal]
	Fixed			::= SDecimal ["." [Decimal]] | [Sign] "." Decimal
	Sign			::= "+" | "-"

	PunctuationChar	::= "(" | ")" | "[" | "]" | "|" | "&" | ":"
	OctHiChar		::= "0" | "1" | "2" | "3"
	OctChar			::= OctHiChar | "4" | "5" | "6" | "7"
	DecChar			::= OctChar | "8" | "9"
	HexChar			::= DecChar | "a" | "b" | "c" | "d" | "e" | "f" | "A" | "B" | "C"
						| "D" | "E" | "F"

	Char			:: <any character>
	SQChar			::= <Char except "\", "'"> | EscapedChar
	DQChar			::= <Char except "\", '"'> | EscapedChar
	UChar			::= <Char except "\", LWSChar,  and PunctuationChar> | EscapedChar

	LWS				::= LWSChar*
	LWSChar			::= " " | TAB | LF
	</code>

	Conditions:
	- If a mask is specified for a pattern, this mask must have the same
	  length as the pattern string.
	- 0.0 <= Priority <= 1.0
	- 0 <= Range begin <= Range end

	Notes:
	- If a case-insensitive flag ("-i") appears in front of any Pattern or RPattern
	  in a DisjList, case-insensitivity is applied to the entire DisjList.

	Examples:
	- 1.0 ('ABCD')
	  The file must start with the string "ABCD". The priority of the rule
	  is 1.0 (maximal).
	- 0.8 [0:3] ('ABCD' | 'abcd')
	  The file must contain the string "ABCD" or "abcd" starting somewhere in
	  the first four bytes. The rule priority is 0.8.
	- 0.5 ([0:3] 'ABCD' | [0:3] 'abcd' | [13] 'EFGH')
	  The file must contain the string "ABCD" or "abcd" starting somewhere in
	  the first four bytes or the string "EFGH" at position 13. The rule
	  priority is 0.5.
	- 0.8 [0:3] ('ABCD' & 0xff00ffff | 'abcd' & 0xffff00ff)
	  The file must contain the string "A.CD" or "ab.d" (whereas "." is an
	  arbitrary character) starting somewhere in the first four bytes. The
	  rule priority is 0.8.
	- 0.3 [10] ('mnop') ('abc') [20] ('xyz')
	  The file must contain the string 'abc' at the beginning of the file,
	  the string 'mnop' starting at position 10, and the string 'xyz'
	  starting at position 20. The rule priority is 0.3.
	- 200e-3 (-i 'ab')
	  The file must contain the string 'ab', 'aB', 'Ab', or 'AB' at the
	  beginning of the file. The rule priority is 0.2.

	Real examples:
	- 0.20 ([0]"//" | [0]"/\*" | [0:32]"#include" | [0:32]"#ifndef"
	        | [0:32]"#ifdef")
	  text/x-source-code
	- 0.70 ("8BPS  \000\000\000\000" & 0xffffffff0000ffffffff )
	  image/x-photoshop
	- 0.40 [0:64]( -i "&lt;HTML" | "&lt;HEAD" | "&lt;TITLE" | "&lt;BODY"
			| "&lt;TABLE" | "&lt;!--" | "&lt;META" | "&lt;CENTER")
	  text/html

*/
class Parser {
public:
	Parser();
	~Parser();
	status_t Parse(const char *rule, Rule *result, BString *parseError = NULL);
private:
	std::string ErrorMessage(Err *err, const char *rule);

	// Things that get done a lot :-)
	void ThrowEndOfStreamError();
	inline void ThrowOutOfMemError(ssize_t pos);
	void ThrowUnexpectedTokenError(TokenType expected, const Token *found);
	void ThrowUnexpectedTokenError(TokenType expected1, TokenType expected2, const Token *found);

	// Parsing functions
	void ParseRule(Rule *result);
	double ParsePriority();
	std::vector<DisjList*>* ParseConjList();
	DisjList* ParseDisjList();
	Range ParseRange();
	DisjList* ParsePatternList(Range range);
	DisjList* ParseRPatternList();
	RPattern* ParseRPattern();
	Pattern* ParsePattern();

	TokenStream stream;

	Err *fOutOfMemErr;
};

};	// namespace Sniffer
};	// namespace Storage
};	// namespace BPrivate

#endif	// _SNIFFER_PARSER_H