#ifndef _PARSER_H #define _PARSER_H class Domain; class DBObject; class TIPsDatabase; class InternetResource; #include "define.h" #include "StringMap.h" #include "Streams.h" #include "DBObjects.h" #include "FilterGroup.h" #include #include using namespace std; //abstract class for Parsers //see HTMLParser for the HTMLParser super class and services //Parsers can be shared: re-entrant and thread safe //stored on a Domain for custom Domain behaviour for each IR class Parser: protected Phrase { protected: static TIPsDatabase *m_db; //passed to created objects that can save themselves to the DB static bool m_init; static StringMultiMapCI m_entities; static StringMap m_md5s; static pthread_mutex_t m_hFirst_mutex; //MUTEX to let only the first init the static vars static StringMap m_salutations; static FilterGroup m_tagfilters; const int m_pagegroupid; const Domain *m_domain; //a Parser is specific to a domain (e.g. for storing Summary objects for later) const int m_type; //news, blog, forum post, company, article, etc. Parser(const int _pagegroupid, TIPsDatabase *_db, const Domain *_domain, const int _type); friend class Report_DomainSummary; public: virtual size_t init(); //if the Parser can parse various types of resource //use dynamic_cast<> to see if the IR is appropriate virtual bool accepts(const InternetResource *ir) const {return false;} virtual const size_t parse(const InternetResource *ir, vector *objects) const {return 0;} virtual const size_t searchForTagPhrases(const char *newbody, vector *tags, const bool malloc = true) const; virtual const size_t searchForEntities(const char *newbody, vector *entityids) const; //accessors const int pagegroupid() const {return m_pagegroupid;} virtual const int parsertype() const = 0; static StringMultiMapCI *entities() {return &m_entities;} //exceptions class DomainMismatch {}; //utilities const size_t split(const char *string, vector *v, char delimiter = ',', const bool malloc = true) const; }; #endif